1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=bonaire -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=CI %s
3 ; RUN: llc -march=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
6 define amdgpu_kernel void @atomic_add_i64_offset(ptr addrspace(1) %out, i64 %in) {
7 ; CI-LABEL: atomic_add_i64_offset:
8 ; CI: ; %bb.0: ; %entry
9 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
10 ; CI-NEXT: s_waitcnt lgkmcnt(0)
11 ; CI-NEXT: v_mov_b32_e32 v0, s2
12 ; CI-NEXT: v_mov_b32_e32 v1, s3
13 ; CI-NEXT: s_mov_b32 s3, 0xf000
14 ; CI-NEXT: s_mov_b32 s2, -1
15 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 offset:32
17 ; CI-NEXT: s_waitcnt vmcnt(0)
18 ; CI-NEXT: buffer_wbinvl1_vol
21 ; VI-LABEL: atomic_add_i64_offset:
22 ; VI: ; %bb.0: ; %entry
23 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
24 ; VI-NEXT: s_waitcnt lgkmcnt(0)
25 ; VI-NEXT: v_mov_b32_e32 v0, s2
26 ; VI-NEXT: v_mov_b32_e32 v1, s3
27 ; VI-NEXT: s_mov_b32 s3, 0xf000
28 ; VI-NEXT: s_mov_b32 s2, -1
29 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
30 ; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 offset:32
31 ; VI-NEXT: s_waitcnt vmcnt(0)
32 ; VI-NEXT: buffer_wbinvl1_vol
35 ; GFX9-LABEL: atomic_add_i64_offset:
36 ; GFX9: ; %bb.0: ; %entry
37 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
38 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
39 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
40 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
41 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
42 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
43 ; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[0:1] offset:32
44 ; GFX9-NEXT: s_waitcnt vmcnt(0)
45 ; GFX9-NEXT: buffer_wbinvl1_vol
48 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
49 %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
53 define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
54 ; CI-LABEL: atomic_add_i64_ret_offset:
55 ; CI: ; %bb.0: ; %entry
56 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
57 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
58 ; CI-NEXT: s_mov_b32 s3, 0xf000
59 ; CI-NEXT: s_mov_b32 s2, -1
60 ; CI-NEXT: s_waitcnt lgkmcnt(0)
61 ; CI-NEXT: s_mov_b32 s0, s6
62 ; CI-NEXT: s_mov_b32 s1, s7
63 ; CI-NEXT: v_mov_b32_e32 v0, s8
64 ; CI-NEXT: v_mov_b32_e32 v1, s9
65 ; CI-NEXT: s_mov_b32 s6, s2
66 ; CI-NEXT: s_mov_b32 s7, s3
67 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
68 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32 glc
69 ; CI-NEXT: s_waitcnt vmcnt(0)
70 ; CI-NEXT: buffer_wbinvl1_vol
71 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
74 ; VI-LABEL: atomic_add_i64_ret_offset:
75 ; VI: ; %bb.0: ; %entry
76 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
77 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
78 ; VI-NEXT: s_mov_b32 s3, 0xf000
79 ; VI-NEXT: s_mov_b32 s2, -1
80 ; VI-NEXT: s_waitcnt lgkmcnt(0)
81 ; VI-NEXT: s_mov_b32 s0, s6
82 ; VI-NEXT: s_mov_b32 s1, s7
83 ; VI-NEXT: v_mov_b32_e32 v0, s8
84 ; VI-NEXT: v_mov_b32_e32 v1, s9
85 ; VI-NEXT: s_mov_b32 s6, s2
86 ; VI-NEXT: s_mov_b32 s7, s3
87 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
88 ; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32 glc
89 ; VI-NEXT: s_waitcnt vmcnt(0)
90 ; VI-NEXT: buffer_wbinvl1_vol
91 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
94 ; GFX9-LABEL: atomic_add_i64_ret_offset:
95 ; GFX9: ; %bb.0: ; %entry
96 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
97 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
98 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
99 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
100 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
101 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
102 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
103 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
104 ; GFX9-NEXT: s_waitcnt vmcnt(0)
105 ; GFX9-NEXT: buffer_wbinvl1_vol
106 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
107 ; GFX9-NEXT: s_endpgm
109 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
110 %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
111 store i64 %tmp0, ptr addrspace(1) %out2
115 define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
116 ; CI-LABEL: atomic_add_i64_addr64_offset:
117 ; CI: ; %bb.0: ; %entry
118 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
119 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
120 ; CI-NEXT: s_waitcnt lgkmcnt(0)
121 ; CI-NEXT: v_mov_b32_e32 v0, s6
122 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
123 ; CI-NEXT: v_mov_b32_e32 v3, s1
124 ; CI-NEXT: v_mov_b32_e32 v1, s7
125 ; CI-NEXT: s_mov_b32 s7, 0xf000
126 ; CI-NEXT: s_mov_b32 s6, 0
127 ; CI-NEXT: v_mov_b32_e32 v2, s0
128 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
129 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
130 ; CI-NEXT: s_waitcnt vmcnt(0)
131 ; CI-NEXT: buffer_wbinvl1_vol
134 ; VI-LABEL: atomic_add_i64_addr64_offset:
135 ; VI: ; %bb.0: ; %entry
136 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
137 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
138 ; VI-NEXT: s_waitcnt lgkmcnt(0)
139 ; VI-NEXT: v_mov_b32_e32 v0, s6
140 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
141 ; VI-NEXT: s_add_u32 s0, s4, s0
142 ; VI-NEXT: s_addc_u32 s1, s5, s1
143 ; VI-NEXT: s_add_u32 s0, s0, 32
144 ; VI-NEXT: s_addc_u32 s1, s1, 0
145 ; VI-NEXT: v_mov_b32_e32 v3, s1
146 ; VI-NEXT: v_mov_b32_e32 v1, s7
147 ; VI-NEXT: v_mov_b32_e32 v2, s0
148 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
149 ; VI-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
150 ; VI-NEXT: s_waitcnt vmcnt(0)
151 ; VI-NEXT: buffer_wbinvl1_vol
154 ; GFX9-LABEL: atomic_add_i64_addr64_offset:
155 ; GFX9: ; %bb.0: ; %entry
156 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
157 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
158 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
159 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
160 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
161 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
162 ; GFX9-NEXT: s_add_u32 s0, s4, s0
163 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
164 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
165 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
166 ; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[0:1] offset:32
167 ; GFX9-NEXT: s_waitcnt vmcnt(0)
168 ; GFX9-NEXT: buffer_wbinvl1_vol
169 ; GFX9-NEXT: s_endpgm
171 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
172 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
173 %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
177 define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
178 ; CI-LABEL: atomic_add_i64_ret_addr64_offset:
179 ; CI: ; %bb.0: ; %entry
180 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
181 ; CI-NEXT: s_mov_b32 s11, 0xf000
182 ; CI-NEXT: s_mov_b32 s10, -1
183 ; CI-NEXT: s_waitcnt lgkmcnt(0)
184 ; CI-NEXT: v_mov_b32_e32 v0, s4
185 ; CI-NEXT: v_mov_b32_e32 v1, s5
186 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
187 ; CI-NEXT: v_mov_b32_e32 v2, s4
188 ; CI-NEXT: s_mov_b32 s8, s2
189 ; CI-NEXT: s_mov_b32 s9, s3
190 ; CI-NEXT: s_mov_b32 s2, 0
191 ; CI-NEXT: s_mov_b32 s3, s11
192 ; CI-NEXT: v_mov_b32_e32 v3, s5
193 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
194 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
195 ; CI-NEXT: s_waitcnt vmcnt(0)
196 ; CI-NEXT: buffer_wbinvl1_vol
197 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
200 ; VI-LABEL: atomic_add_i64_ret_addr64_offset:
201 ; VI: ; %bb.0: ; %entry
202 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
203 ; VI-NEXT: s_waitcnt lgkmcnt(0)
204 ; VI-NEXT: v_mov_b32_e32 v0, s4
205 ; VI-NEXT: v_mov_b32_e32 v1, s5
206 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
207 ; VI-NEXT: s_add_u32 s0, s0, s4
208 ; VI-NEXT: s_addc_u32 s1, s1, s5
209 ; VI-NEXT: s_add_u32 s0, s0, 32
210 ; VI-NEXT: s_addc_u32 s1, s1, 0
211 ; VI-NEXT: v_mov_b32_e32 v3, s1
212 ; VI-NEXT: v_mov_b32_e32 v2, s0
213 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
214 ; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
215 ; VI-NEXT: s_waitcnt vmcnt(0)
216 ; VI-NEXT: buffer_wbinvl1_vol
217 ; VI-NEXT: s_mov_b32 s7, 0xf000
218 ; VI-NEXT: s_mov_b32 s6, -1
219 ; VI-NEXT: s_mov_b32 s4, s2
220 ; VI-NEXT: s_mov_b32 s5, s3
221 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
224 ; GFX9-LABEL: atomic_add_i64_ret_addr64_offset:
225 ; GFX9: ; %bb.0: ; %entry
226 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
227 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
228 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
229 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
230 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
231 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
232 ; GFX9-NEXT: s_add_u32 s0, s0, s4
233 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
234 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
235 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
236 ; GFX9-NEXT: s_waitcnt vmcnt(0)
237 ; GFX9-NEXT: buffer_wbinvl1_vol
238 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
239 ; GFX9-NEXT: s_endpgm
241 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
242 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
243 %tmp0 = atomicrmw volatile add ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
244 store i64 %tmp0, ptr addrspace(1) %out2
248 define amdgpu_kernel void @atomic_add_i64(ptr addrspace(1) %out, i64 %in) {
249 ; CI-LABEL: atomic_add_i64:
250 ; CI: ; %bb.0: ; %entry
251 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
252 ; CI-NEXT: s_mov_b32 s7, 0xf000
253 ; CI-NEXT: s_mov_b32 s6, -1
254 ; CI-NEXT: s_waitcnt lgkmcnt(0)
255 ; CI-NEXT: s_mov_b32 s4, s0
256 ; CI-NEXT: s_mov_b32 s5, s1
257 ; CI-NEXT: v_mov_b32_e32 v0, s2
258 ; CI-NEXT: v_mov_b32_e32 v1, s3
259 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
260 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0
261 ; CI-NEXT: s_waitcnt vmcnt(0)
262 ; CI-NEXT: buffer_wbinvl1_vol
265 ; VI-LABEL: atomic_add_i64:
266 ; VI: ; %bb.0: ; %entry
267 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
268 ; VI-NEXT: s_mov_b32 s7, 0xf000
269 ; VI-NEXT: s_mov_b32 s6, -1
270 ; VI-NEXT: s_waitcnt lgkmcnt(0)
271 ; VI-NEXT: s_mov_b32 s4, s0
272 ; VI-NEXT: s_mov_b32 s5, s1
273 ; VI-NEXT: v_mov_b32_e32 v0, s2
274 ; VI-NEXT: v_mov_b32_e32 v1, s3
275 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
276 ; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[4:7], 0
277 ; VI-NEXT: s_waitcnt vmcnt(0)
278 ; VI-NEXT: buffer_wbinvl1_vol
281 ; GFX9-LABEL: atomic_add_i64:
282 ; GFX9: ; %bb.0: ; %entry
283 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
284 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
285 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
286 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
287 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
288 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
289 ; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[0:1]
290 ; GFX9-NEXT: s_waitcnt vmcnt(0)
291 ; GFX9-NEXT: buffer_wbinvl1_vol
292 ; GFX9-NEXT: s_endpgm
294 %tmp0 = atomicrmw volatile add ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
298 define amdgpu_kernel void @atomic_add_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
299 ; CI-LABEL: atomic_add_i64_ret:
300 ; CI: ; %bb.0: ; %entry
301 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
302 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
303 ; CI-NEXT: s_mov_b32 s3, 0xf000
304 ; CI-NEXT: s_mov_b32 s2, -1
305 ; CI-NEXT: s_waitcnt lgkmcnt(0)
306 ; CI-NEXT: s_mov_b32 s0, s4
307 ; CI-NEXT: s_mov_b32 s1, s5
308 ; CI-NEXT: v_mov_b32_e32 v0, s8
309 ; CI-NEXT: v_mov_b32_e32 v1, s9
310 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
311 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 glc
312 ; CI-NEXT: s_waitcnt vmcnt(0)
313 ; CI-NEXT: buffer_wbinvl1_vol
314 ; CI-NEXT: s_mov_b32 s0, s6
315 ; CI-NEXT: s_mov_b32 s1, s7
316 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
319 ; VI-LABEL: atomic_add_i64_ret:
320 ; VI: ; %bb.0: ; %entry
321 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
322 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
323 ; VI-NEXT: s_mov_b32 s3, 0xf000
324 ; VI-NEXT: s_mov_b32 s2, -1
325 ; VI-NEXT: s_waitcnt lgkmcnt(0)
326 ; VI-NEXT: s_mov_b32 s0, s4
327 ; VI-NEXT: s_mov_b32 s1, s5
328 ; VI-NEXT: v_mov_b32_e32 v0, s8
329 ; VI-NEXT: v_mov_b32_e32 v1, s9
330 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
331 ; VI-NEXT: buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 glc
332 ; VI-NEXT: s_waitcnt vmcnt(0)
333 ; VI-NEXT: buffer_wbinvl1_vol
334 ; VI-NEXT: s_mov_b32 s0, s6
335 ; VI-NEXT: s_mov_b32 s1, s7
336 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
339 ; GFX9-LABEL: atomic_add_i64_ret:
340 ; GFX9: ; %bb.0: ; %entry
341 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
342 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
343 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
344 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
345 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
346 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
347 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
348 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[4:5] glc
349 ; GFX9-NEXT: s_waitcnt vmcnt(0)
350 ; GFX9-NEXT: buffer_wbinvl1_vol
351 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
352 ; GFX9-NEXT: s_endpgm
354 %tmp0 = atomicrmw volatile add ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
355 store i64 %tmp0, ptr addrspace(1) %out2
359 define amdgpu_kernel void @atomic_add_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
360 ; CI-LABEL: atomic_add_i64_addr64:
361 ; CI: ; %bb.0: ; %entry
362 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
363 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
364 ; CI-NEXT: s_waitcnt lgkmcnt(0)
365 ; CI-NEXT: v_mov_b32_e32 v0, s6
366 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
367 ; CI-NEXT: v_mov_b32_e32 v3, s1
368 ; CI-NEXT: v_mov_b32_e32 v1, s7
369 ; CI-NEXT: s_mov_b32 s7, 0xf000
370 ; CI-NEXT: s_mov_b32 s6, 0
371 ; CI-NEXT: v_mov_b32_e32 v2, s0
372 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
373 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[4:7], 0 addr64
374 ; CI-NEXT: s_waitcnt vmcnt(0)
375 ; CI-NEXT: buffer_wbinvl1_vol
378 ; VI-LABEL: atomic_add_i64_addr64:
379 ; VI: ; %bb.0: ; %entry
380 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
381 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
382 ; VI-NEXT: s_waitcnt lgkmcnt(0)
383 ; VI-NEXT: v_mov_b32_e32 v0, s6
384 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
385 ; VI-NEXT: s_add_u32 s0, s4, s0
386 ; VI-NEXT: s_addc_u32 s1, s5, s1
387 ; VI-NEXT: v_mov_b32_e32 v3, s1
388 ; VI-NEXT: v_mov_b32_e32 v1, s7
389 ; VI-NEXT: v_mov_b32_e32 v2, s0
390 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
391 ; VI-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
392 ; VI-NEXT: s_waitcnt vmcnt(0)
393 ; VI-NEXT: buffer_wbinvl1_vol
396 ; GFX9-LABEL: atomic_add_i64_addr64:
397 ; GFX9: ; %bb.0: ; %entry
398 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
399 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
400 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
401 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
402 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
403 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
404 ; GFX9-NEXT: s_add_u32 s0, s4, s0
405 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
406 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
407 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
408 ; GFX9-NEXT: global_atomic_add_x2 v2, v[0:1], s[0:1]
409 ; GFX9-NEXT: s_waitcnt vmcnt(0)
410 ; GFX9-NEXT: buffer_wbinvl1_vol
411 ; GFX9-NEXT: s_endpgm
413 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
414 %tmp0 = atomicrmw volatile add ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
418 define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
419 ; CI-LABEL: atomic_add_i64_ret_addr64:
420 ; CI: ; %bb.0: ; %entry
421 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
422 ; CI-NEXT: s_mov_b32 s11, 0xf000
423 ; CI-NEXT: s_mov_b32 s10, -1
424 ; CI-NEXT: s_waitcnt lgkmcnt(0)
425 ; CI-NEXT: v_mov_b32_e32 v0, s4
426 ; CI-NEXT: v_mov_b32_e32 v1, s5
427 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
428 ; CI-NEXT: v_mov_b32_e32 v2, s4
429 ; CI-NEXT: s_mov_b32 s8, s2
430 ; CI-NEXT: s_mov_b32 s9, s3
431 ; CI-NEXT: s_mov_b32 s2, 0
432 ; CI-NEXT: s_mov_b32 s3, s11
433 ; CI-NEXT: v_mov_b32_e32 v3, s5
434 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
435 ; CI-NEXT: buffer_atomic_add_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
436 ; CI-NEXT: s_waitcnt vmcnt(0)
437 ; CI-NEXT: buffer_wbinvl1_vol
438 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
441 ; VI-LABEL: atomic_add_i64_ret_addr64:
442 ; VI: ; %bb.0: ; %entry
443 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
444 ; VI-NEXT: s_waitcnt lgkmcnt(0)
445 ; VI-NEXT: v_mov_b32_e32 v0, s4
446 ; VI-NEXT: v_mov_b32_e32 v1, s5
447 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
448 ; VI-NEXT: s_add_u32 s0, s0, s4
449 ; VI-NEXT: s_addc_u32 s1, s1, s5
450 ; VI-NEXT: v_mov_b32_e32 v3, s1
451 ; VI-NEXT: v_mov_b32_e32 v2, s0
452 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
453 ; VI-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
454 ; VI-NEXT: s_waitcnt vmcnt(0)
455 ; VI-NEXT: buffer_wbinvl1_vol
456 ; VI-NEXT: s_mov_b32 s7, 0xf000
457 ; VI-NEXT: s_mov_b32 s6, -1
458 ; VI-NEXT: s_mov_b32 s4, s2
459 ; VI-NEXT: s_mov_b32 s5, s3
460 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
463 ; GFX9-LABEL: atomic_add_i64_ret_addr64:
464 ; GFX9: ; %bb.0: ; %entry
465 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
466 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
467 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
468 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
469 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
470 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
471 ; GFX9-NEXT: s_add_u32 s0, s0, s4
472 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
473 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
474 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v2, v[0:1], s[0:1] glc
475 ; GFX9-NEXT: s_waitcnt vmcnt(0)
476 ; GFX9-NEXT: buffer_wbinvl1_vol
477 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
478 ; GFX9-NEXT: s_endpgm
480 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
481 %tmp0 = atomicrmw volatile add ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
482 store i64 %tmp0, ptr addrspace(1) %out2
486 define amdgpu_kernel void @atomic_and_i64_offset(ptr addrspace(1) %out, i64 %in) {
487 ; CI-LABEL: atomic_and_i64_offset:
488 ; CI: ; %bb.0: ; %entry
489 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
490 ; CI-NEXT: s_waitcnt lgkmcnt(0)
491 ; CI-NEXT: v_mov_b32_e32 v0, s2
492 ; CI-NEXT: v_mov_b32_e32 v1, s3
493 ; CI-NEXT: s_mov_b32 s3, 0xf000
494 ; CI-NEXT: s_mov_b32 s2, -1
495 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
496 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 offset:32
497 ; CI-NEXT: s_waitcnt vmcnt(0)
498 ; CI-NEXT: buffer_wbinvl1_vol
501 ; VI-LABEL: atomic_and_i64_offset:
502 ; VI: ; %bb.0: ; %entry
503 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
504 ; VI-NEXT: s_waitcnt lgkmcnt(0)
505 ; VI-NEXT: v_mov_b32_e32 v0, s2
506 ; VI-NEXT: v_mov_b32_e32 v1, s3
507 ; VI-NEXT: s_mov_b32 s3, 0xf000
508 ; VI-NEXT: s_mov_b32 s2, -1
509 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
510 ; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 offset:32
511 ; VI-NEXT: s_waitcnt vmcnt(0)
512 ; VI-NEXT: buffer_wbinvl1_vol
515 ; GFX9-LABEL: atomic_and_i64_offset:
516 ; GFX9: ; %bb.0: ; %entry
517 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
518 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
519 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
520 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
521 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
522 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
523 ; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[0:1] offset:32
524 ; GFX9-NEXT: s_waitcnt vmcnt(0)
525 ; GFX9-NEXT: buffer_wbinvl1_vol
526 ; GFX9-NEXT: s_endpgm
528 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
529 %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
533 define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
534 ; CI-LABEL: atomic_and_i64_ret_offset:
535 ; CI: ; %bb.0: ; %entry
536 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
537 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
538 ; CI-NEXT: s_mov_b32 s3, 0xf000
539 ; CI-NEXT: s_mov_b32 s2, -1
540 ; CI-NEXT: s_waitcnt lgkmcnt(0)
541 ; CI-NEXT: s_mov_b32 s0, s6
542 ; CI-NEXT: s_mov_b32 s1, s7
543 ; CI-NEXT: v_mov_b32_e32 v0, s8
544 ; CI-NEXT: v_mov_b32_e32 v1, s9
545 ; CI-NEXT: s_mov_b32 s6, s2
546 ; CI-NEXT: s_mov_b32 s7, s3
547 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
548 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 glc
549 ; CI-NEXT: s_waitcnt vmcnt(0)
550 ; CI-NEXT: buffer_wbinvl1_vol
551 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
554 ; VI-LABEL: atomic_and_i64_ret_offset:
555 ; VI: ; %bb.0: ; %entry
556 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
557 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
558 ; VI-NEXT: s_mov_b32 s3, 0xf000
559 ; VI-NEXT: s_mov_b32 s2, -1
560 ; VI-NEXT: s_waitcnt lgkmcnt(0)
561 ; VI-NEXT: s_mov_b32 s0, s6
562 ; VI-NEXT: s_mov_b32 s1, s7
563 ; VI-NEXT: v_mov_b32_e32 v0, s8
564 ; VI-NEXT: v_mov_b32_e32 v1, s9
565 ; VI-NEXT: s_mov_b32 s6, s2
566 ; VI-NEXT: s_mov_b32 s7, s3
567 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
568 ; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32 glc
569 ; VI-NEXT: s_waitcnt vmcnt(0)
570 ; VI-NEXT: buffer_wbinvl1_vol
571 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
574 ; GFX9-LABEL: atomic_and_i64_ret_offset:
575 ; GFX9: ; %bb.0: ; %entry
576 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
577 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
578 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
579 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
580 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
581 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
582 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
583 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
584 ; GFX9-NEXT: s_waitcnt vmcnt(0)
585 ; GFX9-NEXT: buffer_wbinvl1_vol
586 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
587 ; GFX9-NEXT: s_endpgm
589 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
590 %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
591 store i64 %tmp0, ptr addrspace(1) %out2
595 define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
596 ; CI-LABEL: atomic_and_i64_addr64_offset:
597 ; CI: ; %bb.0: ; %entry
598 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
599 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
600 ; CI-NEXT: s_waitcnt lgkmcnt(0)
601 ; CI-NEXT: v_mov_b32_e32 v0, s6
602 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
603 ; CI-NEXT: v_mov_b32_e32 v3, s1
604 ; CI-NEXT: v_mov_b32_e32 v1, s7
605 ; CI-NEXT: s_mov_b32 s7, 0xf000
606 ; CI-NEXT: s_mov_b32 s6, 0
607 ; CI-NEXT: v_mov_b32_e32 v2, s0
608 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
609 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
610 ; CI-NEXT: s_waitcnt vmcnt(0)
611 ; CI-NEXT: buffer_wbinvl1_vol
614 ; VI-LABEL: atomic_and_i64_addr64_offset:
615 ; VI: ; %bb.0: ; %entry
616 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
617 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
618 ; VI-NEXT: s_waitcnt lgkmcnt(0)
619 ; VI-NEXT: v_mov_b32_e32 v0, s6
620 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
621 ; VI-NEXT: s_add_u32 s0, s4, s0
622 ; VI-NEXT: s_addc_u32 s1, s5, s1
623 ; VI-NEXT: s_add_u32 s0, s0, 32
624 ; VI-NEXT: s_addc_u32 s1, s1, 0
625 ; VI-NEXT: v_mov_b32_e32 v3, s1
626 ; VI-NEXT: v_mov_b32_e32 v1, s7
627 ; VI-NEXT: v_mov_b32_e32 v2, s0
628 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
629 ; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
630 ; VI-NEXT: s_waitcnt vmcnt(0)
631 ; VI-NEXT: buffer_wbinvl1_vol
634 ; GFX9-LABEL: atomic_and_i64_addr64_offset:
635 ; GFX9: ; %bb.0: ; %entry
636 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
637 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
638 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
639 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
640 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
641 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
642 ; GFX9-NEXT: s_add_u32 s0, s4, s0
643 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
644 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
645 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
646 ; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[0:1] offset:32
647 ; GFX9-NEXT: s_waitcnt vmcnt(0)
648 ; GFX9-NEXT: buffer_wbinvl1_vol
649 ; GFX9-NEXT: s_endpgm
651 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
652 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
653 %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
657 define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
658 ; CI-LABEL: atomic_and_i64_ret_addr64_offset:
659 ; CI: ; %bb.0: ; %entry
660 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
661 ; CI-NEXT: s_mov_b32 s11, 0xf000
662 ; CI-NEXT: s_mov_b32 s10, -1
663 ; CI-NEXT: s_waitcnt lgkmcnt(0)
664 ; CI-NEXT: v_mov_b32_e32 v0, s4
665 ; CI-NEXT: v_mov_b32_e32 v1, s5
666 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
667 ; CI-NEXT: v_mov_b32_e32 v2, s4
668 ; CI-NEXT: s_mov_b32 s8, s2
669 ; CI-NEXT: s_mov_b32 s9, s3
670 ; CI-NEXT: s_mov_b32 s2, 0
671 ; CI-NEXT: s_mov_b32 s3, s11
672 ; CI-NEXT: v_mov_b32_e32 v3, s5
673 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
674 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
675 ; CI-NEXT: s_waitcnt vmcnt(0)
676 ; CI-NEXT: buffer_wbinvl1_vol
677 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
680 ; VI-LABEL: atomic_and_i64_ret_addr64_offset:
681 ; VI: ; %bb.0: ; %entry
682 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
683 ; VI-NEXT: s_waitcnt lgkmcnt(0)
684 ; VI-NEXT: v_mov_b32_e32 v0, s4
685 ; VI-NEXT: v_mov_b32_e32 v1, s5
686 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
687 ; VI-NEXT: s_add_u32 s0, s0, s4
688 ; VI-NEXT: s_addc_u32 s1, s1, s5
689 ; VI-NEXT: s_add_u32 s0, s0, 32
690 ; VI-NEXT: s_addc_u32 s1, s1, 0
691 ; VI-NEXT: v_mov_b32_e32 v3, s1
692 ; VI-NEXT: v_mov_b32_e32 v2, s0
693 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
694 ; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
695 ; VI-NEXT: s_waitcnt vmcnt(0)
696 ; VI-NEXT: buffer_wbinvl1_vol
697 ; VI-NEXT: s_mov_b32 s7, 0xf000
698 ; VI-NEXT: s_mov_b32 s6, -1
699 ; VI-NEXT: s_mov_b32 s4, s2
700 ; VI-NEXT: s_mov_b32 s5, s3
701 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
704 ; GFX9-LABEL: atomic_and_i64_ret_addr64_offset:
705 ; GFX9: ; %bb.0: ; %entry
706 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
707 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
708 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
709 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
710 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
711 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
712 ; GFX9-NEXT: s_add_u32 s0, s0, s4
713 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
714 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
715 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
716 ; GFX9-NEXT: s_waitcnt vmcnt(0)
717 ; GFX9-NEXT: buffer_wbinvl1_vol
718 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
719 ; GFX9-NEXT: s_endpgm
721 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
722 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
723 %tmp0 = atomicrmw volatile and ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
724 store i64 %tmp0, ptr addrspace(1) %out2
728 define amdgpu_kernel void @atomic_and_i64(ptr addrspace(1) %out, i64 %in) {
729 ; CI-LABEL: atomic_and_i64:
730 ; CI: ; %bb.0: ; %entry
731 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
732 ; CI-NEXT: s_mov_b32 s7, 0xf000
733 ; CI-NEXT: s_mov_b32 s6, -1
734 ; CI-NEXT: s_waitcnt lgkmcnt(0)
735 ; CI-NEXT: s_mov_b32 s4, s0
736 ; CI-NEXT: s_mov_b32 s5, s1
737 ; CI-NEXT: v_mov_b32_e32 v0, s2
738 ; CI-NEXT: v_mov_b32_e32 v1, s3
739 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
740 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0
741 ; CI-NEXT: s_waitcnt vmcnt(0)
742 ; CI-NEXT: buffer_wbinvl1_vol
745 ; VI-LABEL: atomic_and_i64:
746 ; VI: ; %bb.0: ; %entry
747 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
748 ; VI-NEXT: s_mov_b32 s7, 0xf000
749 ; VI-NEXT: s_mov_b32 s6, -1
750 ; VI-NEXT: s_waitcnt lgkmcnt(0)
751 ; VI-NEXT: s_mov_b32 s4, s0
752 ; VI-NEXT: s_mov_b32 s5, s1
753 ; VI-NEXT: v_mov_b32_e32 v0, s2
754 ; VI-NEXT: v_mov_b32_e32 v1, s3
755 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
756 ; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[4:7], 0
757 ; VI-NEXT: s_waitcnt vmcnt(0)
758 ; VI-NEXT: buffer_wbinvl1_vol
761 ; GFX9-LABEL: atomic_and_i64:
762 ; GFX9: ; %bb.0: ; %entry
763 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
764 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
765 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
766 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
767 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
768 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
769 ; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[0:1]
770 ; GFX9-NEXT: s_waitcnt vmcnt(0)
771 ; GFX9-NEXT: buffer_wbinvl1_vol
772 ; GFX9-NEXT: s_endpgm
774 %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
778 define amdgpu_kernel void @atomic_and_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
779 ; CI-LABEL: atomic_and_i64_ret:
780 ; CI: ; %bb.0: ; %entry
781 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
782 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
783 ; CI-NEXT: s_mov_b32 s3, 0xf000
784 ; CI-NEXT: s_mov_b32 s2, -1
785 ; CI-NEXT: s_waitcnt lgkmcnt(0)
786 ; CI-NEXT: s_mov_b32 s0, s4
787 ; CI-NEXT: s_mov_b32 s1, s5
788 ; CI-NEXT: v_mov_b32_e32 v0, s8
789 ; CI-NEXT: v_mov_b32_e32 v1, s9
790 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
791 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 glc
792 ; CI-NEXT: s_waitcnt vmcnt(0)
793 ; CI-NEXT: buffer_wbinvl1_vol
794 ; CI-NEXT: s_mov_b32 s0, s6
795 ; CI-NEXT: s_mov_b32 s1, s7
796 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
799 ; VI-LABEL: atomic_and_i64_ret:
800 ; VI: ; %bb.0: ; %entry
801 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
802 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
803 ; VI-NEXT: s_mov_b32 s3, 0xf000
804 ; VI-NEXT: s_mov_b32 s2, -1
805 ; VI-NEXT: s_waitcnt lgkmcnt(0)
806 ; VI-NEXT: s_mov_b32 s0, s4
807 ; VI-NEXT: s_mov_b32 s1, s5
808 ; VI-NEXT: v_mov_b32_e32 v0, s8
809 ; VI-NEXT: v_mov_b32_e32 v1, s9
810 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
811 ; VI-NEXT: buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 glc
812 ; VI-NEXT: s_waitcnt vmcnt(0)
813 ; VI-NEXT: buffer_wbinvl1_vol
814 ; VI-NEXT: s_mov_b32 s0, s6
815 ; VI-NEXT: s_mov_b32 s1, s7
816 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
819 ; GFX9-LABEL: atomic_and_i64_ret:
820 ; GFX9: ; %bb.0: ; %entry
821 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
822 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
823 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
824 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
825 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
826 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
827 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
828 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[4:5] glc
829 ; GFX9-NEXT: s_waitcnt vmcnt(0)
830 ; GFX9-NEXT: buffer_wbinvl1_vol
831 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
832 ; GFX9-NEXT: s_endpgm
834 %tmp0 = atomicrmw volatile and ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
835 store i64 %tmp0, ptr addrspace(1) %out2
839 define amdgpu_kernel void @atomic_and_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
840 ; CI-LABEL: atomic_and_i64_addr64:
841 ; CI: ; %bb.0: ; %entry
842 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
843 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
844 ; CI-NEXT: s_waitcnt lgkmcnt(0)
845 ; CI-NEXT: v_mov_b32_e32 v0, s6
846 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
847 ; CI-NEXT: v_mov_b32_e32 v3, s1
848 ; CI-NEXT: v_mov_b32_e32 v1, s7
849 ; CI-NEXT: s_mov_b32 s7, 0xf000
850 ; CI-NEXT: s_mov_b32 s6, 0
851 ; CI-NEXT: v_mov_b32_e32 v2, s0
852 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
853 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[4:7], 0 addr64
854 ; CI-NEXT: s_waitcnt vmcnt(0)
855 ; CI-NEXT: buffer_wbinvl1_vol
858 ; VI-LABEL: atomic_and_i64_addr64:
859 ; VI: ; %bb.0: ; %entry
860 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
861 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
862 ; VI-NEXT: s_waitcnt lgkmcnt(0)
863 ; VI-NEXT: v_mov_b32_e32 v0, s6
864 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
865 ; VI-NEXT: s_add_u32 s0, s4, s0
866 ; VI-NEXT: s_addc_u32 s1, s5, s1
867 ; VI-NEXT: v_mov_b32_e32 v3, s1
868 ; VI-NEXT: v_mov_b32_e32 v1, s7
869 ; VI-NEXT: v_mov_b32_e32 v2, s0
870 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
871 ; VI-NEXT: flat_atomic_and_x2 v[2:3], v[0:1]
872 ; VI-NEXT: s_waitcnt vmcnt(0)
873 ; VI-NEXT: buffer_wbinvl1_vol
876 ; GFX9-LABEL: atomic_and_i64_addr64:
877 ; GFX9: ; %bb.0: ; %entry
878 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
879 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
880 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
881 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
882 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
883 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
884 ; GFX9-NEXT: s_add_u32 s0, s4, s0
885 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
886 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
887 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
888 ; GFX9-NEXT: global_atomic_and_x2 v2, v[0:1], s[0:1]
889 ; GFX9-NEXT: s_waitcnt vmcnt(0)
890 ; GFX9-NEXT: buffer_wbinvl1_vol
891 ; GFX9-NEXT: s_endpgm
893 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
894 %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
898 define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
899 ; CI-LABEL: atomic_and_i64_ret_addr64:
900 ; CI: ; %bb.0: ; %entry
901 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
902 ; CI-NEXT: s_mov_b32 s11, 0xf000
903 ; CI-NEXT: s_mov_b32 s10, -1
904 ; CI-NEXT: s_waitcnt lgkmcnt(0)
905 ; CI-NEXT: v_mov_b32_e32 v0, s4
906 ; CI-NEXT: v_mov_b32_e32 v1, s5
907 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
908 ; CI-NEXT: v_mov_b32_e32 v2, s4
909 ; CI-NEXT: s_mov_b32 s8, s2
910 ; CI-NEXT: s_mov_b32 s9, s3
911 ; CI-NEXT: s_mov_b32 s2, 0
912 ; CI-NEXT: s_mov_b32 s3, s11
913 ; CI-NEXT: v_mov_b32_e32 v3, s5
914 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
915 ; CI-NEXT: buffer_atomic_and_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
916 ; CI-NEXT: s_waitcnt vmcnt(0)
917 ; CI-NEXT: buffer_wbinvl1_vol
918 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
921 ; VI-LABEL: atomic_and_i64_ret_addr64:
922 ; VI: ; %bb.0: ; %entry
923 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
924 ; VI-NEXT: s_waitcnt lgkmcnt(0)
925 ; VI-NEXT: v_mov_b32_e32 v0, s4
926 ; VI-NEXT: v_mov_b32_e32 v1, s5
927 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
928 ; VI-NEXT: s_add_u32 s0, s0, s4
929 ; VI-NEXT: s_addc_u32 s1, s1, s5
930 ; VI-NEXT: v_mov_b32_e32 v3, s1
931 ; VI-NEXT: v_mov_b32_e32 v2, s0
932 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
933 ; VI-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
934 ; VI-NEXT: s_waitcnt vmcnt(0)
935 ; VI-NEXT: buffer_wbinvl1_vol
936 ; VI-NEXT: s_mov_b32 s7, 0xf000
937 ; VI-NEXT: s_mov_b32 s6, -1
938 ; VI-NEXT: s_mov_b32 s4, s2
939 ; VI-NEXT: s_mov_b32 s5, s3
940 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
943 ; GFX9-LABEL: atomic_and_i64_ret_addr64:
944 ; GFX9: ; %bb.0: ; %entry
945 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
946 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
947 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
948 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
949 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
950 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
951 ; GFX9-NEXT: s_add_u32 s0, s0, s4
952 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
953 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
954 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v2, v[0:1], s[0:1] glc
955 ; GFX9-NEXT: s_waitcnt vmcnt(0)
956 ; GFX9-NEXT: buffer_wbinvl1_vol
957 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
958 ; GFX9-NEXT: s_endpgm
960 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
961 %tmp0 = atomicrmw volatile and ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
962 store i64 %tmp0, ptr addrspace(1) %out2
966 define amdgpu_kernel void @atomic_sub_i64_offset(ptr addrspace(1) %out, i64 %in) {
967 ; CI-LABEL: atomic_sub_i64_offset:
968 ; CI: ; %bb.0: ; %entry
969 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
970 ; CI-NEXT: s_waitcnt lgkmcnt(0)
971 ; CI-NEXT: v_mov_b32_e32 v0, s2
972 ; CI-NEXT: v_mov_b32_e32 v1, s3
973 ; CI-NEXT: s_mov_b32 s3, 0xf000
974 ; CI-NEXT: s_mov_b32 s2, -1
975 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
976 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 offset:32
977 ; CI-NEXT: s_waitcnt vmcnt(0)
978 ; CI-NEXT: buffer_wbinvl1_vol
981 ; VI-LABEL: atomic_sub_i64_offset:
982 ; VI: ; %bb.0: ; %entry
983 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
984 ; VI-NEXT: s_waitcnt lgkmcnt(0)
985 ; VI-NEXT: v_mov_b32_e32 v0, s2
986 ; VI-NEXT: v_mov_b32_e32 v1, s3
987 ; VI-NEXT: s_mov_b32 s3, 0xf000
988 ; VI-NEXT: s_mov_b32 s2, -1
989 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
990 ; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 offset:32
991 ; VI-NEXT: s_waitcnt vmcnt(0)
992 ; VI-NEXT: buffer_wbinvl1_vol
995 ; GFX9-LABEL: atomic_sub_i64_offset:
996 ; GFX9: ; %bb.0: ; %entry
997 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
998 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
999 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1000 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1001 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1002 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1003 ; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[0:1] offset:32
1004 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1005 ; GFX9-NEXT: buffer_wbinvl1_vol
1006 ; GFX9-NEXT: s_endpgm
1008 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1009 %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
1013 define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
1014 ; CI-LABEL: atomic_sub_i64_ret_offset:
1015 ; CI: ; %bb.0: ; %entry
1016 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1017 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1018 ; CI-NEXT: s_mov_b32 s3, 0xf000
1019 ; CI-NEXT: s_mov_b32 s2, -1
1020 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1021 ; CI-NEXT: s_mov_b32 s0, s6
1022 ; CI-NEXT: s_mov_b32 s1, s7
1023 ; CI-NEXT: v_mov_b32_e32 v0, s8
1024 ; CI-NEXT: v_mov_b32_e32 v1, s9
1025 ; CI-NEXT: s_mov_b32 s6, s2
1026 ; CI-NEXT: s_mov_b32 s7, s3
1027 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1028 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 glc
1029 ; CI-NEXT: s_waitcnt vmcnt(0)
1030 ; CI-NEXT: buffer_wbinvl1_vol
1031 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1034 ; VI-LABEL: atomic_sub_i64_ret_offset:
1035 ; VI: ; %bb.0: ; %entry
1036 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1037 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
1038 ; VI-NEXT: s_mov_b32 s3, 0xf000
1039 ; VI-NEXT: s_mov_b32 s2, -1
1040 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1041 ; VI-NEXT: s_mov_b32 s0, s6
1042 ; VI-NEXT: s_mov_b32 s1, s7
1043 ; VI-NEXT: v_mov_b32_e32 v0, s8
1044 ; VI-NEXT: v_mov_b32_e32 v1, s9
1045 ; VI-NEXT: s_mov_b32 s6, s2
1046 ; VI-NEXT: s_mov_b32 s7, s3
1047 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1048 ; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32 glc
1049 ; VI-NEXT: s_waitcnt vmcnt(0)
1050 ; VI-NEXT: buffer_wbinvl1_vol
1051 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1054 ; GFX9-LABEL: atomic_sub_i64_ret_offset:
1055 ; GFX9: ; %bb.0: ; %entry
1056 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1057 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1058 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1059 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1060 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1061 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1062 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1063 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
1064 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1065 ; GFX9-NEXT: buffer_wbinvl1_vol
1066 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
1067 ; GFX9-NEXT: s_endpgm
1069 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1070 %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
1071 store i64 %tmp0, ptr addrspace(1) %out2
1075 define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
1076 ; CI-LABEL: atomic_sub_i64_addr64_offset:
1077 ; CI: ; %bb.0: ; %entry
1078 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1079 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1080 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1081 ; CI-NEXT: v_mov_b32_e32 v0, s6
1082 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
1083 ; CI-NEXT: v_mov_b32_e32 v3, s1
1084 ; CI-NEXT: v_mov_b32_e32 v1, s7
1085 ; CI-NEXT: s_mov_b32 s7, 0xf000
1086 ; CI-NEXT: s_mov_b32 s6, 0
1087 ; CI-NEXT: v_mov_b32_e32 v2, s0
1088 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1089 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
1090 ; CI-NEXT: s_waitcnt vmcnt(0)
1091 ; CI-NEXT: buffer_wbinvl1_vol
1094 ; VI-LABEL: atomic_sub_i64_addr64_offset:
1095 ; VI: ; %bb.0: ; %entry
1096 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1097 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1098 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1099 ; VI-NEXT: v_mov_b32_e32 v0, s6
1100 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
1101 ; VI-NEXT: s_add_u32 s0, s4, s0
1102 ; VI-NEXT: s_addc_u32 s1, s5, s1
1103 ; VI-NEXT: s_add_u32 s0, s0, 32
1104 ; VI-NEXT: s_addc_u32 s1, s1, 0
1105 ; VI-NEXT: v_mov_b32_e32 v3, s1
1106 ; VI-NEXT: v_mov_b32_e32 v1, s7
1107 ; VI-NEXT: v_mov_b32_e32 v2, s0
1108 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1109 ; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
1110 ; VI-NEXT: s_waitcnt vmcnt(0)
1111 ; VI-NEXT: buffer_wbinvl1_vol
1114 ; GFX9-LABEL: atomic_sub_i64_addr64_offset:
1115 ; GFX9: ; %bb.0: ; %entry
1116 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1117 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1118 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1119 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1120 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
1121 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
1122 ; GFX9-NEXT: s_add_u32 s0, s4, s0
1123 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
1124 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
1125 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1126 ; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[0:1] offset:32
1127 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1128 ; GFX9-NEXT: buffer_wbinvl1_vol
1129 ; GFX9-NEXT: s_endpgm
1131 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
1132 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
1133 %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
1137 define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
1138 ; CI-LABEL: atomic_sub_i64_ret_addr64_offset:
1139 ; CI: ; %bb.0: ; %entry
1140 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
1141 ; CI-NEXT: s_mov_b32 s11, 0xf000
1142 ; CI-NEXT: s_mov_b32 s10, -1
1143 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1144 ; CI-NEXT: v_mov_b32_e32 v0, s4
1145 ; CI-NEXT: v_mov_b32_e32 v1, s5
1146 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
1147 ; CI-NEXT: v_mov_b32_e32 v2, s4
1148 ; CI-NEXT: s_mov_b32 s8, s2
1149 ; CI-NEXT: s_mov_b32 s9, s3
1150 ; CI-NEXT: s_mov_b32 s2, 0
1151 ; CI-NEXT: s_mov_b32 s3, s11
1152 ; CI-NEXT: v_mov_b32_e32 v3, s5
1153 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1154 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
1155 ; CI-NEXT: s_waitcnt vmcnt(0)
1156 ; CI-NEXT: buffer_wbinvl1_vol
1157 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1160 ; VI-LABEL: atomic_sub_i64_ret_addr64_offset:
1161 ; VI: ; %bb.0: ; %entry
1162 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
1163 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1164 ; VI-NEXT: v_mov_b32_e32 v0, s4
1165 ; VI-NEXT: v_mov_b32_e32 v1, s5
1166 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
1167 ; VI-NEXT: s_add_u32 s0, s0, s4
1168 ; VI-NEXT: s_addc_u32 s1, s1, s5
1169 ; VI-NEXT: s_add_u32 s0, s0, 32
1170 ; VI-NEXT: s_addc_u32 s1, s1, 0
1171 ; VI-NEXT: v_mov_b32_e32 v3, s1
1172 ; VI-NEXT: v_mov_b32_e32 v2, s0
1173 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1174 ; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
1175 ; VI-NEXT: s_waitcnt vmcnt(0)
1176 ; VI-NEXT: buffer_wbinvl1_vol
1177 ; VI-NEXT: s_mov_b32 s7, 0xf000
1178 ; VI-NEXT: s_mov_b32 s6, -1
1179 ; VI-NEXT: s_mov_b32 s4, s2
1180 ; VI-NEXT: s_mov_b32 s5, s3
1181 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1184 ; GFX9-LABEL: atomic_sub_i64_ret_addr64_offset:
1185 ; GFX9: ; %bb.0: ; %entry
1186 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
1187 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1188 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1189 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
1190 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
1191 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
1192 ; GFX9-NEXT: s_add_u32 s0, s0, s4
1193 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
1194 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1195 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
1196 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1197 ; GFX9-NEXT: buffer_wbinvl1_vol
1198 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
1199 ; GFX9-NEXT: s_endpgm
1201 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
1202 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
1203 %tmp0 = atomicrmw volatile sub ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
1204 store i64 %tmp0, ptr addrspace(1) %out2
1208 define amdgpu_kernel void @atomic_sub_i64(ptr addrspace(1) %out, i64 %in) {
1209 ; CI-LABEL: atomic_sub_i64:
1210 ; CI: ; %bb.0: ; %entry
1211 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1212 ; CI-NEXT: s_mov_b32 s7, 0xf000
1213 ; CI-NEXT: s_mov_b32 s6, -1
1214 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1215 ; CI-NEXT: s_mov_b32 s4, s0
1216 ; CI-NEXT: s_mov_b32 s5, s1
1217 ; CI-NEXT: v_mov_b32_e32 v0, s2
1218 ; CI-NEXT: v_mov_b32_e32 v1, s3
1219 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1220 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0
1221 ; CI-NEXT: s_waitcnt vmcnt(0)
1222 ; CI-NEXT: buffer_wbinvl1_vol
1225 ; VI-LABEL: atomic_sub_i64:
1226 ; VI: ; %bb.0: ; %entry
1227 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1228 ; VI-NEXT: s_mov_b32 s7, 0xf000
1229 ; VI-NEXT: s_mov_b32 s6, -1
1230 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1231 ; VI-NEXT: s_mov_b32 s4, s0
1232 ; VI-NEXT: s_mov_b32 s5, s1
1233 ; VI-NEXT: v_mov_b32_e32 v0, s2
1234 ; VI-NEXT: v_mov_b32_e32 v1, s3
1235 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1236 ; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0
1237 ; VI-NEXT: s_waitcnt vmcnt(0)
1238 ; VI-NEXT: buffer_wbinvl1_vol
1241 ; GFX9-LABEL: atomic_sub_i64:
1242 ; GFX9: ; %bb.0: ; %entry
1243 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1244 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1245 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1246 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1247 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1248 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1249 ; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[0:1]
1250 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1251 ; GFX9-NEXT: buffer_wbinvl1_vol
1252 ; GFX9-NEXT: s_endpgm
1254 %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
1258 define amdgpu_kernel void @atomic_sub_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
1259 ; CI-LABEL: atomic_sub_i64_ret:
1260 ; CI: ; %bb.0: ; %entry
1261 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1262 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1263 ; CI-NEXT: s_mov_b32 s3, 0xf000
1264 ; CI-NEXT: s_mov_b32 s2, -1
1265 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1266 ; CI-NEXT: s_mov_b32 s0, s4
1267 ; CI-NEXT: s_mov_b32 s1, s5
1268 ; CI-NEXT: v_mov_b32_e32 v0, s8
1269 ; CI-NEXT: v_mov_b32_e32 v1, s9
1270 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1271 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 glc
1272 ; CI-NEXT: s_waitcnt vmcnt(0)
1273 ; CI-NEXT: buffer_wbinvl1_vol
1274 ; CI-NEXT: s_mov_b32 s0, s6
1275 ; CI-NEXT: s_mov_b32 s1, s7
1276 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1279 ; VI-LABEL: atomic_sub_i64_ret:
1280 ; VI: ; %bb.0: ; %entry
1281 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1282 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
1283 ; VI-NEXT: s_mov_b32 s3, 0xf000
1284 ; VI-NEXT: s_mov_b32 s2, -1
1285 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1286 ; VI-NEXT: s_mov_b32 s0, s4
1287 ; VI-NEXT: s_mov_b32 s1, s5
1288 ; VI-NEXT: v_mov_b32_e32 v0, s8
1289 ; VI-NEXT: v_mov_b32_e32 v1, s9
1290 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1291 ; VI-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 glc
1292 ; VI-NEXT: s_waitcnt vmcnt(0)
1293 ; VI-NEXT: buffer_wbinvl1_vol
1294 ; VI-NEXT: s_mov_b32 s0, s6
1295 ; VI-NEXT: s_mov_b32 s1, s7
1296 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1299 ; GFX9-LABEL: atomic_sub_i64_ret:
1300 ; GFX9: ; %bb.0: ; %entry
1301 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1302 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1303 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1304 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1305 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1306 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1307 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1308 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[4:5] glc
1309 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1310 ; GFX9-NEXT: buffer_wbinvl1_vol
1311 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
1312 ; GFX9-NEXT: s_endpgm
1314 %tmp0 = atomicrmw volatile sub ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
1315 store i64 %tmp0, ptr addrspace(1) %out2
1319 define amdgpu_kernel void @atomic_sub_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
1320 ; CI-LABEL: atomic_sub_i64_addr64:
1321 ; CI: ; %bb.0: ; %entry
1322 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1323 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1324 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1325 ; CI-NEXT: v_mov_b32_e32 v0, s6
1326 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
1327 ; CI-NEXT: v_mov_b32_e32 v3, s1
1328 ; CI-NEXT: v_mov_b32_e32 v1, s7
1329 ; CI-NEXT: s_mov_b32 s7, 0xf000
1330 ; CI-NEXT: s_mov_b32 s6, 0
1331 ; CI-NEXT: v_mov_b32_e32 v2, s0
1332 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1333 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[4:7], 0 addr64
1334 ; CI-NEXT: s_waitcnt vmcnt(0)
1335 ; CI-NEXT: buffer_wbinvl1_vol
1338 ; VI-LABEL: atomic_sub_i64_addr64:
1339 ; VI: ; %bb.0: ; %entry
1340 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1341 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1342 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1343 ; VI-NEXT: v_mov_b32_e32 v0, s6
1344 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
1345 ; VI-NEXT: s_add_u32 s0, s4, s0
1346 ; VI-NEXT: s_addc_u32 s1, s5, s1
1347 ; VI-NEXT: v_mov_b32_e32 v3, s1
1348 ; VI-NEXT: v_mov_b32_e32 v1, s7
1349 ; VI-NEXT: v_mov_b32_e32 v2, s0
1350 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1351 ; VI-NEXT: flat_atomic_sub_x2 v[2:3], v[0:1]
1352 ; VI-NEXT: s_waitcnt vmcnt(0)
1353 ; VI-NEXT: buffer_wbinvl1_vol
1356 ; GFX9-LABEL: atomic_sub_i64_addr64:
1357 ; GFX9: ; %bb.0: ; %entry
1358 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1359 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1360 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1361 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1362 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
1363 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
1364 ; GFX9-NEXT: s_add_u32 s0, s4, s0
1365 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
1366 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
1367 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1368 ; GFX9-NEXT: global_atomic_sub_x2 v2, v[0:1], s[0:1]
1369 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1370 ; GFX9-NEXT: buffer_wbinvl1_vol
1371 ; GFX9-NEXT: s_endpgm
1373 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
1374 %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
1378 define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
1379 ; CI-LABEL: atomic_sub_i64_ret_addr64:
1380 ; CI: ; %bb.0: ; %entry
1381 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
1382 ; CI-NEXT: s_mov_b32 s11, 0xf000
1383 ; CI-NEXT: s_mov_b32 s10, -1
1384 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1385 ; CI-NEXT: v_mov_b32_e32 v0, s4
1386 ; CI-NEXT: v_mov_b32_e32 v1, s5
1387 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
1388 ; CI-NEXT: v_mov_b32_e32 v2, s4
1389 ; CI-NEXT: s_mov_b32 s8, s2
1390 ; CI-NEXT: s_mov_b32 s9, s3
1391 ; CI-NEXT: s_mov_b32 s2, 0
1392 ; CI-NEXT: s_mov_b32 s3, s11
1393 ; CI-NEXT: v_mov_b32_e32 v3, s5
1394 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1395 ; CI-NEXT: buffer_atomic_sub_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
1396 ; CI-NEXT: s_waitcnt vmcnt(0)
1397 ; CI-NEXT: buffer_wbinvl1_vol
1398 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1401 ; VI-LABEL: atomic_sub_i64_ret_addr64:
1402 ; VI: ; %bb.0: ; %entry
1403 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
1404 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1405 ; VI-NEXT: v_mov_b32_e32 v0, s4
1406 ; VI-NEXT: v_mov_b32_e32 v1, s5
1407 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
1408 ; VI-NEXT: s_add_u32 s0, s0, s4
1409 ; VI-NEXT: s_addc_u32 s1, s1, s5
1410 ; VI-NEXT: v_mov_b32_e32 v3, s1
1411 ; VI-NEXT: v_mov_b32_e32 v2, s0
1412 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1413 ; VI-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
1414 ; VI-NEXT: s_waitcnt vmcnt(0)
1415 ; VI-NEXT: buffer_wbinvl1_vol
1416 ; VI-NEXT: s_mov_b32 s7, 0xf000
1417 ; VI-NEXT: s_mov_b32 s6, -1
1418 ; VI-NEXT: s_mov_b32 s4, s2
1419 ; VI-NEXT: s_mov_b32 s5, s3
1420 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1423 ; GFX9-LABEL: atomic_sub_i64_ret_addr64:
1424 ; GFX9: ; %bb.0: ; %entry
1425 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
1426 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1427 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1428 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
1429 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
1430 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
1431 ; GFX9-NEXT: s_add_u32 s0, s0, s4
1432 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
1433 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1434 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v2, v[0:1], s[0:1] glc
1435 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1436 ; GFX9-NEXT: buffer_wbinvl1_vol
1437 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
1438 ; GFX9-NEXT: s_endpgm
1440 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
1441 %tmp0 = atomicrmw volatile sub ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
1442 store i64 %tmp0, ptr addrspace(1) %out2
1446 define amdgpu_kernel void @atomic_max_i64_offset(ptr addrspace(1) %out, i64 %in) {
1447 ; CI-LABEL: atomic_max_i64_offset:
1448 ; CI: ; %bb.0: ; %entry
1449 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1450 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1451 ; CI-NEXT: v_mov_b32_e32 v0, s2
1452 ; CI-NEXT: v_mov_b32_e32 v1, s3
1453 ; CI-NEXT: s_mov_b32 s3, 0xf000
1454 ; CI-NEXT: s_mov_b32 s2, -1
1455 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1456 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 offset:32
1459 ; VI-LABEL: atomic_max_i64_offset:
1460 ; VI: ; %bb.0: ; %entry
1461 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1462 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1463 ; VI-NEXT: v_mov_b32_e32 v0, s2
1464 ; VI-NEXT: v_mov_b32_e32 v1, s3
1465 ; VI-NEXT: s_mov_b32 s3, 0xf000
1466 ; VI-NEXT: s_mov_b32 s2, -1
1467 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1468 ; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 offset:32
1471 ; GFX9-LABEL: atomic_max_i64_offset:
1472 ; GFX9: ; %bb.0: ; %entry
1473 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1474 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1475 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1476 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1477 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1478 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1479 ; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1] offset:32
1480 ; GFX9-NEXT: s_endpgm
1482 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1483 %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
1487 define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
1488 ; CI-LABEL: atomic_max_i64_ret_offset:
1489 ; CI: ; %bb.0: ; %entry
1490 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1491 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1492 ; CI-NEXT: s_mov_b32 s3, 0xf000
1493 ; CI-NEXT: s_mov_b32 s2, -1
1494 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1495 ; CI-NEXT: s_mov_b32 s0, s6
1496 ; CI-NEXT: s_mov_b32 s1, s7
1497 ; CI-NEXT: v_mov_b32_e32 v0, s8
1498 ; CI-NEXT: v_mov_b32_e32 v1, s9
1499 ; CI-NEXT: s_mov_b32 s6, s2
1500 ; CI-NEXT: s_mov_b32 s7, s3
1501 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1502 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0 offset:32 glc
1503 ; CI-NEXT: s_waitcnt vmcnt(0)
1504 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1507 ; VI-LABEL: atomic_max_i64_ret_offset:
1508 ; VI: ; %bb.0: ; %entry
1509 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1510 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
1511 ; VI-NEXT: s_mov_b32 s3, 0xf000
1512 ; VI-NEXT: s_mov_b32 s2, -1
1513 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1514 ; VI-NEXT: s_mov_b32 s0, s6
1515 ; VI-NEXT: s_mov_b32 s1, s7
1516 ; VI-NEXT: v_mov_b32_e32 v0, s8
1517 ; VI-NEXT: v_mov_b32_e32 v1, s9
1518 ; VI-NEXT: s_mov_b32 s6, s2
1519 ; VI-NEXT: s_mov_b32 s7, s3
1520 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1521 ; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0 offset:32 glc
1522 ; VI-NEXT: s_waitcnt vmcnt(0)
1523 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1526 ; GFX9-LABEL: atomic_max_i64_ret_offset:
1527 ; GFX9: ; %bb.0: ; %entry
1528 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1529 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1530 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1531 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1532 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1533 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1534 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1535 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
1536 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1537 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
1538 ; GFX9-NEXT: s_endpgm
1540 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1541 %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
1542 store i64 %tmp0, ptr addrspace(1) %out2
1546 define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
1547 ; CI-LABEL: atomic_max_i64_addr64_offset:
1548 ; CI: ; %bb.0: ; %entry
1549 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1550 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1551 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1552 ; CI-NEXT: v_mov_b32_e32 v0, s6
1553 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
1554 ; CI-NEXT: v_mov_b32_e32 v3, s1
1555 ; CI-NEXT: v_mov_b32_e32 v1, s7
1556 ; CI-NEXT: s_mov_b32 s7, 0xf000
1557 ; CI-NEXT: s_mov_b32 s6, 0
1558 ; CI-NEXT: v_mov_b32_e32 v2, s0
1559 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1560 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
1563 ; VI-LABEL: atomic_max_i64_addr64_offset:
1564 ; VI: ; %bb.0: ; %entry
1565 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1566 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1567 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1568 ; VI-NEXT: v_mov_b32_e32 v0, s6
1569 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
1570 ; VI-NEXT: s_add_u32 s0, s4, s0
1571 ; VI-NEXT: s_addc_u32 s1, s5, s1
1572 ; VI-NEXT: s_add_u32 s0, s0, 32
1573 ; VI-NEXT: s_addc_u32 s1, s1, 0
1574 ; VI-NEXT: v_mov_b32_e32 v3, s1
1575 ; VI-NEXT: v_mov_b32_e32 v1, s7
1576 ; VI-NEXT: v_mov_b32_e32 v2, s0
1577 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1578 ; VI-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1]
1581 ; GFX9-LABEL: atomic_max_i64_addr64_offset:
1582 ; GFX9: ; %bb.0: ; %entry
1583 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1584 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1585 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1586 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1587 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
1588 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
1589 ; GFX9-NEXT: s_add_u32 s0, s4, s0
1590 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
1591 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
1592 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1593 ; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1] offset:32
1594 ; GFX9-NEXT: s_endpgm
1596 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
1597 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
1598 %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
1602 define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
1603 ; CI-LABEL: atomic_max_i64_ret_addr64_offset:
1604 ; CI: ; %bb.0: ; %entry
1605 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
1606 ; CI-NEXT: s_mov_b32 s11, 0xf000
1607 ; CI-NEXT: s_mov_b32 s10, -1
1608 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1609 ; CI-NEXT: v_mov_b32_e32 v0, s4
1610 ; CI-NEXT: v_mov_b32_e32 v1, s5
1611 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
1612 ; CI-NEXT: v_mov_b32_e32 v2, s4
1613 ; CI-NEXT: s_mov_b32 s8, s2
1614 ; CI-NEXT: s_mov_b32 s9, s3
1615 ; CI-NEXT: s_mov_b32 s2, 0
1616 ; CI-NEXT: s_mov_b32 s3, s11
1617 ; CI-NEXT: v_mov_b32_e32 v3, s5
1618 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1619 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
1620 ; CI-NEXT: s_waitcnt vmcnt(0)
1621 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1624 ; VI-LABEL: atomic_max_i64_ret_addr64_offset:
1625 ; VI: ; %bb.0: ; %entry
1626 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
1627 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1628 ; VI-NEXT: v_mov_b32_e32 v0, s4
1629 ; VI-NEXT: v_mov_b32_e32 v1, s5
1630 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
1631 ; VI-NEXT: s_add_u32 s0, s0, s4
1632 ; VI-NEXT: s_addc_u32 s1, s1, s5
1633 ; VI-NEXT: s_add_u32 s0, s0, 32
1634 ; VI-NEXT: s_addc_u32 s1, s1, 0
1635 ; VI-NEXT: v_mov_b32_e32 v3, s1
1636 ; VI-NEXT: v_mov_b32_e32 v2, s0
1637 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1638 ; VI-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
1639 ; VI-NEXT: s_mov_b32 s7, 0xf000
1640 ; VI-NEXT: s_mov_b32 s6, -1
1641 ; VI-NEXT: s_mov_b32 s4, s2
1642 ; VI-NEXT: s_mov_b32 s5, s3
1643 ; VI-NEXT: s_waitcnt vmcnt(0)
1644 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1647 ; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
1648 ; GFX9: ; %bb.0: ; %entry
1649 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
1650 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1651 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1652 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
1653 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
1654 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
1655 ; GFX9-NEXT: s_add_u32 s0, s0, s4
1656 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
1657 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1658 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
1659 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1660 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
1661 ; GFX9-NEXT: s_endpgm
1663 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
1664 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
1665 %tmp0 = atomicrmw volatile max ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
1666 store i64 %tmp0, ptr addrspace(1) %out2
1670 define amdgpu_kernel void @atomic_max_i64(ptr addrspace(1) %out, i64 %in) {
1671 ; CI-LABEL: atomic_max_i64:
1672 ; CI: ; %bb.0: ; %entry
1673 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1674 ; CI-NEXT: s_mov_b32 s7, 0xf000
1675 ; CI-NEXT: s_mov_b32 s6, -1
1676 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1677 ; CI-NEXT: s_mov_b32 s4, s0
1678 ; CI-NEXT: s_mov_b32 s5, s1
1679 ; CI-NEXT: v_mov_b32_e32 v0, s2
1680 ; CI-NEXT: v_mov_b32_e32 v1, s3
1681 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1682 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0
1685 ; VI-LABEL: atomic_max_i64:
1686 ; VI: ; %bb.0: ; %entry
1687 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1688 ; VI-NEXT: s_mov_b32 s7, 0xf000
1689 ; VI-NEXT: s_mov_b32 s6, -1
1690 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1691 ; VI-NEXT: s_mov_b32 s4, s0
1692 ; VI-NEXT: s_mov_b32 s5, s1
1693 ; VI-NEXT: v_mov_b32_e32 v0, s2
1694 ; VI-NEXT: v_mov_b32_e32 v1, s3
1695 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1696 ; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[4:7], 0
1699 ; GFX9-LABEL: atomic_max_i64:
1700 ; GFX9: ; %bb.0: ; %entry
1701 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1702 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1703 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1704 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1705 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1706 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1707 ; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1]
1708 ; GFX9-NEXT: s_endpgm
1710 %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
1714 define amdgpu_kernel void @atomic_max_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
1715 ; CI-LABEL: atomic_max_i64_ret:
1716 ; CI: ; %bb.0: ; %entry
1717 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1718 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1719 ; CI-NEXT: s_mov_b32 s3, 0xf000
1720 ; CI-NEXT: s_mov_b32 s2, -1
1721 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1722 ; CI-NEXT: s_mov_b32 s0, s4
1723 ; CI-NEXT: s_mov_b32 s1, s5
1724 ; CI-NEXT: v_mov_b32_e32 v0, s8
1725 ; CI-NEXT: v_mov_b32_e32 v1, s9
1726 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1727 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 glc
1728 ; CI-NEXT: s_mov_b32 s0, s6
1729 ; CI-NEXT: s_mov_b32 s1, s7
1730 ; CI-NEXT: s_waitcnt vmcnt(0)
1731 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1734 ; VI-LABEL: atomic_max_i64_ret:
1735 ; VI: ; %bb.0: ; %entry
1736 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1737 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
1738 ; VI-NEXT: s_mov_b32 s3, 0xf000
1739 ; VI-NEXT: s_mov_b32 s2, -1
1740 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1741 ; VI-NEXT: s_mov_b32 s0, s4
1742 ; VI-NEXT: s_mov_b32 s1, s5
1743 ; VI-NEXT: v_mov_b32_e32 v0, s8
1744 ; VI-NEXT: v_mov_b32_e32 v1, s9
1745 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1746 ; VI-NEXT: buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 glc
1747 ; VI-NEXT: s_mov_b32 s0, s6
1748 ; VI-NEXT: s_mov_b32 s1, s7
1749 ; VI-NEXT: s_waitcnt vmcnt(0)
1750 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1753 ; GFX9-LABEL: atomic_max_i64_ret:
1754 ; GFX9: ; %bb.0: ; %entry
1755 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1756 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1757 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1758 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1759 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1760 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1761 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1762 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[4:5] glc
1763 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1764 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
1765 ; GFX9-NEXT: s_endpgm
1767 %tmp0 = atomicrmw volatile max ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
1768 store i64 %tmp0, ptr addrspace(1) %out2
1772 define amdgpu_kernel void @atomic_max_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
1773 ; CI-LABEL: atomic_max_i64_addr64:
1774 ; CI: ; %bb.0: ; %entry
1775 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1776 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1777 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1778 ; CI-NEXT: v_mov_b32_e32 v0, s6
1779 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
1780 ; CI-NEXT: v_mov_b32_e32 v3, s1
1781 ; CI-NEXT: v_mov_b32_e32 v1, s7
1782 ; CI-NEXT: s_mov_b32 s7, 0xf000
1783 ; CI-NEXT: s_mov_b32 s6, 0
1784 ; CI-NEXT: v_mov_b32_e32 v2, s0
1785 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1786 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[4:7], 0 addr64
1789 ; VI-LABEL: atomic_max_i64_addr64:
1790 ; VI: ; %bb.0: ; %entry
1791 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1792 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
1793 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1794 ; VI-NEXT: v_mov_b32_e32 v0, s6
1795 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
1796 ; VI-NEXT: s_add_u32 s0, s4, s0
1797 ; VI-NEXT: s_addc_u32 s1, s5, s1
1798 ; VI-NEXT: v_mov_b32_e32 v3, s1
1799 ; VI-NEXT: v_mov_b32_e32 v1, s7
1800 ; VI-NEXT: v_mov_b32_e32 v2, s0
1801 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1802 ; VI-NEXT: flat_atomic_smax_x2 v[2:3], v[0:1]
1805 ; GFX9-LABEL: atomic_max_i64_addr64:
1806 ; GFX9: ; %bb.0: ; %entry
1807 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1808 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1809 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1810 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1811 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
1812 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
1813 ; GFX9-NEXT: s_add_u32 s0, s4, s0
1814 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
1815 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
1816 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1817 ; GFX9-NEXT: global_atomic_smax_x2 v2, v[0:1], s[0:1]
1818 ; GFX9-NEXT: s_endpgm
1820 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
1821 %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
1825 define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
1826 ; CI-LABEL: atomic_max_i64_ret_addr64:
1827 ; CI: ; %bb.0: ; %entry
1828 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
1829 ; CI-NEXT: s_mov_b32 s11, 0xf000
1830 ; CI-NEXT: s_mov_b32 s10, -1
1831 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1832 ; CI-NEXT: v_mov_b32_e32 v0, s4
1833 ; CI-NEXT: v_mov_b32_e32 v1, s5
1834 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
1835 ; CI-NEXT: v_mov_b32_e32 v2, s4
1836 ; CI-NEXT: s_mov_b32 s8, s2
1837 ; CI-NEXT: s_mov_b32 s9, s3
1838 ; CI-NEXT: s_mov_b32 s2, 0
1839 ; CI-NEXT: s_mov_b32 s3, s11
1840 ; CI-NEXT: v_mov_b32_e32 v3, s5
1841 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1842 ; CI-NEXT: buffer_atomic_smax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
1843 ; CI-NEXT: s_waitcnt vmcnt(0)
1844 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
1847 ; VI-LABEL: atomic_max_i64_ret_addr64:
1848 ; VI: ; %bb.0: ; %entry
1849 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
1850 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1851 ; VI-NEXT: v_mov_b32_e32 v0, s4
1852 ; VI-NEXT: v_mov_b32_e32 v1, s5
1853 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
1854 ; VI-NEXT: s_add_u32 s0, s0, s4
1855 ; VI-NEXT: s_addc_u32 s1, s1, s5
1856 ; VI-NEXT: v_mov_b32_e32 v3, s1
1857 ; VI-NEXT: v_mov_b32_e32 v2, s0
1858 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1859 ; VI-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
1860 ; VI-NEXT: s_mov_b32 s7, 0xf000
1861 ; VI-NEXT: s_mov_b32 s6, -1
1862 ; VI-NEXT: s_mov_b32 s4, s2
1863 ; VI-NEXT: s_mov_b32 s5, s3
1864 ; VI-NEXT: s_waitcnt vmcnt(0)
1865 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1868 ; GFX9-LABEL: atomic_max_i64_ret_addr64:
1869 ; GFX9: ; %bb.0: ; %entry
1870 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
1871 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1872 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1873 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
1874 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
1875 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
1876 ; GFX9-NEXT: s_add_u32 s0, s0, s4
1877 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
1878 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1879 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v2, v[0:1], s[0:1] glc
1880 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1881 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
1882 ; GFX9-NEXT: s_endpgm
1884 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
1885 %tmp0 = atomicrmw volatile max ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
1886 store i64 %tmp0, ptr addrspace(1) %out2
1890 define amdgpu_kernel void @atomic_umax_i64_offset(ptr addrspace(1) %out, i64 %in) {
1891 ; CI-LABEL: atomic_umax_i64_offset:
1892 ; CI: ; %bb.0: ; %entry
1893 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
1894 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1895 ; CI-NEXT: v_mov_b32_e32 v0, s2
1896 ; CI-NEXT: v_mov_b32_e32 v1, s3
1897 ; CI-NEXT: s_mov_b32 s3, 0xf000
1898 ; CI-NEXT: s_mov_b32 s2, -1
1899 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1900 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 offset:32
1903 ; VI-LABEL: atomic_umax_i64_offset:
1904 ; VI: ; %bb.0: ; %entry
1905 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1906 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1907 ; VI-NEXT: v_mov_b32_e32 v0, s2
1908 ; VI-NEXT: v_mov_b32_e32 v1, s3
1909 ; VI-NEXT: s_mov_b32 s3, 0xf000
1910 ; VI-NEXT: s_mov_b32 s2, -1
1911 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1912 ; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 offset:32
1915 ; GFX9-LABEL: atomic_umax_i64_offset:
1916 ; GFX9: ; %bb.0: ; %entry
1917 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1918 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1919 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1920 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1921 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1922 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1923 ; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1] offset:32
1924 ; GFX9-NEXT: s_endpgm
1926 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1927 %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
1931 define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
1932 ; CI-LABEL: atomic_umax_i64_ret_offset:
1933 ; CI: ; %bb.0: ; %entry
1934 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1935 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
1936 ; CI-NEXT: s_mov_b32 s3, 0xf000
1937 ; CI-NEXT: s_mov_b32 s2, -1
1938 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1939 ; CI-NEXT: s_mov_b32 s0, s6
1940 ; CI-NEXT: s_mov_b32 s1, s7
1941 ; CI-NEXT: v_mov_b32_e32 v0, s8
1942 ; CI-NEXT: v_mov_b32_e32 v1, s9
1943 ; CI-NEXT: s_mov_b32 s6, s2
1944 ; CI-NEXT: s_mov_b32 s7, s3
1945 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1946 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0 offset:32 glc
1947 ; CI-NEXT: s_waitcnt vmcnt(0)
1948 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1951 ; VI-LABEL: atomic_umax_i64_ret_offset:
1952 ; VI: ; %bb.0: ; %entry
1953 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1954 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
1955 ; VI-NEXT: s_mov_b32 s3, 0xf000
1956 ; VI-NEXT: s_mov_b32 s2, -1
1957 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1958 ; VI-NEXT: s_mov_b32 s0, s6
1959 ; VI-NEXT: s_mov_b32 s1, s7
1960 ; VI-NEXT: v_mov_b32_e32 v0, s8
1961 ; VI-NEXT: v_mov_b32_e32 v1, s9
1962 ; VI-NEXT: s_mov_b32 s6, s2
1963 ; VI-NEXT: s_mov_b32 s7, s3
1964 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1965 ; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0 offset:32 glc
1966 ; VI-NEXT: s_waitcnt vmcnt(0)
1967 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1970 ; GFX9-LABEL: atomic_umax_i64_ret_offset:
1971 ; GFX9: ; %bb.0: ; %entry
1972 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1973 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1974 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1975 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1976 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
1977 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
1978 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1979 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
1980 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1981 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
1982 ; GFX9-NEXT: s_endpgm
1984 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
1985 %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
1986 store i64 %tmp0, ptr addrspace(1) %out2
1990 define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
1991 ; CI-LABEL: atomic_umax_i64_addr64_offset:
1992 ; CI: ; %bb.0: ; %entry
1993 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1994 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
1995 ; CI-NEXT: s_waitcnt lgkmcnt(0)
1996 ; CI-NEXT: v_mov_b32_e32 v0, s6
1997 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
1998 ; CI-NEXT: v_mov_b32_e32 v3, s1
1999 ; CI-NEXT: v_mov_b32_e32 v1, s7
2000 ; CI-NEXT: s_mov_b32 s7, 0xf000
2001 ; CI-NEXT: s_mov_b32 s6, 0
2002 ; CI-NEXT: v_mov_b32_e32 v2, s0
2003 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2004 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
2007 ; VI-LABEL: atomic_umax_i64_addr64_offset:
2008 ; VI: ; %bb.0: ; %entry
2009 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2010 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2011 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2012 ; VI-NEXT: v_mov_b32_e32 v0, s6
2013 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
2014 ; VI-NEXT: s_add_u32 s0, s4, s0
2015 ; VI-NEXT: s_addc_u32 s1, s5, s1
2016 ; VI-NEXT: s_add_u32 s0, s0, 32
2017 ; VI-NEXT: s_addc_u32 s1, s1, 0
2018 ; VI-NEXT: v_mov_b32_e32 v3, s1
2019 ; VI-NEXT: v_mov_b32_e32 v1, s7
2020 ; VI-NEXT: v_mov_b32_e32 v2, s0
2021 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2022 ; VI-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1]
2025 ; GFX9-LABEL: atomic_umax_i64_addr64_offset:
2026 ; GFX9: ; %bb.0: ; %entry
2027 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2028 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2029 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2030 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2031 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
2032 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
2033 ; GFX9-NEXT: s_add_u32 s0, s4, s0
2034 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
2035 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
2036 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2037 ; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1] offset:32
2038 ; GFX9-NEXT: s_endpgm
2040 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
2041 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
2042 %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
2046 define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
2047 ; CI-LABEL: atomic_umax_i64_ret_addr64_offset:
2048 ; CI: ; %bb.0: ; %entry
2049 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
2050 ; CI-NEXT: s_mov_b32 s11, 0xf000
2051 ; CI-NEXT: s_mov_b32 s10, -1
2052 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2053 ; CI-NEXT: v_mov_b32_e32 v0, s4
2054 ; CI-NEXT: v_mov_b32_e32 v1, s5
2055 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
2056 ; CI-NEXT: v_mov_b32_e32 v2, s4
2057 ; CI-NEXT: s_mov_b32 s8, s2
2058 ; CI-NEXT: s_mov_b32 s9, s3
2059 ; CI-NEXT: s_mov_b32 s2, 0
2060 ; CI-NEXT: s_mov_b32 s3, s11
2061 ; CI-NEXT: v_mov_b32_e32 v3, s5
2062 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2063 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
2064 ; CI-NEXT: s_waitcnt vmcnt(0)
2065 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2068 ; VI-LABEL: atomic_umax_i64_ret_addr64_offset:
2069 ; VI: ; %bb.0: ; %entry
2070 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
2071 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2072 ; VI-NEXT: v_mov_b32_e32 v0, s4
2073 ; VI-NEXT: v_mov_b32_e32 v1, s5
2074 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
2075 ; VI-NEXT: s_add_u32 s0, s0, s4
2076 ; VI-NEXT: s_addc_u32 s1, s1, s5
2077 ; VI-NEXT: s_add_u32 s0, s0, 32
2078 ; VI-NEXT: s_addc_u32 s1, s1, 0
2079 ; VI-NEXT: v_mov_b32_e32 v3, s1
2080 ; VI-NEXT: v_mov_b32_e32 v2, s0
2081 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2082 ; VI-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
2083 ; VI-NEXT: s_mov_b32 s7, 0xf000
2084 ; VI-NEXT: s_mov_b32 s6, -1
2085 ; VI-NEXT: s_mov_b32 s4, s2
2086 ; VI-NEXT: s_mov_b32 s5, s3
2087 ; VI-NEXT: s_waitcnt vmcnt(0)
2088 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2091 ; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
2092 ; GFX9: ; %bb.0: ; %entry
2093 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
2094 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2095 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2096 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
2097 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
2098 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
2099 ; GFX9-NEXT: s_add_u32 s0, s0, s4
2100 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
2101 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2102 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
2103 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2104 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
2105 ; GFX9-NEXT: s_endpgm
2107 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
2108 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
2109 %tmp0 = atomicrmw volatile umax ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
2110 store i64 %tmp0, ptr addrspace(1) %out2
2114 define amdgpu_kernel void @atomic_umax_i64(ptr addrspace(1) %out, i64 %in) {
2115 ; CI-LABEL: atomic_umax_i64:
2116 ; CI: ; %bb.0: ; %entry
2117 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2118 ; CI-NEXT: s_mov_b32 s7, 0xf000
2119 ; CI-NEXT: s_mov_b32 s6, -1
2120 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2121 ; CI-NEXT: s_mov_b32 s4, s0
2122 ; CI-NEXT: s_mov_b32 s5, s1
2123 ; CI-NEXT: v_mov_b32_e32 v0, s2
2124 ; CI-NEXT: v_mov_b32_e32 v1, s3
2125 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2126 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0
2129 ; VI-LABEL: atomic_umax_i64:
2130 ; VI: ; %bb.0: ; %entry
2131 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2132 ; VI-NEXT: s_mov_b32 s7, 0xf000
2133 ; VI-NEXT: s_mov_b32 s6, -1
2134 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2135 ; VI-NEXT: s_mov_b32 s4, s0
2136 ; VI-NEXT: s_mov_b32 s5, s1
2137 ; VI-NEXT: v_mov_b32_e32 v0, s2
2138 ; VI-NEXT: v_mov_b32_e32 v1, s3
2139 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2140 ; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[4:7], 0
2143 ; GFX9-LABEL: atomic_umax_i64:
2144 ; GFX9: ; %bb.0: ; %entry
2145 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2146 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2147 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2148 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
2149 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2150 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2151 ; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1]
2152 ; GFX9-NEXT: s_endpgm
2154 %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
2158 define amdgpu_kernel void @atomic_umax_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
2159 ; CI-LABEL: atomic_umax_i64_ret:
2160 ; CI: ; %bb.0: ; %entry
2161 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2162 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2163 ; CI-NEXT: s_mov_b32 s3, 0xf000
2164 ; CI-NEXT: s_mov_b32 s2, -1
2165 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2166 ; CI-NEXT: s_mov_b32 s0, s4
2167 ; CI-NEXT: s_mov_b32 s1, s5
2168 ; CI-NEXT: v_mov_b32_e32 v0, s8
2169 ; CI-NEXT: v_mov_b32_e32 v1, s9
2170 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2171 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 glc
2172 ; CI-NEXT: s_mov_b32 s0, s6
2173 ; CI-NEXT: s_mov_b32 s1, s7
2174 ; CI-NEXT: s_waitcnt vmcnt(0)
2175 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2178 ; VI-LABEL: atomic_umax_i64_ret:
2179 ; VI: ; %bb.0: ; %entry
2180 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2181 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
2182 ; VI-NEXT: s_mov_b32 s3, 0xf000
2183 ; VI-NEXT: s_mov_b32 s2, -1
2184 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2185 ; VI-NEXT: s_mov_b32 s0, s4
2186 ; VI-NEXT: s_mov_b32 s1, s5
2187 ; VI-NEXT: v_mov_b32_e32 v0, s8
2188 ; VI-NEXT: v_mov_b32_e32 v1, s9
2189 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2190 ; VI-NEXT: buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 glc
2191 ; VI-NEXT: s_mov_b32 s0, s6
2192 ; VI-NEXT: s_mov_b32 s1, s7
2193 ; VI-NEXT: s_waitcnt vmcnt(0)
2194 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2197 ; GFX9-LABEL: atomic_umax_i64_ret:
2198 ; GFX9: ; %bb.0: ; %entry
2199 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2200 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2201 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2202 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2203 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
2204 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2205 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2206 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[4:5] glc
2207 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2208 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
2209 ; GFX9-NEXT: s_endpgm
2211 %tmp0 = atomicrmw volatile umax ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
2212 store i64 %tmp0, ptr addrspace(1) %out2
2216 define amdgpu_kernel void @atomic_umax_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
2217 ; CI-LABEL: atomic_umax_i64_addr64:
2218 ; CI: ; %bb.0: ; %entry
2219 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2220 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2221 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2222 ; CI-NEXT: v_mov_b32_e32 v0, s6
2223 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
2224 ; CI-NEXT: v_mov_b32_e32 v3, s1
2225 ; CI-NEXT: v_mov_b32_e32 v1, s7
2226 ; CI-NEXT: s_mov_b32 s7, 0xf000
2227 ; CI-NEXT: s_mov_b32 s6, 0
2228 ; CI-NEXT: v_mov_b32_e32 v2, s0
2229 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2230 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[4:7], 0 addr64
2233 ; VI-LABEL: atomic_umax_i64_addr64:
2234 ; VI: ; %bb.0: ; %entry
2235 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2236 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2237 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2238 ; VI-NEXT: v_mov_b32_e32 v0, s6
2239 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
2240 ; VI-NEXT: s_add_u32 s0, s4, s0
2241 ; VI-NEXT: s_addc_u32 s1, s5, s1
2242 ; VI-NEXT: v_mov_b32_e32 v3, s1
2243 ; VI-NEXT: v_mov_b32_e32 v1, s7
2244 ; VI-NEXT: v_mov_b32_e32 v2, s0
2245 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2246 ; VI-NEXT: flat_atomic_umax_x2 v[2:3], v[0:1]
2249 ; GFX9-LABEL: atomic_umax_i64_addr64:
2250 ; GFX9: ; %bb.0: ; %entry
2251 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2252 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2253 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2254 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2255 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
2256 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
2257 ; GFX9-NEXT: s_add_u32 s0, s4, s0
2258 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
2259 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
2260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2261 ; GFX9-NEXT: global_atomic_umax_x2 v2, v[0:1], s[0:1]
2262 ; GFX9-NEXT: s_endpgm
2264 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
2265 %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
2269 define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
2270 ; CI-LABEL: atomic_umax_i64_ret_addr64:
2271 ; CI: ; %bb.0: ; %entry
2272 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
2273 ; CI-NEXT: s_mov_b32 s11, 0xf000
2274 ; CI-NEXT: s_mov_b32 s10, -1
2275 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2276 ; CI-NEXT: v_mov_b32_e32 v0, s4
2277 ; CI-NEXT: v_mov_b32_e32 v1, s5
2278 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
2279 ; CI-NEXT: v_mov_b32_e32 v2, s4
2280 ; CI-NEXT: s_mov_b32 s8, s2
2281 ; CI-NEXT: s_mov_b32 s9, s3
2282 ; CI-NEXT: s_mov_b32 s2, 0
2283 ; CI-NEXT: s_mov_b32 s3, s11
2284 ; CI-NEXT: v_mov_b32_e32 v3, s5
2285 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2286 ; CI-NEXT: buffer_atomic_umax_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
2287 ; CI-NEXT: s_waitcnt vmcnt(0)
2288 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2291 ; VI-LABEL: atomic_umax_i64_ret_addr64:
2292 ; VI: ; %bb.0: ; %entry
2293 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
2294 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2295 ; VI-NEXT: v_mov_b32_e32 v0, s4
2296 ; VI-NEXT: v_mov_b32_e32 v1, s5
2297 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
2298 ; VI-NEXT: s_add_u32 s0, s0, s4
2299 ; VI-NEXT: s_addc_u32 s1, s1, s5
2300 ; VI-NEXT: v_mov_b32_e32 v3, s1
2301 ; VI-NEXT: v_mov_b32_e32 v2, s0
2302 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2303 ; VI-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
2304 ; VI-NEXT: s_mov_b32 s7, 0xf000
2305 ; VI-NEXT: s_mov_b32 s6, -1
2306 ; VI-NEXT: s_mov_b32 s4, s2
2307 ; VI-NEXT: s_mov_b32 s5, s3
2308 ; VI-NEXT: s_waitcnt vmcnt(0)
2309 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2312 ; GFX9-LABEL: atomic_umax_i64_ret_addr64:
2313 ; GFX9: ; %bb.0: ; %entry
2314 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
2315 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2316 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2317 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
2318 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
2319 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
2320 ; GFX9-NEXT: s_add_u32 s0, s0, s4
2321 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
2322 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2323 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v2, v[0:1], s[0:1] glc
2324 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2325 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
2326 ; GFX9-NEXT: s_endpgm
2328 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
2329 %tmp0 = atomicrmw volatile umax ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
2330 store i64 %tmp0, ptr addrspace(1) %out2
2334 define amdgpu_kernel void @atomic_min_i64_offset(ptr addrspace(1) %out, i64 %in) {
2335 ; CI-LABEL: atomic_min_i64_offset:
2336 ; CI: ; %bb.0: ; %entry
2337 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2338 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2339 ; CI-NEXT: v_mov_b32_e32 v0, s2
2340 ; CI-NEXT: v_mov_b32_e32 v1, s3
2341 ; CI-NEXT: s_mov_b32 s3, 0xf000
2342 ; CI-NEXT: s_mov_b32 s2, -1
2343 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2344 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 offset:32
2347 ; VI-LABEL: atomic_min_i64_offset:
2348 ; VI: ; %bb.0: ; %entry
2349 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2350 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2351 ; VI-NEXT: v_mov_b32_e32 v0, s2
2352 ; VI-NEXT: v_mov_b32_e32 v1, s3
2353 ; VI-NEXT: s_mov_b32 s3, 0xf000
2354 ; VI-NEXT: s_mov_b32 s2, -1
2355 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2356 ; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 offset:32
2359 ; GFX9-LABEL: atomic_min_i64_offset:
2360 ; GFX9: ; %bb.0: ; %entry
2361 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2362 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2363 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2364 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
2365 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2366 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2367 ; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1] offset:32
2368 ; GFX9-NEXT: s_endpgm
2370 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2371 %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
2375 define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
2376 ; CI-LABEL: atomic_min_i64_ret_offset:
2377 ; CI: ; %bb.0: ; %entry
2378 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2379 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2380 ; CI-NEXT: s_mov_b32 s3, 0xf000
2381 ; CI-NEXT: s_mov_b32 s2, -1
2382 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2383 ; CI-NEXT: s_mov_b32 s0, s6
2384 ; CI-NEXT: s_mov_b32 s1, s7
2385 ; CI-NEXT: v_mov_b32_e32 v0, s8
2386 ; CI-NEXT: v_mov_b32_e32 v1, s9
2387 ; CI-NEXT: s_mov_b32 s6, s2
2388 ; CI-NEXT: s_mov_b32 s7, s3
2389 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2390 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0 offset:32 glc
2391 ; CI-NEXT: s_waitcnt vmcnt(0)
2392 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2395 ; VI-LABEL: atomic_min_i64_ret_offset:
2396 ; VI: ; %bb.0: ; %entry
2397 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2398 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
2399 ; VI-NEXT: s_mov_b32 s3, 0xf000
2400 ; VI-NEXT: s_mov_b32 s2, -1
2401 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2402 ; VI-NEXT: s_mov_b32 s0, s6
2403 ; VI-NEXT: s_mov_b32 s1, s7
2404 ; VI-NEXT: v_mov_b32_e32 v0, s8
2405 ; VI-NEXT: v_mov_b32_e32 v1, s9
2406 ; VI-NEXT: s_mov_b32 s6, s2
2407 ; VI-NEXT: s_mov_b32 s7, s3
2408 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2409 ; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0 offset:32 glc
2410 ; VI-NEXT: s_waitcnt vmcnt(0)
2411 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2414 ; GFX9-LABEL: atomic_min_i64_ret_offset:
2415 ; GFX9: ; %bb.0: ; %entry
2416 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2417 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2418 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2419 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2420 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
2421 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2422 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2423 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
2424 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2425 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
2426 ; GFX9-NEXT: s_endpgm
2428 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2429 %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
2430 store i64 %tmp0, ptr addrspace(1) %out2
2434 define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
2435 ; CI-LABEL: atomic_min_i64_addr64_offset:
2436 ; CI: ; %bb.0: ; %entry
2437 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2438 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2439 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2440 ; CI-NEXT: v_mov_b32_e32 v0, s6
2441 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
2442 ; CI-NEXT: v_mov_b32_e32 v3, s1
2443 ; CI-NEXT: v_mov_b32_e32 v1, s7
2444 ; CI-NEXT: s_mov_b32 s7, 0xf000
2445 ; CI-NEXT: s_mov_b32 s6, 0
2446 ; CI-NEXT: v_mov_b32_e32 v2, s0
2447 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2448 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
2451 ; VI-LABEL: atomic_min_i64_addr64_offset:
2452 ; VI: ; %bb.0: ; %entry
2453 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2454 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2455 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2456 ; VI-NEXT: v_mov_b32_e32 v0, s6
2457 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
2458 ; VI-NEXT: s_add_u32 s0, s4, s0
2459 ; VI-NEXT: s_addc_u32 s1, s5, s1
2460 ; VI-NEXT: s_add_u32 s0, s0, 32
2461 ; VI-NEXT: s_addc_u32 s1, s1, 0
2462 ; VI-NEXT: v_mov_b32_e32 v3, s1
2463 ; VI-NEXT: v_mov_b32_e32 v1, s7
2464 ; VI-NEXT: v_mov_b32_e32 v2, s0
2465 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2466 ; VI-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1]
2469 ; GFX9-LABEL: atomic_min_i64_addr64_offset:
2470 ; GFX9: ; %bb.0: ; %entry
2471 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2472 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2473 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2474 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2475 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
2476 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
2477 ; GFX9-NEXT: s_add_u32 s0, s4, s0
2478 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
2479 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
2480 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2481 ; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1] offset:32
2482 ; GFX9-NEXT: s_endpgm
2484 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
2485 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
2486 %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
2490 define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
2491 ; CI-LABEL: atomic_min_i64_ret_addr64_offset:
2492 ; CI: ; %bb.0: ; %entry
2493 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
2494 ; CI-NEXT: s_mov_b32 s11, 0xf000
2495 ; CI-NEXT: s_mov_b32 s10, -1
2496 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2497 ; CI-NEXT: v_mov_b32_e32 v0, s4
2498 ; CI-NEXT: v_mov_b32_e32 v1, s5
2499 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
2500 ; CI-NEXT: v_mov_b32_e32 v2, s4
2501 ; CI-NEXT: s_mov_b32 s8, s2
2502 ; CI-NEXT: s_mov_b32 s9, s3
2503 ; CI-NEXT: s_mov_b32 s2, 0
2504 ; CI-NEXT: s_mov_b32 s3, s11
2505 ; CI-NEXT: v_mov_b32_e32 v3, s5
2506 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2507 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
2508 ; CI-NEXT: s_waitcnt vmcnt(0)
2509 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2512 ; VI-LABEL: atomic_min_i64_ret_addr64_offset:
2513 ; VI: ; %bb.0: ; %entry
2514 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
2515 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2516 ; VI-NEXT: v_mov_b32_e32 v0, s4
2517 ; VI-NEXT: v_mov_b32_e32 v1, s5
2518 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
2519 ; VI-NEXT: s_add_u32 s0, s0, s4
2520 ; VI-NEXT: s_addc_u32 s1, s1, s5
2521 ; VI-NEXT: s_add_u32 s0, s0, 32
2522 ; VI-NEXT: s_addc_u32 s1, s1, 0
2523 ; VI-NEXT: v_mov_b32_e32 v3, s1
2524 ; VI-NEXT: v_mov_b32_e32 v2, s0
2525 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2526 ; VI-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
2527 ; VI-NEXT: s_mov_b32 s7, 0xf000
2528 ; VI-NEXT: s_mov_b32 s6, -1
2529 ; VI-NEXT: s_mov_b32 s4, s2
2530 ; VI-NEXT: s_mov_b32 s5, s3
2531 ; VI-NEXT: s_waitcnt vmcnt(0)
2532 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2535 ; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
2536 ; GFX9: ; %bb.0: ; %entry
2537 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
2538 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2539 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2540 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
2541 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
2542 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
2543 ; GFX9-NEXT: s_add_u32 s0, s0, s4
2544 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
2545 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2546 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
2547 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2548 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
2549 ; GFX9-NEXT: s_endpgm
2551 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
2552 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
2553 %tmp0 = atomicrmw volatile min ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
2554 store i64 %tmp0, ptr addrspace(1) %out2
2558 define amdgpu_kernel void @atomic_min_i64(ptr addrspace(1) %out, i64 %in) {
2559 ; CI-LABEL: atomic_min_i64:
2560 ; CI: ; %bb.0: ; %entry
2561 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2562 ; CI-NEXT: s_mov_b32 s7, 0xf000
2563 ; CI-NEXT: s_mov_b32 s6, -1
2564 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2565 ; CI-NEXT: s_mov_b32 s4, s0
2566 ; CI-NEXT: s_mov_b32 s5, s1
2567 ; CI-NEXT: v_mov_b32_e32 v0, s2
2568 ; CI-NEXT: v_mov_b32_e32 v1, s3
2569 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2570 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0
2573 ; VI-LABEL: atomic_min_i64:
2574 ; VI: ; %bb.0: ; %entry
2575 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2576 ; VI-NEXT: s_mov_b32 s7, 0xf000
2577 ; VI-NEXT: s_mov_b32 s6, -1
2578 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2579 ; VI-NEXT: s_mov_b32 s4, s0
2580 ; VI-NEXT: s_mov_b32 s5, s1
2581 ; VI-NEXT: v_mov_b32_e32 v0, s2
2582 ; VI-NEXT: v_mov_b32_e32 v1, s3
2583 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2584 ; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[4:7], 0
2587 ; GFX9-LABEL: atomic_min_i64:
2588 ; GFX9: ; %bb.0: ; %entry
2589 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2590 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2591 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2592 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
2593 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2594 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2595 ; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1]
2596 ; GFX9-NEXT: s_endpgm
2598 %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
2602 define amdgpu_kernel void @atomic_min_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
2603 ; CI-LABEL: atomic_min_i64_ret:
2604 ; CI: ; %bb.0: ; %entry
2605 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2606 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2607 ; CI-NEXT: s_mov_b32 s3, 0xf000
2608 ; CI-NEXT: s_mov_b32 s2, -1
2609 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2610 ; CI-NEXT: s_mov_b32 s0, s4
2611 ; CI-NEXT: s_mov_b32 s1, s5
2612 ; CI-NEXT: v_mov_b32_e32 v0, s8
2613 ; CI-NEXT: v_mov_b32_e32 v1, s9
2614 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2615 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 glc
2616 ; CI-NEXT: s_mov_b32 s0, s6
2617 ; CI-NEXT: s_mov_b32 s1, s7
2618 ; CI-NEXT: s_waitcnt vmcnt(0)
2619 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2622 ; VI-LABEL: atomic_min_i64_ret:
2623 ; VI: ; %bb.0: ; %entry
2624 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2625 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
2626 ; VI-NEXT: s_mov_b32 s3, 0xf000
2627 ; VI-NEXT: s_mov_b32 s2, -1
2628 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2629 ; VI-NEXT: s_mov_b32 s0, s4
2630 ; VI-NEXT: s_mov_b32 s1, s5
2631 ; VI-NEXT: v_mov_b32_e32 v0, s8
2632 ; VI-NEXT: v_mov_b32_e32 v1, s9
2633 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2634 ; VI-NEXT: buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 glc
2635 ; VI-NEXT: s_mov_b32 s0, s6
2636 ; VI-NEXT: s_mov_b32 s1, s7
2637 ; VI-NEXT: s_waitcnt vmcnt(0)
2638 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2641 ; GFX9-LABEL: atomic_min_i64_ret:
2642 ; GFX9: ; %bb.0: ; %entry
2643 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2644 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2645 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2646 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2647 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
2648 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2649 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2650 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[4:5] glc
2651 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2652 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
2653 ; GFX9-NEXT: s_endpgm
2655 %tmp0 = atomicrmw volatile min ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
2656 store i64 %tmp0, ptr addrspace(1) %out2
2660 define amdgpu_kernel void @atomic_min_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
2661 ; CI-LABEL: atomic_min_i64_addr64:
2662 ; CI: ; %bb.0: ; %entry
2663 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2664 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2665 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2666 ; CI-NEXT: v_mov_b32_e32 v0, s6
2667 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
2668 ; CI-NEXT: v_mov_b32_e32 v3, s1
2669 ; CI-NEXT: v_mov_b32_e32 v1, s7
2670 ; CI-NEXT: s_mov_b32 s7, 0xf000
2671 ; CI-NEXT: s_mov_b32 s6, 0
2672 ; CI-NEXT: v_mov_b32_e32 v2, s0
2673 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2674 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[4:7], 0 addr64
2677 ; VI-LABEL: atomic_min_i64_addr64:
2678 ; VI: ; %bb.0: ; %entry
2679 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2680 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2681 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2682 ; VI-NEXT: v_mov_b32_e32 v0, s6
2683 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
2684 ; VI-NEXT: s_add_u32 s0, s4, s0
2685 ; VI-NEXT: s_addc_u32 s1, s5, s1
2686 ; VI-NEXT: v_mov_b32_e32 v3, s1
2687 ; VI-NEXT: v_mov_b32_e32 v1, s7
2688 ; VI-NEXT: v_mov_b32_e32 v2, s0
2689 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2690 ; VI-NEXT: flat_atomic_smin_x2 v[2:3], v[0:1]
2693 ; GFX9-LABEL: atomic_min_i64_addr64:
2694 ; GFX9: ; %bb.0: ; %entry
2695 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2696 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2697 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2698 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2699 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
2700 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
2701 ; GFX9-NEXT: s_add_u32 s0, s4, s0
2702 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
2703 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
2704 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2705 ; GFX9-NEXT: global_atomic_smin_x2 v2, v[0:1], s[0:1]
2706 ; GFX9-NEXT: s_endpgm
2708 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
2709 %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
2713 define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
2714 ; CI-LABEL: atomic_min_i64_ret_addr64:
2715 ; CI: ; %bb.0: ; %entry
2716 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
2717 ; CI-NEXT: s_mov_b32 s11, 0xf000
2718 ; CI-NEXT: s_mov_b32 s10, -1
2719 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2720 ; CI-NEXT: v_mov_b32_e32 v0, s4
2721 ; CI-NEXT: v_mov_b32_e32 v1, s5
2722 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
2723 ; CI-NEXT: v_mov_b32_e32 v2, s4
2724 ; CI-NEXT: s_mov_b32 s8, s2
2725 ; CI-NEXT: s_mov_b32 s9, s3
2726 ; CI-NEXT: s_mov_b32 s2, 0
2727 ; CI-NEXT: s_mov_b32 s3, s11
2728 ; CI-NEXT: v_mov_b32_e32 v3, s5
2729 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2730 ; CI-NEXT: buffer_atomic_smin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
2731 ; CI-NEXT: s_waitcnt vmcnt(0)
2732 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2735 ; VI-LABEL: atomic_min_i64_ret_addr64:
2736 ; VI: ; %bb.0: ; %entry
2737 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
2738 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2739 ; VI-NEXT: v_mov_b32_e32 v0, s4
2740 ; VI-NEXT: v_mov_b32_e32 v1, s5
2741 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
2742 ; VI-NEXT: s_add_u32 s0, s0, s4
2743 ; VI-NEXT: s_addc_u32 s1, s1, s5
2744 ; VI-NEXT: v_mov_b32_e32 v3, s1
2745 ; VI-NEXT: v_mov_b32_e32 v2, s0
2746 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2747 ; VI-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
2748 ; VI-NEXT: s_mov_b32 s7, 0xf000
2749 ; VI-NEXT: s_mov_b32 s6, -1
2750 ; VI-NEXT: s_mov_b32 s4, s2
2751 ; VI-NEXT: s_mov_b32 s5, s3
2752 ; VI-NEXT: s_waitcnt vmcnt(0)
2753 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2756 ; GFX9-LABEL: atomic_min_i64_ret_addr64:
2757 ; GFX9: ; %bb.0: ; %entry
2758 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
2759 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2760 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2761 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
2762 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
2763 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
2764 ; GFX9-NEXT: s_add_u32 s0, s0, s4
2765 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
2766 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2767 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v2, v[0:1], s[0:1] glc
2768 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2769 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
2770 ; GFX9-NEXT: s_endpgm
2772 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
2773 %tmp0 = atomicrmw volatile min ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
2774 store i64 %tmp0, ptr addrspace(1) %out2
2778 define amdgpu_kernel void @atomic_umin_i64_offset(ptr addrspace(1) %out, i64 %in) {
2779 ; CI-LABEL: atomic_umin_i64_offset:
2780 ; CI: ; %bb.0: ; %entry
2781 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
2782 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2783 ; CI-NEXT: v_mov_b32_e32 v0, s2
2784 ; CI-NEXT: v_mov_b32_e32 v1, s3
2785 ; CI-NEXT: s_mov_b32 s3, 0xf000
2786 ; CI-NEXT: s_mov_b32 s2, -1
2787 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2788 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 offset:32
2791 ; VI-LABEL: atomic_umin_i64_offset:
2792 ; VI: ; %bb.0: ; %entry
2793 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2794 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2795 ; VI-NEXT: v_mov_b32_e32 v0, s2
2796 ; VI-NEXT: v_mov_b32_e32 v1, s3
2797 ; VI-NEXT: s_mov_b32 s3, 0xf000
2798 ; VI-NEXT: s_mov_b32 s2, -1
2799 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2800 ; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 offset:32
2803 ; GFX9-LABEL: atomic_umin_i64_offset:
2804 ; GFX9: ; %bb.0: ; %entry
2805 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2806 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2807 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2808 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
2809 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2810 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2811 ; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1] offset:32
2812 ; GFX9-NEXT: s_endpgm
2814 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2815 %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
2819 define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
2820 ; CI-LABEL: atomic_umin_i64_ret_offset:
2821 ; CI: ; %bb.0: ; %entry
2822 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2823 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
2824 ; CI-NEXT: s_mov_b32 s3, 0xf000
2825 ; CI-NEXT: s_mov_b32 s2, -1
2826 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2827 ; CI-NEXT: s_mov_b32 s0, s6
2828 ; CI-NEXT: s_mov_b32 s1, s7
2829 ; CI-NEXT: v_mov_b32_e32 v0, s8
2830 ; CI-NEXT: v_mov_b32_e32 v1, s9
2831 ; CI-NEXT: s_mov_b32 s6, s2
2832 ; CI-NEXT: s_mov_b32 s7, s3
2833 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2834 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0 offset:32 glc
2835 ; CI-NEXT: s_waitcnt vmcnt(0)
2836 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2839 ; VI-LABEL: atomic_umin_i64_ret_offset:
2840 ; VI: ; %bb.0: ; %entry
2841 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2842 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
2843 ; VI-NEXT: s_mov_b32 s3, 0xf000
2844 ; VI-NEXT: s_mov_b32 s2, -1
2845 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2846 ; VI-NEXT: s_mov_b32 s0, s6
2847 ; VI-NEXT: s_mov_b32 s1, s7
2848 ; VI-NEXT: v_mov_b32_e32 v0, s8
2849 ; VI-NEXT: v_mov_b32_e32 v1, s9
2850 ; VI-NEXT: s_mov_b32 s6, s2
2851 ; VI-NEXT: s_mov_b32 s7, s3
2852 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2853 ; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0 offset:32 glc
2854 ; VI-NEXT: s_waitcnt vmcnt(0)
2855 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
2858 ; GFX9-LABEL: atomic_umin_i64_ret_offset:
2859 ; GFX9: ; %bb.0: ; %entry
2860 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2861 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2862 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2863 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2864 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
2865 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
2866 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2867 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
2868 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2869 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
2870 ; GFX9-NEXT: s_endpgm
2872 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
2873 %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
2874 store i64 %tmp0, ptr addrspace(1) %out2
2878 define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
2879 ; CI-LABEL: atomic_umin_i64_addr64_offset:
2880 ; CI: ; %bb.0: ; %entry
2881 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2882 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
2883 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2884 ; CI-NEXT: v_mov_b32_e32 v0, s6
2885 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
2886 ; CI-NEXT: v_mov_b32_e32 v3, s1
2887 ; CI-NEXT: v_mov_b32_e32 v1, s7
2888 ; CI-NEXT: s_mov_b32 s7, 0xf000
2889 ; CI-NEXT: s_mov_b32 s6, 0
2890 ; CI-NEXT: v_mov_b32_e32 v2, s0
2891 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2892 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
2895 ; VI-LABEL: atomic_umin_i64_addr64_offset:
2896 ; VI: ; %bb.0: ; %entry
2897 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2898 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
2899 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2900 ; VI-NEXT: v_mov_b32_e32 v0, s6
2901 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
2902 ; VI-NEXT: s_add_u32 s0, s4, s0
2903 ; VI-NEXT: s_addc_u32 s1, s5, s1
2904 ; VI-NEXT: s_add_u32 s0, s0, 32
2905 ; VI-NEXT: s_addc_u32 s1, s1, 0
2906 ; VI-NEXT: v_mov_b32_e32 v3, s1
2907 ; VI-NEXT: v_mov_b32_e32 v1, s7
2908 ; VI-NEXT: v_mov_b32_e32 v2, s0
2909 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2910 ; VI-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1]
2913 ; GFX9-LABEL: atomic_umin_i64_addr64_offset:
2914 ; GFX9: ; %bb.0: ; %entry
2915 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2916 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2917 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2918 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2919 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
2920 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
2921 ; GFX9-NEXT: s_add_u32 s0, s4, s0
2922 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
2923 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
2924 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2925 ; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1] offset:32
2926 ; GFX9-NEXT: s_endpgm
2928 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
2929 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
2930 %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
2934 define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
2935 ; CI-LABEL: atomic_umin_i64_ret_addr64_offset:
2936 ; CI: ; %bb.0: ; %entry
2937 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
2938 ; CI-NEXT: s_mov_b32 s11, 0xf000
2939 ; CI-NEXT: s_mov_b32 s10, -1
2940 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2941 ; CI-NEXT: v_mov_b32_e32 v0, s4
2942 ; CI-NEXT: v_mov_b32_e32 v1, s5
2943 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
2944 ; CI-NEXT: v_mov_b32_e32 v2, s4
2945 ; CI-NEXT: s_mov_b32 s8, s2
2946 ; CI-NEXT: s_mov_b32 s9, s3
2947 ; CI-NEXT: s_mov_b32 s2, 0
2948 ; CI-NEXT: s_mov_b32 s3, s11
2949 ; CI-NEXT: v_mov_b32_e32 v3, s5
2950 ; CI-NEXT: s_waitcnt lgkmcnt(0)
2951 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
2952 ; CI-NEXT: s_waitcnt vmcnt(0)
2953 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
2956 ; VI-LABEL: atomic_umin_i64_ret_addr64_offset:
2957 ; VI: ; %bb.0: ; %entry
2958 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
2959 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2960 ; VI-NEXT: v_mov_b32_e32 v0, s4
2961 ; VI-NEXT: v_mov_b32_e32 v1, s5
2962 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
2963 ; VI-NEXT: s_add_u32 s0, s0, s4
2964 ; VI-NEXT: s_addc_u32 s1, s1, s5
2965 ; VI-NEXT: s_add_u32 s0, s0, 32
2966 ; VI-NEXT: s_addc_u32 s1, s1, 0
2967 ; VI-NEXT: v_mov_b32_e32 v3, s1
2968 ; VI-NEXT: v_mov_b32_e32 v2, s0
2969 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2970 ; VI-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
2971 ; VI-NEXT: s_mov_b32 s7, 0xf000
2972 ; VI-NEXT: s_mov_b32 s6, -1
2973 ; VI-NEXT: s_mov_b32 s4, s2
2974 ; VI-NEXT: s_mov_b32 s5, s3
2975 ; VI-NEXT: s_waitcnt vmcnt(0)
2976 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2979 ; GFX9-LABEL: atomic_umin_i64_ret_addr64_offset:
2980 ; GFX9: ; %bb.0: ; %entry
2981 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
2982 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2983 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2984 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
2985 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
2986 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
2987 ; GFX9-NEXT: s_add_u32 s0, s0, s4
2988 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
2989 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2990 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
2991 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2992 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
2993 ; GFX9-NEXT: s_endpgm
2995 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
2996 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
2997 %tmp0 = atomicrmw volatile umin ptr addrspace(1) %gep, i64 %in syncscope("workgroup") seq_cst
2998 store i64 %tmp0, ptr addrspace(1) %out2
3002 define amdgpu_kernel void @atomic_umin_i64(ptr addrspace(1) %out, i64 %in) {
3003 ; CI-LABEL: atomic_umin_i64:
3004 ; CI: ; %bb.0: ; %entry
3005 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
3006 ; CI-NEXT: s_mov_b32 s7, 0xf000
3007 ; CI-NEXT: s_mov_b32 s6, -1
3008 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3009 ; CI-NEXT: s_mov_b32 s4, s0
3010 ; CI-NEXT: s_mov_b32 s5, s1
3011 ; CI-NEXT: v_mov_b32_e32 v0, s2
3012 ; CI-NEXT: v_mov_b32_e32 v1, s3
3013 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3014 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0
3017 ; VI-LABEL: atomic_umin_i64:
3018 ; VI: ; %bb.0: ; %entry
3019 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3020 ; VI-NEXT: s_mov_b32 s7, 0xf000
3021 ; VI-NEXT: s_mov_b32 s6, -1
3022 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3023 ; VI-NEXT: s_mov_b32 s4, s0
3024 ; VI-NEXT: s_mov_b32 s5, s1
3025 ; VI-NEXT: v_mov_b32_e32 v0, s2
3026 ; VI-NEXT: v_mov_b32_e32 v1, s3
3027 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3028 ; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[4:7], 0
3031 ; GFX9-LABEL: atomic_umin_i64:
3032 ; GFX9: ; %bb.0: ; %entry
3033 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3034 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3035 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3036 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3037 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3038 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3039 ; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1]
3040 ; GFX9-NEXT: s_endpgm
3042 %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
3046 define amdgpu_kernel void @atomic_umin_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
3047 ; CI-LABEL: atomic_umin_i64_ret:
3048 ; CI: ; %bb.0: ; %entry
3049 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3050 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
3051 ; CI-NEXT: s_mov_b32 s3, 0xf000
3052 ; CI-NEXT: s_mov_b32 s2, -1
3053 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3054 ; CI-NEXT: s_mov_b32 s0, s4
3055 ; CI-NEXT: s_mov_b32 s1, s5
3056 ; CI-NEXT: v_mov_b32_e32 v0, s8
3057 ; CI-NEXT: v_mov_b32_e32 v1, s9
3058 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3059 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 glc
3060 ; CI-NEXT: s_mov_b32 s0, s6
3061 ; CI-NEXT: s_mov_b32 s1, s7
3062 ; CI-NEXT: s_waitcnt vmcnt(0)
3063 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3066 ; VI-LABEL: atomic_umin_i64_ret:
3067 ; VI: ; %bb.0: ; %entry
3068 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3069 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
3070 ; VI-NEXT: s_mov_b32 s3, 0xf000
3071 ; VI-NEXT: s_mov_b32 s2, -1
3072 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3073 ; VI-NEXT: s_mov_b32 s0, s4
3074 ; VI-NEXT: s_mov_b32 s1, s5
3075 ; VI-NEXT: v_mov_b32_e32 v0, s8
3076 ; VI-NEXT: v_mov_b32_e32 v1, s9
3077 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3078 ; VI-NEXT: buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 glc
3079 ; VI-NEXT: s_mov_b32 s0, s6
3080 ; VI-NEXT: s_mov_b32 s1, s7
3081 ; VI-NEXT: s_waitcnt vmcnt(0)
3082 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3085 ; GFX9-LABEL: atomic_umin_i64_ret:
3086 ; GFX9: ; %bb.0: ; %entry
3087 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3088 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3089 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3090 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3091 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3092 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3093 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3094 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[4:5] glc
3095 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3096 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
3097 ; GFX9-NEXT: s_endpgm
3099 %tmp0 = atomicrmw volatile umin ptr addrspace(1) %out, i64 %in syncscope("workgroup") seq_cst
3100 store i64 %tmp0, ptr addrspace(1) %out2
3104 define amdgpu_kernel void @atomic_umin_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
3105 ; CI-LABEL: atomic_umin_i64_addr64:
3106 ; CI: ; %bb.0: ; %entry
3107 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3108 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
3109 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3110 ; CI-NEXT: v_mov_b32_e32 v0, s6
3111 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
3112 ; CI-NEXT: v_mov_b32_e32 v3, s1
3113 ; CI-NEXT: v_mov_b32_e32 v1, s7
3114 ; CI-NEXT: s_mov_b32 s7, 0xf000
3115 ; CI-NEXT: s_mov_b32 s6, 0
3116 ; CI-NEXT: v_mov_b32_e32 v2, s0
3117 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3118 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[4:7], 0 addr64
3121 ; VI-LABEL: atomic_umin_i64_addr64:
3122 ; VI: ; %bb.0: ; %entry
3123 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3124 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3125 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3126 ; VI-NEXT: v_mov_b32_e32 v0, s6
3127 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
3128 ; VI-NEXT: s_add_u32 s0, s4, s0
3129 ; VI-NEXT: s_addc_u32 s1, s5, s1
3130 ; VI-NEXT: v_mov_b32_e32 v3, s1
3131 ; VI-NEXT: v_mov_b32_e32 v1, s7
3132 ; VI-NEXT: v_mov_b32_e32 v2, s0
3133 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3134 ; VI-NEXT: flat_atomic_umin_x2 v[2:3], v[0:1]
3137 ; GFX9-LABEL: atomic_umin_i64_addr64:
3138 ; GFX9: ; %bb.0: ; %entry
3139 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3140 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3141 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3142 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3143 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
3144 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
3145 ; GFX9-NEXT: s_add_u32 s0, s4, s0
3146 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
3147 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
3148 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3149 ; GFX9-NEXT: global_atomic_umin_x2 v2, v[0:1], s[0:1]
3150 ; GFX9-NEXT: s_endpgm
3152 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
3153 %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
3157 define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
3158 ; CI-LABEL: atomic_umin_i64_ret_addr64:
3159 ; CI: ; %bb.0: ; %entry
3160 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
3161 ; CI-NEXT: s_mov_b32 s11, 0xf000
3162 ; CI-NEXT: s_mov_b32 s10, -1
3163 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3164 ; CI-NEXT: v_mov_b32_e32 v0, s4
3165 ; CI-NEXT: v_mov_b32_e32 v1, s5
3166 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
3167 ; CI-NEXT: v_mov_b32_e32 v2, s4
3168 ; CI-NEXT: s_mov_b32 s8, s2
3169 ; CI-NEXT: s_mov_b32 s9, s3
3170 ; CI-NEXT: s_mov_b32 s2, 0
3171 ; CI-NEXT: s_mov_b32 s3, s11
3172 ; CI-NEXT: v_mov_b32_e32 v3, s5
3173 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3174 ; CI-NEXT: buffer_atomic_umin_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
3175 ; CI-NEXT: s_waitcnt vmcnt(0)
3176 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
3179 ; VI-LABEL: atomic_umin_i64_ret_addr64:
3180 ; VI: ; %bb.0: ; %entry
3181 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
3182 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3183 ; VI-NEXT: v_mov_b32_e32 v0, s4
3184 ; VI-NEXT: v_mov_b32_e32 v1, s5
3185 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
3186 ; VI-NEXT: s_add_u32 s0, s0, s4
3187 ; VI-NEXT: s_addc_u32 s1, s1, s5
3188 ; VI-NEXT: v_mov_b32_e32 v3, s1
3189 ; VI-NEXT: v_mov_b32_e32 v2, s0
3190 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3191 ; VI-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
3192 ; VI-NEXT: s_mov_b32 s7, 0xf000
3193 ; VI-NEXT: s_mov_b32 s6, -1
3194 ; VI-NEXT: s_mov_b32 s4, s2
3195 ; VI-NEXT: s_mov_b32 s5, s3
3196 ; VI-NEXT: s_waitcnt vmcnt(0)
3197 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3200 ; GFX9-LABEL: atomic_umin_i64_ret_addr64:
3201 ; GFX9: ; %bb.0: ; %entry
3202 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
3203 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3204 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3205 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
3206 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
3207 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
3208 ; GFX9-NEXT: s_add_u32 s0, s0, s4
3209 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
3210 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3211 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v2, v[0:1], s[0:1] glc
3212 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3213 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
3214 ; GFX9-NEXT: s_endpgm
3216 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
3217 %tmp0 = atomicrmw volatile umin ptr addrspace(1) %ptr, i64 %in syncscope("workgroup") seq_cst
3218 store i64 %tmp0, ptr addrspace(1) %out2
3222 define amdgpu_kernel void @atomic_or_i64_offset(ptr addrspace(1) %out, i64 %in) {
3223 ; CI-LABEL: atomic_or_i64_offset:
3224 ; CI: ; %bb.0: ; %entry
3225 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
3226 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3227 ; CI-NEXT: v_mov_b32_e32 v0, s2
3228 ; CI-NEXT: v_mov_b32_e32 v1, s3
3229 ; CI-NEXT: s_mov_b32 s3, 0xf000
3230 ; CI-NEXT: s_mov_b32 s2, -1
3231 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3232 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 offset:32
3233 ; CI-NEXT: s_waitcnt vmcnt(0)
3234 ; CI-NEXT: buffer_wbinvl1_vol
3237 ; VI-LABEL: atomic_or_i64_offset:
3238 ; VI: ; %bb.0: ; %entry
3239 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3240 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3241 ; VI-NEXT: v_mov_b32_e32 v0, s2
3242 ; VI-NEXT: v_mov_b32_e32 v1, s3
3243 ; VI-NEXT: s_mov_b32 s3, 0xf000
3244 ; VI-NEXT: s_mov_b32 s2, -1
3245 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3246 ; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 offset:32
3247 ; VI-NEXT: s_waitcnt vmcnt(0)
3248 ; VI-NEXT: buffer_wbinvl1_vol
3251 ; GFX9-LABEL: atomic_or_i64_offset:
3252 ; GFX9: ; %bb.0: ; %entry
3253 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3254 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3255 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3256 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3257 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3258 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3259 ; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[0:1] offset:32
3260 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3261 ; GFX9-NEXT: buffer_wbinvl1_vol
3262 ; GFX9-NEXT: s_endpgm
3264 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3265 %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
3269 define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
3270 ; CI-LABEL: atomic_or_i64_ret_offset:
3271 ; CI: ; %bb.0: ; %entry
3272 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3273 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
3274 ; CI-NEXT: s_mov_b32 s3, 0xf000
3275 ; CI-NEXT: s_mov_b32 s2, -1
3276 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3277 ; CI-NEXT: s_mov_b32 s0, s6
3278 ; CI-NEXT: s_mov_b32 s1, s7
3279 ; CI-NEXT: v_mov_b32_e32 v0, s8
3280 ; CI-NEXT: v_mov_b32_e32 v1, s9
3281 ; CI-NEXT: s_mov_b32 s6, s2
3282 ; CI-NEXT: s_mov_b32 s7, s3
3283 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3284 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 glc
3285 ; CI-NEXT: s_waitcnt vmcnt(0)
3286 ; CI-NEXT: buffer_wbinvl1_vol
3287 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3290 ; VI-LABEL: atomic_or_i64_ret_offset:
3291 ; VI: ; %bb.0: ; %entry
3292 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3293 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
3294 ; VI-NEXT: s_mov_b32 s3, 0xf000
3295 ; VI-NEXT: s_mov_b32 s2, -1
3296 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3297 ; VI-NEXT: s_mov_b32 s0, s6
3298 ; VI-NEXT: s_mov_b32 s1, s7
3299 ; VI-NEXT: v_mov_b32_e32 v0, s8
3300 ; VI-NEXT: v_mov_b32_e32 v1, s9
3301 ; VI-NEXT: s_mov_b32 s6, s2
3302 ; VI-NEXT: s_mov_b32 s7, s3
3303 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3304 ; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32 glc
3305 ; VI-NEXT: s_waitcnt vmcnt(0)
3306 ; VI-NEXT: buffer_wbinvl1_vol
3307 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3310 ; GFX9-LABEL: atomic_or_i64_ret_offset:
3311 ; GFX9: ; %bb.0: ; %entry
3312 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3313 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3314 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3315 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3316 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3317 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3318 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3319 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
3320 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3321 ; GFX9-NEXT: buffer_wbinvl1_vol
3322 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
3323 ; GFX9-NEXT: s_endpgm
3325 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3326 %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
3327 store i64 %tmp0, ptr addrspace(1) %out2
3331 define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
3332 ; CI-LABEL: atomic_or_i64_addr64_offset:
3333 ; CI: ; %bb.0: ; %entry
3334 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3335 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
3336 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3337 ; CI-NEXT: v_mov_b32_e32 v0, s6
3338 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
3339 ; CI-NEXT: v_mov_b32_e32 v3, s1
3340 ; CI-NEXT: v_mov_b32_e32 v1, s7
3341 ; CI-NEXT: s_mov_b32 s7, 0xf000
3342 ; CI-NEXT: s_mov_b32 s6, 0
3343 ; CI-NEXT: v_mov_b32_e32 v2, s0
3344 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3345 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
3346 ; CI-NEXT: s_waitcnt vmcnt(0)
3347 ; CI-NEXT: buffer_wbinvl1_vol
3350 ; VI-LABEL: atomic_or_i64_addr64_offset:
3351 ; VI: ; %bb.0: ; %entry
3352 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3353 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3354 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3355 ; VI-NEXT: v_mov_b32_e32 v0, s6
3356 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
3357 ; VI-NEXT: s_add_u32 s0, s4, s0
3358 ; VI-NEXT: s_addc_u32 s1, s5, s1
3359 ; VI-NEXT: s_add_u32 s0, s0, 32
3360 ; VI-NEXT: s_addc_u32 s1, s1, 0
3361 ; VI-NEXT: v_mov_b32_e32 v3, s1
3362 ; VI-NEXT: v_mov_b32_e32 v1, s7
3363 ; VI-NEXT: v_mov_b32_e32 v2, s0
3364 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3365 ; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
3366 ; VI-NEXT: s_waitcnt vmcnt(0)
3367 ; VI-NEXT: buffer_wbinvl1_vol
3370 ; GFX9-LABEL: atomic_or_i64_addr64_offset:
3371 ; GFX9: ; %bb.0: ; %entry
3372 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3373 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3374 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3375 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3376 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
3377 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
3378 ; GFX9-NEXT: s_add_u32 s0, s4, s0
3379 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
3380 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
3381 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3382 ; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[0:1] offset:32
3383 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3384 ; GFX9-NEXT: buffer_wbinvl1_vol
3385 ; GFX9-NEXT: s_endpgm
3387 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
3388 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
3389 %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
3393 define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
3394 ; CI-LABEL: atomic_or_i64_ret_addr64_offset:
3395 ; CI: ; %bb.0: ; %entry
3396 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
3397 ; CI-NEXT: s_mov_b32 s11, 0xf000
3398 ; CI-NEXT: s_mov_b32 s10, -1
3399 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3400 ; CI-NEXT: v_mov_b32_e32 v0, s4
3401 ; CI-NEXT: v_mov_b32_e32 v1, s5
3402 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
3403 ; CI-NEXT: v_mov_b32_e32 v2, s4
3404 ; CI-NEXT: s_mov_b32 s8, s2
3405 ; CI-NEXT: s_mov_b32 s9, s3
3406 ; CI-NEXT: s_mov_b32 s2, 0
3407 ; CI-NEXT: s_mov_b32 s3, s11
3408 ; CI-NEXT: v_mov_b32_e32 v3, s5
3409 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3410 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
3411 ; CI-NEXT: s_waitcnt vmcnt(0)
3412 ; CI-NEXT: buffer_wbinvl1_vol
3413 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
3416 ; VI-LABEL: atomic_or_i64_ret_addr64_offset:
3417 ; VI: ; %bb.0: ; %entry
3418 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
3419 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3420 ; VI-NEXT: v_mov_b32_e32 v0, s4
3421 ; VI-NEXT: v_mov_b32_e32 v1, s5
3422 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
3423 ; VI-NEXT: s_add_u32 s0, s0, s4
3424 ; VI-NEXT: s_addc_u32 s1, s1, s5
3425 ; VI-NEXT: s_add_u32 s0, s0, 32
3426 ; VI-NEXT: s_addc_u32 s1, s1, 0
3427 ; VI-NEXT: v_mov_b32_e32 v3, s1
3428 ; VI-NEXT: v_mov_b32_e32 v2, s0
3429 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3430 ; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
3431 ; VI-NEXT: s_waitcnt vmcnt(0)
3432 ; VI-NEXT: buffer_wbinvl1_vol
3433 ; VI-NEXT: s_mov_b32 s7, 0xf000
3434 ; VI-NEXT: s_mov_b32 s6, -1
3435 ; VI-NEXT: s_mov_b32 s4, s2
3436 ; VI-NEXT: s_mov_b32 s5, s3
3437 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3440 ; GFX9-LABEL: atomic_or_i64_ret_addr64_offset:
3441 ; GFX9: ; %bb.0: ; %entry
3442 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
3443 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3444 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3445 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
3446 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
3447 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
3448 ; GFX9-NEXT: s_add_u32 s0, s0, s4
3449 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
3450 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3451 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
3452 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3453 ; GFX9-NEXT: buffer_wbinvl1_vol
3454 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
3455 ; GFX9-NEXT: s_endpgm
3457 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
3458 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
3459 %tmp0 = atomicrmw volatile or ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
3460 store i64 %tmp0, ptr addrspace(1) %out2
3464 define amdgpu_kernel void @atomic_or_i64(ptr addrspace(1) %out, i64 %in) {
3465 ; CI-LABEL: atomic_or_i64:
3466 ; CI: ; %bb.0: ; %entry
3467 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
3468 ; CI-NEXT: s_mov_b32 s7, 0xf000
3469 ; CI-NEXT: s_mov_b32 s6, -1
3470 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3471 ; CI-NEXT: s_mov_b32 s4, s0
3472 ; CI-NEXT: s_mov_b32 s5, s1
3473 ; CI-NEXT: v_mov_b32_e32 v0, s2
3474 ; CI-NEXT: v_mov_b32_e32 v1, s3
3475 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3476 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0
3477 ; CI-NEXT: s_waitcnt vmcnt(0)
3478 ; CI-NEXT: buffer_wbinvl1_vol
3481 ; VI-LABEL: atomic_or_i64:
3482 ; VI: ; %bb.0: ; %entry
3483 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3484 ; VI-NEXT: s_mov_b32 s7, 0xf000
3485 ; VI-NEXT: s_mov_b32 s6, -1
3486 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3487 ; VI-NEXT: s_mov_b32 s4, s0
3488 ; VI-NEXT: s_mov_b32 s5, s1
3489 ; VI-NEXT: v_mov_b32_e32 v0, s2
3490 ; VI-NEXT: v_mov_b32_e32 v1, s3
3491 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3492 ; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[4:7], 0
3493 ; VI-NEXT: s_waitcnt vmcnt(0)
3494 ; VI-NEXT: buffer_wbinvl1_vol
3497 ; GFX9-LABEL: atomic_or_i64:
3498 ; GFX9: ; %bb.0: ; %entry
3499 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3500 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3501 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3502 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3503 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3504 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3505 ; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[0:1]
3506 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3507 ; GFX9-NEXT: buffer_wbinvl1_vol
3508 ; GFX9-NEXT: s_endpgm
3510 %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
3514 define amdgpu_kernel void @atomic_or_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
3515 ; CI-LABEL: atomic_or_i64_ret:
3516 ; CI: ; %bb.0: ; %entry
3517 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3518 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
3519 ; CI-NEXT: s_mov_b32 s3, 0xf000
3520 ; CI-NEXT: s_mov_b32 s2, -1
3521 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3522 ; CI-NEXT: s_mov_b32 s0, s4
3523 ; CI-NEXT: s_mov_b32 s1, s5
3524 ; CI-NEXT: v_mov_b32_e32 v0, s8
3525 ; CI-NEXT: v_mov_b32_e32 v1, s9
3526 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3527 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 glc
3528 ; CI-NEXT: s_waitcnt vmcnt(0)
3529 ; CI-NEXT: buffer_wbinvl1_vol
3530 ; CI-NEXT: s_mov_b32 s0, s6
3531 ; CI-NEXT: s_mov_b32 s1, s7
3532 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3535 ; VI-LABEL: atomic_or_i64_ret:
3536 ; VI: ; %bb.0: ; %entry
3537 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3538 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
3539 ; VI-NEXT: s_mov_b32 s3, 0xf000
3540 ; VI-NEXT: s_mov_b32 s2, -1
3541 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3542 ; VI-NEXT: s_mov_b32 s0, s4
3543 ; VI-NEXT: s_mov_b32 s1, s5
3544 ; VI-NEXT: v_mov_b32_e32 v0, s8
3545 ; VI-NEXT: v_mov_b32_e32 v1, s9
3546 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3547 ; VI-NEXT: buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 glc
3548 ; VI-NEXT: s_waitcnt vmcnt(0)
3549 ; VI-NEXT: buffer_wbinvl1_vol
3550 ; VI-NEXT: s_mov_b32 s0, s6
3551 ; VI-NEXT: s_mov_b32 s1, s7
3552 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3555 ; GFX9-LABEL: atomic_or_i64_ret:
3556 ; GFX9: ; %bb.0: ; %entry
3557 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3558 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3559 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3560 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3561 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3562 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3563 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3564 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[4:5] glc
3565 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3566 ; GFX9-NEXT: buffer_wbinvl1_vol
3567 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
3568 ; GFX9-NEXT: s_endpgm
3570 %tmp0 = atomicrmw volatile or ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
3571 store i64 %tmp0, ptr addrspace(1) %out2
3575 define amdgpu_kernel void @atomic_or_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
3576 ; CI-LABEL: atomic_or_i64_addr64:
3577 ; CI: ; %bb.0: ; %entry
3578 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3579 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
3580 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3581 ; CI-NEXT: v_mov_b32_e32 v0, s6
3582 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
3583 ; CI-NEXT: v_mov_b32_e32 v3, s1
3584 ; CI-NEXT: v_mov_b32_e32 v1, s7
3585 ; CI-NEXT: s_mov_b32 s7, 0xf000
3586 ; CI-NEXT: s_mov_b32 s6, 0
3587 ; CI-NEXT: v_mov_b32_e32 v2, s0
3588 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3589 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[4:7], 0 addr64
3590 ; CI-NEXT: s_waitcnt vmcnt(0)
3591 ; CI-NEXT: buffer_wbinvl1_vol
3594 ; VI-LABEL: atomic_or_i64_addr64:
3595 ; VI: ; %bb.0: ; %entry
3596 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3597 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3598 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3599 ; VI-NEXT: v_mov_b32_e32 v0, s6
3600 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
3601 ; VI-NEXT: s_add_u32 s0, s4, s0
3602 ; VI-NEXT: s_addc_u32 s1, s5, s1
3603 ; VI-NEXT: v_mov_b32_e32 v3, s1
3604 ; VI-NEXT: v_mov_b32_e32 v1, s7
3605 ; VI-NEXT: v_mov_b32_e32 v2, s0
3606 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3607 ; VI-NEXT: flat_atomic_or_x2 v[2:3], v[0:1]
3608 ; VI-NEXT: s_waitcnt vmcnt(0)
3609 ; VI-NEXT: buffer_wbinvl1_vol
3612 ; GFX9-LABEL: atomic_or_i64_addr64:
3613 ; GFX9: ; %bb.0: ; %entry
3614 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3615 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3616 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3617 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3618 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
3619 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
3620 ; GFX9-NEXT: s_add_u32 s0, s4, s0
3621 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
3622 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
3623 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3624 ; GFX9-NEXT: global_atomic_or_x2 v2, v[0:1], s[0:1]
3625 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3626 ; GFX9-NEXT: buffer_wbinvl1_vol
3627 ; GFX9-NEXT: s_endpgm
3629 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
3630 %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
3634 define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
3635 ; CI-LABEL: atomic_or_i64_ret_addr64:
3636 ; CI: ; %bb.0: ; %entry
3637 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
3638 ; CI-NEXT: s_mov_b32 s11, 0xf000
3639 ; CI-NEXT: s_mov_b32 s10, -1
3640 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3641 ; CI-NEXT: v_mov_b32_e32 v0, s4
3642 ; CI-NEXT: v_mov_b32_e32 v1, s5
3643 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
3644 ; CI-NEXT: v_mov_b32_e32 v2, s4
3645 ; CI-NEXT: s_mov_b32 s8, s2
3646 ; CI-NEXT: s_mov_b32 s9, s3
3647 ; CI-NEXT: s_mov_b32 s2, 0
3648 ; CI-NEXT: s_mov_b32 s3, s11
3649 ; CI-NEXT: v_mov_b32_e32 v3, s5
3650 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3651 ; CI-NEXT: buffer_atomic_or_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
3652 ; CI-NEXT: s_waitcnt vmcnt(0)
3653 ; CI-NEXT: buffer_wbinvl1_vol
3654 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
3657 ; VI-LABEL: atomic_or_i64_ret_addr64:
3658 ; VI: ; %bb.0: ; %entry
3659 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
3660 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3661 ; VI-NEXT: v_mov_b32_e32 v0, s4
3662 ; VI-NEXT: v_mov_b32_e32 v1, s5
3663 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
3664 ; VI-NEXT: s_add_u32 s0, s0, s4
3665 ; VI-NEXT: s_addc_u32 s1, s1, s5
3666 ; VI-NEXT: v_mov_b32_e32 v3, s1
3667 ; VI-NEXT: v_mov_b32_e32 v2, s0
3668 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3669 ; VI-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
3670 ; VI-NEXT: s_waitcnt vmcnt(0)
3671 ; VI-NEXT: buffer_wbinvl1_vol
3672 ; VI-NEXT: s_mov_b32 s7, 0xf000
3673 ; VI-NEXT: s_mov_b32 s6, -1
3674 ; VI-NEXT: s_mov_b32 s4, s2
3675 ; VI-NEXT: s_mov_b32 s5, s3
3676 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
3679 ; GFX9-LABEL: atomic_or_i64_ret_addr64:
3680 ; GFX9: ; %bb.0: ; %entry
3681 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
3682 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3683 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3684 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
3685 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
3686 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
3687 ; GFX9-NEXT: s_add_u32 s0, s0, s4
3688 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
3689 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3690 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v2, v[0:1], s[0:1] glc
3691 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3692 ; GFX9-NEXT: buffer_wbinvl1_vol
3693 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
3694 ; GFX9-NEXT: s_endpgm
3696 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
3697 %tmp0 = atomicrmw volatile or ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
3698 store i64 %tmp0, ptr addrspace(1) %out2
3702 define amdgpu_kernel void @atomic_xchg_i64_offset(ptr addrspace(1) %out, i64 %in) {
3703 ; CI-LABEL: atomic_xchg_i64_offset:
3704 ; CI: ; %bb.0: ; %entry
3705 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
3706 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3707 ; CI-NEXT: v_mov_b32_e32 v0, s2
3708 ; CI-NEXT: v_mov_b32_e32 v1, s3
3709 ; CI-NEXT: s_mov_b32 s3, 0xf000
3710 ; CI-NEXT: s_mov_b32 s2, -1
3711 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3712 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32
3713 ; CI-NEXT: s_waitcnt vmcnt(0)
3714 ; CI-NEXT: buffer_wbinvl1_vol
3717 ; VI-LABEL: atomic_xchg_i64_offset:
3718 ; VI: ; %bb.0: ; %entry
3719 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3720 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3721 ; VI-NEXT: v_mov_b32_e32 v0, s2
3722 ; VI-NEXT: v_mov_b32_e32 v1, s3
3723 ; VI-NEXT: s_mov_b32 s3, 0xf000
3724 ; VI-NEXT: s_mov_b32 s2, -1
3725 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3726 ; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32
3727 ; VI-NEXT: s_waitcnt vmcnt(0)
3728 ; VI-NEXT: buffer_wbinvl1_vol
3731 ; GFX9-LABEL: atomic_xchg_i64_offset:
3732 ; GFX9: ; %bb.0: ; %entry
3733 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3734 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3735 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3736 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3737 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3738 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3739 ; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
3740 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3741 ; GFX9-NEXT: buffer_wbinvl1_vol
3742 ; GFX9-NEXT: s_endpgm
3744 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3745 %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
3749 define amdgpu_kernel void @atomic_xchg_f64_offset(ptr addrspace(1) %out, double %in) {
3750 ; CI-LABEL: atomic_xchg_f64_offset:
3751 ; CI: ; %bb.0: ; %entry
3752 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
3753 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3754 ; CI-NEXT: v_mov_b32_e32 v0, s2
3755 ; CI-NEXT: v_mov_b32_e32 v1, s3
3756 ; CI-NEXT: s_mov_b32 s3, 0xf000
3757 ; CI-NEXT: s_mov_b32 s2, -1
3758 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3759 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32
3760 ; CI-NEXT: s_waitcnt vmcnt(0)
3761 ; CI-NEXT: buffer_wbinvl1_vol
3764 ; VI-LABEL: atomic_xchg_f64_offset:
3765 ; VI: ; %bb.0: ; %entry
3766 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3767 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3768 ; VI-NEXT: v_mov_b32_e32 v0, s2
3769 ; VI-NEXT: v_mov_b32_e32 v1, s3
3770 ; VI-NEXT: s_mov_b32 s3, 0xf000
3771 ; VI-NEXT: s_mov_b32 s2, -1
3772 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3773 ; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32
3774 ; VI-NEXT: s_waitcnt vmcnt(0)
3775 ; VI-NEXT: buffer_wbinvl1_vol
3778 ; GFX9-LABEL: atomic_xchg_f64_offset:
3779 ; GFX9: ; %bb.0: ; %entry
3780 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3781 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3782 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3783 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3784 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3785 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3786 ; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
3787 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3788 ; GFX9-NEXT: buffer_wbinvl1_vol
3789 ; GFX9-NEXT: s_endpgm
3791 %gep = getelementptr double, ptr addrspace(1) %out, i64 4
3792 %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, double %in syncscope("agent") seq_cst
3796 define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr addrspace(1) %out, ptr %in) {
3797 ; CI-LABEL: atomic_xchg_pointer_offset:
3798 ; CI: ; %bb.0: ; %entry
3799 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
3800 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3801 ; CI-NEXT: v_mov_b32_e32 v0, s2
3802 ; CI-NEXT: v_mov_b32_e32 v1, s3
3803 ; CI-NEXT: s_mov_b32 s3, 0xf000
3804 ; CI-NEXT: s_mov_b32 s2, -1
3805 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3806 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32
3807 ; CI-NEXT: s_waitcnt vmcnt(0)
3808 ; CI-NEXT: buffer_wbinvl1_vol
3811 ; VI-LABEL: atomic_xchg_pointer_offset:
3812 ; VI: ; %bb.0: ; %entry
3813 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3814 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3815 ; VI-NEXT: v_mov_b32_e32 v0, s2
3816 ; VI-NEXT: v_mov_b32_e32 v1, s3
3817 ; VI-NEXT: s_mov_b32 s3, 0xf000
3818 ; VI-NEXT: s_mov_b32 s2, -1
3819 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3820 ; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:32
3821 ; VI-NEXT: s_waitcnt vmcnt(0)
3822 ; VI-NEXT: buffer_wbinvl1_vol
3825 ; GFX9-LABEL: atomic_xchg_pointer_offset:
3826 ; GFX9: ; %bb.0: ; %entry
3827 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
3828 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3829 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3830 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3831 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3832 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3833 ; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
3834 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3835 ; GFX9-NEXT: buffer_wbinvl1_vol
3836 ; GFX9-NEXT: s_endpgm
3838 %gep = getelementptr ptr, ptr addrspace(1) %out, i64 4
3839 %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, ptr %in syncscope("agent") seq_cst
3843 define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
3844 ; CI-LABEL: atomic_xchg_i64_ret_offset:
3845 ; CI: ; %bb.0: ; %entry
3846 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3847 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
3848 ; CI-NEXT: s_mov_b32 s3, 0xf000
3849 ; CI-NEXT: s_mov_b32 s2, -1
3850 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3851 ; CI-NEXT: s_mov_b32 s0, s6
3852 ; CI-NEXT: s_mov_b32 s1, s7
3853 ; CI-NEXT: v_mov_b32_e32 v0, s8
3854 ; CI-NEXT: v_mov_b32_e32 v1, s9
3855 ; CI-NEXT: s_mov_b32 s6, s2
3856 ; CI-NEXT: s_mov_b32 s7, s3
3857 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3858 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 glc
3859 ; CI-NEXT: s_waitcnt vmcnt(0)
3860 ; CI-NEXT: buffer_wbinvl1_vol
3861 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3864 ; VI-LABEL: atomic_xchg_i64_ret_offset:
3865 ; VI: ; %bb.0: ; %entry
3866 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3867 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
3868 ; VI-NEXT: s_mov_b32 s3, 0xf000
3869 ; VI-NEXT: s_mov_b32 s2, -1
3870 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3871 ; VI-NEXT: s_mov_b32 s0, s6
3872 ; VI-NEXT: s_mov_b32 s1, s7
3873 ; VI-NEXT: v_mov_b32_e32 v0, s8
3874 ; VI-NEXT: v_mov_b32_e32 v1, s9
3875 ; VI-NEXT: s_mov_b32 s6, s2
3876 ; VI-NEXT: s_mov_b32 s7, s3
3877 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3878 ; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32 glc
3879 ; VI-NEXT: s_waitcnt vmcnt(0)
3880 ; VI-NEXT: buffer_wbinvl1_vol
3881 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
3884 ; GFX9-LABEL: atomic_xchg_i64_ret_offset:
3885 ; GFX9: ; %bb.0: ; %entry
3886 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3887 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3888 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3889 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3890 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
3891 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
3892 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3893 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
3894 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3895 ; GFX9-NEXT: buffer_wbinvl1_vol
3896 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
3897 ; GFX9-NEXT: s_endpgm
3899 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
3900 %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
3901 store i64 %tmp0, ptr addrspace(1) %out2
3905 define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
3906 ; CI-LABEL: atomic_xchg_i64_addr64_offset:
3907 ; CI: ; %bb.0: ; %entry
3908 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3909 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
3910 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3911 ; CI-NEXT: v_mov_b32_e32 v0, s6
3912 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
3913 ; CI-NEXT: v_mov_b32_e32 v3, s1
3914 ; CI-NEXT: v_mov_b32_e32 v1, s7
3915 ; CI-NEXT: s_mov_b32 s7, 0xf000
3916 ; CI-NEXT: s_mov_b32 s6, 0
3917 ; CI-NEXT: v_mov_b32_e32 v2, s0
3918 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3919 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
3920 ; CI-NEXT: s_waitcnt vmcnt(0)
3921 ; CI-NEXT: buffer_wbinvl1_vol
3924 ; VI-LABEL: atomic_xchg_i64_addr64_offset:
3925 ; VI: ; %bb.0: ; %entry
3926 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3927 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
3928 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3929 ; VI-NEXT: v_mov_b32_e32 v0, s6
3930 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
3931 ; VI-NEXT: s_add_u32 s0, s4, s0
3932 ; VI-NEXT: s_addc_u32 s1, s5, s1
3933 ; VI-NEXT: s_add_u32 s0, s0, 32
3934 ; VI-NEXT: s_addc_u32 s1, s1, 0
3935 ; VI-NEXT: v_mov_b32_e32 v3, s1
3936 ; VI-NEXT: v_mov_b32_e32 v1, s7
3937 ; VI-NEXT: v_mov_b32_e32 v2, s0
3938 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3939 ; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
3940 ; VI-NEXT: s_waitcnt vmcnt(0)
3941 ; VI-NEXT: buffer_wbinvl1_vol
3944 ; GFX9-LABEL: atomic_xchg_i64_addr64_offset:
3945 ; GFX9: ; %bb.0: ; %entry
3946 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3947 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3948 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3949 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3950 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
3951 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
3952 ; GFX9-NEXT: s_add_u32 s0, s4, s0
3953 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
3954 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
3955 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3956 ; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1] offset:32
3957 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3958 ; GFX9-NEXT: buffer_wbinvl1_vol
3959 ; GFX9-NEXT: s_endpgm
3961 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
3962 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
3963 %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
3967 define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
3968 ; CI-LABEL: atomic_xchg_i64_ret_addr64_offset:
3969 ; CI: ; %bb.0: ; %entry
3970 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
3971 ; CI-NEXT: s_mov_b32 s11, 0xf000
3972 ; CI-NEXT: s_mov_b32 s10, -1
3973 ; CI-NEXT: s_waitcnt lgkmcnt(0)
3974 ; CI-NEXT: v_mov_b32_e32 v0, s4
3975 ; CI-NEXT: v_mov_b32_e32 v1, s5
3976 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
3977 ; CI-NEXT: v_mov_b32_e32 v2, s4
3978 ; CI-NEXT: s_mov_b32 s8, s2
3979 ; CI-NEXT: s_mov_b32 s9, s3
3980 ; CI-NEXT: s_mov_b32 s2, 0
3981 ; CI-NEXT: s_mov_b32 s3, s11
3982 ; CI-NEXT: v_mov_b32_e32 v3, s5
3983 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3984 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
3985 ; CI-NEXT: s_waitcnt vmcnt(0)
3986 ; CI-NEXT: buffer_wbinvl1_vol
3987 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
3990 ; VI-LABEL: atomic_xchg_i64_ret_addr64_offset:
3991 ; VI: ; %bb.0: ; %entry
3992 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
3993 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3994 ; VI-NEXT: v_mov_b32_e32 v0, s4
3995 ; VI-NEXT: v_mov_b32_e32 v1, s5
3996 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
3997 ; VI-NEXT: s_add_u32 s0, s0, s4
3998 ; VI-NEXT: s_addc_u32 s1, s1, s5
3999 ; VI-NEXT: s_add_u32 s0, s0, 32
4000 ; VI-NEXT: s_addc_u32 s1, s1, 0
4001 ; VI-NEXT: v_mov_b32_e32 v3, s1
4002 ; VI-NEXT: v_mov_b32_e32 v2, s0
4003 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4004 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
4005 ; VI-NEXT: s_waitcnt vmcnt(0)
4006 ; VI-NEXT: buffer_wbinvl1_vol
4007 ; VI-NEXT: s_mov_b32 s7, 0xf000
4008 ; VI-NEXT: s_mov_b32 s6, -1
4009 ; VI-NEXT: s_mov_b32 s4, s2
4010 ; VI-NEXT: s_mov_b32 s5, s3
4011 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4014 ; GFX9-LABEL: atomic_xchg_i64_ret_addr64_offset:
4015 ; GFX9: ; %bb.0: ; %entry
4016 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
4017 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4018 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4019 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
4020 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
4021 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
4022 ; GFX9-NEXT: s_add_u32 s0, s0, s4
4023 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
4024 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4025 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
4026 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4027 ; GFX9-NEXT: buffer_wbinvl1_vol
4028 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
4029 ; GFX9-NEXT: s_endpgm
4031 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
4032 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
4033 %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
4034 store i64 %tmp0, ptr addrspace(1) %out2
4038 define amdgpu_kernel void @atomic_xchg_i64(ptr addrspace(1) %out, i64 %in) {
4039 ; CI-LABEL: atomic_xchg_i64:
4040 ; CI: ; %bb.0: ; %entry
4041 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
4042 ; CI-NEXT: s_mov_b32 s7, 0xf000
4043 ; CI-NEXT: s_mov_b32 s6, -1
4044 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4045 ; CI-NEXT: s_mov_b32 s4, s0
4046 ; CI-NEXT: s_mov_b32 s5, s1
4047 ; CI-NEXT: v_mov_b32_e32 v0, s2
4048 ; CI-NEXT: v_mov_b32_e32 v1, s3
4049 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4050 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0
4051 ; CI-NEXT: s_waitcnt vmcnt(0)
4052 ; CI-NEXT: buffer_wbinvl1_vol
4055 ; VI-LABEL: atomic_xchg_i64:
4056 ; VI: ; %bb.0: ; %entry
4057 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4058 ; VI-NEXT: s_mov_b32 s7, 0xf000
4059 ; VI-NEXT: s_mov_b32 s6, -1
4060 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4061 ; VI-NEXT: s_mov_b32 s4, s0
4062 ; VI-NEXT: s_mov_b32 s5, s1
4063 ; VI-NEXT: v_mov_b32_e32 v0, s2
4064 ; VI-NEXT: v_mov_b32_e32 v1, s3
4065 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4066 ; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0
4067 ; VI-NEXT: s_waitcnt vmcnt(0)
4068 ; VI-NEXT: buffer_wbinvl1_vol
4071 ; GFX9-LABEL: atomic_xchg_i64:
4072 ; GFX9: ; %bb.0: ; %entry
4073 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4074 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4075 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4076 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4077 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4078 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4079 ; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1]
4080 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4081 ; GFX9-NEXT: buffer_wbinvl1_vol
4082 ; GFX9-NEXT: s_endpgm
4084 %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
4088 define amdgpu_kernel void @atomic_xchg_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
4089 ; CI-LABEL: atomic_xchg_i64_ret:
4090 ; CI: ; %bb.0: ; %entry
4091 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4092 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
4093 ; CI-NEXT: s_mov_b32 s3, 0xf000
4094 ; CI-NEXT: s_mov_b32 s2, -1
4095 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4096 ; CI-NEXT: s_mov_b32 s0, s4
4097 ; CI-NEXT: s_mov_b32 s1, s5
4098 ; CI-NEXT: v_mov_b32_e32 v0, s8
4099 ; CI-NEXT: v_mov_b32_e32 v1, s9
4100 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4101 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 glc
4102 ; CI-NEXT: s_waitcnt vmcnt(0)
4103 ; CI-NEXT: buffer_wbinvl1_vol
4104 ; CI-NEXT: s_mov_b32 s0, s6
4105 ; CI-NEXT: s_mov_b32 s1, s7
4106 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4109 ; VI-LABEL: atomic_xchg_i64_ret:
4110 ; VI: ; %bb.0: ; %entry
4111 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4112 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
4113 ; VI-NEXT: s_mov_b32 s3, 0xf000
4114 ; VI-NEXT: s_mov_b32 s2, -1
4115 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4116 ; VI-NEXT: s_mov_b32 s0, s4
4117 ; VI-NEXT: s_mov_b32 s1, s5
4118 ; VI-NEXT: v_mov_b32_e32 v0, s8
4119 ; VI-NEXT: v_mov_b32_e32 v1, s9
4120 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4121 ; VI-NEXT: buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 glc
4122 ; VI-NEXT: s_waitcnt vmcnt(0)
4123 ; VI-NEXT: buffer_wbinvl1_vol
4124 ; VI-NEXT: s_mov_b32 s0, s6
4125 ; VI-NEXT: s_mov_b32 s1, s7
4126 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4129 ; GFX9-LABEL: atomic_xchg_i64_ret:
4130 ; GFX9: ; %bb.0: ; %entry
4131 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4132 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4133 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4134 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4135 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4136 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4137 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4138 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[4:5] glc
4139 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4140 ; GFX9-NEXT: buffer_wbinvl1_vol
4141 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
4142 ; GFX9-NEXT: s_endpgm
4144 %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
4145 store i64 %tmp0, ptr addrspace(1) %out2
4149 define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
4150 ; CI-LABEL: atomic_xchg_i64_addr64:
4151 ; CI: ; %bb.0: ; %entry
4152 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4153 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
4154 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4155 ; CI-NEXT: v_mov_b32_e32 v0, s6
4156 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
4157 ; CI-NEXT: v_mov_b32_e32 v3, s1
4158 ; CI-NEXT: v_mov_b32_e32 v1, s7
4159 ; CI-NEXT: s_mov_b32 s7, 0xf000
4160 ; CI-NEXT: s_mov_b32 s6, 0
4161 ; CI-NEXT: v_mov_b32_e32 v2, s0
4162 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4163 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[4:7], 0 addr64
4164 ; CI-NEXT: s_waitcnt vmcnt(0)
4165 ; CI-NEXT: buffer_wbinvl1_vol
4168 ; VI-LABEL: atomic_xchg_i64_addr64:
4169 ; VI: ; %bb.0: ; %entry
4170 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4171 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
4172 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4173 ; VI-NEXT: v_mov_b32_e32 v0, s6
4174 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
4175 ; VI-NEXT: s_add_u32 s0, s4, s0
4176 ; VI-NEXT: s_addc_u32 s1, s5, s1
4177 ; VI-NEXT: v_mov_b32_e32 v3, s1
4178 ; VI-NEXT: v_mov_b32_e32 v1, s7
4179 ; VI-NEXT: v_mov_b32_e32 v2, s0
4180 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4181 ; VI-NEXT: flat_atomic_swap_x2 v[2:3], v[0:1]
4182 ; VI-NEXT: s_waitcnt vmcnt(0)
4183 ; VI-NEXT: buffer_wbinvl1_vol
4186 ; GFX9-LABEL: atomic_xchg_i64_addr64:
4187 ; GFX9: ; %bb.0: ; %entry
4188 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4189 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4190 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4191 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4192 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
4193 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
4194 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4195 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
4196 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4197 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4198 ; GFX9-NEXT: global_atomic_swap_x2 v2, v[0:1], s[0:1]
4199 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4200 ; GFX9-NEXT: buffer_wbinvl1_vol
4201 ; GFX9-NEXT: s_endpgm
4203 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
4204 %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
4208 define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
4209 ; CI-LABEL: atomic_xchg_i64_ret_addr64:
4210 ; CI: ; %bb.0: ; %entry
4211 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
4212 ; CI-NEXT: s_mov_b32 s11, 0xf000
4213 ; CI-NEXT: s_mov_b32 s10, -1
4214 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4215 ; CI-NEXT: v_mov_b32_e32 v0, s4
4216 ; CI-NEXT: v_mov_b32_e32 v1, s5
4217 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
4218 ; CI-NEXT: v_mov_b32_e32 v2, s4
4219 ; CI-NEXT: s_mov_b32 s8, s2
4220 ; CI-NEXT: s_mov_b32 s9, s3
4221 ; CI-NEXT: s_mov_b32 s2, 0
4222 ; CI-NEXT: s_mov_b32 s3, s11
4223 ; CI-NEXT: v_mov_b32_e32 v3, s5
4224 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4225 ; CI-NEXT: buffer_atomic_swap_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
4226 ; CI-NEXT: s_waitcnt vmcnt(0)
4227 ; CI-NEXT: buffer_wbinvl1_vol
4228 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
4231 ; VI-LABEL: atomic_xchg_i64_ret_addr64:
4232 ; VI: ; %bb.0: ; %entry
4233 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
4234 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4235 ; VI-NEXT: v_mov_b32_e32 v0, s4
4236 ; VI-NEXT: v_mov_b32_e32 v1, s5
4237 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
4238 ; VI-NEXT: s_add_u32 s0, s0, s4
4239 ; VI-NEXT: s_addc_u32 s1, s1, s5
4240 ; VI-NEXT: v_mov_b32_e32 v3, s1
4241 ; VI-NEXT: v_mov_b32_e32 v2, s0
4242 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4243 ; VI-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
4244 ; VI-NEXT: s_waitcnt vmcnt(0)
4245 ; VI-NEXT: buffer_wbinvl1_vol
4246 ; VI-NEXT: s_mov_b32 s7, 0xf000
4247 ; VI-NEXT: s_mov_b32 s6, -1
4248 ; VI-NEXT: s_mov_b32 s4, s2
4249 ; VI-NEXT: s_mov_b32 s5, s3
4250 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4253 ; GFX9-LABEL: atomic_xchg_i64_ret_addr64:
4254 ; GFX9: ; %bb.0: ; %entry
4255 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
4256 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4257 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4258 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
4259 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
4260 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
4261 ; GFX9-NEXT: s_add_u32 s0, s0, s4
4262 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
4263 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4264 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v2, v[0:1], s[0:1] glc
4265 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4266 ; GFX9-NEXT: buffer_wbinvl1_vol
4267 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
4268 ; GFX9-NEXT: s_endpgm
4270 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
4271 %tmp0 = atomicrmw volatile xchg ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
4272 store i64 %tmp0, ptr addrspace(1) %out2
4276 define amdgpu_kernel void @atomic_xor_i64_offset(ptr addrspace(1) %out, i64 %in) {
4277 ; CI-LABEL: atomic_xor_i64_offset:
4278 ; CI: ; %bb.0: ; %entry
4279 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
4280 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4281 ; CI-NEXT: v_mov_b32_e32 v0, s2
4282 ; CI-NEXT: v_mov_b32_e32 v1, s3
4283 ; CI-NEXT: s_mov_b32 s3, 0xf000
4284 ; CI-NEXT: s_mov_b32 s2, -1
4285 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4286 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 offset:32
4287 ; CI-NEXT: s_waitcnt vmcnt(0)
4288 ; CI-NEXT: buffer_wbinvl1_vol
4291 ; VI-LABEL: atomic_xor_i64_offset:
4292 ; VI: ; %bb.0: ; %entry
4293 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4294 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4295 ; VI-NEXT: v_mov_b32_e32 v0, s2
4296 ; VI-NEXT: v_mov_b32_e32 v1, s3
4297 ; VI-NEXT: s_mov_b32 s3, 0xf000
4298 ; VI-NEXT: s_mov_b32 s2, -1
4299 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4300 ; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 offset:32
4301 ; VI-NEXT: s_waitcnt vmcnt(0)
4302 ; VI-NEXT: buffer_wbinvl1_vol
4305 ; GFX9-LABEL: atomic_xor_i64_offset:
4306 ; GFX9: ; %bb.0: ; %entry
4307 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4308 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4309 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4310 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4311 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4312 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4313 ; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[0:1] offset:32
4314 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4315 ; GFX9-NEXT: buffer_wbinvl1_vol
4316 ; GFX9-NEXT: s_endpgm
4318 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
4319 %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
4323 define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
4324 ; CI-LABEL: atomic_xor_i64_ret_offset:
4325 ; CI: ; %bb.0: ; %entry
4326 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4327 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
4328 ; CI-NEXT: s_mov_b32 s3, 0xf000
4329 ; CI-NEXT: s_mov_b32 s2, -1
4330 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4331 ; CI-NEXT: s_mov_b32 s0, s6
4332 ; CI-NEXT: s_mov_b32 s1, s7
4333 ; CI-NEXT: v_mov_b32_e32 v0, s8
4334 ; CI-NEXT: v_mov_b32_e32 v1, s9
4335 ; CI-NEXT: s_mov_b32 s6, s2
4336 ; CI-NEXT: s_mov_b32 s7, s3
4337 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4338 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 glc
4339 ; CI-NEXT: s_waitcnt vmcnt(0)
4340 ; CI-NEXT: buffer_wbinvl1_vol
4341 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4344 ; VI-LABEL: atomic_xor_i64_ret_offset:
4345 ; VI: ; %bb.0: ; %entry
4346 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4347 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
4348 ; VI-NEXT: s_mov_b32 s3, 0xf000
4349 ; VI-NEXT: s_mov_b32 s2, -1
4350 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4351 ; VI-NEXT: s_mov_b32 s0, s6
4352 ; VI-NEXT: s_mov_b32 s1, s7
4353 ; VI-NEXT: v_mov_b32_e32 v0, s8
4354 ; VI-NEXT: v_mov_b32_e32 v1, s9
4355 ; VI-NEXT: s_mov_b32 s6, s2
4356 ; VI-NEXT: s_mov_b32 s7, s3
4357 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4358 ; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32 glc
4359 ; VI-NEXT: s_waitcnt vmcnt(0)
4360 ; VI-NEXT: buffer_wbinvl1_vol
4361 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4364 ; GFX9-LABEL: atomic_xor_i64_ret_offset:
4365 ; GFX9: ; %bb.0: ; %entry
4366 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4367 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4368 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4369 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4370 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4371 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4372 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4373 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
4374 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4375 ; GFX9-NEXT: buffer_wbinvl1_vol
4376 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
4377 ; GFX9-NEXT: s_endpgm
4379 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
4380 %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
4381 store i64 %tmp0, ptr addrspace(1) %out2
4385 define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
4386 ; CI-LABEL: atomic_xor_i64_addr64_offset:
4387 ; CI: ; %bb.0: ; %entry
4388 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4389 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
4390 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4391 ; CI-NEXT: v_mov_b32_e32 v0, s6
4392 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
4393 ; CI-NEXT: v_mov_b32_e32 v3, s1
4394 ; CI-NEXT: v_mov_b32_e32 v1, s7
4395 ; CI-NEXT: s_mov_b32 s7, 0xf000
4396 ; CI-NEXT: s_mov_b32 s6, 0
4397 ; CI-NEXT: v_mov_b32_e32 v2, s0
4398 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4399 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
4400 ; CI-NEXT: s_waitcnt vmcnt(0)
4401 ; CI-NEXT: buffer_wbinvl1_vol
4404 ; VI-LABEL: atomic_xor_i64_addr64_offset:
4405 ; VI: ; %bb.0: ; %entry
4406 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4407 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
4408 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4409 ; VI-NEXT: v_mov_b32_e32 v0, s6
4410 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
4411 ; VI-NEXT: s_add_u32 s0, s4, s0
4412 ; VI-NEXT: s_addc_u32 s1, s5, s1
4413 ; VI-NEXT: s_add_u32 s0, s0, 32
4414 ; VI-NEXT: s_addc_u32 s1, s1, 0
4415 ; VI-NEXT: v_mov_b32_e32 v3, s1
4416 ; VI-NEXT: v_mov_b32_e32 v1, s7
4417 ; VI-NEXT: v_mov_b32_e32 v2, s0
4418 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4419 ; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
4420 ; VI-NEXT: s_waitcnt vmcnt(0)
4421 ; VI-NEXT: buffer_wbinvl1_vol
4424 ; GFX9-LABEL: atomic_xor_i64_addr64_offset:
4425 ; GFX9: ; %bb.0: ; %entry
4426 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4427 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4428 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4429 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4430 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
4431 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
4432 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4433 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
4434 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4435 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4436 ; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[0:1] offset:32
4437 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4438 ; GFX9-NEXT: buffer_wbinvl1_vol
4439 ; GFX9-NEXT: s_endpgm
4441 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
4442 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
4443 %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
4447 define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
4448 ; CI-LABEL: atomic_xor_i64_ret_addr64_offset:
4449 ; CI: ; %bb.0: ; %entry
4450 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
4451 ; CI-NEXT: s_mov_b32 s11, 0xf000
4452 ; CI-NEXT: s_mov_b32 s10, -1
4453 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4454 ; CI-NEXT: v_mov_b32_e32 v0, s4
4455 ; CI-NEXT: v_mov_b32_e32 v1, s5
4456 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
4457 ; CI-NEXT: v_mov_b32_e32 v2, s4
4458 ; CI-NEXT: s_mov_b32 s8, s2
4459 ; CI-NEXT: s_mov_b32 s9, s3
4460 ; CI-NEXT: s_mov_b32 s2, 0
4461 ; CI-NEXT: s_mov_b32 s3, s11
4462 ; CI-NEXT: v_mov_b32_e32 v3, s5
4463 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4464 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32 glc
4465 ; CI-NEXT: s_waitcnt vmcnt(0)
4466 ; CI-NEXT: buffer_wbinvl1_vol
4467 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
4470 ; VI-LABEL: atomic_xor_i64_ret_addr64_offset:
4471 ; VI: ; %bb.0: ; %entry
4472 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
4473 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4474 ; VI-NEXT: v_mov_b32_e32 v0, s4
4475 ; VI-NEXT: v_mov_b32_e32 v1, s5
4476 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
4477 ; VI-NEXT: s_add_u32 s0, s0, s4
4478 ; VI-NEXT: s_addc_u32 s1, s1, s5
4479 ; VI-NEXT: s_add_u32 s0, s0, 32
4480 ; VI-NEXT: s_addc_u32 s1, s1, 0
4481 ; VI-NEXT: v_mov_b32_e32 v3, s1
4482 ; VI-NEXT: v_mov_b32_e32 v2, s0
4483 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4484 ; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
4485 ; VI-NEXT: s_waitcnt vmcnt(0)
4486 ; VI-NEXT: buffer_wbinvl1_vol
4487 ; VI-NEXT: s_mov_b32 s7, 0xf000
4488 ; VI-NEXT: s_mov_b32 s6, -1
4489 ; VI-NEXT: s_mov_b32 s4, s2
4490 ; VI-NEXT: s_mov_b32 s5, s3
4491 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4494 ; GFX9-LABEL: atomic_xor_i64_ret_addr64_offset:
4495 ; GFX9: ; %bb.0: ; %entry
4496 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
4497 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4498 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4499 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
4500 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
4501 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
4502 ; GFX9-NEXT: s_add_u32 s0, s0, s4
4503 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
4504 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4505 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] offset:32 glc
4506 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4507 ; GFX9-NEXT: buffer_wbinvl1_vol
4508 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
4509 ; GFX9-NEXT: s_endpgm
4511 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
4512 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
4513 %tmp0 = atomicrmw volatile xor ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
4514 store i64 %tmp0, ptr addrspace(1) %out2
4518 define amdgpu_kernel void @atomic_xor_i64(ptr addrspace(1) %out, i64 %in) {
4519 ; CI-LABEL: atomic_xor_i64:
4520 ; CI: ; %bb.0: ; %entry
4521 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
4522 ; CI-NEXT: s_mov_b32 s7, 0xf000
4523 ; CI-NEXT: s_mov_b32 s6, -1
4524 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4525 ; CI-NEXT: s_mov_b32 s4, s0
4526 ; CI-NEXT: s_mov_b32 s5, s1
4527 ; CI-NEXT: v_mov_b32_e32 v0, s2
4528 ; CI-NEXT: v_mov_b32_e32 v1, s3
4529 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4530 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0
4531 ; CI-NEXT: s_waitcnt vmcnt(0)
4532 ; CI-NEXT: buffer_wbinvl1_vol
4535 ; VI-LABEL: atomic_xor_i64:
4536 ; VI: ; %bb.0: ; %entry
4537 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4538 ; VI-NEXT: s_mov_b32 s7, 0xf000
4539 ; VI-NEXT: s_mov_b32 s6, -1
4540 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4541 ; VI-NEXT: s_mov_b32 s4, s0
4542 ; VI-NEXT: s_mov_b32 s5, s1
4543 ; VI-NEXT: v_mov_b32_e32 v0, s2
4544 ; VI-NEXT: v_mov_b32_e32 v1, s3
4545 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4546 ; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0
4547 ; VI-NEXT: s_waitcnt vmcnt(0)
4548 ; VI-NEXT: buffer_wbinvl1_vol
4551 ; GFX9-LABEL: atomic_xor_i64:
4552 ; GFX9: ; %bb.0: ; %entry
4553 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4554 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4555 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4556 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4557 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4558 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4559 ; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[0:1]
4560 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4561 ; GFX9-NEXT: buffer_wbinvl1_vol
4562 ; GFX9-NEXT: s_endpgm
4564 %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
4568 define amdgpu_kernel void @atomic_xor_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
4569 ; CI-LABEL: atomic_xor_i64_ret:
4570 ; CI: ; %bb.0: ; %entry
4571 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4572 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
4573 ; CI-NEXT: s_mov_b32 s3, 0xf000
4574 ; CI-NEXT: s_mov_b32 s2, -1
4575 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4576 ; CI-NEXT: s_mov_b32 s0, s4
4577 ; CI-NEXT: s_mov_b32 s1, s5
4578 ; CI-NEXT: v_mov_b32_e32 v0, s8
4579 ; CI-NEXT: v_mov_b32_e32 v1, s9
4580 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4581 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 glc
4582 ; CI-NEXT: s_waitcnt vmcnt(0)
4583 ; CI-NEXT: buffer_wbinvl1_vol
4584 ; CI-NEXT: s_mov_b32 s0, s6
4585 ; CI-NEXT: s_mov_b32 s1, s7
4586 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4589 ; VI-LABEL: atomic_xor_i64_ret:
4590 ; VI: ; %bb.0: ; %entry
4591 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4592 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
4593 ; VI-NEXT: s_mov_b32 s3, 0xf000
4594 ; VI-NEXT: s_mov_b32 s2, -1
4595 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4596 ; VI-NEXT: s_mov_b32 s0, s4
4597 ; VI-NEXT: s_mov_b32 s1, s5
4598 ; VI-NEXT: v_mov_b32_e32 v0, s8
4599 ; VI-NEXT: v_mov_b32_e32 v1, s9
4600 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4601 ; VI-NEXT: buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 glc
4602 ; VI-NEXT: s_waitcnt vmcnt(0)
4603 ; VI-NEXT: buffer_wbinvl1_vol
4604 ; VI-NEXT: s_mov_b32 s0, s6
4605 ; VI-NEXT: s_mov_b32 s1, s7
4606 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
4609 ; GFX9-LABEL: atomic_xor_i64_ret:
4610 ; GFX9: ; %bb.0: ; %entry
4611 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4612 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4613 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4614 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4615 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4616 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4617 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4618 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[4:5] glc
4619 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4620 ; GFX9-NEXT: buffer_wbinvl1_vol
4621 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
4622 ; GFX9-NEXT: s_endpgm
4624 %tmp0 = atomicrmw volatile xor ptr addrspace(1) %out, i64 %in syncscope("agent") seq_cst
4625 store i64 %tmp0, ptr addrspace(1) %out2
4629 define amdgpu_kernel void @atomic_xor_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index) {
4630 ; CI-LABEL: atomic_xor_i64_addr64:
4631 ; CI: ; %bb.0: ; %entry
4632 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4633 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
4634 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4635 ; CI-NEXT: v_mov_b32_e32 v0, s6
4636 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
4637 ; CI-NEXT: v_mov_b32_e32 v3, s1
4638 ; CI-NEXT: v_mov_b32_e32 v1, s7
4639 ; CI-NEXT: s_mov_b32 s7, 0xf000
4640 ; CI-NEXT: s_mov_b32 s6, 0
4641 ; CI-NEXT: v_mov_b32_e32 v2, s0
4642 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4643 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[4:7], 0 addr64
4644 ; CI-NEXT: s_waitcnt vmcnt(0)
4645 ; CI-NEXT: buffer_wbinvl1_vol
4648 ; VI-LABEL: atomic_xor_i64_addr64:
4649 ; VI: ; %bb.0: ; %entry
4650 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4651 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
4652 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4653 ; VI-NEXT: v_mov_b32_e32 v0, s6
4654 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
4655 ; VI-NEXT: s_add_u32 s0, s4, s0
4656 ; VI-NEXT: s_addc_u32 s1, s5, s1
4657 ; VI-NEXT: v_mov_b32_e32 v3, s1
4658 ; VI-NEXT: v_mov_b32_e32 v1, s7
4659 ; VI-NEXT: v_mov_b32_e32 v2, s0
4660 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4661 ; VI-NEXT: flat_atomic_xor_x2 v[2:3], v[0:1]
4662 ; VI-NEXT: s_waitcnt vmcnt(0)
4663 ; VI-NEXT: buffer_wbinvl1_vol
4666 ; GFX9-LABEL: atomic_xor_i64_addr64:
4667 ; GFX9: ; %bb.0: ; %entry
4668 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4669 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4670 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4671 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4672 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
4673 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
4674 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4675 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
4676 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4677 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4678 ; GFX9-NEXT: global_atomic_xor_x2 v2, v[0:1], s[0:1]
4679 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4680 ; GFX9-NEXT: buffer_wbinvl1_vol
4681 ; GFX9-NEXT: s_endpgm
4683 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
4684 %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
4688 define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index) {
4689 ; CI-LABEL: atomic_xor_i64_ret_addr64:
4690 ; CI: ; %bb.0: ; %entry
4691 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
4692 ; CI-NEXT: s_mov_b32 s11, 0xf000
4693 ; CI-NEXT: s_mov_b32 s10, -1
4694 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4695 ; CI-NEXT: v_mov_b32_e32 v0, s4
4696 ; CI-NEXT: v_mov_b32_e32 v1, s5
4697 ; CI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
4698 ; CI-NEXT: v_mov_b32_e32 v2, s4
4699 ; CI-NEXT: s_mov_b32 s8, s2
4700 ; CI-NEXT: s_mov_b32 s9, s3
4701 ; CI-NEXT: s_mov_b32 s2, 0
4702 ; CI-NEXT: s_mov_b32 s3, s11
4703 ; CI-NEXT: v_mov_b32_e32 v3, s5
4704 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4705 ; CI-NEXT: buffer_atomic_xor_x2 v[0:1], v[2:3], s[0:3], 0 addr64 glc
4706 ; CI-NEXT: s_waitcnt vmcnt(0)
4707 ; CI-NEXT: buffer_wbinvl1_vol
4708 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
4711 ; VI-LABEL: atomic_xor_i64_ret_addr64:
4712 ; VI: ; %bb.0: ; %entry
4713 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
4714 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4715 ; VI-NEXT: v_mov_b32_e32 v0, s4
4716 ; VI-NEXT: v_mov_b32_e32 v1, s5
4717 ; VI-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
4718 ; VI-NEXT: s_add_u32 s0, s0, s4
4719 ; VI-NEXT: s_addc_u32 s1, s1, s5
4720 ; VI-NEXT: v_mov_b32_e32 v3, s1
4721 ; VI-NEXT: v_mov_b32_e32 v2, s0
4722 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4723 ; VI-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
4724 ; VI-NEXT: s_waitcnt vmcnt(0)
4725 ; VI-NEXT: buffer_wbinvl1_vol
4726 ; VI-NEXT: s_mov_b32 s7, 0xf000
4727 ; VI-NEXT: s_mov_b32 s6, -1
4728 ; VI-NEXT: s_mov_b32 s4, s2
4729 ; VI-NEXT: s_mov_b32 s5, s3
4730 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
4733 ; GFX9-LABEL: atomic_xor_i64_ret_addr64:
4734 ; GFX9: ; %bb.0: ; %entry
4735 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
4736 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4737 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4738 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
4739 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
4740 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
4741 ; GFX9-NEXT: s_add_u32 s0, s0, s4
4742 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
4743 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4744 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v2, v[0:1], s[0:1] glc
4745 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4746 ; GFX9-NEXT: buffer_wbinvl1_vol
4747 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
4748 ; GFX9-NEXT: s_endpgm
4750 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
4751 %tmp0 = atomicrmw volatile xor ptr addrspace(1) %ptr, i64 %in syncscope("agent") seq_cst
4752 store i64 %tmp0, ptr addrspace(1) %out2
4756 define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr addrspace(1) %out, i64 %in, i64 %old) {
4757 ; CI-LABEL: atomic_cmpxchg_i64_offset:
4758 ; CI: ; %bb.0: ; %entry
4759 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4760 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
4761 ; CI-NEXT: s_mov_b32 s3, 0xf000
4762 ; CI-NEXT: s_mov_b32 s2, -1
4763 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4764 ; CI-NEXT: s_mov_b32 s0, s4
4765 ; CI-NEXT: s_mov_b32 s1, s5
4766 ; CI-NEXT: v_mov_b32_e32 v0, s6
4767 ; CI-NEXT: v_mov_b32_e32 v1, s7
4768 ; CI-NEXT: v_mov_b32_e32 v2, s8
4769 ; CI-NEXT: v_mov_b32_e32 v3, s9
4770 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4771 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 offset:32
4772 ; CI-NEXT: s_waitcnt vmcnt(0)
4773 ; CI-NEXT: buffer_wbinvl1_vol
4776 ; VI-LABEL: atomic_cmpxchg_i64_offset:
4777 ; VI: ; %bb.0: ; %entry
4778 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4779 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
4780 ; VI-NEXT: s_mov_b32 s3, 0xf000
4781 ; VI-NEXT: s_mov_b32 s2, -1
4782 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4783 ; VI-NEXT: s_mov_b32 s0, s4
4784 ; VI-NEXT: s_mov_b32 s1, s5
4785 ; VI-NEXT: v_mov_b32_e32 v0, s6
4786 ; VI-NEXT: v_mov_b32_e32 v1, s7
4787 ; VI-NEXT: v_mov_b32_e32 v2, s8
4788 ; VI-NEXT: v_mov_b32_e32 v3, s9
4789 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4790 ; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 offset:32
4791 ; VI-NEXT: s_waitcnt vmcnt(0)
4792 ; VI-NEXT: buffer_wbinvl1_vol
4795 ; GFX9-LABEL: atomic_cmpxchg_i64_offset:
4796 ; GFX9: ; %bb.0: ; %entry
4797 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4798 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4799 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
4800 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4801 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
4802 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
4803 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
4804 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
4805 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4806 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[4:5] offset:32
4807 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4808 ; GFX9-NEXT: buffer_wbinvl1_vol
4809 ; GFX9-NEXT: s_endpgm
4811 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
4812 %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
4816 define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr addrspace(1) %out, i64 %in, i64 %old) {
4817 ; CI-LABEL: atomic_cmpxchg_i64_soffset:
4818 ; CI: ; %bb.0: ; %entry
4819 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4820 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
4821 ; CI-NEXT: s_mov_b32 s3, 0xf000
4822 ; CI-NEXT: s_mov_b32 s2, -1
4823 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4824 ; CI-NEXT: s_mov_b32 s0, s4
4825 ; CI-NEXT: s_mov_b32 s1, s5
4826 ; CI-NEXT: s_mov_b32 s4, 0x11940
4827 ; CI-NEXT: v_mov_b32_e32 v0, s6
4828 ; CI-NEXT: v_mov_b32_e32 v1, s7
4829 ; CI-NEXT: v_mov_b32_e32 v2, s8
4830 ; CI-NEXT: v_mov_b32_e32 v3, s9
4831 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4832 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s4
4833 ; CI-NEXT: s_waitcnt vmcnt(0)
4834 ; CI-NEXT: buffer_wbinvl1_vol
4837 ; VI-LABEL: atomic_cmpxchg_i64_soffset:
4838 ; VI: ; %bb.0: ; %entry
4839 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4840 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
4841 ; VI-NEXT: s_mov_b32 s3, 0xf000
4842 ; VI-NEXT: s_mov_b32 s2, -1
4843 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4844 ; VI-NEXT: s_mov_b32 s0, s4
4845 ; VI-NEXT: s_mov_b32 s1, s5
4846 ; VI-NEXT: s_mov_b32 s4, 0x11940
4847 ; VI-NEXT: v_mov_b32_e32 v0, s6
4848 ; VI-NEXT: v_mov_b32_e32 v1, s7
4849 ; VI-NEXT: v_mov_b32_e32 v2, s8
4850 ; VI-NEXT: v_mov_b32_e32 v3, s9
4851 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4852 ; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s4
4853 ; VI-NEXT: s_waitcnt vmcnt(0)
4854 ; VI-NEXT: buffer_wbinvl1_vol
4857 ; GFX9-LABEL: atomic_cmpxchg_i64_soffset:
4858 ; GFX9: ; %bb.0: ; %entry
4859 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4860 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4861 ; GFX9-NEXT: v_mov_b32_e32 v4, 0x11000
4862 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4863 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
4864 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
4865 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
4866 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
4867 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4868 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[4:5] offset:2368
4869 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4870 ; GFX9-NEXT: buffer_wbinvl1_vol
4871 ; GFX9-NEXT: s_endpgm
4873 %gep = getelementptr i64, ptr addrspace(1) %out, i64 9000
4874 %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
4878 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %old) {
4879 ; CI-LABEL: atomic_cmpxchg_i64_ret_offset:
4880 ; CI: ; %bb.0: ; %entry
4881 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
4882 ; CI-NEXT: s_mov_b32 s11, 0xf000
4883 ; CI-NEXT: s_mov_b32 s10, -1
4884 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4885 ; CI-NEXT: s_mov_b32 s8, s2
4886 ; CI-NEXT: s_mov_b32 s9, s3
4887 ; CI-NEXT: s_mov_b32 s2, s10
4888 ; CI-NEXT: s_mov_b32 s3, s11
4889 ; CI-NEXT: v_mov_b32_e32 v0, s4
4890 ; CI-NEXT: v_mov_b32_e32 v1, s5
4891 ; CI-NEXT: v_mov_b32_e32 v2, s6
4892 ; CI-NEXT: v_mov_b32_e32 v3, s7
4893 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4894 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 offset:32 glc
4895 ; CI-NEXT: s_waitcnt vmcnt(0)
4896 ; CI-NEXT: buffer_wbinvl1_vol
4897 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
4900 ; VI-LABEL: atomic_cmpxchg_i64_ret_offset:
4901 ; VI: ; %bb.0: ; %entry
4902 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
4903 ; VI-NEXT: s_mov_b32 s11, 0xf000
4904 ; VI-NEXT: s_mov_b32 s10, -1
4905 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4906 ; VI-NEXT: s_mov_b32 s8, s2
4907 ; VI-NEXT: s_mov_b32 s9, s3
4908 ; VI-NEXT: s_mov_b32 s2, s10
4909 ; VI-NEXT: s_mov_b32 s3, s11
4910 ; VI-NEXT: v_mov_b32_e32 v0, s4
4911 ; VI-NEXT: v_mov_b32_e32 v1, s5
4912 ; VI-NEXT: v_mov_b32_e32 v2, s6
4913 ; VI-NEXT: v_mov_b32_e32 v3, s7
4914 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4915 ; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 offset:32 glc
4916 ; VI-NEXT: s_waitcnt vmcnt(0)
4917 ; VI-NEXT: buffer_wbinvl1_vol
4918 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
4921 ; GFX9-LABEL: atomic_cmpxchg_i64_ret_offset:
4922 ; GFX9: ; %bb.0: ; %entry
4923 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
4924 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
4925 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4926 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
4927 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
4928 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
4929 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
4930 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4931 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] offset:32 glc
4932 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4933 ; GFX9-NEXT: buffer_wbinvl1_vol
4934 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
4935 ; GFX9-NEXT: s_endpgm
4937 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
4938 %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
4939 %extract0 = extractvalue { i64, i1 } %val, 0
4940 store i64 %extract0, ptr addrspace(1) %out2
4944 define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index, i64 %old) {
4945 ; CI-LABEL: atomic_cmpxchg_i64_addr64_offset:
4946 ; CI: ; %bb.0: ; %entry
4947 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
4948 ; CI-NEXT: s_mov_b32 s11, 0xf000
4949 ; CI-NEXT: s_mov_b32 s10, 0
4950 ; CI-NEXT: s_waitcnt lgkmcnt(0)
4951 ; CI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
4952 ; CI-NEXT: v_mov_b32_e32 v4, s4
4953 ; CI-NEXT: s_mov_b64 s[8:9], s[0:1]
4954 ; CI-NEXT: v_mov_b32_e32 v0, s2
4955 ; CI-NEXT: v_mov_b32_e32 v1, s3
4956 ; CI-NEXT: v_mov_b32_e32 v2, s6
4957 ; CI-NEXT: v_mov_b32_e32 v3, s7
4958 ; CI-NEXT: v_mov_b32_e32 v5, s5
4959 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4960 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[8:11], 0 addr64 offset:32
4961 ; CI-NEXT: s_waitcnt vmcnt(0)
4962 ; CI-NEXT: buffer_wbinvl1_vol
4965 ; VI-LABEL: atomic_cmpxchg_i64_addr64_offset:
4966 ; VI: ; %bb.0: ; %entry
4967 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
4968 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4969 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
4970 ; VI-NEXT: s_add_u32 s0, s0, s4
4971 ; VI-NEXT: s_addc_u32 s1, s1, s5
4972 ; VI-NEXT: s_add_u32 s0, s0, 32
4973 ; VI-NEXT: s_addc_u32 s1, s1, 0
4974 ; VI-NEXT: v_mov_b32_e32 v5, s1
4975 ; VI-NEXT: v_mov_b32_e32 v0, s2
4976 ; VI-NEXT: v_mov_b32_e32 v1, s3
4977 ; VI-NEXT: v_mov_b32_e32 v2, s6
4978 ; VI-NEXT: v_mov_b32_e32 v3, s7
4979 ; VI-NEXT: v_mov_b32_e32 v4, s0
4980 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4981 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
4982 ; VI-NEXT: s_waitcnt vmcnt(0)
4983 ; VI-NEXT: buffer_wbinvl1_vol
4986 ; GFX9-LABEL: atomic_cmpxchg_i64_addr64_offset:
4987 ; GFX9: ; %bb.0: ; %entry
4988 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
4989 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
4990 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4991 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
4992 ; GFX9-NEXT: s_add_u32 s0, s0, s4
4993 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4994 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4995 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
4996 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
4997 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
4998 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4999 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[0:1] offset:32
5000 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5001 ; GFX9-NEXT: buffer_wbinvl1_vol
5002 ; GFX9-NEXT: s_endpgm
5004 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
5005 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
5006 %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
5010 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) {
5011 ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
5012 ; CI: ; %bb.0: ; %entry
5013 ; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
5014 ; CI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
5015 ; CI-NEXT: s_mov_b32 s3, 0xf000
5016 ; CI-NEXT: s_mov_b32 s2, -1
5017 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5018 ; CI-NEXT: s_lshl_b64 s[10:11], s[10:11], 3
5019 ; CI-NEXT: v_mov_b32_e32 v4, s10
5020 ; CI-NEXT: s_mov_b32 s0, s6
5021 ; CI-NEXT: s_mov_b32 s1, s7
5022 ; CI-NEXT: s_mov_b32 s6, 0
5023 ; CI-NEXT: s_mov_b32 s7, s3
5024 ; CI-NEXT: v_mov_b32_e32 v0, s8
5025 ; CI-NEXT: v_mov_b32_e32 v1, s9
5026 ; CI-NEXT: v_mov_b32_e32 v2, s12
5027 ; CI-NEXT: v_mov_b32_e32 v3, s13
5028 ; CI-NEXT: v_mov_b32_e32 v5, s11
5029 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5030 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 offset:32 glc
5031 ; CI-NEXT: s_waitcnt vmcnt(0)
5032 ; CI-NEXT: buffer_wbinvl1_vol
5033 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5036 ; VI-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
5037 ; VI: ; %bb.0: ; %entry
5038 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
5039 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
5040 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5041 ; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
5042 ; VI-NEXT: v_mov_b32_e32 v2, s0
5043 ; VI-NEXT: s_add_u32 s0, s4, s2
5044 ; VI-NEXT: s_addc_u32 s3, s5, s3
5045 ; VI-NEXT: s_add_u32 s2, s0, 32
5046 ; VI-NEXT: s_addc_u32 s3, s3, 0
5047 ; VI-NEXT: v_mov_b32_e32 v5, s3
5048 ; VI-NEXT: v_mov_b32_e32 v0, s8
5049 ; VI-NEXT: v_mov_b32_e32 v1, s9
5050 ; VI-NEXT: v_mov_b32_e32 v3, s1
5051 ; VI-NEXT: v_mov_b32_e32 v4, s2
5052 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5053 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5054 ; VI-NEXT: s_waitcnt vmcnt(0)
5055 ; VI-NEXT: buffer_wbinvl1_vol
5056 ; VI-NEXT: s_mov_b32 s3, 0xf000
5057 ; VI-NEXT: s_mov_b32 s2, -1
5058 ; VI-NEXT: s_mov_b32 s0, s6
5059 ; VI-NEXT: s_mov_b32 s1, s7
5060 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5063 ; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
5064 ; GFX9: ; %bb.0: ; %entry
5065 ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
5066 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
5067 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
5068 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5069 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
5070 ; GFX9-NEXT: s_add_u32 s2, s4, s2
5071 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
5072 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
5073 ; GFX9-NEXT: s_addc_u32 s3, s5, s3
5074 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
5075 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
5076 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5077 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] offset:32 glc
5078 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5079 ; GFX9-NEXT: buffer_wbinvl1_vol
5080 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
5081 ; GFX9-NEXT: s_endpgm
5083 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
5084 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
5085 %val = cmpxchg volatile ptr addrspace(1) %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
5086 %extract0 = extractvalue { i64, i1 } %val, 0
5087 store i64 %extract0, ptr addrspace(1) %out2
5091 define amdgpu_kernel void @atomic_cmpxchg_i64(ptr addrspace(1) %out, i64 %in, i64 %old) {
5092 ; CI-LABEL: atomic_cmpxchg_i64:
5093 ; CI: ; %bb.0: ; %entry
5094 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
5095 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
5096 ; CI-NEXT: s_mov_b32 s3, 0xf000
5097 ; CI-NEXT: s_mov_b32 s2, -1
5098 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5099 ; CI-NEXT: s_mov_b32 s0, s4
5100 ; CI-NEXT: s_mov_b32 s1, s5
5101 ; CI-NEXT: v_mov_b32_e32 v0, s6
5102 ; CI-NEXT: v_mov_b32_e32 v1, s7
5103 ; CI-NEXT: v_mov_b32_e32 v2, s8
5104 ; CI-NEXT: v_mov_b32_e32 v3, s9
5105 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5106 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0
5107 ; CI-NEXT: s_waitcnt vmcnt(0)
5108 ; CI-NEXT: buffer_wbinvl1_vol
5111 ; VI-LABEL: atomic_cmpxchg_i64:
5112 ; VI: ; %bb.0: ; %entry
5113 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5114 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
5115 ; VI-NEXT: s_mov_b32 s3, 0xf000
5116 ; VI-NEXT: s_mov_b32 s2, -1
5117 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5118 ; VI-NEXT: s_mov_b32 s0, s4
5119 ; VI-NEXT: s_mov_b32 s1, s5
5120 ; VI-NEXT: v_mov_b32_e32 v0, s6
5121 ; VI-NEXT: v_mov_b32_e32 v1, s7
5122 ; VI-NEXT: v_mov_b32_e32 v2, s8
5123 ; VI-NEXT: v_mov_b32_e32 v3, s9
5124 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5125 ; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0
5126 ; VI-NEXT: s_waitcnt vmcnt(0)
5127 ; VI-NEXT: buffer_wbinvl1_vol
5130 ; GFX9-LABEL: atomic_cmpxchg_i64:
5131 ; GFX9: ; %bb.0: ; %entry
5132 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5133 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5134 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
5135 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5136 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
5137 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
5138 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
5139 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
5140 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5141 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[4:5]
5142 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5143 ; GFX9-NEXT: buffer_wbinvl1_vol
5144 ; GFX9-NEXT: s_endpgm
5146 %val = cmpxchg volatile ptr addrspace(1) %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
5150 define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %old) {
5151 ; CI-LABEL: atomic_cmpxchg_i64_ret:
5152 ; CI: ; %bb.0: ; %entry
5153 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
5154 ; CI-NEXT: s_mov_b32 s11, 0xf000
5155 ; CI-NEXT: s_mov_b32 s10, -1
5156 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5157 ; CI-NEXT: s_mov_b32 s8, s0
5158 ; CI-NEXT: s_mov_b32 s9, s1
5159 ; CI-NEXT: v_mov_b32_e32 v0, s4
5160 ; CI-NEXT: v_mov_b32_e32 v1, s5
5161 ; CI-NEXT: v_mov_b32_e32 v2, s6
5162 ; CI-NEXT: v_mov_b32_e32 v3, s7
5163 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5164 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc
5165 ; CI-NEXT: s_waitcnt vmcnt(0)
5166 ; CI-NEXT: buffer_wbinvl1_vol
5167 ; CI-NEXT: s_mov_b32 s8, s2
5168 ; CI-NEXT: s_mov_b32 s9, s3
5169 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
5172 ; VI-LABEL: atomic_cmpxchg_i64_ret:
5173 ; VI: ; %bb.0: ; %entry
5174 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
5175 ; VI-NEXT: s_mov_b32 s11, 0xf000
5176 ; VI-NEXT: s_mov_b32 s10, -1
5177 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5178 ; VI-NEXT: s_mov_b32 s8, s0
5179 ; VI-NEXT: s_mov_b32 s9, s1
5180 ; VI-NEXT: v_mov_b32_e32 v0, s4
5181 ; VI-NEXT: v_mov_b32_e32 v1, s5
5182 ; VI-NEXT: v_mov_b32_e32 v2, s6
5183 ; VI-NEXT: v_mov_b32_e32 v3, s7
5184 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5185 ; VI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], off, s[8:11], 0 glc
5186 ; VI-NEXT: s_waitcnt vmcnt(0)
5187 ; VI-NEXT: buffer_wbinvl1_vol
5188 ; VI-NEXT: s_mov_b32 s8, s2
5189 ; VI-NEXT: s_mov_b32 s9, s3
5190 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0
5193 ; GFX9-LABEL: atomic_cmpxchg_i64_ret:
5194 ; GFX9: ; %bb.0: ; %entry
5195 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
5196 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
5197 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5198 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
5199 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
5200 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
5201 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
5202 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5203 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
5204 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5205 ; GFX9-NEXT: buffer_wbinvl1_vol
5206 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[2:3]
5207 ; GFX9-NEXT: s_endpgm
5209 %val = cmpxchg volatile ptr addrspace(1) %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
5210 %extract0 = extractvalue { i64, i1 } %val, 0
5211 store i64 %extract0, ptr addrspace(1) %out2
5215 define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr addrspace(1) %out, i64 %in, i64 %index, i64 %old) {
5216 ; CI-LABEL: atomic_cmpxchg_i64_addr64:
5217 ; CI: ; %bb.0: ; %entry
5218 ; CI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9
5219 ; CI-NEXT: s_mov_b32 s11, 0xf000
5220 ; CI-NEXT: s_mov_b32 s10, 0
5221 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5222 ; CI-NEXT: s_mov_b64 s[8:9], s[0:1]
5223 ; CI-NEXT: s_lshl_b64 s[0:1], s[4:5], 3
5224 ; CI-NEXT: v_mov_b32_e32 v5, s1
5225 ; CI-NEXT: v_mov_b32_e32 v0, s2
5226 ; CI-NEXT: v_mov_b32_e32 v1, s3
5227 ; CI-NEXT: v_mov_b32_e32 v2, s6
5228 ; CI-NEXT: v_mov_b32_e32 v3, s7
5229 ; CI-NEXT: v_mov_b32_e32 v4, s0
5230 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5231 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[8:11], 0 addr64
5232 ; CI-NEXT: s_waitcnt vmcnt(0)
5233 ; CI-NEXT: buffer_wbinvl1_vol
5236 ; VI-LABEL: atomic_cmpxchg_i64_addr64:
5237 ; VI: ; %bb.0: ; %entry
5238 ; VI-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
5239 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5240 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
5241 ; VI-NEXT: s_add_u32 s0, s0, s4
5242 ; VI-NEXT: s_addc_u32 s1, s1, s5
5243 ; VI-NEXT: v_mov_b32_e32 v5, s1
5244 ; VI-NEXT: v_mov_b32_e32 v0, s2
5245 ; VI-NEXT: v_mov_b32_e32 v1, s3
5246 ; VI-NEXT: v_mov_b32_e32 v2, s6
5247 ; VI-NEXT: v_mov_b32_e32 v3, s7
5248 ; VI-NEXT: v_mov_b32_e32 v4, s0
5249 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5250 ; VI-NEXT: flat_atomic_cmpswap_x2 v[4:5], v[0:3]
5251 ; VI-NEXT: s_waitcnt vmcnt(0)
5252 ; VI-NEXT: buffer_wbinvl1_vol
5255 ; GFX9-LABEL: atomic_cmpxchg_i64_addr64:
5256 ; GFX9: ; %bb.0: ; %entry
5257 ; GFX9-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24
5258 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
5259 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5260 ; GFX9-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
5261 ; GFX9-NEXT: s_add_u32 s0, s0, s4
5262 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
5263 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
5264 ; GFX9-NEXT: v_mov_b32_e32 v2, s6
5265 ; GFX9-NEXT: s_addc_u32 s1, s1, s5
5266 ; GFX9-NEXT: v_mov_b32_e32 v3, s7
5267 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5268 ; GFX9-NEXT: global_atomic_cmpswap_x2 v4, v[0:3], s[0:1]
5269 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5270 ; GFX9-NEXT: buffer_wbinvl1_vol
5271 ; GFX9-NEXT: s_endpgm
5273 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
5274 %val = cmpxchg volatile ptr addrspace(1) %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
5278 define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in, i64 %index, i64 %old) {
5279 ; CI-LABEL: atomic_cmpxchg_i64_ret_addr64:
5280 ; CI: ; %bb.0: ; %entry
5281 ; CI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x9
5282 ; CI-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x11
5283 ; CI-NEXT: s_mov_b32 s3, 0xf000
5284 ; CI-NEXT: s_mov_b32 s2, -1
5285 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5286 ; CI-NEXT: s_lshl_b64 s[10:11], s[10:11], 3
5287 ; CI-NEXT: v_mov_b32_e32 v4, s10
5288 ; CI-NEXT: s_mov_b32 s0, s6
5289 ; CI-NEXT: s_mov_b32 s1, s7
5290 ; CI-NEXT: s_mov_b32 s6, 0
5291 ; CI-NEXT: s_mov_b32 s7, s3
5292 ; CI-NEXT: v_mov_b32_e32 v0, s8
5293 ; CI-NEXT: v_mov_b32_e32 v1, s9
5294 ; CI-NEXT: v_mov_b32_e32 v2, s12
5295 ; CI-NEXT: v_mov_b32_e32 v3, s13
5296 ; CI-NEXT: v_mov_b32_e32 v5, s11
5297 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5298 ; CI-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v[4:5], s[4:7], 0 addr64 glc
5299 ; CI-NEXT: s_waitcnt vmcnt(0)
5300 ; CI-NEXT: buffer_wbinvl1_vol
5301 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5304 ; VI-LABEL: atomic_cmpxchg_i64_ret_addr64:
5305 ; VI: ; %bb.0: ; %entry
5306 ; VI-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
5307 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
5308 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5309 ; VI-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
5310 ; VI-NEXT: s_add_u32 s2, s4, s2
5311 ; VI-NEXT: s_addc_u32 s3, s5, s3
5312 ; VI-NEXT: v_mov_b32_e32 v5, s3
5313 ; VI-NEXT: v_mov_b32_e32 v0, s8
5314 ; VI-NEXT: v_mov_b32_e32 v1, s9
5315 ; VI-NEXT: v_mov_b32_e32 v2, s0
5316 ; VI-NEXT: v_mov_b32_e32 v3, s1
5317 ; VI-NEXT: v_mov_b32_e32 v4, s2
5318 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5319 ; VI-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
5320 ; VI-NEXT: s_waitcnt vmcnt(0)
5321 ; VI-NEXT: buffer_wbinvl1_vol
5322 ; VI-NEXT: s_mov_b32 s3, 0xf000
5323 ; VI-NEXT: s_mov_b32 s2, -1
5324 ; VI-NEXT: s_mov_b32 s0, s6
5325 ; VI-NEXT: s_mov_b32 s1, s7
5326 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5329 ; GFX9-LABEL: atomic_cmpxchg_i64_ret_addr64:
5330 ; GFX9: ; %bb.0: ; %entry
5331 ; GFX9-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x24
5332 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
5333 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x44
5334 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5335 ; GFX9-NEXT: s_lshl_b64 s[2:3], s[10:11], 3
5336 ; GFX9-NEXT: s_add_u32 s2, s4, s2
5337 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
5338 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
5339 ; GFX9-NEXT: s_addc_u32 s3, s5, s3
5340 ; GFX9-NEXT: v_mov_b32_e32 v2, s0
5341 ; GFX9-NEXT: v_mov_b32_e32 v3, s1
5342 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5343 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[2:3] glc
5344 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5345 ; GFX9-NEXT: buffer_wbinvl1_vol
5346 ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[6:7]
5347 ; GFX9-NEXT: s_endpgm
5349 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
5350 %val = cmpxchg volatile ptr addrspace(1) %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
5351 %extract0 = extractvalue { i64, i1 } %val, 0
5352 store i64 %extract0, ptr addrspace(1) %out2
5356 define amdgpu_kernel void @atomic_load_i64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
5357 ; CI-LABEL: atomic_load_i64_offset:
5358 ; CI: ; %bb.0: ; %entry
5359 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5360 ; CI-NEXT: s_mov_b32 s7, 0xf000
5361 ; CI-NEXT: s_mov_b32 s6, -1
5362 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5363 ; CI-NEXT: s_mov_b32 s4, s2
5364 ; CI-NEXT: s_mov_b32 s5, s3
5365 ; CI-NEXT: s_mov_b32 s2, s6
5366 ; CI-NEXT: s_mov_b32 s3, s7
5367 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5368 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 offset:32 glc
5369 ; CI-NEXT: s_waitcnt vmcnt(0)
5370 ; CI-NEXT: buffer_wbinvl1_vol
5371 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5374 ; VI-LABEL: atomic_load_i64_offset:
5375 ; VI: ; %bb.0: ; %entry
5376 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5377 ; VI-NEXT: s_mov_b32 s7, 0xf000
5378 ; VI-NEXT: s_mov_b32 s6, -1
5379 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5380 ; VI-NEXT: s_add_u32 s0, s0, 32
5381 ; VI-NEXT: s_addc_u32 s1, s1, 0
5382 ; VI-NEXT: v_mov_b32_e32 v0, s0
5383 ; VI-NEXT: v_mov_b32_e32 v1, s1
5384 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5385 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
5386 ; VI-NEXT: s_waitcnt vmcnt(0)
5387 ; VI-NEXT: buffer_wbinvl1_vol
5388 ; VI-NEXT: s_mov_b32 s4, s2
5389 ; VI-NEXT: s_mov_b32 s5, s3
5390 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5393 ; GFX9-LABEL: atomic_load_i64_offset:
5394 ; GFX9: ; %bb.0: ; %entry
5395 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5396 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5397 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5398 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc
5399 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5400 ; GFX9-NEXT: buffer_wbinvl1_vol
5401 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
5402 ; GFX9-NEXT: s_endpgm
5404 %gep = getelementptr i64, ptr addrspace(1) %in, i64 4
5405 %val = load atomic i64, ptr addrspace(1) %gep seq_cst, align 8
5406 store i64 %val, ptr addrspace(1) %out
5410 define amdgpu_kernel void @atomic_load_i64_neg_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
5411 ; CI-LABEL: atomic_load_i64_neg_offset:
5412 ; CI: ; %bb.0: ; %entry
5413 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5414 ; CI-NEXT: s_mov_b32 s7, 0xf000
5415 ; CI-NEXT: v_mov_b32_e32 v0, 0xffffffe0
5416 ; CI-NEXT: v_mov_b32_e32 v1, -1
5417 ; CI-NEXT: s_mov_b32 s6, -1
5418 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5419 ; CI-NEXT: s_mov_b32 s4, s2
5420 ; CI-NEXT: s_mov_b32 s5, s3
5421 ; CI-NEXT: s_mov_b32 s2, 0
5422 ; CI-NEXT: s_mov_b32 s3, s7
5423 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5424 ; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], 0 addr64 glc
5425 ; CI-NEXT: s_waitcnt vmcnt(0)
5426 ; CI-NEXT: buffer_wbinvl1_vol
5427 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5430 ; VI-LABEL: atomic_load_i64_neg_offset:
5431 ; VI: ; %bb.0: ; %entry
5432 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5433 ; VI-NEXT: s_mov_b32 s7, 0xf000
5434 ; VI-NEXT: s_mov_b32 s6, -1
5435 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5436 ; VI-NEXT: s_add_u32 s0, s0, 0xffffffe0
5437 ; VI-NEXT: s_addc_u32 s1, s1, -1
5438 ; VI-NEXT: v_mov_b32_e32 v0, s0
5439 ; VI-NEXT: v_mov_b32_e32 v1, s1
5440 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5441 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
5442 ; VI-NEXT: s_waitcnt vmcnt(0)
5443 ; VI-NEXT: buffer_wbinvl1_vol
5444 ; VI-NEXT: s_mov_b32 s4, s2
5445 ; VI-NEXT: s_mov_b32 s5, s3
5446 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5449 ; GFX9-LABEL: atomic_load_i64_neg_offset:
5450 ; GFX9: ; %bb.0: ; %entry
5451 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5452 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5453 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5454 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:-32 glc
5455 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5456 ; GFX9-NEXT: buffer_wbinvl1_vol
5457 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
5458 ; GFX9-NEXT: s_endpgm
5460 %gep = getelementptr i64, ptr addrspace(1) %in, i64 -4
5461 %val = load atomic i64, ptr addrspace(1) %gep seq_cst, align 8
5462 store i64 %val, ptr addrspace(1) %out
5466 define amdgpu_kernel void @atomic_load_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) {
5467 ; CI-LABEL: atomic_load_i64:
5468 ; CI: ; %bb.0: ; %entry
5469 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5470 ; CI-NEXT: s_mov_b32 s7, 0xf000
5471 ; CI-NEXT: s_mov_b32 s6, -1
5472 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5473 ; CI-NEXT: s_mov_b32 s4, s0
5474 ; CI-NEXT: s_mov_b32 s5, s1
5475 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5476 ; CI-NEXT: buffer_load_dwordx2 v[0:1], off, s[4:7], 0 glc
5477 ; CI-NEXT: s_waitcnt vmcnt(0)
5478 ; CI-NEXT: buffer_wbinvl1_vol
5479 ; CI-NEXT: s_mov_b32 s4, s2
5480 ; CI-NEXT: s_mov_b32 s5, s3
5481 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5484 ; VI-LABEL: atomic_load_i64:
5485 ; VI: ; %bb.0: ; %entry
5486 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5487 ; VI-NEXT: s_mov_b32 s7, 0xf000
5488 ; VI-NEXT: s_mov_b32 s6, -1
5489 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5490 ; VI-NEXT: v_mov_b32_e32 v0, s0
5491 ; VI-NEXT: v_mov_b32_e32 v1, s1
5492 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5493 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
5494 ; VI-NEXT: s_waitcnt vmcnt(0)
5495 ; VI-NEXT: buffer_wbinvl1_vol
5496 ; VI-NEXT: s_mov_b32 s4, s2
5497 ; VI-NEXT: s_mov_b32 s5, s3
5498 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5501 ; GFX9-LABEL: atomic_load_i64:
5502 ; GFX9: ; %bb.0: ; %entry
5503 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5504 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5505 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5506 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc
5507 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5508 ; GFX9-NEXT: buffer_wbinvl1_vol
5509 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
5510 ; GFX9-NEXT: s_endpgm
5512 %val = load atomic i64, ptr addrspace(1) %in syncscope("agent") seq_cst, align 8
5513 store i64 %val, ptr addrspace(1) %out
5517 define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) {
5518 ; CI-LABEL: atomic_load_i64_addr64_offset:
5519 ; CI: ; %bb.0: ; %entry
5520 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
5521 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
5522 ; CI-NEXT: s_mov_b32 s3, 0xf000
5523 ; CI-NEXT: s_mov_b32 s2, -1
5524 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5525 ; CI-NEXT: s_mov_b32 s0, s6
5526 ; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3
5527 ; CI-NEXT: v_mov_b32_e32 v0, s8
5528 ; CI-NEXT: s_mov_b32 s1, s7
5529 ; CI-NEXT: s_mov_b32 s6, 0
5530 ; CI-NEXT: s_mov_b32 s7, s3
5531 ; CI-NEXT: v_mov_b32_e32 v1, s9
5532 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5533 ; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:32 glc
5534 ; CI-NEXT: s_waitcnt vmcnt(0)
5535 ; CI-NEXT: buffer_wbinvl1_vol
5536 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5539 ; VI-LABEL: atomic_load_i64_addr64_offset:
5540 ; VI: ; %bb.0: ; %entry
5541 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
5542 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5543 ; VI-NEXT: s_mov_b32 s7, 0xf000
5544 ; VI-NEXT: s_mov_b32 s6, -1
5545 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5546 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
5547 ; VI-NEXT: s_add_u32 s0, s0, s4
5548 ; VI-NEXT: s_addc_u32 s1, s1, s5
5549 ; VI-NEXT: s_add_u32 s0, s0, 32
5550 ; VI-NEXT: s_addc_u32 s1, s1, 0
5551 ; VI-NEXT: v_mov_b32_e32 v0, s0
5552 ; VI-NEXT: v_mov_b32_e32 v1, s1
5553 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5554 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
5555 ; VI-NEXT: s_waitcnt vmcnt(0)
5556 ; VI-NEXT: buffer_wbinvl1_vol
5557 ; VI-NEXT: s_mov_b32 s4, s2
5558 ; VI-NEXT: s_mov_b32 s5, s3
5559 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5562 ; GFX9-LABEL: atomic_load_i64_addr64_offset:
5563 ; GFX9: ; %bb.0: ; %entry
5564 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5565 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5566 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5567 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5568 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
5569 ; GFX9-NEXT: s_add_u32 s0, s4, s0
5570 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
5571 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5572 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc
5573 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5574 ; GFX9-NEXT: buffer_wbinvl1_vol
5575 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
5576 ; GFX9-NEXT: s_endpgm
5578 %ptr = getelementptr i64, ptr addrspace(1) %in, i64 %index
5579 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
5580 %val = load atomic i64, ptr addrspace(1) %gep seq_cst, align 8
5581 store i64 %val, ptr addrspace(1) %out
5585 define amdgpu_kernel void @atomic_load_i64_addr64(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) {
5586 ; CI-LABEL: atomic_load_i64_addr64:
5587 ; CI: ; %bb.0: ; %entry
5588 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
5589 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
5590 ; CI-NEXT: s_mov_b32 s3, 0xf000
5591 ; CI-NEXT: s_mov_b32 s2, -1
5592 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5593 ; CI-NEXT: s_mov_b32 s0, s6
5594 ; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3
5595 ; CI-NEXT: v_mov_b32_e32 v0, s8
5596 ; CI-NEXT: s_mov_b32 s1, s7
5597 ; CI-NEXT: s_mov_b32 s6, 0
5598 ; CI-NEXT: s_mov_b32 s7, s3
5599 ; CI-NEXT: v_mov_b32_e32 v1, s9
5600 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5601 ; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 glc
5602 ; CI-NEXT: s_waitcnt vmcnt(0)
5603 ; CI-NEXT: buffer_wbinvl1_vol
5604 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5607 ; VI-LABEL: atomic_load_i64_addr64:
5608 ; VI: ; %bb.0: ; %entry
5609 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
5610 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5611 ; VI-NEXT: s_mov_b32 s7, 0xf000
5612 ; VI-NEXT: s_mov_b32 s6, -1
5613 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5614 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
5615 ; VI-NEXT: s_add_u32 s0, s0, s4
5616 ; VI-NEXT: s_addc_u32 s1, s1, s5
5617 ; VI-NEXT: v_mov_b32_e32 v0, s0
5618 ; VI-NEXT: v_mov_b32_e32 v1, s1
5619 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5620 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
5621 ; VI-NEXT: s_waitcnt vmcnt(0)
5622 ; VI-NEXT: buffer_wbinvl1_vol
5623 ; VI-NEXT: s_mov_b32 s4, s2
5624 ; VI-NEXT: s_mov_b32 s5, s3
5625 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5628 ; GFX9-LABEL: atomic_load_i64_addr64:
5629 ; GFX9: ; %bb.0: ; %entry
5630 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5631 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5632 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5633 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5634 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
5635 ; GFX9-NEXT: s_add_u32 s0, s4, s0
5636 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
5637 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5638 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] glc
5639 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5640 ; GFX9-NEXT: buffer_wbinvl1_vol
5641 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
5642 ; GFX9-NEXT: s_endpgm
5644 %ptr = getelementptr i64, ptr addrspace(1) %in, i64 %index
5645 %val = load atomic i64, ptr addrspace(1) %ptr seq_cst, align 8
5646 store i64 %val, ptr addrspace(1) %out
5650 define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) {
5651 ; CI-LABEL: atomic_load_f64_addr64_offset:
5652 ; CI: ; %bb.0: ; %entry
5653 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
5654 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
5655 ; CI-NEXT: s_mov_b32 s3, 0xf000
5656 ; CI-NEXT: s_mov_b32 s2, -1
5657 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5658 ; CI-NEXT: s_mov_b32 s0, s6
5659 ; CI-NEXT: s_lshl_b64 s[8:9], s[8:9], 3
5660 ; CI-NEXT: v_mov_b32_e32 v0, s8
5661 ; CI-NEXT: s_mov_b32 s1, s7
5662 ; CI-NEXT: s_mov_b32 s6, 0
5663 ; CI-NEXT: s_mov_b32 s7, s3
5664 ; CI-NEXT: v_mov_b32_e32 v1, s9
5665 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5666 ; CI-NEXT: buffer_load_dwordx2 v[0:1], v[0:1], s[4:7], 0 addr64 offset:32 glc
5667 ; CI-NEXT: s_waitcnt vmcnt(0)
5668 ; CI-NEXT: buffer_wbinvl1_vol
5669 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
5672 ; VI-LABEL: atomic_load_f64_addr64_offset:
5673 ; VI: ; %bb.0: ; %entry
5674 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
5675 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5676 ; VI-NEXT: s_mov_b32 s7, 0xf000
5677 ; VI-NEXT: s_mov_b32 s6, -1
5678 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5679 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
5680 ; VI-NEXT: s_add_u32 s0, s0, s4
5681 ; VI-NEXT: s_addc_u32 s1, s1, s5
5682 ; VI-NEXT: s_add_u32 s0, s0, 32
5683 ; VI-NEXT: s_addc_u32 s1, s1, 0
5684 ; VI-NEXT: v_mov_b32_e32 v0, s0
5685 ; VI-NEXT: v_mov_b32_e32 v1, s1
5686 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5687 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
5688 ; VI-NEXT: s_waitcnt vmcnt(0)
5689 ; VI-NEXT: buffer_wbinvl1_vol
5690 ; VI-NEXT: s_mov_b32 s4, s2
5691 ; VI-NEXT: s_mov_b32 s5, s3
5692 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5695 ; GFX9-LABEL: atomic_load_f64_addr64_offset:
5696 ; GFX9: ; %bb.0: ; %entry
5697 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5698 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5699 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5700 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5701 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
5702 ; GFX9-NEXT: s_add_u32 s0, s4, s0
5703 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
5704 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5705 ; GFX9-NEXT: global_load_dwordx2 v[0:1], v2, s[0:1] offset:32 glc
5706 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5707 ; GFX9-NEXT: buffer_wbinvl1_vol
5708 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
5709 ; GFX9-NEXT: s_endpgm
5711 %ptr = getelementptr double, ptr addrspace(1) %in, i64 %index
5712 %gep = getelementptr double, ptr addrspace(1) %ptr, i64 4
5713 %val = load atomic double, ptr addrspace(1) %gep seq_cst, align 8
5714 store double %val, ptr addrspace(1) %out
5718 define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr addrspace(1) %out) {
5719 ; CI-LABEL: atomic_store_i64_offset:
5720 ; CI: ; %bb.0: ; %entry
5721 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5722 ; CI-NEXT: s_mov_b32 s7, 0xf000
5723 ; CI-NEXT: s_mov_b32 s6, -1
5724 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5725 ; CI-NEXT: v_mov_b32_e32 v0, s0
5726 ; CI-NEXT: v_mov_b32_e32 v1, s1
5727 ; CI-NEXT: s_mov_b32 s4, s2
5728 ; CI-NEXT: s_mov_b32 s5, s3
5729 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5730 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:32
5733 ; VI-LABEL: atomic_store_i64_offset:
5734 ; VI: ; %bb.0: ; %entry
5735 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5736 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5737 ; VI-NEXT: v_mov_b32_e32 v0, s0
5738 ; VI-NEXT: s_add_u32 s0, s2, 32
5739 ; VI-NEXT: v_mov_b32_e32 v1, s1
5740 ; VI-NEXT: s_addc_u32 s1, s3, 0
5741 ; VI-NEXT: v_mov_b32_e32 v3, s1
5742 ; VI-NEXT: v_mov_b32_e32 v2, s0
5743 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5744 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
5747 ; GFX9-LABEL: atomic_store_i64_offset:
5748 ; GFX9: ; %bb.0: ; %entry
5749 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5750 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5751 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5752 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
5753 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
5754 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5755 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3] offset:32
5756 ; GFX9-NEXT: s_endpgm
5758 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
5759 store atomic i64 %in, ptr addrspace(1) %gep seq_cst, align 8
5763 define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr addrspace(1) %out) {
5764 ; CI-LABEL: atomic_store_i64:
5765 ; CI: ; %bb.0: ; %entry
5766 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5767 ; CI-NEXT: s_mov_b32 s7, 0xf000
5768 ; CI-NEXT: s_mov_b32 s6, -1
5769 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5770 ; CI-NEXT: v_mov_b32_e32 v0, s0
5771 ; CI-NEXT: v_mov_b32_e32 v1, s1
5772 ; CI-NEXT: s_mov_b32 s4, s2
5773 ; CI-NEXT: s_mov_b32 s5, s3
5774 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5775 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
5778 ; VI-LABEL: atomic_store_i64:
5779 ; VI: ; %bb.0: ; %entry
5780 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5781 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5782 ; VI-NEXT: v_mov_b32_e32 v0, s0
5783 ; VI-NEXT: v_mov_b32_e32 v1, s1
5784 ; VI-NEXT: v_mov_b32_e32 v2, s2
5785 ; VI-NEXT: v_mov_b32_e32 v3, s3
5786 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5787 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
5790 ; GFX9-LABEL: atomic_store_i64:
5791 ; GFX9: ; %bb.0: ; %entry
5792 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5793 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5794 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5795 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
5796 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
5797 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5798 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[2:3]
5799 ; GFX9-NEXT: s_endpgm
5801 store atomic i64 %in, ptr addrspace(1) %out seq_cst, align 8
5805 define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr addrspace(1) %out, i64 %index) {
5806 ; CI-LABEL: atomic_store_i64_addr64_offset:
5807 ; CI: ; %bb.0: ; %entry
5808 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
5809 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
5810 ; CI-NEXT: s_mov_b32 s3, 0xf000
5811 ; CI-NEXT: s_mov_b32 s2, 0
5812 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5813 ; CI-NEXT: v_mov_b32_e32 v0, s4
5814 ; CI-NEXT: v_mov_b32_e32 v1, s5
5815 ; CI-NEXT: s_lshl_b64 s[4:5], s[0:1], 3
5816 ; CI-NEXT: v_mov_b32_e32 v2, s4
5817 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
5818 ; CI-NEXT: v_mov_b32_e32 v3, s5
5819 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5820 ; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32
5823 ; VI-LABEL: atomic_store_i64_addr64_offset:
5824 ; VI: ; %bb.0: ; %entry
5825 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5826 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
5827 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5828 ; VI-NEXT: v_mov_b32_e32 v0, s4
5829 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
5830 ; VI-NEXT: s_add_u32 s0, s6, s0
5831 ; VI-NEXT: s_addc_u32 s1, s7, s1
5832 ; VI-NEXT: s_add_u32 s0, s0, 32
5833 ; VI-NEXT: s_addc_u32 s1, s1, 0
5834 ; VI-NEXT: v_mov_b32_e32 v3, s1
5835 ; VI-NEXT: v_mov_b32_e32 v1, s5
5836 ; VI-NEXT: v_mov_b32_e32 v2, s0
5837 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5838 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
5841 ; GFX9-LABEL: atomic_store_i64_addr64_offset:
5842 ; GFX9: ; %bb.0: ; %entry
5843 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5844 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5845 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5846 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5847 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
5848 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
5849 ; GFX9-NEXT: s_add_u32 s0, s6, s0
5850 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
5851 ; GFX9-NEXT: s_addc_u32 s1, s7, s1
5852 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5853 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] offset:32
5854 ; GFX9-NEXT: s_endpgm
5856 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
5857 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
5858 store atomic i64 %in, ptr addrspace(1) %gep seq_cst, align 8
5862 define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr addrspace(1) %out, i64 %index) {
5863 ; CI-LABEL: atomic_store_i64_addr64:
5864 ; CI: ; %bb.0: ; %entry
5865 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
5866 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
5867 ; CI-NEXT: s_mov_b32 s3, 0xf000
5868 ; CI-NEXT: s_mov_b32 s2, 0
5869 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5870 ; CI-NEXT: v_mov_b32_e32 v0, s4
5871 ; CI-NEXT: v_mov_b32_e32 v1, s5
5872 ; CI-NEXT: s_lshl_b64 s[4:5], s[8:9], 3
5873 ; CI-NEXT: v_mov_b32_e32 v2, s4
5874 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
5875 ; CI-NEXT: v_mov_b32_e32 v3, s5
5876 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5877 ; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[0:3], 0 addr64
5880 ; VI-LABEL: atomic_store_i64_addr64:
5881 ; VI: ; %bb.0: ; %entry
5882 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5883 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
5884 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5885 ; VI-NEXT: v_mov_b32_e32 v0, s4
5886 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
5887 ; VI-NEXT: s_add_u32 s0, s6, s0
5888 ; VI-NEXT: s_addc_u32 s1, s7, s1
5889 ; VI-NEXT: v_mov_b32_e32 v3, s1
5890 ; VI-NEXT: v_mov_b32_e32 v1, s5
5891 ; VI-NEXT: v_mov_b32_e32 v2, s0
5892 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5893 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
5896 ; GFX9-LABEL: atomic_store_i64_addr64:
5897 ; GFX9: ; %bb.0: ; %entry
5898 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5899 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5900 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5901 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5902 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
5903 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
5904 ; GFX9-NEXT: s_add_u32 s0, s6, s0
5905 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
5906 ; GFX9-NEXT: s_addc_u32 s1, s7, s1
5907 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5908 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
5909 ; GFX9-NEXT: s_endpgm
5911 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
5912 store atomic i64 %in, ptr addrspace(1) %ptr seq_cst, align 8
5916 define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr addrspace(1) %out, i64 %index) {
5917 ; CI-LABEL: atomic_store_f64_addr64_offset:
5918 ; CI: ; %bb.0: ; %entry
5919 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
5920 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
5921 ; CI-NEXT: s_mov_b32 s3, 0xf000
5922 ; CI-NEXT: s_mov_b32 s2, 0
5923 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5924 ; CI-NEXT: v_mov_b32_e32 v0, s4
5925 ; CI-NEXT: v_mov_b32_e32 v1, s5
5926 ; CI-NEXT: s_lshl_b64 s[4:5], s[0:1], 3
5927 ; CI-NEXT: v_mov_b32_e32 v2, s4
5928 ; CI-NEXT: s_mov_b64 s[0:1], s[6:7]
5929 ; CI-NEXT: v_mov_b32_e32 v3, s5
5930 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5931 ; CI-NEXT: buffer_store_dwordx2 v[0:1], v[2:3], s[0:3], 0 addr64 offset:32
5934 ; VI-LABEL: atomic_store_f64_addr64_offset:
5935 ; VI: ; %bb.0: ; %entry
5936 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5937 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
5938 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5939 ; VI-NEXT: v_mov_b32_e32 v0, s4
5940 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
5941 ; VI-NEXT: s_add_u32 s0, s6, s0
5942 ; VI-NEXT: s_addc_u32 s1, s7, s1
5943 ; VI-NEXT: s_add_u32 s0, s0, 32
5944 ; VI-NEXT: s_addc_u32 s1, s1, 0
5945 ; VI-NEXT: v_mov_b32_e32 v3, s1
5946 ; VI-NEXT: v_mov_b32_e32 v1, s5
5947 ; VI-NEXT: v_mov_b32_e32 v2, s0
5948 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5949 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
5952 ; GFX9-LABEL: atomic_store_f64_addr64_offset:
5953 ; GFX9: ; %bb.0: ; %entry
5954 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5955 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5956 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
5957 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5958 ; GFX9-NEXT: v_mov_b32_e32 v0, s4
5959 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
5960 ; GFX9-NEXT: s_add_u32 s0, s6, s0
5961 ; GFX9-NEXT: v_mov_b32_e32 v1, s5
5962 ; GFX9-NEXT: s_addc_u32 s1, s7, s1
5963 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5964 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] offset:32
5965 ; GFX9-NEXT: s_endpgm
5967 %ptr = getelementptr double, ptr addrspace(1) %out, i64 %index
5968 %gep = getelementptr double, ptr addrspace(1) %ptr, i64 4
5969 store atomic double %in, ptr addrspace(1) %gep seq_cst, align 8
5973 define amdgpu_kernel void @atomic_inc_i64_offset(ptr addrspace(1) %out, i64 %in) {
5974 ; CI-LABEL: atomic_inc_i64_offset:
5975 ; CI: ; %bb.0: ; %entry
5976 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5977 ; CI-NEXT: s_waitcnt lgkmcnt(0)
5978 ; CI-NEXT: v_mov_b32_e32 v0, s2
5979 ; CI-NEXT: v_mov_b32_e32 v1, s3
5980 ; CI-NEXT: s_mov_b32 s3, 0xf000
5981 ; CI-NEXT: s_mov_b32 s2, -1
5982 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5983 ; CI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 offset:32
5984 ; CI-NEXT: s_waitcnt vmcnt(0)
5985 ; CI-NEXT: buffer_wbinvl1_vol
5988 ; VI-LABEL: atomic_inc_i64_offset:
5989 ; VI: ; %bb.0: ; %entry
5990 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5991 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5992 ; VI-NEXT: v_mov_b32_e32 v0, s2
5993 ; VI-NEXT: v_mov_b32_e32 v1, s3
5994 ; VI-NEXT: s_mov_b32 s3, 0xf000
5995 ; VI-NEXT: s_mov_b32 s2, -1
5996 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5997 ; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 offset:32
5998 ; VI-NEXT: s_waitcnt vmcnt(0)
5999 ; VI-NEXT: buffer_wbinvl1_vol
6002 ; GFX9-LABEL: atomic_inc_i64_offset:
6003 ; GFX9: ; %bb.0: ; %entry
6004 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
6005 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6006 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6007 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
6008 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
6009 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6010 ; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32
6011 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6012 ; GFX9-NEXT: buffer_wbinvl1_vol
6013 ; GFX9-NEXT: s_endpgm
6015 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
6016 %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
6020 define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
6021 ; CI-LABEL: atomic_inc_i64_ret_offset:
6022 ; CI: ; %bb.0: ; %entry
6023 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
6024 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
6025 ; CI-NEXT: s_mov_b32 s3, 0xf000
6026 ; CI-NEXT: s_mov_b32 s2, -1
6027 ; CI-NEXT: s_waitcnt lgkmcnt(0)
6028 ; CI-NEXT: s_mov_b32 s0, s6
6029 ; CI-NEXT: s_mov_b32 s1, s7
6030 ; CI-NEXT: v_mov_b32_e32 v0, s8
6031 ; CI-NEXT: v_mov_b32_e32 v1, s9
6032 ; CI-NEXT: s_mov_b32 s6, s2
6033 ; CI-NEXT: s_mov_b32 s7, s3
6034 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6035 ; CI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc
6036 ; CI-NEXT: s_waitcnt vmcnt(0)
6037 ; CI-NEXT: buffer_wbinvl1_vol
6038 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6041 ; VI-LABEL: atomic_inc_i64_ret_offset:
6042 ; VI: ; %bb.0: ; %entry
6043 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6044 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
6045 ; VI-NEXT: s_mov_b32 s3, 0xf000
6046 ; VI-NEXT: s_mov_b32 s2, -1
6047 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6048 ; VI-NEXT: s_mov_b32 s0, s6
6049 ; VI-NEXT: s_mov_b32 s1, s7
6050 ; VI-NEXT: v_mov_b32_e32 v0, s8
6051 ; VI-NEXT: v_mov_b32_e32 v1, s9
6052 ; VI-NEXT: s_mov_b32 s6, s2
6053 ; VI-NEXT: s_mov_b32 s7, s3
6054 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6055 ; VI-NEXT: buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32 glc
6056 ; VI-NEXT: s_waitcnt vmcnt(0)
6057 ; VI-NEXT: buffer_wbinvl1_vol
6058 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6061 ; GFX9-LABEL: atomic_inc_i64_ret_offset:
6062 ; GFX9: ; %bb.0: ; %entry
6063 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
6064 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6065 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6066 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6067 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
6068 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
6069 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6070 ; GFX9-NEXT: global_atomic_inc_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
6071 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6072 ; GFX9-NEXT: buffer_wbinvl1_vol
6073 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
6074 ; GFX9-NEXT: s_endpgm
6076 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
6077 %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
6078 store i64 %tmp0, ptr addrspace(1) %out2
6082 define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
6083 ; CI-LABEL: atomic_inc_i64_incr64_offset:
6084 ; CI: ; %bb.0: ; %entry
6085 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
6086 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
6087 ; CI-NEXT: s_waitcnt lgkmcnt(0)
6088 ; CI-NEXT: v_mov_b32_e32 v0, s6
6089 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
6090 ; CI-NEXT: v_mov_b32_e32 v3, s1
6091 ; CI-NEXT: v_mov_b32_e32 v1, s7
6092 ; CI-NEXT: s_mov_b32 s7, 0xf000
6093 ; CI-NEXT: s_mov_b32 s6, 0
6094 ; CI-NEXT: v_mov_b32_e32 v2, s0
6095 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6096 ; CI-NEXT: buffer_atomic_inc_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
6097 ; CI-NEXT: s_waitcnt vmcnt(0)
6098 ; CI-NEXT: buffer_wbinvl1_vol
6101 ; VI-LABEL: atomic_inc_i64_incr64_offset:
6102 ; VI: ; %bb.0: ; %entry
6103 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6104 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
6105 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6106 ; VI-NEXT: v_mov_b32_e32 v0, s6
6107 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
6108 ; VI-NEXT: s_add_u32 s0, s4, s0
6109 ; VI-NEXT: s_addc_u32 s1, s5, s1
6110 ; VI-NEXT: s_add_u32 s0, s0, 32
6111 ; VI-NEXT: s_addc_u32 s1, s1, 0
6112 ; VI-NEXT: v_mov_b32_e32 v3, s1
6113 ; VI-NEXT: v_mov_b32_e32 v1, s7
6114 ; VI-NEXT: v_mov_b32_e32 v2, s0
6115 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6116 ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
6117 ; VI-NEXT: s_waitcnt vmcnt(0)
6118 ; VI-NEXT: buffer_wbinvl1_vol
6121 ; GFX9-LABEL: atomic_inc_i64_incr64_offset:
6122 ; GFX9: ; %bb.0: ; %entry
6123 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6124 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
6125 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6126 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6127 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
6128 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
6129 ; GFX9-NEXT: s_add_u32 s0, s4, s0
6130 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
6131 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
6132 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6133 ; GFX9-NEXT: global_atomic_inc_x2 v2, v[0:1], s[0:1] offset:32
6134 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6135 ; GFX9-NEXT: buffer_wbinvl1_vol
6136 ; GFX9-NEXT: s_endpgm
6138 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
6139 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
6140 %tmp0 = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
6144 define amdgpu_kernel void @atomic_dec_i64_offset(ptr addrspace(1) %out, i64 %in) {
6145 ; CI-LABEL: atomic_dec_i64_offset:
6146 ; CI: ; %bb.0: ; %entry
6147 ; CI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
6148 ; CI-NEXT: s_waitcnt lgkmcnt(0)
6149 ; CI-NEXT: v_mov_b32_e32 v0, s2
6150 ; CI-NEXT: v_mov_b32_e32 v1, s3
6151 ; CI-NEXT: s_mov_b32 s3, 0xf000
6152 ; CI-NEXT: s_mov_b32 s2, -1
6153 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6154 ; CI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 offset:32
6155 ; CI-NEXT: s_waitcnt vmcnt(0)
6156 ; CI-NEXT: buffer_wbinvl1_vol
6159 ; VI-LABEL: atomic_dec_i64_offset:
6160 ; VI: ; %bb.0: ; %entry
6161 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
6162 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6163 ; VI-NEXT: v_mov_b32_e32 v0, s2
6164 ; VI-NEXT: v_mov_b32_e32 v1, s3
6165 ; VI-NEXT: s_mov_b32 s3, 0xf000
6166 ; VI-NEXT: s_mov_b32 s2, -1
6167 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6168 ; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 offset:32
6169 ; VI-NEXT: s_waitcnt vmcnt(0)
6170 ; VI-NEXT: buffer_wbinvl1_vol
6173 ; GFX9-LABEL: atomic_dec_i64_offset:
6174 ; GFX9: ; %bb.0: ; %entry
6175 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
6176 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6177 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6178 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
6179 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
6180 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6181 ; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32
6182 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6183 ; GFX9-NEXT: buffer_wbinvl1_vol
6184 ; GFX9-NEXT: s_endpgm
6186 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
6187 %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
6191 define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i64 %in) {
6192 ; CI-LABEL: atomic_dec_i64_ret_offset:
6193 ; CI: ; %bb.0: ; %entry
6194 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
6195 ; CI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
6196 ; CI-NEXT: s_mov_b32 s3, 0xf000
6197 ; CI-NEXT: s_mov_b32 s2, -1
6198 ; CI-NEXT: s_waitcnt lgkmcnt(0)
6199 ; CI-NEXT: s_mov_b32 s0, s6
6200 ; CI-NEXT: s_mov_b32 s1, s7
6201 ; CI-NEXT: v_mov_b32_e32 v0, s8
6202 ; CI-NEXT: v_mov_b32_e32 v1, s9
6203 ; CI-NEXT: s_mov_b32 s6, s2
6204 ; CI-NEXT: s_mov_b32 s7, s3
6205 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6206 ; CI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc
6207 ; CI-NEXT: s_waitcnt vmcnt(0)
6208 ; CI-NEXT: buffer_wbinvl1_vol
6209 ; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6212 ; VI-LABEL: atomic_dec_i64_ret_offset:
6213 ; VI: ; %bb.0: ; %entry
6214 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6215 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
6216 ; VI-NEXT: s_mov_b32 s3, 0xf000
6217 ; VI-NEXT: s_mov_b32 s2, -1
6218 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6219 ; VI-NEXT: s_mov_b32 s0, s6
6220 ; VI-NEXT: s_mov_b32 s1, s7
6221 ; VI-NEXT: v_mov_b32_e32 v0, s8
6222 ; VI-NEXT: v_mov_b32_e32 v1, s9
6223 ; VI-NEXT: s_mov_b32 s6, s2
6224 ; VI-NEXT: s_mov_b32 s7, s3
6225 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6226 ; VI-NEXT: buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32 glc
6227 ; VI-NEXT: s_waitcnt vmcnt(0)
6228 ; VI-NEXT: buffer_wbinvl1_vol
6229 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
6232 ; GFX9-LABEL: atomic_dec_i64_ret_offset:
6233 ; GFX9: ; %bb.0: ; %entry
6234 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
6235 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6236 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6237 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6238 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
6239 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
6240 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6241 ; GFX9-NEXT: global_atomic_dec_x2 v[0:1], v2, v[0:1], s[4:5] offset:32 glc
6242 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6243 ; GFX9-NEXT: buffer_wbinvl1_vol
6244 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
6245 ; GFX9-NEXT: s_endpgm
6247 %gep = getelementptr i64, ptr addrspace(1) %out, i64 4
6248 %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst
6249 store i64 %tmp0, ptr addrspace(1) %out2
6253 define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr addrspace(1) %out, i64 %in, i64 %index) {
6254 ; CI-LABEL: atomic_dec_i64_decr64_offset:
6255 ; CI: ; %bb.0: ; %entry
6256 ; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
6257 ; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
6258 ; CI-NEXT: s_waitcnt lgkmcnt(0)
6259 ; CI-NEXT: v_mov_b32_e32 v0, s6
6260 ; CI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
6261 ; CI-NEXT: v_mov_b32_e32 v3, s1
6262 ; CI-NEXT: v_mov_b32_e32 v1, s7
6263 ; CI-NEXT: s_mov_b32 s7, 0xf000
6264 ; CI-NEXT: s_mov_b32 s6, 0
6265 ; CI-NEXT: v_mov_b32_e32 v2, s0
6266 ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6267 ; CI-NEXT: buffer_atomic_dec_x2 v[0:1], v[2:3], s[4:7], 0 addr64 offset:32
6268 ; CI-NEXT: s_waitcnt vmcnt(0)
6269 ; CI-NEXT: buffer_wbinvl1_vol
6272 ; VI-LABEL: atomic_dec_i64_decr64_offset:
6273 ; VI: ; %bb.0: ; %entry
6274 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6275 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
6276 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6277 ; VI-NEXT: v_mov_b32_e32 v0, s6
6278 ; VI-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
6279 ; VI-NEXT: s_add_u32 s0, s4, s0
6280 ; VI-NEXT: s_addc_u32 s1, s5, s1
6281 ; VI-NEXT: s_add_u32 s0, s0, 32
6282 ; VI-NEXT: s_addc_u32 s1, s1, 0
6283 ; VI-NEXT: v_mov_b32_e32 v3, s1
6284 ; VI-NEXT: v_mov_b32_e32 v1, s7
6285 ; VI-NEXT: v_mov_b32_e32 v2, s0
6286 ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6287 ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
6288 ; VI-NEXT: s_waitcnt vmcnt(0)
6289 ; VI-NEXT: buffer_wbinvl1_vol
6292 ; GFX9-LABEL: atomic_dec_i64_decr64_offset:
6293 ; GFX9: ; %bb.0: ; %entry
6294 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6295 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
6296 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
6297 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6298 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
6299 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 3
6300 ; GFX9-NEXT: s_add_u32 s0, s4, s0
6301 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
6302 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
6303 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6304 ; GFX9-NEXT: global_atomic_dec_x2 v2, v[0:1], s[0:1] offset:32
6305 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6306 ; GFX9-NEXT: buffer_wbinvl1_vol
6307 ; GFX9-NEXT: s_endpgm
6309 %ptr = getelementptr i64, ptr addrspace(1) %out, i64 %index
6310 %gep = getelementptr i64, ptr addrspace(1) %ptr, i64 4
6311 %tmp0 = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i64 %in syncscope("agent") seq_cst