1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -amdgpu-atomic-optimizer-strategy=None -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=None -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
6 define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) {
7 ; SI-LABEL: atomic_add_i32_offset:
8 ; SI: ; %bb.0: ; %entry
9 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
10 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
11 ; SI-NEXT: s_mov_b32 s3, 0xf000
12 ; SI-NEXT: s_mov_b32 s2, -1
13 ; SI-NEXT: s_waitcnt lgkmcnt(0)
14 ; SI-NEXT: v_mov_b32_e32 v0, s4
15 ; SI-NEXT: buffer_atomic_add v0, off, s[0:3], 0 offset:16
16 ; SI-NEXT: s_waitcnt vmcnt(0)
17 ; SI-NEXT: buffer_wbinvl1
20 ; VI-LABEL: atomic_add_i32_offset:
21 ; VI: ; %bb.0: ; %entry
22 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
23 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
24 ; VI-NEXT: s_mov_b32 s3, 0xf000
25 ; VI-NEXT: s_mov_b32 s2, -1
26 ; VI-NEXT: s_waitcnt lgkmcnt(0)
27 ; VI-NEXT: v_mov_b32_e32 v0, s4
28 ; VI-NEXT: buffer_atomic_add v0, off, s[0:3], 0 offset:16
29 ; VI-NEXT: s_waitcnt vmcnt(0)
30 ; VI-NEXT: buffer_wbinvl1_vol
33 ; GFX9-LABEL: atomic_add_i32_offset:
34 ; GFX9: ; %bb.0: ; %entry
35 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
36 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
37 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
38 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
39 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
40 ; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:16
41 ; GFX9-NEXT: s_waitcnt vmcnt(0)
42 ; GFX9-NEXT: buffer_wbinvl1_vol
45 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
46 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
50 define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) {
51 ; SI-LABEL: atomic_add_i32_max_neg_offset:
52 ; SI: ; %bb.0: ; %entry
53 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
54 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
55 ; SI-NEXT: s_mov_b32 s3, 0xf000
56 ; SI-NEXT: s_mov_b32 s2, 0
57 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000
58 ; SI-NEXT: v_mov_b32_e32 v1, -1
59 ; SI-NEXT: s_waitcnt lgkmcnt(0)
60 ; SI-NEXT: v_mov_b32_e32 v2, s4
61 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64
62 ; SI-NEXT: s_waitcnt vmcnt(0)
63 ; SI-NEXT: buffer_wbinvl1
66 ; VI-LABEL: atomic_add_i32_max_neg_offset:
67 ; VI: ; %bb.0: ; %entry
68 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
69 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
70 ; VI-NEXT: s_waitcnt lgkmcnt(0)
71 ; VI-NEXT: s_add_u32 s0, s2, 0xfffff000
72 ; VI-NEXT: s_addc_u32 s1, s3, -1
73 ; VI-NEXT: v_mov_b32_e32 v0, s0
74 ; VI-NEXT: v_mov_b32_e32 v1, s1
75 ; VI-NEXT: v_mov_b32_e32 v2, s4
76 ; VI-NEXT: flat_atomic_add v[0:1], v2
77 ; VI-NEXT: s_waitcnt vmcnt(0)
78 ; VI-NEXT: buffer_wbinvl1_vol
81 ; GFX9-LABEL: atomic_add_i32_max_neg_offset:
82 ; GFX9: ; %bb.0: ; %entry
83 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
84 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
85 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
86 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
87 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
88 ; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:-4096
89 ; GFX9-NEXT: s_waitcnt vmcnt(0)
90 ; GFX9-NEXT: buffer_wbinvl1_vol
93 %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024
94 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
98 define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) {
99 ; SI-LABEL: atomic_add_i32_soffset:
100 ; SI: ; %bb.0: ; %entry
101 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
102 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
103 ; SI-NEXT: s_mov_b32 s3, 0xf000
104 ; SI-NEXT: s_mov_b32 s2, -1
105 ; SI-NEXT: s_mov_b32 s5, 0x8ca0
106 ; SI-NEXT: s_waitcnt lgkmcnt(0)
107 ; SI-NEXT: v_mov_b32_e32 v0, s4
108 ; SI-NEXT: buffer_atomic_add v0, off, s[0:3], s5
109 ; SI-NEXT: s_waitcnt vmcnt(0)
110 ; SI-NEXT: buffer_wbinvl1
113 ; VI-LABEL: atomic_add_i32_soffset:
114 ; VI: ; %bb.0: ; %entry
115 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
116 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
117 ; VI-NEXT: s_mov_b32 s3, 0xf000
118 ; VI-NEXT: s_mov_b32 s2, -1
119 ; VI-NEXT: s_mov_b32 s5, 0x8ca0
120 ; VI-NEXT: s_waitcnt lgkmcnt(0)
121 ; VI-NEXT: v_mov_b32_e32 v0, s4
122 ; VI-NEXT: buffer_atomic_add v0, off, s[0:3], s5
123 ; VI-NEXT: s_waitcnt vmcnt(0)
124 ; VI-NEXT: buffer_wbinvl1_vol
127 ; GFX9-LABEL: atomic_add_i32_soffset:
128 ; GFX9: ; %bb.0: ; %entry
129 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
130 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
131 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000
132 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
133 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
134 ; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:3232
135 ; GFX9-NEXT: s_waitcnt vmcnt(0)
136 ; GFX9-NEXT: buffer_wbinvl1_vol
137 ; GFX9-NEXT: s_endpgm
139 %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000
140 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
144 define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) {
145 ; SI-LABEL: atomic_add_i32_huge_offset:
146 ; SI: ; %bb.0: ; %entry
147 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
148 ; SI-NEXT: s_load_dword s0, s[0:1], 0xb
149 ; SI-NEXT: s_mov_b32 s7, 0xf000
150 ; SI-NEXT: s_mov_b32 s6, 0
151 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac
152 ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd
153 ; SI-NEXT: s_waitcnt lgkmcnt(0)
154 ; SI-NEXT: v_mov_b32_e32 v2, s0
155 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64
156 ; SI-NEXT: s_waitcnt vmcnt(0)
157 ; SI-NEXT: buffer_wbinvl1
160 ; VI-LABEL: atomic_add_i32_huge_offset:
161 ; VI: ; %bb.0: ; %entry
162 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
163 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
164 ; VI-NEXT: s_waitcnt lgkmcnt(0)
165 ; VI-NEXT: s_add_u32 s0, s2, 0xdeac
166 ; VI-NEXT: s_addc_u32 s1, s3, 0xabcd
167 ; VI-NEXT: v_mov_b32_e32 v0, s0
168 ; VI-NEXT: v_mov_b32_e32 v1, s1
169 ; VI-NEXT: v_mov_b32_e32 v2, s4
170 ; VI-NEXT: flat_atomic_add v[0:1], v2
171 ; VI-NEXT: s_waitcnt vmcnt(0)
172 ; VI-NEXT: buffer_wbinvl1_vol
175 ; GFX9-LABEL: atomic_add_i32_huge_offset:
176 ; GFX9: ; %bb.0: ; %entry
177 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
178 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
179 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
180 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
181 ; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac
182 ; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd
183 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
184 ; GFX9-NEXT: global_atomic_add v0, v1, s[0:1]
185 ; GFX9-NEXT: s_waitcnt vmcnt(0)
186 ; GFX9-NEXT: buffer_wbinvl1_vol
187 ; GFX9-NEXT: s_endpgm
189 %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595
191 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
195 define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
196 ; SI-LABEL: atomic_add_i32_ret_offset:
197 ; SI: ; %bb.0: ; %entry
198 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
199 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
200 ; SI-NEXT: s_mov_b32 s3, 0xf000
201 ; SI-NEXT: s_mov_b32 s2, -1
202 ; SI-NEXT: s_waitcnt lgkmcnt(0)
203 ; SI-NEXT: s_mov_b32 s0, s6
204 ; SI-NEXT: s_mov_b32 s1, s7
205 ; SI-NEXT: s_mov_b32 s6, s2
206 ; SI-NEXT: s_mov_b32 s7, s3
207 ; SI-NEXT: v_mov_b32_e32 v0, s8
208 ; SI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16 glc
209 ; SI-NEXT: s_waitcnt vmcnt(0)
210 ; SI-NEXT: buffer_wbinvl1
211 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
214 ; VI-LABEL: atomic_add_i32_ret_offset:
215 ; VI: ; %bb.0: ; %entry
216 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
217 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
218 ; VI-NEXT: s_mov_b32 s3, 0xf000
219 ; VI-NEXT: s_mov_b32 s2, -1
220 ; VI-NEXT: s_waitcnt lgkmcnt(0)
221 ; VI-NEXT: s_mov_b32 s0, s6
222 ; VI-NEXT: s_mov_b32 s1, s7
223 ; VI-NEXT: s_mov_b32 s6, s2
224 ; VI-NEXT: s_mov_b32 s7, s3
225 ; VI-NEXT: v_mov_b32_e32 v0, s8
226 ; VI-NEXT: buffer_atomic_add v0, off, s[4:7], 0 offset:16 glc
227 ; VI-NEXT: s_waitcnt vmcnt(0)
228 ; VI-NEXT: buffer_wbinvl1_vol
229 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
232 ; GFX9-LABEL: atomic_add_i32_ret_offset:
233 ; GFX9: ; %bb.0: ; %entry
234 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
235 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
236 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
237 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
238 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
239 ; GFX9-NEXT: global_atomic_add v1, v0, v1, s[4:5] offset:16 glc
240 ; GFX9-NEXT: s_waitcnt vmcnt(0)
241 ; GFX9-NEXT: buffer_wbinvl1_vol
242 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
243 ; GFX9-NEXT: s_endpgm
245 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
246 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
247 store i32 %val, ptr addrspace(1) %out2
251 define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
252 ; SI-LABEL: atomic_add_i32_addr64_offset:
253 ; SI: ; %bb.0: ; %entry
254 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
255 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
256 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
257 ; SI-NEXT: s_mov_b32 s3, 0xf000
258 ; SI-NEXT: s_waitcnt lgkmcnt(0)
259 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
260 ; SI-NEXT: s_mov_b32 s2, 0
261 ; SI-NEXT: v_mov_b32_e32 v2, s6
262 ; SI-NEXT: v_mov_b32_e32 v0, s4
263 ; SI-NEXT: v_mov_b32_e32 v1, s5
264 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64 offset:16
265 ; SI-NEXT: s_waitcnt vmcnt(0)
266 ; SI-NEXT: buffer_wbinvl1
269 ; VI-LABEL: atomic_add_i32_addr64_offset:
270 ; VI: ; %bb.0: ; %entry
271 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
272 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
273 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
274 ; VI-NEXT: s_waitcnt lgkmcnt(0)
275 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
276 ; VI-NEXT: s_add_u32 s0, s4, s0
277 ; VI-NEXT: s_addc_u32 s1, s5, s1
278 ; VI-NEXT: s_add_u32 s0, s0, 16
279 ; VI-NEXT: s_addc_u32 s1, s1, 0
280 ; VI-NEXT: v_mov_b32_e32 v0, s0
281 ; VI-NEXT: v_mov_b32_e32 v1, s1
282 ; VI-NEXT: v_mov_b32_e32 v2, s6
283 ; VI-NEXT: flat_atomic_add v[0:1], v2
284 ; VI-NEXT: s_waitcnt vmcnt(0)
285 ; VI-NEXT: buffer_wbinvl1_vol
288 ; GFX9-LABEL: atomic_add_i32_addr64_offset:
289 ; GFX9: ; %bb.0: ; %entry
290 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
291 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
292 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
293 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
294 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
295 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
296 ; GFX9-NEXT: s_add_u32 s0, s4, s0
297 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
298 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
299 ; GFX9-NEXT: global_atomic_add v0, v1, s[0:1] offset:16
300 ; GFX9-NEXT: s_waitcnt vmcnt(0)
301 ; GFX9-NEXT: buffer_wbinvl1_vol
302 ; GFX9-NEXT: s_endpgm
304 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
305 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
306 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
310 define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
311 ; SI-LABEL: atomic_add_i32_ret_addr64_offset:
312 ; SI: ; %bb.0: ; %entry
313 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
314 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
315 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
316 ; SI-NEXT: s_mov_b32 s3, 0xf000
317 ; SI-NEXT: s_waitcnt lgkmcnt(0)
318 ; SI-NEXT: s_mov_b32 s0, s6
319 ; SI-NEXT: s_mov_b32 s1, s7
320 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
321 ; SI-NEXT: s_mov_b32 s6, 0
322 ; SI-NEXT: s_mov_b32 s7, s3
323 ; SI-NEXT: v_mov_b32_e32 v2, s2
324 ; SI-NEXT: v_mov_b32_e32 v0, s8
325 ; SI-NEXT: v_mov_b32_e32 v1, s9
326 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
327 ; SI-NEXT: s_waitcnt vmcnt(0)
328 ; SI-NEXT: buffer_wbinvl1
329 ; SI-NEXT: s_mov_b32 s2, -1
330 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
333 ; VI-LABEL: atomic_add_i32_ret_addr64_offset:
334 ; VI: ; %bb.0: ; %entry
335 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
336 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
337 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
338 ; VI-NEXT: s_waitcnt lgkmcnt(0)
339 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
340 ; VI-NEXT: s_add_u32 s0, s4, s0
341 ; VI-NEXT: s_addc_u32 s1, s5, s1
342 ; VI-NEXT: s_add_u32 s0, s0, 16
343 ; VI-NEXT: s_addc_u32 s1, s1, 0
344 ; VI-NEXT: v_mov_b32_e32 v0, s0
345 ; VI-NEXT: v_mov_b32_e32 v1, s1
346 ; VI-NEXT: v_mov_b32_e32 v2, s8
347 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc
348 ; VI-NEXT: s_waitcnt vmcnt(0)
349 ; VI-NEXT: buffer_wbinvl1_vol
350 ; VI-NEXT: s_mov_b32 s3, 0xf000
351 ; VI-NEXT: s_mov_b32 s2, -1
352 ; VI-NEXT: s_mov_b32 s0, s6
353 ; VI-NEXT: s_mov_b32 s1, s7
354 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
357 ; GFX9-LABEL: atomic_add_i32_ret_addr64_offset:
358 ; GFX9: ; %bb.0: ; %entry
359 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
360 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
361 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
362 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
363 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
364 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
365 ; GFX9-NEXT: s_add_u32 s0, s4, s0
366 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
367 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
368 ; GFX9-NEXT: global_atomic_add v1, v0, v1, s[0:1] offset:16 glc
369 ; GFX9-NEXT: s_waitcnt vmcnt(0)
370 ; GFX9-NEXT: buffer_wbinvl1_vol
371 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
372 ; GFX9-NEXT: s_endpgm
374 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
375 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
376 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
377 store i32 %val, ptr addrspace(1) %out2
381 define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) {
382 ; SI-LABEL: atomic_add_i32:
383 ; SI: ; %bb.0: ; %entry
384 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
385 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
386 ; SI-NEXT: s_mov_b32 s3, 0xf000
387 ; SI-NEXT: s_mov_b32 s2, -1
388 ; SI-NEXT: s_waitcnt lgkmcnt(0)
389 ; SI-NEXT: v_mov_b32_e32 v0, s4
390 ; SI-NEXT: buffer_atomic_add v0, off, s[0:3], 0
391 ; SI-NEXT: s_waitcnt vmcnt(0)
392 ; SI-NEXT: buffer_wbinvl1
395 ; VI-LABEL: atomic_add_i32:
396 ; VI: ; %bb.0: ; %entry
397 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
398 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
399 ; VI-NEXT: s_mov_b32 s3, 0xf000
400 ; VI-NEXT: s_mov_b32 s2, -1
401 ; VI-NEXT: s_waitcnt lgkmcnt(0)
402 ; VI-NEXT: v_mov_b32_e32 v0, s4
403 ; VI-NEXT: buffer_atomic_add v0, off, s[0:3], 0
404 ; VI-NEXT: s_waitcnt vmcnt(0)
405 ; VI-NEXT: buffer_wbinvl1_vol
408 ; GFX9-LABEL: atomic_add_i32:
409 ; GFX9: ; %bb.0: ; %entry
410 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
411 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
412 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
413 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
414 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
415 ; GFX9-NEXT: global_atomic_add v0, v1, s[2:3]
416 ; GFX9-NEXT: s_waitcnt vmcnt(0)
417 ; GFX9-NEXT: buffer_wbinvl1_vol
418 ; GFX9-NEXT: s_endpgm
420 %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
424 define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
425 ; SI-LABEL: atomic_add_i32_ret:
426 ; SI: ; %bb.0: ; %entry
427 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
428 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
429 ; SI-NEXT: s_mov_b32 s3, 0xf000
430 ; SI-NEXT: s_mov_b32 s2, -1
431 ; SI-NEXT: s_waitcnt lgkmcnt(0)
432 ; SI-NEXT: s_mov_b32 s0, s4
433 ; SI-NEXT: s_mov_b32 s1, s5
434 ; SI-NEXT: v_mov_b32_e32 v0, s8
435 ; SI-NEXT: buffer_atomic_add v0, off, s[0:3], 0 glc
436 ; SI-NEXT: s_waitcnt vmcnt(0)
437 ; SI-NEXT: buffer_wbinvl1
438 ; SI-NEXT: s_mov_b32 s0, s6
439 ; SI-NEXT: s_mov_b32 s1, s7
440 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
443 ; VI-LABEL: atomic_add_i32_ret:
444 ; VI: ; %bb.0: ; %entry
445 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
446 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
447 ; VI-NEXT: s_mov_b32 s3, 0xf000
448 ; VI-NEXT: s_mov_b32 s2, -1
449 ; VI-NEXT: s_waitcnt lgkmcnt(0)
450 ; VI-NEXT: s_mov_b32 s0, s4
451 ; VI-NEXT: s_mov_b32 s1, s5
452 ; VI-NEXT: v_mov_b32_e32 v0, s8
453 ; VI-NEXT: buffer_atomic_add v0, off, s[0:3], 0 glc
454 ; VI-NEXT: s_waitcnt vmcnt(0)
455 ; VI-NEXT: buffer_wbinvl1_vol
456 ; VI-NEXT: s_mov_b32 s0, s6
457 ; VI-NEXT: s_mov_b32 s1, s7
458 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
461 ; GFX9-LABEL: atomic_add_i32_ret:
462 ; GFX9: ; %bb.0: ; %entry
463 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
464 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
465 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
466 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
467 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
468 ; GFX9-NEXT: global_atomic_add v1, v0, v1, s[4:5] glc
469 ; GFX9-NEXT: s_waitcnt vmcnt(0)
470 ; GFX9-NEXT: buffer_wbinvl1_vol
471 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
472 ; GFX9-NEXT: s_endpgm
474 %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
475 store i32 %val, ptr addrspace(1) %out2
479 define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
480 ; SI-LABEL: atomic_add_i32_addr64:
481 ; SI: ; %bb.0: ; %entry
482 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
483 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
484 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
485 ; SI-NEXT: s_mov_b32 s3, 0xf000
486 ; SI-NEXT: s_waitcnt lgkmcnt(0)
487 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
488 ; SI-NEXT: s_mov_b32 s2, 0
489 ; SI-NEXT: v_mov_b32_e32 v2, s6
490 ; SI-NEXT: v_mov_b32_e32 v0, s4
491 ; SI-NEXT: v_mov_b32_e32 v1, s5
492 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[0:3], 0 addr64
493 ; SI-NEXT: s_waitcnt vmcnt(0)
494 ; SI-NEXT: buffer_wbinvl1
497 ; VI-LABEL: atomic_add_i32_addr64:
498 ; VI: ; %bb.0: ; %entry
499 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
500 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
501 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
502 ; VI-NEXT: s_waitcnt lgkmcnt(0)
503 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
504 ; VI-NEXT: s_add_u32 s0, s4, s0
505 ; VI-NEXT: s_addc_u32 s1, s5, s1
506 ; VI-NEXT: v_mov_b32_e32 v0, s0
507 ; VI-NEXT: v_mov_b32_e32 v1, s1
508 ; VI-NEXT: v_mov_b32_e32 v2, s6
509 ; VI-NEXT: flat_atomic_add v[0:1], v2
510 ; VI-NEXT: s_waitcnt vmcnt(0)
511 ; VI-NEXT: buffer_wbinvl1_vol
514 ; GFX9-LABEL: atomic_add_i32_addr64:
515 ; GFX9: ; %bb.0: ; %entry
516 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
517 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
518 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
519 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
520 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
521 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
522 ; GFX9-NEXT: s_add_u32 s0, s4, s0
523 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
524 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
525 ; GFX9-NEXT: global_atomic_add v0, v1, s[0:1]
526 ; GFX9-NEXT: s_waitcnt vmcnt(0)
527 ; GFX9-NEXT: buffer_wbinvl1_vol
528 ; GFX9-NEXT: s_endpgm
530 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
531 %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
535 define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
536 ; SI-LABEL: atomic_add_i32_ret_addr64:
537 ; SI: ; %bb.0: ; %entry
538 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
539 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
540 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
541 ; SI-NEXT: s_mov_b32 s3, 0xf000
542 ; SI-NEXT: s_waitcnt lgkmcnt(0)
543 ; SI-NEXT: s_mov_b32 s0, s6
544 ; SI-NEXT: s_mov_b32 s1, s7
545 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
546 ; SI-NEXT: s_mov_b32 s6, 0
547 ; SI-NEXT: s_mov_b32 s7, s3
548 ; SI-NEXT: v_mov_b32_e32 v2, s2
549 ; SI-NEXT: v_mov_b32_e32 v0, s8
550 ; SI-NEXT: v_mov_b32_e32 v1, s9
551 ; SI-NEXT: buffer_atomic_add v2, v[0:1], s[4:7], 0 addr64 glc
552 ; SI-NEXT: s_waitcnt vmcnt(0)
553 ; SI-NEXT: buffer_wbinvl1
554 ; SI-NEXT: s_mov_b32 s2, -1
555 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
558 ; VI-LABEL: atomic_add_i32_ret_addr64:
559 ; VI: ; %bb.0: ; %entry
560 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
561 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
562 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
563 ; VI-NEXT: s_waitcnt lgkmcnt(0)
564 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
565 ; VI-NEXT: s_add_u32 s0, s4, s0
566 ; VI-NEXT: s_addc_u32 s1, s5, s1
567 ; VI-NEXT: v_mov_b32_e32 v0, s0
568 ; VI-NEXT: v_mov_b32_e32 v1, s1
569 ; VI-NEXT: v_mov_b32_e32 v2, s8
570 ; VI-NEXT: flat_atomic_add v0, v[0:1], v2 glc
571 ; VI-NEXT: s_waitcnt vmcnt(0)
572 ; VI-NEXT: buffer_wbinvl1_vol
573 ; VI-NEXT: s_mov_b32 s3, 0xf000
574 ; VI-NEXT: s_mov_b32 s2, -1
575 ; VI-NEXT: s_mov_b32 s0, s6
576 ; VI-NEXT: s_mov_b32 s1, s7
577 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
580 ; GFX9-LABEL: atomic_add_i32_ret_addr64:
581 ; GFX9: ; %bb.0: ; %entry
582 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
583 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
584 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
585 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
586 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
587 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
588 ; GFX9-NEXT: s_add_u32 s0, s4, s0
589 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
590 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
591 ; GFX9-NEXT: global_atomic_add v1, v0, v1, s[0:1] glc
592 ; GFX9-NEXT: s_waitcnt vmcnt(0)
593 ; GFX9-NEXT: buffer_wbinvl1_vol
594 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
595 ; GFX9-NEXT: s_endpgm
597 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
598 %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
599 store i32 %val, ptr addrspace(1) %out2
603 define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) {
604 ; SI-LABEL: atomic_and_i32_offset:
605 ; SI: ; %bb.0: ; %entry
606 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
607 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
608 ; SI-NEXT: s_mov_b32 s3, 0xf000
609 ; SI-NEXT: s_mov_b32 s2, -1
610 ; SI-NEXT: s_waitcnt lgkmcnt(0)
611 ; SI-NEXT: v_mov_b32_e32 v0, s4
612 ; SI-NEXT: buffer_atomic_and v0, off, s[0:3], 0 offset:16
613 ; SI-NEXT: s_waitcnt vmcnt(0)
614 ; SI-NEXT: buffer_wbinvl1
617 ; VI-LABEL: atomic_and_i32_offset:
618 ; VI: ; %bb.0: ; %entry
619 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
620 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
621 ; VI-NEXT: s_mov_b32 s3, 0xf000
622 ; VI-NEXT: s_mov_b32 s2, -1
623 ; VI-NEXT: s_waitcnt lgkmcnt(0)
624 ; VI-NEXT: v_mov_b32_e32 v0, s4
625 ; VI-NEXT: buffer_atomic_and v0, off, s[0:3], 0 offset:16
626 ; VI-NEXT: s_waitcnt vmcnt(0)
627 ; VI-NEXT: buffer_wbinvl1_vol
630 ; GFX9-LABEL: atomic_and_i32_offset:
631 ; GFX9: ; %bb.0: ; %entry
632 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
633 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
634 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
635 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
636 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
637 ; GFX9-NEXT: global_atomic_and v0, v1, s[2:3] offset:16
638 ; GFX9-NEXT: s_waitcnt vmcnt(0)
639 ; GFX9-NEXT: buffer_wbinvl1_vol
640 ; GFX9-NEXT: s_endpgm
642 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
643 %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
647 define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
648 ; SI-LABEL: atomic_and_i32_ret_offset:
649 ; SI: ; %bb.0: ; %entry
650 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
651 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
652 ; SI-NEXT: s_mov_b32 s3, 0xf000
653 ; SI-NEXT: s_mov_b32 s2, -1
654 ; SI-NEXT: s_waitcnt lgkmcnt(0)
655 ; SI-NEXT: s_mov_b32 s0, s6
656 ; SI-NEXT: s_mov_b32 s1, s7
657 ; SI-NEXT: s_mov_b32 s6, s2
658 ; SI-NEXT: s_mov_b32 s7, s3
659 ; SI-NEXT: v_mov_b32_e32 v0, s8
660 ; SI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc
661 ; SI-NEXT: s_waitcnt vmcnt(0)
662 ; SI-NEXT: buffer_wbinvl1
663 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
666 ; VI-LABEL: atomic_and_i32_ret_offset:
667 ; VI: ; %bb.0: ; %entry
668 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
669 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
670 ; VI-NEXT: s_mov_b32 s3, 0xf000
671 ; VI-NEXT: s_mov_b32 s2, -1
672 ; VI-NEXT: s_waitcnt lgkmcnt(0)
673 ; VI-NEXT: s_mov_b32 s0, s6
674 ; VI-NEXT: s_mov_b32 s1, s7
675 ; VI-NEXT: s_mov_b32 s6, s2
676 ; VI-NEXT: s_mov_b32 s7, s3
677 ; VI-NEXT: v_mov_b32_e32 v0, s8
678 ; VI-NEXT: buffer_atomic_and v0, off, s[4:7], 0 offset:16 glc
679 ; VI-NEXT: s_waitcnt vmcnt(0)
680 ; VI-NEXT: buffer_wbinvl1_vol
681 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
684 ; GFX9-LABEL: atomic_and_i32_ret_offset:
685 ; GFX9: ; %bb.0: ; %entry
686 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
687 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
688 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
689 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
690 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
691 ; GFX9-NEXT: global_atomic_and v1, v0, v1, s[4:5] offset:16 glc
692 ; GFX9-NEXT: s_waitcnt vmcnt(0)
693 ; GFX9-NEXT: buffer_wbinvl1_vol
694 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
695 ; GFX9-NEXT: s_endpgm
697 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
698 %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
699 store i32 %val, ptr addrspace(1) %out2
703 define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
704 ; SI-LABEL: atomic_and_i32_addr64_offset:
705 ; SI: ; %bb.0: ; %entry
706 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
707 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
708 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
709 ; SI-NEXT: s_mov_b32 s3, 0xf000
710 ; SI-NEXT: s_waitcnt lgkmcnt(0)
711 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
712 ; SI-NEXT: s_mov_b32 s2, 0
713 ; SI-NEXT: v_mov_b32_e32 v2, s6
714 ; SI-NEXT: v_mov_b32_e32 v0, s4
715 ; SI-NEXT: v_mov_b32_e32 v1, s5
716 ; SI-NEXT: buffer_atomic_and v2, v[0:1], s[0:3], 0 addr64 offset:16
717 ; SI-NEXT: s_waitcnt vmcnt(0)
718 ; SI-NEXT: buffer_wbinvl1
721 ; VI-LABEL: atomic_and_i32_addr64_offset:
722 ; VI: ; %bb.0: ; %entry
723 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
724 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
725 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
726 ; VI-NEXT: s_waitcnt lgkmcnt(0)
727 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
728 ; VI-NEXT: s_add_u32 s0, s4, s0
729 ; VI-NEXT: s_addc_u32 s1, s5, s1
730 ; VI-NEXT: s_add_u32 s0, s0, 16
731 ; VI-NEXT: s_addc_u32 s1, s1, 0
732 ; VI-NEXT: v_mov_b32_e32 v0, s0
733 ; VI-NEXT: v_mov_b32_e32 v1, s1
734 ; VI-NEXT: v_mov_b32_e32 v2, s6
735 ; VI-NEXT: flat_atomic_and v[0:1], v2
736 ; VI-NEXT: s_waitcnt vmcnt(0)
737 ; VI-NEXT: buffer_wbinvl1_vol
740 ; GFX9-LABEL: atomic_and_i32_addr64_offset:
741 ; GFX9: ; %bb.0: ; %entry
742 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
743 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
744 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
745 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
746 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
747 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
748 ; GFX9-NEXT: s_add_u32 s0, s4, s0
749 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
750 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
751 ; GFX9-NEXT: global_atomic_and v0, v1, s[0:1] offset:16
752 ; GFX9-NEXT: s_waitcnt vmcnt(0)
753 ; GFX9-NEXT: buffer_wbinvl1_vol
754 ; GFX9-NEXT: s_endpgm
756 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
757 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
758 %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
762 define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
763 ; SI-LABEL: atomic_and_i32_ret_addr64_offset:
764 ; SI: ; %bb.0: ; %entry
765 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
766 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
767 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
768 ; SI-NEXT: s_mov_b32 s3, 0xf000
769 ; SI-NEXT: s_waitcnt lgkmcnt(0)
770 ; SI-NEXT: s_mov_b32 s0, s6
771 ; SI-NEXT: s_mov_b32 s1, s7
772 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
773 ; SI-NEXT: s_mov_b32 s6, 0
774 ; SI-NEXT: s_mov_b32 s7, s3
775 ; SI-NEXT: v_mov_b32_e32 v2, s2
776 ; SI-NEXT: v_mov_b32_e32 v0, s8
777 ; SI-NEXT: v_mov_b32_e32 v1, s9
778 ; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
779 ; SI-NEXT: s_waitcnt vmcnt(0)
780 ; SI-NEXT: buffer_wbinvl1
781 ; SI-NEXT: s_mov_b32 s2, -1
782 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
785 ; VI-LABEL: atomic_and_i32_ret_addr64_offset:
786 ; VI: ; %bb.0: ; %entry
787 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
788 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
789 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
790 ; VI-NEXT: s_waitcnt lgkmcnt(0)
791 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
792 ; VI-NEXT: s_add_u32 s0, s4, s0
793 ; VI-NEXT: s_addc_u32 s1, s5, s1
794 ; VI-NEXT: s_add_u32 s0, s0, 16
795 ; VI-NEXT: s_addc_u32 s1, s1, 0
796 ; VI-NEXT: v_mov_b32_e32 v0, s0
797 ; VI-NEXT: v_mov_b32_e32 v1, s1
798 ; VI-NEXT: v_mov_b32_e32 v2, s8
799 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc
800 ; VI-NEXT: s_waitcnt vmcnt(0)
801 ; VI-NEXT: buffer_wbinvl1_vol
802 ; VI-NEXT: s_mov_b32 s3, 0xf000
803 ; VI-NEXT: s_mov_b32 s2, -1
804 ; VI-NEXT: s_mov_b32 s0, s6
805 ; VI-NEXT: s_mov_b32 s1, s7
806 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
809 ; GFX9-LABEL: atomic_and_i32_ret_addr64_offset:
810 ; GFX9: ; %bb.0: ; %entry
811 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
812 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
813 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
814 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
815 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
816 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
817 ; GFX9-NEXT: s_add_u32 s0, s4, s0
818 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
819 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
820 ; GFX9-NEXT: global_atomic_and v1, v0, v1, s[0:1] offset:16 glc
821 ; GFX9-NEXT: s_waitcnt vmcnt(0)
822 ; GFX9-NEXT: buffer_wbinvl1_vol
823 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
824 ; GFX9-NEXT: s_endpgm
826 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
827 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
828 %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
829 store i32 %val, ptr addrspace(1) %out2
833 define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) {
834 ; SI-LABEL: atomic_and_i32:
835 ; SI: ; %bb.0: ; %entry
836 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
837 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
838 ; SI-NEXT: s_mov_b32 s3, 0xf000
839 ; SI-NEXT: s_mov_b32 s2, -1
840 ; SI-NEXT: s_waitcnt lgkmcnt(0)
841 ; SI-NEXT: v_mov_b32_e32 v0, s4
842 ; SI-NEXT: buffer_atomic_and v0, off, s[0:3], 0
843 ; SI-NEXT: s_waitcnt vmcnt(0)
844 ; SI-NEXT: buffer_wbinvl1
847 ; VI-LABEL: atomic_and_i32:
848 ; VI: ; %bb.0: ; %entry
849 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
850 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
851 ; VI-NEXT: s_mov_b32 s3, 0xf000
852 ; VI-NEXT: s_mov_b32 s2, -1
853 ; VI-NEXT: s_waitcnt lgkmcnt(0)
854 ; VI-NEXT: v_mov_b32_e32 v0, s4
855 ; VI-NEXT: buffer_atomic_and v0, off, s[0:3], 0
856 ; VI-NEXT: s_waitcnt vmcnt(0)
857 ; VI-NEXT: buffer_wbinvl1_vol
860 ; GFX9-LABEL: atomic_and_i32:
861 ; GFX9: ; %bb.0: ; %entry
862 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
863 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
864 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
865 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
866 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
867 ; GFX9-NEXT: global_atomic_and v0, v1, s[2:3]
868 ; GFX9-NEXT: s_waitcnt vmcnt(0)
869 ; GFX9-NEXT: buffer_wbinvl1_vol
870 ; GFX9-NEXT: s_endpgm
872 %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
876 define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
877 ; SI-LABEL: atomic_and_i32_ret:
878 ; SI: ; %bb.0: ; %entry
879 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
880 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
881 ; SI-NEXT: s_mov_b32 s3, 0xf000
882 ; SI-NEXT: s_mov_b32 s2, -1
883 ; SI-NEXT: s_waitcnt lgkmcnt(0)
884 ; SI-NEXT: s_mov_b32 s0, s4
885 ; SI-NEXT: s_mov_b32 s1, s5
886 ; SI-NEXT: v_mov_b32_e32 v0, s8
887 ; SI-NEXT: buffer_atomic_and v0, off, s[0:3], 0 glc
888 ; SI-NEXT: s_waitcnt vmcnt(0)
889 ; SI-NEXT: buffer_wbinvl1
890 ; SI-NEXT: s_mov_b32 s0, s6
891 ; SI-NEXT: s_mov_b32 s1, s7
892 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
895 ; VI-LABEL: atomic_and_i32_ret:
896 ; VI: ; %bb.0: ; %entry
897 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
898 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
899 ; VI-NEXT: s_mov_b32 s3, 0xf000
900 ; VI-NEXT: s_mov_b32 s2, -1
901 ; VI-NEXT: s_waitcnt lgkmcnt(0)
902 ; VI-NEXT: s_mov_b32 s0, s4
903 ; VI-NEXT: s_mov_b32 s1, s5
904 ; VI-NEXT: v_mov_b32_e32 v0, s8
905 ; VI-NEXT: buffer_atomic_and v0, off, s[0:3], 0 glc
906 ; VI-NEXT: s_waitcnt vmcnt(0)
907 ; VI-NEXT: buffer_wbinvl1_vol
908 ; VI-NEXT: s_mov_b32 s0, s6
909 ; VI-NEXT: s_mov_b32 s1, s7
910 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
913 ; GFX9-LABEL: atomic_and_i32_ret:
914 ; GFX9: ; %bb.0: ; %entry
915 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
916 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
917 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
918 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
919 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
920 ; GFX9-NEXT: global_atomic_and v1, v0, v1, s[4:5] glc
921 ; GFX9-NEXT: s_waitcnt vmcnt(0)
922 ; GFX9-NEXT: buffer_wbinvl1_vol
923 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
924 ; GFX9-NEXT: s_endpgm
926 %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
927 store i32 %val, ptr addrspace(1) %out2
931 define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
932 ; SI-LABEL: atomic_and_i32_addr64:
933 ; SI: ; %bb.0: ; %entry
934 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
935 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
936 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
937 ; SI-NEXT: s_mov_b32 s3, 0xf000
938 ; SI-NEXT: s_waitcnt lgkmcnt(0)
939 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
940 ; SI-NEXT: s_mov_b32 s2, 0
941 ; SI-NEXT: v_mov_b32_e32 v2, s6
942 ; SI-NEXT: v_mov_b32_e32 v0, s4
943 ; SI-NEXT: v_mov_b32_e32 v1, s5
944 ; SI-NEXT: buffer_atomic_and v2, v[0:1], s[0:3], 0 addr64
945 ; SI-NEXT: s_waitcnt vmcnt(0)
946 ; SI-NEXT: buffer_wbinvl1
949 ; VI-LABEL: atomic_and_i32_addr64:
950 ; VI: ; %bb.0: ; %entry
951 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
952 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
953 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
954 ; VI-NEXT: s_waitcnt lgkmcnt(0)
955 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
956 ; VI-NEXT: s_add_u32 s0, s4, s0
957 ; VI-NEXT: s_addc_u32 s1, s5, s1
958 ; VI-NEXT: v_mov_b32_e32 v0, s0
959 ; VI-NEXT: v_mov_b32_e32 v1, s1
960 ; VI-NEXT: v_mov_b32_e32 v2, s6
961 ; VI-NEXT: flat_atomic_and v[0:1], v2
962 ; VI-NEXT: s_waitcnt vmcnt(0)
963 ; VI-NEXT: buffer_wbinvl1_vol
966 ; GFX9-LABEL: atomic_and_i32_addr64:
967 ; GFX9: ; %bb.0: ; %entry
968 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
969 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
970 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
971 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
972 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
973 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
974 ; GFX9-NEXT: s_add_u32 s0, s4, s0
975 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
976 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
977 ; GFX9-NEXT: global_atomic_and v0, v1, s[0:1]
978 ; GFX9-NEXT: s_waitcnt vmcnt(0)
979 ; GFX9-NEXT: buffer_wbinvl1_vol
980 ; GFX9-NEXT: s_endpgm
982 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
983 %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
987 define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
988 ; SI-LABEL: atomic_and_i32_ret_addr64:
989 ; SI: ; %bb.0: ; %entry
990 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
991 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
992 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
993 ; SI-NEXT: s_mov_b32 s3, 0xf000
994 ; SI-NEXT: s_waitcnt lgkmcnt(0)
995 ; SI-NEXT: s_mov_b32 s0, s6
996 ; SI-NEXT: s_mov_b32 s1, s7
997 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
998 ; SI-NEXT: s_mov_b32 s6, 0
999 ; SI-NEXT: s_mov_b32 s7, s3
1000 ; SI-NEXT: v_mov_b32_e32 v2, s2
1001 ; SI-NEXT: v_mov_b32_e32 v0, s8
1002 ; SI-NEXT: v_mov_b32_e32 v1, s9
1003 ; SI-NEXT: buffer_atomic_and v2, v[0:1], s[4:7], 0 addr64 glc
1004 ; SI-NEXT: s_waitcnt vmcnt(0)
1005 ; SI-NEXT: buffer_wbinvl1
1006 ; SI-NEXT: s_mov_b32 s2, -1
1007 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
1010 ; VI-LABEL: atomic_and_i32_ret_addr64:
1011 ; VI: ; %bb.0: ; %entry
1012 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1013 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1014 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
1015 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1016 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1017 ; VI-NEXT: s_add_u32 s0, s4, s0
1018 ; VI-NEXT: s_addc_u32 s1, s5, s1
1019 ; VI-NEXT: v_mov_b32_e32 v0, s0
1020 ; VI-NEXT: v_mov_b32_e32 v1, s1
1021 ; VI-NEXT: v_mov_b32_e32 v2, s8
1022 ; VI-NEXT: flat_atomic_and v0, v[0:1], v2 glc
1023 ; VI-NEXT: s_waitcnt vmcnt(0)
1024 ; VI-NEXT: buffer_wbinvl1_vol
1025 ; VI-NEXT: s_mov_b32 s3, 0xf000
1026 ; VI-NEXT: s_mov_b32 s2, -1
1027 ; VI-NEXT: s_mov_b32 s0, s6
1028 ; VI-NEXT: s_mov_b32 s1, s7
1029 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1032 ; GFX9-LABEL: atomic_and_i32_ret_addr64:
1033 ; GFX9: ; %bb.0: ; %entry
1034 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1035 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1036 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
1037 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1038 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1039 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1040 ; GFX9-NEXT: s_add_u32 s0, s4, s0
1041 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
1042 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
1043 ; GFX9-NEXT: global_atomic_and v1, v0, v1, s[0:1] glc
1044 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1045 ; GFX9-NEXT: buffer_wbinvl1_vol
1046 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
1047 ; GFX9-NEXT: s_endpgm
1049 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1050 %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
1051 store i32 %val, ptr addrspace(1) %out2
1055 define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) {
1056 ; SI-LABEL: atomic_sub_i32_offset:
1057 ; SI: ; %bb.0: ; %entry
1058 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
1059 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1060 ; SI-NEXT: s_mov_b32 s3, 0xf000
1061 ; SI-NEXT: s_mov_b32 s2, -1
1062 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1063 ; SI-NEXT: v_mov_b32_e32 v0, s4
1064 ; SI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 offset:16
1065 ; SI-NEXT: s_waitcnt vmcnt(0)
1066 ; SI-NEXT: buffer_wbinvl1
1069 ; VI-LABEL: atomic_sub_i32_offset:
1070 ; VI: ; %bb.0: ; %entry
1071 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
1072 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1073 ; VI-NEXT: s_mov_b32 s3, 0xf000
1074 ; VI-NEXT: s_mov_b32 s2, -1
1075 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1076 ; VI-NEXT: v_mov_b32_e32 v0, s4
1077 ; VI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 offset:16
1078 ; VI-NEXT: s_waitcnt vmcnt(0)
1079 ; VI-NEXT: buffer_wbinvl1_vol
1082 ; GFX9-LABEL: atomic_sub_i32_offset:
1083 ; GFX9: ; %bb.0: ; %entry
1084 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
1085 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1086 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1087 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1088 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
1089 ; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3] offset:16
1090 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1091 ; GFX9-NEXT: buffer_wbinvl1_vol
1092 ; GFX9-NEXT: s_endpgm
1094 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1095 %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
1099 define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
1100 ; SI-LABEL: atomic_sub_i32_ret_offset:
1101 ; SI: ; %bb.0: ; %entry
1102 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1103 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
1104 ; SI-NEXT: s_mov_b32 s3, 0xf000
1105 ; SI-NEXT: s_mov_b32 s2, -1
1106 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1107 ; SI-NEXT: s_mov_b32 s0, s6
1108 ; SI-NEXT: s_mov_b32 s1, s7
1109 ; SI-NEXT: s_mov_b32 s6, s2
1110 ; SI-NEXT: s_mov_b32 s7, s3
1111 ; SI-NEXT: v_mov_b32_e32 v0, s8
1112 ; SI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc
1113 ; SI-NEXT: s_waitcnt vmcnt(0)
1114 ; SI-NEXT: buffer_wbinvl1
1115 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1118 ; VI-LABEL: atomic_sub_i32_ret_offset:
1119 ; VI: ; %bb.0: ; %entry
1120 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1121 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
1122 ; VI-NEXT: s_mov_b32 s3, 0xf000
1123 ; VI-NEXT: s_mov_b32 s2, -1
1124 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1125 ; VI-NEXT: s_mov_b32 s0, s6
1126 ; VI-NEXT: s_mov_b32 s1, s7
1127 ; VI-NEXT: s_mov_b32 s6, s2
1128 ; VI-NEXT: s_mov_b32 s7, s3
1129 ; VI-NEXT: v_mov_b32_e32 v0, s8
1130 ; VI-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 offset:16 glc
1131 ; VI-NEXT: s_waitcnt vmcnt(0)
1132 ; VI-NEXT: buffer_wbinvl1_vol
1133 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1136 ; GFX9-LABEL: atomic_sub_i32_ret_offset:
1137 ; GFX9: ; %bb.0: ; %entry
1138 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
1139 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1140 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1141 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1142 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1143 ; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[4:5] offset:16 glc
1144 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1145 ; GFX9-NEXT: buffer_wbinvl1_vol
1146 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
1147 ; GFX9-NEXT: s_endpgm
1149 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1150 %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
1151 store i32 %val, ptr addrspace(1) %out2
1155 define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
1156 ; SI-LABEL: atomic_sub_i32_addr64_offset:
1157 ; SI: ; %bb.0: ; %entry
1158 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
1159 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
1160 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1161 ; SI-NEXT: s_mov_b32 s3, 0xf000
1162 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1163 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
1164 ; SI-NEXT: s_mov_b32 s2, 0
1165 ; SI-NEXT: v_mov_b32_e32 v2, s6
1166 ; SI-NEXT: v_mov_b32_e32 v0, s4
1167 ; SI-NEXT: v_mov_b32_e32 v1, s5
1168 ; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[0:3], 0 addr64 offset:16
1169 ; SI-NEXT: s_waitcnt vmcnt(0)
1170 ; SI-NEXT: buffer_wbinvl1
1173 ; VI-LABEL: atomic_sub_i32_addr64_offset:
1174 ; VI: ; %bb.0: ; %entry
1175 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1176 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1177 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
1178 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1179 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1180 ; VI-NEXT: s_add_u32 s0, s4, s0
1181 ; VI-NEXT: s_addc_u32 s1, s5, s1
1182 ; VI-NEXT: s_add_u32 s0, s0, 16
1183 ; VI-NEXT: s_addc_u32 s1, s1, 0
1184 ; VI-NEXT: v_mov_b32_e32 v0, s0
1185 ; VI-NEXT: v_mov_b32_e32 v1, s1
1186 ; VI-NEXT: v_mov_b32_e32 v2, s6
1187 ; VI-NEXT: flat_atomic_sub v[0:1], v2
1188 ; VI-NEXT: s_waitcnt vmcnt(0)
1189 ; VI-NEXT: buffer_wbinvl1_vol
1192 ; GFX9-LABEL: atomic_sub_i32_addr64_offset:
1193 ; GFX9: ; %bb.0: ; %entry
1194 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1195 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1196 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
1197 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1198 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1199 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1200 ; GFX9-NEXT: s_add_u32 s0, s4, s0
1201 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
1202 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1203 ; GFX9-NEXT: global_atomic_sub v0, v1, s[0:1] offset:16
1204 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1205 ; GFX9-NEXT: buffer_wbinvl1_vol
1206 ; GFX9-NEXT: s_endpgm
1208 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1209 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
1210 %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
1214 define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
1215 ; SI-LABEL: atomic_sub_i32_ret_addr64_offset:
1216 ; SI: ; %bb.0: ; %entry
1217 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1218 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
1219 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
1220 ; SI-NEXT: s_mov_b32 s3, 0xf000
1221 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1222 ; SI-NEXT: s_mov_b32 s0, s6
1223 ; SI-NEXT: s_mov_b32 s1, s7
1224 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
1225 ; SI-NEXT: s_mov_b32 s6, 0
1226 ; SI-NEXT: s_mov_b32 s7, s3
1227 ; SI-NEXT: v_mov_b32_e32 v2, s2
1228 ; SI-NEXT: v_mov_b32_e32 v0, s8
1229 ; SI-NEXT: v_mov_b32_e32 v1, s9
1230 ; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
1231 ; SI-NEXT: s_waitcnt vmcnt(0)
1232 ; SI-NEXT: buffer_wbinvl1
1233 ; SI-NEXT: s_mov_b32 s2, -1
1234 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
1237 ; VI-LABEL: atomic_sub_i32_ret_addr64_offset:
1238 ; VI: ; %bb.0: ; %entry
1239 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1240 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1241 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
1242 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1243 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1244 ; VI-NEXT: s_add_u32 s0, s4, s0
1245 ; VI-NEXT: s_addc_u32 s1, s5, s1
1246 ; VI-NEXT: s_add_u32 s0, s0, 16
1247 ; VI-NEXT: s_addc_u32 s1, s1, 0
1248 ; VI-NEXT: v_mov_b32_e32 v0, s0
1249 ; VI-NEXT: v_mov_b32_e32 v1, s1
1250 ; VI-NEXT: v_mov_b32_e32 v2, s8
1251 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1252 ; VI-NEXT: s_waitcnt vmcnt(0)
1253 ; VI-NEXT: buffer_wbinvl1_vol
1254 ; VI-NEXT: s_mov_b32 s3, 0xf000
1255 ; VI-NEXT: s_mov_b32 s2, -1
1256 ; VI-NEXT: s_mov_b32 s0, s6
1257 ; VI-NEXT: s_mov_b32 s1, s7
1258 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1261 ; GFX9-LABEL: atomic_sub_i32_ret_addr64_offset:
1262 ; GFX9: ; %bb.0: ; %entry
1263 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1264 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1265 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
1266 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1267 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1268 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1269 ; GFX9-NEXT: s_add_u32 s0, s4, s0
1270 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
1271 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
1272 ; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[0:1] offset:16 glc
1273 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1274 ; GFX9-NEXT: buffer_wbinvl1_vol
1275 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
1276 ; GFX9-NEXT: s_endpgm
1278 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1279 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
1280 %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
1281 store i32 %val, ptr addrspace(1) %out2
1285 define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) {
1286 ; SI-LABEL: atomic_sub_i32:
1287 ; SI: ; %bb.0: ; %entry
1288 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
1289 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1290 ; SI-NEXT: s_mov_b32 s3, 0xf000
1291 ; SI-NEXT: s_mov_b32 s2, -1
1292 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1293 ; SI-NEXT: v_mov_b32_e32 v0, s4
1294 ; SI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0
1295 ; SI-NEXT: s_waitcnt vmcnt(0)
1296 ; SI-NEXT: buffer_wbinvl1
1299 ; VI-LABEL: atomic_sub_i32:
1300 ; VI: ; %bb.0: ; %entry
1301 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
1302 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1303 ; VI-NEXT: s_mov_b32 s3, 0xf000
1304 ; VI-NEXT: s_mov_b32 s2, -1
1305 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1306 ; VI-NEXT: v_mov_b32_e32 v0, s4
1307 ; VI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0
1308 ; VI-NEXT: s_waitcnt vmcnt(0)
1309 ; VI-NEXT: buffer_wbinvl1_vol
1312 ; GFX9-LABEL: atomic_sub_i32:
1313 ; GFX9: ; %bb.0: ; %entry
1314 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
1315 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1316 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1317 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1318 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
1319 ; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3]
1320 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1321 ; GFX9-NEXT: buffer_wbinvl1_vol
1322 ; GFX9-NEXT: s_endpgm
1324 %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
1328 define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
1329 ; SI-LABEL: atomic_sub_i32_ret:
1330 ; SI: ; %bb.0: ; %entry
1331 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1332 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
1333 ; SI-NEXT: s_mov_b32 s3, 0xf000
1334 ; SI-NEXT: s_mov_b32 s2, -1
1335 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1336 ; SI-NEXT: s_mov_b32 s0, s4
1337 ; SI-NEXT: s_mov_b32 s1, s5
1338 ; SI-NEXT: v_mov_b32_e32 v0, s8
1339 ; SI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 glc
1340 ; SI-NEXT: s_waitcnt vmcnt(0)
1341 ; SI-NEXT: buffer_wbinvl1
1342 ; SI-NEXT: s_mov_b32 s0, s6
1343 ; SI-NEXT: s_mov_b32 s1, s7
1344 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1347 ; VI-LABEL: atomic_sub_i32_ret:
1348 ; VI: ; %bb.0: ; %entry
1349 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1350 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
1351 ; VI-NEXT: s_mov_b32 s3, 0xf000
1352 ; VI-NEXT: s_mov_b32 s2, -1
1353 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1354 ; VI-NEXT: s_mov_b32 s0, s4
1355 ; VI-NEXT: s_mov_b32 s1, s5
1356 ; VI-NEXT: v_mov_b32_e32 v0, s8
1357 ; VI-NEXT: buffer_atomic_sub v0, off, s[0:3], 0 glc
1358 ; VI-NEXT: s_waitcnt vmcnt(0)
1359 ; VI-NEXT: buffer_wbinvl1_vol
1360 ; VI-NEXT: s_mov_b32 s0, s6
1361 ; VI-NEXT: s_mov_b32 s1, s7
1362 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1365 ; GFX9-LABEL: atomic_sub_i32_ret:
1366 ; GFX9: ; %bb.0: ; %entry
1367 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
1368 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1369 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1370 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1371 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1372 ; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[4:5] glc
1373 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1374 ; GFX9-NEXT: buffer_wbinvl1_vol
1375 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
1376 ; GFX9-NEXT: s_endpgm
1378 %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
1379 store i32 %val, ptr addrspace(1) %out2
1383 define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
1384 ; SI-LABEL: atomic_sub_i32_addr64:
1385 ; SI: ; %bb.0: ; %entry
1386 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
1387 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
1388 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1389 ; SI-NEXT: s_mov_b32 s3, 0xf000
1390 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1391 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
1392 ; SI-NEXT: s_mov_b32 s2, 0
1393 ; SI-NEXT: v_mov_b32_e32 v2, s6
1394 ; SI-NEXT: v_mov_b32_e32 v0, s4
1395 ; SI-NEXT: v_mov_b32_e32 v1, s5
1396 ; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[0:3], 0 addr64
1397 ; SI-NEXT: s_waitcnt vmcnt(0)
1398 ; SI-NEXT: buffer_wbinvl1
1401 ; VI-LABEL: atomic_sub_i32_addr64:
1402 ; VI: ; %bb.0: ; %entry
1403 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1404 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1405 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
1406 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1407 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1408 ; VI-NEXT: s_add_u32 s0, s4, s0
1409 ; VI-NEXT: s_addc_u32 s1, s5, s1
1410 ; VI-NEXT: v_mov_b32_e32 v0, s0
1411 ; VI-NEXT: v_mov_b32_e32 v1, s1
1412 ; VI-NEXT: v_mov_b32_e32 v2, s6
1413 ; VI-NEXT: flat_atomic_sub v[0:1], v2
1414 ; VI-NEXT: s_waitcnt vmcnt(0)
1415 ; VI-NEXT: buffer_wbinvl1_vol
1418 ; GFX9-LABEL: atomic_sub_i32_addr64:
1419 ; GFX9: ; %bb.0: ; %entry
1420 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1421 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1422 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
1423 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1424 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1425 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1426 ; GFX9-NEXT: s_add_u32 s0, s4, s0
1427 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
1428 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1429 ; GFX9-NEXT: global_atomic_sub v0, v1, s[0:1]
1430 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1431 ; GFX9-NEXT: buffer_wbinvl1_vol
1432 ; GFX9-NEXT: s_endpgm
1434 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1435 %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
1439 define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
1440 ; SI-LABEL: atomic_sub_i32_ret_addr64:
1441 ; SI: ; %bb.0: ; %entry
1442 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1443 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
1444 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
1445 ; SI-NEXT: s_mov_b32 s3, 0xf000
1446 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1447 ; SI-NEXT: s_mov_b32 s0, s6
1448 ; SI-NEXT: s_mov_b32 s1, s7
1449 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
1450 ; SI-NEXT: s_mov_b32 s6, 0
1451 ; SI-NEXT: s_mov_b32 s7, s3
1452 ; SI-NEXT: v_mov_b32_e32 v2, s2
1453 ; SI-NEXT: v_mov_b32_e32 v0, s8
1454 ; SI-NEXT: v_mov_b32_e32 v1, s9
1455 ; SI-NEXT: buffer_atomic_sub v2, v[0:1], s[4:7], 0 addr64 glc
1456 ; SI-NEXT: s_waitcnt vmcnt(0)
1457 ; SI-NEXT: buffer_wbinvl1
1458 ; SI-NEXT: s_mov_b32 s2, -1
1459 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
1462 ; VI-LABEL: atomic_sub_i32_ret_addr64:
1463 ; VI: ; %bb.0: ; %entry
1464 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1465 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1466 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
1467 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1468 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1469 ; VI-NEXT: s_add_u32 s0, s4, s0
1470 ; VI-NEXT: s_addc_u32 s1, s5, s1
1471 ; VI-NEXT: v_mov_b32_e32 v0, s0
1472 ; VI-NEXT: v_mov_b32_e32 v1, s1
1473 ; VI-NEXT: v_mov_b32_e32 v2, s8
1474 ; VI-NEXT: flat_atomic_sub v0, v[0:1], v2 glc
1475 ; VI-NEXT: s_waitcnt vmcnt(0)
1476 ; VI-NEXT: buffer_wbinvl1_vol
1477 ; VI-NEXT: s_mov_b32 s3, 0xf000
1478 ; VI-NEXT: s_mov_b32 s2, -1
1479 ; VI-NEXT: s_mov_b32 s0, s6
1480 ; VI-NEXT: s_mov_b32 s1, s7
1481 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1484 ; GFX9-LABEL: atomic_sub_i32_ret_addr64:
1485 ; GFX9: ; %bb.0: ; %entry
1486 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1487 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1488 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
1489 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1490 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1491 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1492 ; GFX9-NEXT: s_add_u32 s0, s4, s0
1493 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
1494 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
1495 ; GFX9-NEXT: global_atomic_sub v1, v0, v1, s[0:1] glc
1496 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1497 ; GFX9-NEXT: buffer_wbinvl1_vol
1498 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
1499 ; GFX9-NEXT: s_endpgm
1501 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1502 %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
1503 store i32 %val, ptr addrspace(1) %out2
1507 define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) {
1508 ; SI-LABEL: atomic_max_i32_offset:
1509 ; SI: ; %bb.0: ; %entry
1510 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
1511 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1512 ; SI-NEXT: s_mov_b32 s3, 0xf000
1513 ; SI-NEXT: s_mov_b32 s2, -1
1514 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1515 ; SI-NEXT: v_mov_b32_e32 v0, s4
1516 ; SI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0 offset:16
1517 ; SI-NEXT: s_waitcnt vmcnt(0)
1518 ; SI-NEXT: buffer_wbinvl1
1521 ; VI-LABEL: atomic_max_i32_offset:
1522 ; VI: ; %bb.0: ; %entry
1523 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
1524 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1525 ; VI-NEXT: s_mov_b32 s3, 0xf000
1526 ; VI-NEXT: s_mov_b32 s2, -1
1527 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1528 ; VI-NEXT: v_mov_b32_e32 v0, s4
1529 ; VI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0 offset:16
1530 ; VI-NEXT: s_waitcnt vmcnt(0)
1531 ; VI-NEXT: buffer_wbinvl1_vol
1534 ; GFX9-LABEL: atomic_max_i32_offset:
1535 ; GFX9: ; %bb.0: ; %entry
1536 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
1537 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1538 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1539 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1540 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
1541 ; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] offset:16
1542 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1543 ; GFX9-NEXT: buffer_wbinvl1_vol
1544 ; GFX9-NEXT: s_endpgm
1546 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1547 %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
1551 define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
1552 ; SI-LABEL: atomic_max_i32_ret_offset:
1553 ; SI: ; %bb.0: ; %entry
1554 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1555 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
1556 ; SI-NEXT: s_mov_b32 s3, 0xf000
1557 ; SI-NEXT: s_mov_b32 s2, -1
1558 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1559 ; SI-NEXT: s_mov_b32 s0, s6
1560 ; SI-NEXT: s_mov_b32 s1, s7
1561 ; SI-NEXT: s_mov_b32 s6, s2
1562 ; SI-NEXT: s_mov_b32 s7, s3
1563 ; SI-NEXT: v_mov_b32_e32 v0, s8
1564 ; SI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0 offset:16 glc
1565 ; SI-NEXT: s_waitcnt vmcnt(0)
1566 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1569 ; VI-LABEL: atomic_max_i32_ret_offset:
1570 ; VI: ; %bb.0: ; %entry
1571 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1572 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
1573 ; VI-NEXT: s_mov_b32 s3, 0xf000
1574 ; VI-NEXT: s_mov_b32 s2, -1
1575 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1576 ; VI-NEXT: s_mov_b32 s0, s6
1577 ; VI-NEXT: s_mov_b32 s1, s7
1578 ; VI-NEXT: s_mov_b32 s6, s2
1579 ; VI-NEXT: s_mov_b32 s7, s3
1580 ; VI-NEXT: v_mov_b32_e32 v0, s8
1581 ; VI-NEXT: buffer_atomic_smax v0, off, s[4:7], 0 offset:16 glc
1582 ; VI-NEXT: s_waitcnt vmcnt(0)
1583 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1586 ; GFX9-LABEL: atomic_max_i32_ret_offset:
1587 ; GFX9: ; %bb.0: ; %entry
1588 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
1589 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1590 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1591 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1592 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1593 ; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[4:5] offset:16 glc
1594 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1595 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
1596 ; GFX9-NEXT: s_endpgm
1598 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1599 %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1600 store i32 %val, ptr addrspace(1) %out2
1604 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
1605 ; SI-LABEL: atomic_max_i32_addr64_offset:
1606 ; SI: ; %bb.0: ; %entry
1607 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
1608 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
1609 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1610 ; SI-NEXT: s_mov_b32 s3, 0xf000
1611 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1612 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
1613 ; SI-NEXT: s_mov_b32 s2, 0
1614 ; SI-NEXT: v_mov_b32_e32 v2, s6
1615 ; SI-NEXT: v_mov_b32_e32 v0, s4
1616 ; SI-NEXT: v_mov_b32_e32 v1, s5
1617 ; SI-NEXT: buffer_atomic_smax v2, v[0:1], s[0:3], 0 addr64 offset:16
1620 ; VI-LABEL: atomic_max_i32_addr64_offset:
1621 ; VI: ; %bb.0: ; %entry
1622 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1623 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1624 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
1625 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1626 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1627 ; VI-NEXT: s_add_u32 s0, s4, s0
1628 ; VI-NEXT: s_addc_u32 s1, s5, s1
1629 ; VI-NEXT: s_add_u32 s0, s0, 16
1630 ; VI-NEXT: s_addc_u32 s1, s1, 0
1631 ; VI-NEXT: v_mov_b32_e32 v0, s0
1632 ; VI-NEXT: v_mov_b32_e32 v1, s1
1633 ; VI-NEXT: v_mov_b32_e32 v2, s6
1634 ; VI-NEXT: flat_atomic_smax v[0:1], v2
1637 ; GFX9-LABEL: atomic_max_i32_addr64_offset:
1638 ; GFX9: ; %bb.0: ; %entry
1639 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1640 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1641 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
1642 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1643 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1644 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1645 ; GFX9-NEXT: s_add_u32 s0, s4, s0
1646 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
1647 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1648 ; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1] offset:16
1649 ; GFX9-NEXT: s_endpgm
1651 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1652 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
1653 %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1657 define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
1658 ; SI-LABEL: atomic_max_i32_ret_addr64_offset:
1659 ; SI: ; %bb.0: ; %entry
1660 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1661 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
1662 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
1663 ; SI-NEXT: s_mov_b32 s3, 0xf000
1664 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1665 ; SI-NEXT: s_mov_b32 s0, s6
1666 ; SI-NEXT: s_mov_b32 s1, s7
1667 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
1668 ; SI-NEXT: s_mov_b32 s6, 0
1669 ; SI-NEXT: s_mov_b32 s7, s3
1670 ; SI-NEXT: v_mov_b32_e32 v2, s2
1671 ; SI-NEXT: v_mov_b32_e32 v0, s8
1672 ; SI-NEXT: v_mov_b32_e32 v1, s9
1673 ; SI-NEXT: buffer_atomic_smax v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
1674 ; SI-NEXT: s_mov_b32 s2, -1
1675 ; SI-NEXT: s_waitcnt vmcnt(0)
1676 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
1679 ; VI-LABEL: atomic_max_i32_ret_addr64_offset:
1680 ; VI: ; %bb.0: ; %entry
1681 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1682 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1683 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
1684 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1685 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1686 ; VI-NEXT: s_add_u32 s0, s4, s0
1687 ; VI-NEXT: s_addc_u32 s1, s5, s1
1688 ; VI-NEXT: s_add_u32 s0, s0, 16
1689 ; VI-NEXT: s_addc_u32 s1, s1, 0
1690 ; VI-NEXT: v_mov_b32_e32 v0, s0
1691 ; VI-NEXT: v_mov_b32_e32 v1, s1
1692 ; VI-NEXT: v_mov_b32_e32 v2, s8
1693 ; VI-NEXT: flat_atomic_smax v0, v[0:1], v2 glc
1694 ; VI-NEXT: s_mov_b32 s3, 0xf000
1695 ; VI-NEXT: s_mov_b32 s2, -1
1696 ; VI-NEXT: s_mov_b32 s0, s6
1697 ; VI-NEXT: s_mov_b32 s1, s7
1698 ; VI-NEXT: s_waitcnt vmcnt(0)
1699 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1702 ; GFX9-LABEL: atomic_max_i32_ret_addr64_offset:
1703 ; GFX9: ; %bb.0: ; %entry
1704 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1705 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1706 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
1707 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1708 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1709 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1710 ; GFX9-NEXT: s_add_u32 s0, s4, s0
1711 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
1712 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
1713 ; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[0:1] offset:16 glc
1714 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1715 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
1716 ; GFX9-NEXT: s_endpgm
1718 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1719 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
1720 %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1721 store i32 %val, ptr addrspace(1) %out2
1725 define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) {
1726 ; SI-LABEL: atomic_max_i32:
1727 ; SI: ; %bb.0: ; %entry
1728 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
1729 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1730 ; SI-NEXT: s_mov_b32 s3, 0xf000
1731 ; SI-NEXT: s_mov_b32 s2, -1
1732 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1733 ; SI-NEXT: v_mov_b32_e32 v0, s4
1734 ; SI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0
1737 ; VI-LABEL: atomic_max_i32:
1738 ; VI: ; %bb.0: ; %entry
1739 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
1740 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1741 ; VI-NEXT: s_mov_b32 s3, 0xf000
1742 ; VI-NEXT: s_mov_b32 s2, -1
1743 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1744 ; VI-NEXT: v_mov_b32_e32 v0, s4
1745 ; VI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0
1748 ; GFX9-LABEL: atomic_max_i32:
1749 ; GFX9: ; %bb.0: ; %entry
1750 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
1751 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1752 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1753 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1754 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
1755 ; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3]
1756 ; GFX9-NEXT: s_endpgm
1758 %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
1762 define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
1763 ; SI-LABEL: atomic_max_i32_ret:
1764 ; SI: ; %bb.0: ; %entry
1765 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1766 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
1767 ; SI-NEXT: s_mov_b32 s3, 0xf000
1768 ; SI-NEXT: s_mov_b32 s2, -1
1769 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1770 ; SI-NEXT: s_mov_b32 s0, s4
1771 ; SI-NEXT: s_mov_b32 s1, s5
1772 ; SI-NEXT: v_mov_b32_e32 v0, s8
1773 ; SI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0 glc
1774 ; SI-NEXT: s_mov_b32 s0, s6
1775 ; SI-NEXT: s_mov_b32 s1, s7
1776 ; SI-NEXT: s_waitcnt vmcnt(0)
1777 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1780 ; VI-LABEL: atomic_max_i32_ret:
1781 ; VI: ; %bb.0: ; %entry
1782 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1783 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
1784 ; VI-NEXT: s_mov_b32 s3, 0xf000
1785 ; VI-NEXT: s_mov_b32 s2, -1
1786 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1787 ; VI-NEXT: s_mov_b32 s0, s4
1788 ; VI-NEXT: s_mov_b32 s1, s5
1789 ; VI-NEXT: v_mov_b32_e32 v0, s8
1790 ; VI-NEXT: buffer_atomic_smax v0, off, s[0:3], 0 glc
1791 ; VI-NEXT: s_mov_b32 s0, s6
1792 ; VI-NEXT: s_mov_b32 s1, s7
1793 ; VI-NEXT: s_waitcnt vmcnt(0)
1794 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1797 ; GFX9-LABEL: atomic_max_i32_ret:
1798 ; GFX9: ; %bb.0: ; %entry
1799 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
1800 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1801 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1802 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1803 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1804 ; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[4:5] glc
1805 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1806 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
1807 ; GFX9-NEXT: s_endpgm
1809 %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
1810 store i32 %val, ptr addrspace(1) %out2
1814 define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
1815 ; SI-LABEL: atomic_max_i32_addr64:
1816 ; SI: ; %bb.0: ; %entry
1817 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
1818 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
1819 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1820 ; SI-NEXT: s_mov_b32 s3, 0xf000
1821 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1822 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
1823 ; SI-NEXT: s_mov_b32 s2, 0
1824 ; SI-NEXT: v_mov_b32_e32 v2, s6
1825 ; SI-NEXT: v_mov_b32_e32 v0, s4
1826 ; SI-NEXT: v_mov_b32_e32 v1, s5
1827 ; SI-NEXT: buffer_atomic_smax v2, v[0:1], s[0:3], 0 addr64
1830 ; VI-LABEL: atomic_max_i32_addr64:
1831 ; VI: ; %bb.0: ; %entry
1832 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1833 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1834 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
1835 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1836 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1837 ; VI-NEXT: s_add_u32 s0, s4, s0
1838 ; VI-NEXT: s_addc_u32 s1, s5, s1
1839 ; VI-NEXT: v_mov_b32_e32 v0, s0
1840 ; VI-NEXT: v_mov_b32_e32 v1, s1
1841 ; VI-NEXT: v_mov_b32_e32 v2, s6
1842 ; VI-NEXT: flat_atomic_smax v[0:1], v2
1845 ; GFX9-LABEL: atomic_max_i32_addr64:
1846 ; GFX9: ; %bb.0: ; %entry
1847 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1848 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1849 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
1850 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1851 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1852 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1853 ; GFX9-NEXT: s_add_u32 s0, s4, s0
1854 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
1855 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
1856 ; GFX9-NEXT: global_atomic_smax v0, v1, s[0:1]
1857 ; GFX9-NEXT: s_endpgm
1859 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1860 %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
1864 define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
1865 ; SI-LABEL: atomic_max_i32_ret_addr64:
1866 ; SI: ; %bb.0: ; %entry
1867 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1868 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
1869 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
1870 ; SI-NEXT: s_mov_b32 s3, 0xf000
1871 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1872 ; SI-NEXT: s_mov_b32 s0, s6
1873 ; SI-NEXT: s_mov_b32 s1, s7
1874 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
1875 ; SI-NEXT: s_mov_b32 s6, 0
1876 ; SI-NEXT: s_mov_b32 s7, s3
1877 ; SI-NEXT: v_mov_b32_e32 v2, s2
1878 ; SI-NEXT: v_mov_b32_e32 v0, s8
1879 ; SI-NEXT: v_mov_b32_e32 v1, s9
1880 ; SI-NEXT: buffer_atomic_smax v2, v[0:1], s[4:7], 0 addr64 glc
1881 ; SI-NEXT: s_mov_b32 s2, -1
1882 ; SI-NEXT: s_waitcnt vmcnt(0)
1883 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
1886 ; VI-LABEL: atomic_max_i32_ret_addr64:
1887 ; VI: ; %bb.0: ; %entry
1888 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1889 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1890 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
1891 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1892 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1893 ; VI-NEXT: s_add_u32 s0, s4, s0
1894 ; VI-NEXT: s_addc_u32 s1, s5, s1
1895 ; VI-NEXT: v_mov_b32_e32 v0, s0
1896 ; VI-NEXT: v_mov_b32_e32 v1, s1
1897 ; VI-NEXT: v_mov_b32_e32 v2, s8
1898 ; VI-NEXT: flat_atomic_smax v0, v[0:1], v2 glc
1899 ; VI-NEXT: s_mov_b32 s3, 0xf000
1900 ; VI-NEXT: s_mov_b32 s2, -1
1901 ; VI-NEXT: s_mov_b32 s0, s6
1902 ; VI-NEXT: s_mov_b32 s1, s7
1903 ; VI-NEXT: s_waitcnt vmcnt(0)
1904 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1907 ; GFX9-LABEL: atomic_max_i32_ret_addr64:
1908 ; GFX9: ; %bb.0: ; %entry
1909 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1910 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1911 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
1912 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1913 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1914 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1915 ; GFX9-NEXT: s_add_u32 s0, s4, s0
1916 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
1917 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
1918 ; GFX9-NEXT: global_atomic_smax v1, v0, v1, s[0:1] glc
1919 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1920 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
1921 ; GFX9-NEXT: s_endpgm
1923 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1924 %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
1925 store i32 %val, ptr addrspace(1) %out2
1929 define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) {
1930 ; SI-LABEL: atomic_umax_i32_offset:
1931 ; SI: ; %bb.0: ; %entry
1932 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
1933 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1934 ; SI-NEXT: s_mov_b32 s3, 0xf000
1935 ; SI-NEXT: s_mov_b32 s2, -1
1936 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1937 ; SI-NEXT: v_mov_b32_e32 v0, s4
1938 ; SI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0 offset:16
1941 ; VI-LABEL: atomic_umax_i32_offset:
1942 ; VI: ; %bb.0: ; %entry
1943 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
1944 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1945 ; VI-NEXT: s_mov_b32 s3, 0xf000
1946 ; VI-NEXT: s_mov_b32 s2, -1
1947 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1948 ; VI-NEXT: v_mov_b32_e32 v0, s4
1949 ; VI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0 offset:16
1952 ; GFX9-LABEL: atomic_umax_i32_offset:
1953 ; GFX9: ; %bb.0: ; %entry
1954 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
1955 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1956 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1957 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1958 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
1959 ; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] offset:16
1960 ; GFX9-NEXT: s_endpgm
1962 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1963 %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1967 define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
1968 ; SI-LABEL: atomic_umax_i32_ret_offset:
1969 ; SI: ; %bb.0: ; %entry
1970 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1971 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
1972 ; SI-NEXT: s_mov_b32 s3, 0xf000
1973 ; SI-NEXT: s_mov_b32 s2, -1
1974 ; SI-NEXT: s_waitcnt lgkmcnt(0)
1975 ; SI-NEXT: s_mov_b32 s0, s6
1976 ; SI-NEXT: s_mov_b32 s1, s7
1977 ; SI-NEXT: s_mov_b32 s6, s2
1978 ; SI-NEXT: s_mov_b32 s7, s3
1979 ; SI-NEXT: v_mov_b32_e32 v0, s8
1980 ; SI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0 offset:16 glc
1981 ; SI-NEXT: s_waitcnt vmcnt(0)
1982 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
1985 ; VI-LABEL: atomic_umax_i32_ret_offset:
1986 ; VI: ; %bb.0: ; %entry
1987 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1988 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
1989 ; VI-NEXT: s_mov_b32 s3, 0xf000
1990 ; VI-NEXT: s_mov_b32 s2, -1
1991 ; VI-NEXT: s_waitcnt lgkmcnt(0)
1992 ; VI-NEXT: s_mov_b32 s0, s6
1993 ; VI-NEXT: s_mov_b32 s1, s7
1994 ; VI-NEXT: s_mov_b32 s6, s2
1995 ; VI-NEXT: s_mov_b32 s7, s3
1996 ; VI-NEXT: v_mov_b32_e32 v0, s8
1997 ; VI-NEXT: buffer_atomic_umax v0, off, s[4:7], 0 offset:16 glc
1998 ; VI-NEXT: s_waitcnt vmcnt(0)
1999 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2002 ; GFX9-LABEL: atomic_umax_i32_ret_offset:
2003 ; GFX9: ; %bb.0: ; %entry
2004 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
2005 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2006 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2007 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2008 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
2009 ; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[4:5] offset:16 glc
2010 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2011 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
2012 ; GFX9-NEXT: s_endpgm
2014 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
2015 %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
2016 store i32 %val, ptr addrspace(1) %out2
2020 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
2021 ; SI-LABEL: atomic_umax_i32_addr64_offset:
2022 ; SI: ; %bb.0: ; %entry
2023 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
2024 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
2025 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2026 ; SI-NEXT: s_mov_b32 s3, 0xf000
2027 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2028 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
2029 ; SI-NEXT: s_mov_b32 s2, 0
2030 ; SI-NEXT: v_mov_b32_e32 v2, s6
2031 ; SI-NEXT: v_mov_b32_e32 v0, s4
2032 ; SI-NEXT: v_mov_b32_e32 v1, s5
2033 ; SI-NEXT: buffer_atomic_umax v2, v[0:1], s[0:3], 0 addr64 offset:16
2036 ; VI-LABEL: atomic_umax_i32_addr64_offset:
2037 ; VI: ; %bb.0: ; %entry
2038 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2039 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2040 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
2041 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2042 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2043 ; VI-NEXT: s_add_u32 s0, s4, s0
2044 ; VI-NEXT: s_addc_u32 s1, s5, s1
2045 ; VI-NEXT: s_add_u32 s0, s0, 16
2046 ; VI-NEXT: s_addc_u32 s1, s1, 0
2047 ; VI-NEXT: v_mov_b32_e32 v0, s0
2048 ; VI-NEXT: v_mov_b32_e32 v1, s1
2049 ; VI-NEXT: v_mov_b32_e32 v2, s6
2050 ; VI-NEXT: flat_atomic_umax v[0:1], v2
2053 ; GFX9-LABEL: atomic_umax_i32_addr64_offset:
2054 ; GFX9: ; %bb.0: ; %entry
2055 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2056 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2057 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
2058 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2059 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2060 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2061 ; GFX9-NEXT: s_add_u32 s0, s4, s0
2062 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
2063 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
2064 ; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1] offset:16
2065 ; GFX9-NEXT: s_endpgm
2067 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
2068 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
2069 %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
2073 define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
2074 ; SI-LABEL: atomic_umax_i32_ret_addr64_offset:
2075 ; SI: ; %bb.0: ; %entry
2076 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2077 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
2078 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
2079 ; SI-NEXT: s_mov_b32 s3, 0xf000
2080 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2081 ; SI-NEXT: s_mov_b32 s0, s6
2082 ; SI-NEXT: s_mov_b32 s1, s7
2083 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
2084 ; SI-NEXT: s_mov_b32 s6, 0
2085 ; SI-NEXT: s_mov_b32 s7, s3
2086 ; SI-NEXT: v_mov_b32_e32 v2, s2
2087 ; SI-NEXT: v_mov_b32_e32 v0, s8
2088 ; SI-NEXT: v_mov_b32_e32 v1, s9
2089 ; SI-NEXT: buffer_atomic_umax v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
2090 ; SI-NEXT: s_mov_b32 s2, -1
2091 ; SI-NEXT: s_waitcnt vmcnt(0)
2092 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
2095 ; VI-LABEL: atomic_umax_i32_ret_addr64_offset:
2096 ; VI: ; %bb.0: ; %entry
2097 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2098 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2099 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
2100 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2101 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2102 ; VI-NEXT: s_add_u32 s0, s4, s0
2103 ; VI-NEXT: s_addc_u32 s1, s5, s1
2104 ; VI-NEXT: s_add_u32 s0, s0, 16
2105 ; VI-NEXT: s_addc_u32 s1, s1, 0
2106 ; VI-NEXT: v_mov_b32_e32 v0, s0
2107 ; VI-NEXT: v_mov_b32_e32 v1, s1
2108 ; VI-NEXT: v_mov_b32_e32 v2, s8
2109 ; VI-NEXT: flat_atomic_umax v0, v[0:1], v2 glc
2110 ; VI-NEXT: s_mov_b32 s3, 0xf000
2111 ; VI-NEXT: s_mov_b32 s2, -1
2112 ; VI-NEXT: s_mov_b32 s0, s6
2113 ; VI-NEXT: s_mov_b32 s1, s7
2114 ; VI-NEXT: s_waitcnt vmcnt(0)
2115 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2118 ; GFX9-LABEL: atomic_umax_i32_ret_addr64_offset:
2119 ; GFX9: ; %bb.0: ; %entry
2120 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2121 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2122 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
2123 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2124 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2125 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2126 ; GFX9-NEXT: s_add_u32 s0, s4, s0
2127 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
2128 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
2129 ; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[0:1] offset:16 glc
2130 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2131 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
2132 ; GFX9-NEXT: s_endpgm
2134 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
2135 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
2136 %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
2137 store i32 %val, ptr addrspace(1) %out2
2141 define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) {
2142 ; SI-LABEL: atomic_umax_i32:
2143 ; SI: ; %bb.0: ; %entry
2144 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
2145 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2146 ; SI-NEXT: s_mov_b32 s3, 0xf000
2147 ; SI-NEXT: s_mov_b32 s2, -1
2148 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2149 ; SI-NEXT: v_mov_b32_e32 v0, s4
2150 ; SI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0
2153 ; VI-LABEL: atomic_umax_i32:
2154 ; VI: ; %bb.0: ; %entry
2155 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
2156 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2157 ; VI-NEXT: s_mov_b32 s3, 0xf000
2158 ; VI-NEXT: s_mov_b32 s2, -1
2159 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2160 ; VI-NEXT: v_mov_b32_e32 v0, s4
2161 ; VI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0
2164 ; GFX9-LABEL: atomic_umax_i32:
2165 ; GFX9: ; %bb.0: ; %entry
2166 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
2167 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2168 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2169 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2170 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
2171 ; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3]
2172 ; GFX9-NEXT: s_endpgm
2174 %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
2178 define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
2179 ; SI-LABEL: atomic_umax_i32_ret:
2180 ; SI: ; %bb.0: ; %entry
2181 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2182 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
2183 ; SI-NEXT: s_mov_b32 s3, 0xf000
2184 ; SI-NEXT: s_mov_b32 s2, -1
2185 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2186 ; SI-NEXT: s_mov_b32 s0, s4
2187 ; SI-NEXT: s_mov_b32 s1, s5
2188 ; SI-NEXT: v_mov_b32_e32 v0, s8
2189 ; SI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0 glc
2190 ; SI-NEXT: s_mov_b32 s0, s6
2191 ; SI-NEXT: s_mov_b32 s1, s7
2192 ; SI-NEXT: s_waitcnt vmcnt(0)
2193 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2196 ; VI-LABEL: atomic_umax_i32_ret:
2197 ; VI: ; %bb.0: ; %entry
2198 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2199 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
2200 ; VI-NEXT: s_mov_b32 s3, 0xf000
2201 ; VI-NEXT: s_mov_b32 s2, -1
2202 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2203 ; VI-NEXT: s_mov_b32 s0, s4
2204 ; VI-NEXT: s_mov_b32 s1, s5
2205 ; VI-NEXT: v_mov_b32_e32 v0, s8
2206 ; VI-NEXT: buffer_atomic_umax v0, off, s[0:3], 0 glc
2207 ; VI-NEXT: s_mov_b32 s0, s6
2208 ; VI-NEXT: s_mov_b32 s1, s7
2209 ; VI-NEXT: s_waitcnt vmcnt(0)
2210 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2213 ; GFX9-LABEL: atomic_umax_i32_ret:
2214 ; GFX9: ; %bb.0: ; %entry
2215 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
2216 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2217 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2218 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2219 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
2220 ; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[4:5] glc
2221 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2222 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
2223 ; GFX9-NEXT: s_endpgm
2225 %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
2226 store i32 %val, ptr addrspace(1) %out2
2230 define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
2231 ; SI-LABEL: atomic_umax_i32_addr64:
2232 ; SI: ; %bb.0: ; %entry
2233 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
2234 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
2235 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2236 ; SI-NEXT: s_mov_b32 s3, 0xf000
2237 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2238 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
2239 ; SI-NEXT: s_mov_b32 s2, 0
2240 ; SI-NEXT: v_mov_b32_e32 v2, s6
2241 ; SI-NEXT: v_mov_b32_e32 v0, s4
2242 ; SI-NEXT: v_mov_b32_e32 v1, s5
2243 ; SI-NEXT: buffer_atomic_umax v2, v[0:1], s[0:3], 0 addr64
2246 ; VI-LABEL: atomic_umax_i32_addr64:
2247 ; VI: ; %bb.0: ; %entry
2248 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2249 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2250 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
2251 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2252 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2253 ; VI-NEXT: s_add_u32 s0, s4, s0
2254 ; VI-NEXT: s_addc_u32 s1, s5, s1
2255 ; VI-NEXT: v_mov_b32_e32 v0, s0
2256 ; VI-NEXT: v_mov_b32_e32 v1, s1
2257 ; VI-NEXT: v_mov_b32_e32 v2, s6
2258 ; VI-NEXT: flat_atomic_umax v[0:1], v2
2261 ; GFX9-LABEL: atomic_umax_i32_addr64:
2262 ; GFX9: ; %bb.0: ; %entry
2263 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2264 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2265 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
2266 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2267 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2268 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2269 ; GFX9-NEXT: s_add_u32 s0, s4, s0
2270 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
2271 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
2272 ; GFX9-NEXT: global_atomic_umax v0, v1, s[0:1]
2273 ; GFX9-NEXT: s_endpgm
2275 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
2276 %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
2280 define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
2281 ; SI-LABEL: atomic_umax_i32_ret_addr64:
2282 ; SI: ; %bb.0: ; %entry
2283 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2284 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
2285 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
2286 ; SI-NEXT: s_mov_b32 s3, 0xf000
2287 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2288 ; SI-NEXT: s_mov_b32 s0, s6
2289 ; SI-NEXT: s_mov_b32 s1, s7
2290 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
2291 ; SI-NEXT: s_mov_b32 s6, 0
2292 ; SI-NEXT: s_mov_b32 s7, s3
2293 ; SI-NEXT: v_mov_b32_e32 v2, s2
2294 ; SI-NEXT: v_mov_b32_e32 v0, s8
2295 ; SI-NEXT: v_mov_b32_e32 v1, s9
2296 ; SI-NEXT: buffer_atomic_umax v2, v[0:1], s[4:7], 0 addr64 glc
2297 ; SI-NEXT: s_mov_b32 s2, -1
2298 ; SI-NEXT: s_waitcnt vmcnt(0)
2299 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
2302 ; VI-LABEL: atomic_umax_i32_ret_addr64:
2303 ; VI: ; %bb.0: ; %entry
2304 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2305 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2306 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
2307 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2308 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2309 ; VI-NEXT: s_add_u32 s0, s4, s0
2310 ; VI-NEXT: s_addc_u32 s1, s5, s1
2311 ; VI-NEXT: v_mov_b32_e32 v0, s0
2312 ; VI-NEXT: v_mov_b32_e32 v1, s1
2313 ; VI-NEXT: v_mov_b32_e32 v2, s8
2314 ; VI-NEXT: flat_atomic_umax v0, v[0:1], v2 glc
2315 ; VI-NEXT: s_mov_b32 s3, 0xf000
2316 ; VI-NEXT: s_mov_b32 s2, -1
2317 ; VI-NEXT: s_mov_b32 s0, s6
2318 ; VI-NEXT: s_mov_b32 s1, s7
2319 ; VI-NEXT: s_waitcnt vmcnt(0)
2320 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2323 ; GFX9-LABEL: atomic_umax_i32_ret_addr64:
2324 ; GFX9: ; %bb.0: ; %entry
2325 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2326 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2327 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
2328 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2329 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2330 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2331 ; GFX9-NEXT: s_add_u32 s0, s4, s0
2332 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
2333 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
2334 ; GFX9-NEXT: global_atomic_umax v1, v0, v1, s[0:1] glc
2335 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2336 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
2337 ; GFX9-NEXT: s_endpgm
2339 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
2340 %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
2341 store i32 %val, ptr addrspace(1) %out2
2345 define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) {
2346 ; SI-LABEL: atomic_min_i32_offset:
2347 ; SI: ; %bb.0: ; %entry
2348 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
2349 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2350 ; SI-NEXT: s_mov_b32 s3, 0xf000
2351 ; SI-NEXT: s_mov_b32 s2, -1
2352 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2353 ; SI-NEXT: v_mov_b32_e32 v0, s4
2354 ; SI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0 offset:16
2357 ; VI-LABEL: atomic_min_i32_offset:
2358 ; VI: ; %bb.0: ; %entry
2359 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
2360 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2361 ; VI-NEXT: s_mov_b32 s3, 0xf000
2362 ; VI-NEXT: s_mov_b32 s2, -1
2363 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2364 ; VI-NEXT: v_mov_b32_e32 v0, s4
2365 ; VI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0 offset:16
2368 ; GFX9-LABEL: atomic_min_i32_offset:
2369 ; GFX9: ; %bb.0: ; %entry
2370 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
2371 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2372 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2373 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2374 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
2375 ; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] offset:16
2376 ; GFX9-NEXT: s_endpgm
2378 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
2379 %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
2383 define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
2384 ; SI-LABEL: atomic_min_i32_ret_offset:
2385 ; SI: ; %bb.0: ; %entry
2386 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2387 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
2388 ; SI-NEXT: s_mov_b32 s3, 0xf000
2389 ; SI-NEXT: s_mov_b32 s2, -1
2390 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2391 ; SI-NEXT: s_mov_b32 s0, s6
2392 ; SI-NEXT: s_mov_b32 s1, s7
2393 ; SI-NEXT: s_mov_b32 s6, s2
2394 ; SI-NEXT: s_mov_b32 s7, s3
2395 ; SI-NEXT: v_mov_b32_e32 v0, s8
2396 ; SI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0 offset:16 glc
2397 ; SI-NEXT: s_waitcnt vmcnt(0)
2398 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2401 ; VI-LABEL: atomic_min_i32_ret_offset:
2402 ; VI: ; %bb.0: ; %entry
2403 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2404 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
2405 ; VI-NEXT: s_mov_b32 s3, 0xf000
2406 ; VI-NEXT: s_mov_b32 s2, -1
2407 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2408 ; VI-NEXT: s_mov_b32 s0, s6
2409 ; VI-NEXT: s_mov_b32 s1, s7
2410 ; VI-NEXT: s_mov_b32 s6, s2
2411 ; VI-NEXT: s_mov_b32 s7, s3
2412 ; VI-NEXT: v_mov_b32_e32 v0, s8
2413 ; VI-NEXT: buffer_atomic_smin v0, off, s[4:7], 0 offset:16 glc
2414 ; VI-NEXT: s_waitcnt vmcnt(0)
2415 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2418 ; GFX9-LABEL: atomic_min_i32_ret_offset:
2419 ; GFX9: ; %bb.0: ; %entry
2420 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
2421 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2422 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2423 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2424 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
2425 ; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[4:5] offset:16 glc
2426 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2427 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
2428 ; GFX9-NEXT: s_endpgm
2430 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
2431 %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
2432 store i32 %val, ptr addrspace(1) %out2
2436 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
2437 ; SI-LABEL: atomic_min_i32_addr64_offset:
2438 ; SI: ; %bb.0: ; %entry
2439 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
2440 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
2441 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2442 ; SI-NEXT: s_mov_b32 s3, 0xf000
2443 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2444 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
2445 ; SI-NEXT: s_mov_b32 s2, 0
2446 ; SI-NEXT: v_mov_b32_e32 v2, s6
2447 ; SI-NEXT: v_mov_b32_e32 v0, s4
2448 ; SI-NEXT: v_mov_b32_e32 v1, s5
2449 ; SI-NEXT: buffer_atomic_smin v2, v[0:1], s[0:3], 0 addr64 offset:16
2452 ; VI-LABEL: atomic_min_i32_addr64_offset:
2453 ; VI: ; %bb.0: ; %entry
2454 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2455 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2456 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
2457 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2458 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2459 ; VI-NEXT: s_add_u32 s0, s4, s0
2460 ; VI-NEXT: s_addc_u32 s1, s5, s1
2461 ; VI-NEXT: s_add_u32 s0, s0, 16
2462 ; VI-NEXT: s_addc_u32 s1, s1, 0
2463 ; VI-NEXT: v_mov_b32_e32 v0, s0
2464 ; VI-NEXT: v_mov_b32_e32 v1, s1
2465 ; VI-NEXT: v_mov_b32_e32 v2, s6
2466 ; VI-NEXT: flat_atomic_smin v[0:1], v2
2469 ; GFX9-LABEL: atomic_min_i32_addr64_offset:
2470 ; GFX9: ; %bb.0: ; %entry
2471 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2472 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2473 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
2474 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2475 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2476 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2477 ; GFX9-NEXT: s_add_u32 s0, s4, s0
2478 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
2479 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
2480 ; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1] offset:16
2481 ; GFX9-NEXT: s_endpgm
2483 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
2484 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
2485 %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
2489 define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
2490 ; SI-LABEL: atomic_min_i32_ret_addr64_offset:
2491 ; SI: ; %bb.0: ; %entry
2492 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2493 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
2494 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
2495 ; SI-NEXT: s_mov_b32 s3, 0xf000
2496 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2497 ; SI-NEXT: s_mov_b32 s0, s6
2498 ; SI-NEXT: s_mov_b32 s1, s7
2499 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
2500 ; SI-NEXT: s_mov_b32 s6, 0
2501 ; SI-NEXT: s_mov_b32 s7, s3
2502 ; SI-NEXT: v_mov_b32_e32 v2, s2
2503 ; SI-NEXT: v_mov_b32_e32 v0, s8
2504 ; SI-NEXT: v_mov_b32_e32 v1, s9
2505 ; SI-NEXT: buffer_atomic_smin v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
2506 ; SI-NEXT: s_mov_b32 s2, -1
2507 ; SI-NEXT: s_waitcnt vmcnt(0)
2508 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
2511 ; VI-LABEL: atomic_min_i32_ret_addr64_offset:
2512 ; VI: ; %bb.0: ; %entry
2513 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2514 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2515 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
2516 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2517 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2518 ; VI-NEXT: s_add_u32 s0, s4, s0
2519 ; VI-NEXT: s_addc_u32 s1, s5, s1
2520 ; VI-NEXT: s_add_u32 s0, s0, 16
2521 ; VI-NEXT: s_addc_u32 s1, s1, 0
2522 ; VI-NEXT: v_mov_b32_e32 v0, s0
2523 ; VI-NEXT: v_mov_b32_e32 v1, s1
2524 ; VI-NEXT: v_mov_b32_e32 v2, s8
2525 ; VI-NEXT: flat_atomic_smin v0, v[0:1], v2 glc
2526 ; VI-NEXT: s_mov_b32 s3, 0xf000
2527 ; VI-NEXT: s_mov_b32 s2, -1
2528 ; VI-NEXT: s_mov_b32 s0, s6
2529 ; VI-NEXT: s_mov_b32 s1, s7
2530 ; VI-NEXT: s_waitcnt vmcnt(0)
2531 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2534 ; GFX9-LABEL: atomic_min_i32_ret_addr64_offset:
2535 ; GFX9: ; %bb.0: ; %entry
2536 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2537 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2538 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
2539 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2540 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2541 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2542 ; GFX9-NEXT: s_add_u32 s0, s4, s0
2543 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
2544 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
2545 ; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[0:1] offset:16 glc
2546 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2547 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
2548 ; GFX9-NEXT: s_endpgm
2550 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
2551 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
2552 %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
2553 store i32 %val, ptr addrspace(1) %out2
2557 define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
2558 ; SI-LABEL: atomic_min_i32:
2559 ; SI: ; %bb.0: ; %entry
2560 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
2561 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2562 ; SI-NEXT: s_mov_b32 s3, 0xf000
2563 ; SI-NEXT: s_mov_b32 s2, -1
2564 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2565 ; SI-NEXT: v_mov_b32_e32 v0, s4
2566 ; SI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0
2569 ; VI-LABEL: atomic_min_i32:
2570 ; VI: ; %bb.0: ; %entry
2571 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
2572 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2573 ; VI-NEXT: s_mov_b32 s3, 0xf000
2574 ; VI-NEXT: s_mov_b32 s2, -1
2575 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2576 ; VI-NEXT: v_mov_b32_e32 v0, s4
2577 ; VI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0
2580 ; GFX9-LABEL: atomic_min_i32:
2581 ; GFX9: ; %bb.0: ; %entry
2582 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
2583 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2584 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2585 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2586 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
2587 ; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3]
2588 ; GFX9-NEXT: s_endpgm
2590 %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
2594 define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
2595 ; SI-LABEL: atomic_min_i32_ret:
2596 ; SI: ; %bb.0: ; %entry
2597 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2598 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
2599 ; SI-NEXT: s_mov_b32 s3, 0xf000
2600 ; SI-NEXT: s_mov_b32 s2, -1
2601 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2602 ; SI-NEXT: s_mov_b32 s0, s4
2603 ; SI-NEXT: s_mov_b32 s1, s5
2604 ; SI-NEXT: v_mov_b32_e32 v0, s8
2605 ; SI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0 glc
2606 ; SI-NEXT: s_mov_b32 s0, s6
2607 ; SI-NEXT: s_mov_b32 s1, s7
2608 ; SI-NEXT: s_waitcnt vmcnt(0)
2609 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2612 ; VI-LABEL: atomic_min_i32_ret:
2613 ; VI: ; %bb.0: ; %entry
2614 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2615 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
2616 ; VI-NEXT: s_mov_b32 s3, 0xf000
2617 ; VI-NEXT: s_mov_b32 s2, -1
2618 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2619 ; VI-NEXT: s_mov_b32 s0, s4
2620 ; VI-NEXT: s_mov_b32 s1, s5
2621 ; VI-NEXT: v_mov_b32_e32 v0, s8
2622 ; VI-NEXT: buffer_atomic_smin v0, off, s[0:3], 0 glc
2623 ; VI-NEXT: s_mov_b32 s0, s6
2624 ; VI-NEXT: s_mov_b32 s1, s7
2625 ; VI-NEXT: s_waitcnt vmcnt(0)
2626 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2629 ; GFX9-LABEL: atomic_min_i32_ret:
2630 ; GFX9: ; %bb.0: ; %entry
2631 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
2632 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2633 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2634 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2635 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
2636 ; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[4:5] glc
2637 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2638 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
2639 ; GFX9-NEXT: s_endpgm
2641 %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
2642 store i32 %val, ptr addrspace(1) %out2
2646 define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
2647 ; SI-LABEL: atomic_min_i32_addr64:
2648 ; SI: ; %bb.0: ; %entry
2649 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
2650 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
2651 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2652 ; SI-NEXT: s_mov_b32 s3, 0xf000
2653 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2654 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
2655 ; SI-NEXT: s_mov_b32 s2, 0
2656 ; SI-NEXT: v_mov_b32_e32 v2, s6
2657 ; SI-NEXT: v_mov_b32_e32 v0, s4
2658 ; SI-NEXT: v_mov_b32_e32 v1, s5
2659 ; SI-NEXT: buffer_atomic_smin v2, v[0:1], s[0:3], 0 addr64
2662 ; VI-LABEL: atomic_min_i32_addr64:
2663 ; VI: ; %bb.0: ; %entry
2664 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2665 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2666 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
2667 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2668 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2669 ; VI-NEXT: s_add_u32 s0, s4, s0
2670 ; VI-NEXT: s_addc_u32 s1, s5, s1
2671 ; VI-NEXT: v_mov_b32_e32 v0, s0
2672 ; VI-NEXT: v_mov_b32_e32 v1, s1
2673 ; VI-NEXT: v_mov_b32_e32 v2, s6
2674 ; VI-NEXT: flat_atomic_smin v[0:1], v2
2677 ; GFX9-LABEL: atomic_min_i32_addr64:
2678 ; GFX9: ; %bb.0: ; %entry
2679 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2680 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2681 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
2682 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2683 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2684 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2685 ; GFX9-NEXT: s_add_u32 s0, s4, s0
2686 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
2687 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
2688 ; GFX9-NEXT: global_atomic_smin v0, v1, s[0:1]
2689 ; GFX9-NEXT: s_endpgm
2691 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
2692 %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
2696 define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
2697 ; SI-LABEL: atomic_min_i32_ret_addr64:
2698 ; SI: ; %bb.0: ; %entry
2699 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2700 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
2701 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
2702 ; SI-NEXT: s_mov_b32 s3, 0xf000
2703 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2704 ; SI-NEXT: s_mov_b32 s0, s6
2705 ; SI-NEXT: s_mov_b32 s1, s7
2706 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
2707 ; SI-NEXT: s_mov_b32 s6, 0
2708 ; SI-NEXT: s_mov_b32 s7, s3
2709 ; SI-NEXT: v_mov_b32_e32 v2, s2
2710 ; SI-NEXT: v_mov_b32_e32 v0, s8
2711 ; SI-NEXT: v_mov_b32_e32 v1, s9
2712 ; SI-NEXT: buffer_atomic_smin v2, v[0:1], s[4:7], 0 addr64 glc
2713 ; SI-NEXT: s_mov_b32 s2, -1
2714 ; SI-NEXT: s_waitcnt vmcnt(0)
2715 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
2718 ; VI-LABEL: atomic_min_i32_ret_addr64:
2719 ; VI: ; %bb.0: ; %entry
2720 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2721 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2722 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
2723 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2724 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2725 ; VI-NEXT: s_add_u32 s0, s4, s0
2726 ; VI-NEXT: s_addc_u32 s1, s5, s1
2727 ; VI-NEXT: v_mov_b32_e32 v0, s0
2728 ; VI-NEXT: v_mov_b32_e32 v1, s1
2729 ; VI-NEXT: v_mov_b32_e32 v2, s8
2730 ; VI-NEXT: flat_atomic_smin v0, v[0:1], v2 glc
2731 ; VI-NEXT: s_mov_b32 s3, 0xf000
2732 ; VI-NEXT: s_mov_b32 s2, -1
2733 ; VI-NEXT: s_mov_b32 s0, s6
2734 ; VI-NEXT: s_mov_b32 s1, s7
2735 ; VI-NEXT: s_waitcnt vmcnt(0)
2736 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2739 ; GFX9-LABEL: atomic_min_i32_ret_addr64:
2740 ; GFX9: ; %bb.0: ; %entry
2741 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2742 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2743 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
2744 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2745 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2746 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2747 ; GFX9-NEXT: s_add_u32 s0, s4, s0
2748 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
2749 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
2750 ; GFX9-NEXT: global_atomic_smin v1, v0, v1, s[0:1] glc
2751 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2752 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
2753 ; GFX9-NEXT: s_endpgm
2755 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
2756 %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
2757 store i32 %val, ptr addrspace(1) %out2
2761 define amdgpu_kernel void @atomic_umin_i32_offset(ptr addrspace(1) %out, i32 %in) {
2762 ; SI-LABEL: atomic_umin_i32_offset:
2763 ; SI: ; %bb.0: ; %entry
2764 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
2765 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2766 ; SI-NEXT: s_mov_b32 s3, 0xf000
2767 ; SI-NEXT: s_mov_b32 s2, -1
2768 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2769 ; SI-NEXT: v_mov_b32_e32 v0, s4
2770 ; SI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0 offset:16
2773 ; VI-LABEL: atomic_umin_i32_offset:
2774 ; VI: ; %bb.0: ; %entry
2775 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
2776 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2777 ; VI-NEXT: s_mov_b32 s3, 0xf000
2778 ; VI-NEXT: s_mov_b32 s2, -1
2779 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2780 ; VI-NEXT: v_mov_b32_e32 v0, s4
2781 ; VI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0 offset:16
2784 ; GFX9-LABEL: atomic_umin_i32_offset:
2785 ; GFX9: ; %bb.0: ; %entry
2786 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
2787 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2788 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2789 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2790 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
2791 ; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] offset:16
2792 ; GFX9-NEXT: s_endpgm
2794 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
2795 %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
2799 define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
2800 ; SI-LABEL: atomic_umin_i32_ret_offset:
2801 ; SI: ; %bb.0: ; %entry
2802 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2803 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
2804 ; SI-NEXT: s_mov_b32 s3, 0xf000
2805 ; SI-NEXT: s_mov_b32 s2, -1
2806 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2807 ; SI-NEXT: s_mov_b32 s0, s6
2808 ; SI-NEXT: s_mov_b32 s1, s7
2809 ; SI-NEXT: s_mov_b32 s6, s2
2810 ; SI-NEXT: s_mov_b32 s7, s3
2811 ; SI-NEXT: v_mov_b32_e32 v0, s8
2812 ; SI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0 offset:16 glc
2813 ; SI-NEXT: s_waitcnt vmcnt(0)
2814 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2817 ; VI-LABEL: atomic_umin_i32_ret_offset:
2818 ; VI: ; %bb.0: ; %entry
2819 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2820 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
2821 ; VI-NEXT: s_mov_b32 s3, 0xf000
2822 ; VI-NEXT: s_mov_b32 s2, -1
2823 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2824 ; VI-NEXT: s_mov_b32 s0, s6
2825 ; VI-NEXT: s_mov_b32 s1, s7
2826 ; VI-NEXT: s_mov_b32 s6, s2
2827 ; VI-NEXT: s_mov_b32 s7, s3
2828 ; VI-NEXT: v_mov_b32_e32 v0, s8
2829 ; VI-NEXT: buffer_atomic_umin v0, off, s[4:7], 0 offset:16 glc
2830 ; VI-NEXT: s_waitcnt vmcnt(0)
2831 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2834 ; GFX9-LABEL: atomic_umin_i32_ret_offset:
2835 ; GFX9: ; %bb.0: ; %entry
2836 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
2837 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2838 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2839 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2840 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
2841 ; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[4:5] offset:16 glc
2842 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2843 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
2844 ; GFX9-NEXT: s_endpgm
2846 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
2847 %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
2848 store i32 %val, ptr addrspace(1) %out2
2852 define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
2853 ; SI-LABEL: atomic_umin_i32_addr64_offset:
2854 ; SI: ; %bb.0: ; %entry
2855 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
2856 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
2857 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2858 ; SI-NEXT: s_mov_b32 s3, 0xf000
2859 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2860 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
2861 ; SI-NEXT: s_mov_b32 s2, 0
2862 ; SI-NEXT: v_mov_b32_e32 v2, s6
2863 ; SI-NEXT: v_mov_b32_e32 v0, s4
2864 ; SI-NEXT: v_mov_b32_e32 v1, s5
2865 ; SI-NEXT: buffer_atomic_umin v2, v[0:1], s[0:3], 0 addr64 offset:16
2868 ; VI-LABEL: atomic_umin_i32_addr64_offset:
2869 ; VI: ; %bb.0: ; %entry
2870 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2871 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2872 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
2873 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2874 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2875 ; VI-NEXT: s_add_u32 s0, s4, s0
2876 ; VI-NEXT: s_addc_u32 s1, s5, s1
2877 ; VI-NEXT: s_add_u32 s0, s0, 16
2878 ; VI-NEXT: s_addc_u32 s1, s1, 0
2879 ; VI-NEXT: v_mov_b32_e32 v0, s0
2880 ; VI-NEXT: v_mov_b32_e32 v1, s1
2881 ; VI-NEXT: v_mov_b32_e32 v2, s6
2882 ; VI-NEXT: flat_atomic_umin v[0:1], v2
2885 ; GFX9-LABEL: atomic_umin_i32_addr64_offset:
2886 ; GFX9: ; %bb.0: ; %entry
2887 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2888 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2889 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
2890 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2891 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2892 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2893 ; GFX9-NEXT: s_add_u32 s0, s4, s0
2894 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
2895 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
2896 ; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1] offset:16
2897 ; GFX9-NEXT: s_endpgm
2899 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
2900 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
2901 %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
2905 define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
2906 ; SI-LABEL: atomic_umin_i32_ret_addr64_offset:
2907 ; SI: ; %bb.0: ; %entry
2908 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2909 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
2910 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
2911 ; SI-NEXT: s_mov_b32 s3, 0xf000
2912 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2913 ; SI-NEXT: s_mov_b32 s0, s6
2914 ; SI-NEXT: s_mov_b32 s1, s7
2915 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
2916 ; SI-NEXT: s_mov_b32 s6, 0
2917 ; SI-NEXT: s_mov_b32 s7, s3
2918 ; SI-NEXT: v_mov_b32_e32 v2, s2
2919 ; SI-NEXT: v_mov_b32_e32 v0, s8
2920 ; SI-NEXT: v_mov_b32_e32 v1, s9
2921 ; SI-NEXT: buffer_atomic_umin v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
2922 ; SI-NEXT: s_mov_b32 s2, -1
2923 ; SI-NEXT: s_waitcnt vmcnt(0)
2924 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
2927 ; VI-LABEL: atomic_umin_i32_ret_addr64_offset:
2928 ; VI: ; %bb.0: ; %entry
2929 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2930 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2931 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
2932 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2933 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2934 ; VI-NEXT: s_add_u32 s0, s4, s0
2935 ; VI-NEXT: s_addc_u32 s1, s5, s1
2936 ; VI-NEXT: s_add_u32 s0, s0, 16
2937 ; VI-NEXT: s_addc_u32 s1, s1, 0
2938 ; VI-NEXT: v_mov_b32_e32 v0, s0
2939 ; VI-NEXT: v_mov_b32_e32 v1, s1
2940 ; VI-NEXT: v_mov_b32_e32 v2, s8
2941 ; VI-NEXT: flat_atomic_umin v0, v[0:1], v2 glc
2942 ; VI-NEXT: s_mov_b32 s3, 0xf000
2943 ; VI-NEXT: s_mov_b32 s2, -1
2944 ; VI-NEXT: s_mov_b32 s0, s6
2945 ; VI-NEXT: s_mov_b32 s1, s7
2946 ; VI-NEXT: s_waitcnt vmcnt(0)
2947 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
2950 ; GFX9-LABEL: atomic_umin_i32_ret_addr64_offset:
2951 ; GFX9: ; %bb.0: ; %entry
2952 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2953 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2954 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
2955 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2956 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2957 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2958 ; GFX9-NEXT: s_add_u32 s0, s4, s0
2959 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
2960 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
2961 ; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[0:1] offset:16 glc
2962 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2963 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
2964 ; GFX9-NEXT: s_endpgm
2966 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
2967 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
2968 %val = atomicrmw volatile umin ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
2969 store i32 %val, ptr addrspace(1) %out2
2973 define amdgpu_kernel void @atomic_umin_i32(ptr addrspace(1) %out, i32 %in) {
2974 ; SI-LABEL: atomic_umin_i32:
2975 ; SI: ; %bb.0: ; %entry
2976 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
2977 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2978 ; SI-NEXT: s_mov_b32 s3, 0xf000
2979 ; SI-NEXT: s_mov_b32 s2, -1
2980 ; SI-NEXT: s_waitcnt lgkmcnt(0)
2981 ; SI-NEXT: v_mov_b32_e32 v0, s4
2982 ; SI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0
2985 ; VI-LABEL: atomic_umin_i32:
2986 ; VI: ; %bb.0: ; %entry
2987 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
2988 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2989 ; VI-NEXT: s_mov_b32 s3, 0xf000
2990 ; VI-NEXT: s_mov_b32 s2, -1
2991 ; VI-NEXT: s_waitcnt lgkmcnt(0)
2992 ; VI-NEXT: v_mov_b32_e32 v0, s4
2993 ; VI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0
2996 ; GFX9-LABEL: atomic_umin_i32:
2997 ; GFX9: ; %bb.0: ; %entry
2998 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
2999 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
3000 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3001 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3002 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
3003 ; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3]
3004 ; GFX9-NEXT: s_endpgm
3006 %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
3010 define amdgpu_kernel void @atomic_umin_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
3011 ; SI-LABEL: atomic_umin_i32_ret:
3012 ; SI: ; %bb.0: ; %entry
3013 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3014 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
3015 ; SI-NEXT: s_mov_b32 s3, 0xf000
3016 ; SI-NEXT: s_mov_b32 s2, -1
3017 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3018 ; SI-NEXT: s_mov_b32 s0, s4
3019 ; SI-NEXT: s_mov_b32 s1, s5
3020 ; SI-NEXT: v_mov_b32_e32 v0, s8
3021 ; SI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0 glc
3022 ; SI-NEXT: s_mov_b32 s0, s6
3023 ; SI-NEXT: s_mov_b32 s1, s7
3024 ; SI-NEXT: s_waitcnt vmcnt(0)
3025 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
3028 ; VI-LABEL: atomic_umin_i32_ret:
3029 ; VI: ; %bb.0: ; %entry
3030 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3031 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
3032 ; VI-NEXT: s_mov_b32 s3, 0xf000
3033 ; VI-NEXT: s_mov_b32 s2, -1
3034 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3035 ; VI-NEXT: s_mov_b32 s0, s4
3036 ; VI-NEXT: s_mov_b32 s1, s5
3037 ; VI-NEXT: v_mov_b32_e32 v0, s8
3038 ; VI-NEXT: buffer_atomic_umin v0, off, s[0:3], 0 glc
3039 ; VI-NEXT: s_mov_b32 s0, s6
3040 ; VI-NEXT: s_mov_b32 s1, s7
3041 ; VI-NEXT: s_waitcnt vmcnt(0)
3042 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
3045 ; GFX9-LABEL: atomic_umin_i32_ret:
3046 ; GFX9: ; %bb.0: ; %entry
3047 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
3048 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3049 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3050 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3051 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
3052 ; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[4:5] glc
3053 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3054 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
3055 ; GFX9-NEXT: s_endpgm
3057 %val = atomicrmw volatile umin ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
3058 store i32 %val, ptr addrspace(1) %out2
3062 define amdgpu_kernel void @atomic_umin_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
3063 ; SI-LABEL: atomic_umin_i32_addr64:
3064 ; SI: ; %bb.0: ; %entry
3065 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
3066 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
3067 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
3068 ; SI-NEXT: s_mov_b32 s3, 0xf000
3069 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3070 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
3071 ; SI-NEXT: s_mov_b32 s2, 0
3072 ; SI-NEXT: v_mov_b32_e32 v2, s6
3073 ; SI-NEXT: v_mov_b32_e32 v0, s4
3074 ; SI-NEXT: v_mov_b32_e32 v1, s5
3075 ; SI-NEXT: buffer_atomic_umin v2, v[0:1], s[0:3], 0 addr64
3078 ; VI-LABEL: atomic_umin_i32_addr64:
3079 ; VI: ; %bb.0: ; %entry
3080 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3081 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3082 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
3083 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3084 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3085 ; VI-NEXT: s_add_u32 s0, s4, s0
3086 ; VI-NEXT: s_addc_u32 s1, s5, s1
3087 ; VI-NEXT: v_mov_b32_e32 v0, s0
3088 ; VI-NEXT: v_mov_b32_e32 v1, s1
3089 ; VI-NEXT: v_mov_b32_e32 v2, s6
3090 ; VI-NEXT: flat_atomic_umin v[0:1], v2
3093 ; GFX9-LABEL: atomic_umin_i32_addr64:
3094 ; GFX9: ; %bb.0: ; %entry
3095 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3096 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3097 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
3098 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3099 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3100 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3101 ; GFX9-NEXT: s_add_u32 s0, s4, s0
3102 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
3103 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3104 ; GFX9-NEXT: global_atomic_umin v0, v1, s[0:1]
3105 ; GFX9-NEXT: s_endpgm
3107 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
3108 %val = atomicrmw volatile umin ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
3112 define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
3113 ; SI-LABEL: atomic_umin_i32_ret_addr64:
3114 ; SI: ; %bb.0: ; %entry
3115 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3116 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
3117 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
3118 ; SI-NEXT: s_mov_b32 s3, 0xf000
3119 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3120 ; SI-NEXT: s_mov_b32 s0, s6
3121 ; SI-NEXT: s_mov_b32 s1, s7
3122 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
3123 ; SI-NEXT: s_mov_b32 s6, 0
3124 ; SI-NEXT: s_mov_b32 s7, s3
3125 ; SI-NEXT: v_mov_b32_e32 v2, s2
3126 ; SI-NEXT: v_mov_b32_e32 v0, s8
3127 ; SI-NEXT: v_mov_b32_e32 v1, s9
3128 ; SI-NEXT: buffer_atomic_umin v2, v[0:1], s[4:7], 0 addr64 glc
3129 ; SI-NEXT: s_mov_b32 s2, -1
3130 ; SI-NEXT: s_waitcnt vmcnt(0)
3131 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
3134 ; VI-LABEL: atomic_umin_i32_ret_addr64:
3135 ; VI: ; %bb.0: ; %entry
3136 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
3137 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3138 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
3139 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3140 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3141 ; VI-NEXT: s_add_u32 s0, s4, s0
3142 ; VI-NEXT: s_addc_u32 s1, s5, s1
3143 ; VI-NEXT: v_mov_b32_e32 v0, s0
3144 ; VI-NEXT: v_mov_b32_e32 v1, s1
3145 ; VI-NEXT: v_mov_b32_e32 v2, s8
3146 ; VI-NEXT: flat_atomic_umin v0, v[0:1], v2 glc
3147 ; VI-NEXT: s_mov_b32 s3, 0xf000
3148 ; VI-NEXT: s_mov_b32 s2, -1
3149 ; VI-NEXT: s_mov_b32 s0, s6
3150 ; VI-NEXT: s_mov_b32 s1, s7
3151 ; VI-NEXT: s_waitcnt vmcnt(0)
3152 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
3155 ; GFX9-LABEL: atomic_umin_i32_ret_addr64:
3156 ; GFX9: ; %bb.0: ; %entry
3157 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
3158 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3159 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
3160 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3161 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3162 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3163 ; GFX9-NEXT: s_add_u32 s0, s4, s0
3164 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
3165 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
3166 ; GFX9-NEXT: global_atomic_umin v1, v0, v1, s[0:1] glc
3167 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3168 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
3169 ; GFX9-NEXT: s_endpgm
3171 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
3172 %val = atomicrmw volatile umin ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
3173 store i32 %val, ptr addrspace(1) %out2
3177 define amdgpu_kernel void @atomic_or_i32_offset(ptr addrspace(1) %out, i32 %in) {
3178 ; SI-LABEL: atomic_or_i32_offset:
3179 ; SI: ; %bb.0: ; %entry
3180 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
3181 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
3182 ; SI-NEXT: s_mov_b32 s3, 0xf000
3183 ; SI-NEXT: s_mov_b32 s2, -1
3184 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3185 ; SI-NEXT: v_mov_b32_e32 v0, s4
3186 ; SI-NEXT: buffer_atomic_or v0, off, s[0:3], 0 offset:16
3187 ; SI-NEXT: s_waitcnt vmcnt(0)
3188 ; SI-NEXT: buffer_wbinvl1
3191 ; VI-LABEL: atomic_or_i32_offset:
3192 ; VI: ; %bb.0: ; %entry
3193 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
3194 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
3195 ; VI-NEXT: s_mov_b32 s3, 0xf000
3196 ; VI-NEXT: s_mov_b32 s2, -1
3197 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3198 ; VI-NEXT: v_mov_b32_e32 v0, s4
3199 ; VI-NEXT: buffer_atomic_or v0, off, s[0:3], 0 offset:16
3200 ; VI-NEXT: s_waitcnt vmcnt(0)
3201 ; VI-NEXT: buffer_wbinvl1_vol
3204 ; GFX9-LABEL: atomic_or_i32_offset:
3205 ; GFX9: ; %bb.0: ; %entry
3206 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
3207 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
3208 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3209 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3210 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
3211 ; GFX9-NEXT: global_atomic_or v0, v1, s[2:3] offset:16
3212 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3213 ; GFX9-NEXT: buffer_wbinvl1_vol
3214 ; GFX9-NEXT: s_endpgm
3216 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
3217 %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
3221 define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
3222 ; SI-LABEL: atomic_or_i32_ret_offset:
3223 ; SI: ; %bb.0: ; %entry
3224 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3225 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
3226 ; SI-NEXT: s_mov_b32 s3, 0xf000
3227 ; SI-NEXT: s_mov_b32 s2, -1
3228 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3229 ; SI-NEXT: s_mov_b32 s0, s6
3230 ; SI-NEXT: s_mov_b32 s1, s7
3231 ; SI-NEXT: s_mov_b32 s6, s2
3232 ; SI-NEXT: s_mov_b32 s7, s3
3233 ; SI-NEXT: v_mov_b32_e32 v0, s8
3234 ; SI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc
3235 ; SI-NEXT: s_waitcnt vmcnt(0)
3236 ; SI-NEXT: buffer_wbinvl1
3237 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
3240 ; VI-LABEL: atomic_or_i32_ret_offset:
3241 ; VI: ; %bb.0: ; %entry
3242 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3243 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
3244 ; VI-NEXT: s_mov_b32 s3, 0xf000
3245 ; VI-NEXT: s_mov_b32 s2, -1
3246 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3247 ; VI-NEXT: s_mov_b32 s0, s6
3248 ; VI-NEXT: s_mov_b32 s1, s7
3249 ; VI-NEXT: s_mov_b32 s6, s2
3250 ; VI-NEXT: s_mov_b32 s7, s3
3251 ; VI-NEXT: v_mov_b32_e32 v0, s8
3252 ; VI-NEXT: buffer_atomic_or v0, off, s[4:7], 0 offset:16 glc
3253 ; VI-NEXT: s_waitcnt vmcnt(0)
3254 ; VI-NEXT: buffer_wbinvl1_vol
3255 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
3258 ; GFX9-LABEL: atomic_or_i32_ret_offset:
3259 ; GFX9: ; %bb.0: ; %entry
3260 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
3261 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3262 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3263 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3264 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
3265 ; GFX9-NEXT: global_atomic_or v1, v0, v1, s[4:5] offset:16 glc
3266 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3267 ; GFX9-NEXT: buffer_wbinvl1_vol
3268 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
3269 ; GFX9-NEXT: s_endpgm
3271 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
3272 %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
3273 store i32 %val, ptr addrspace(1) %out2
3277 define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
3278 ; SI-LABEL: atomic_or_i32_addr64_offset:
3279 ; SI: ; %bb.0: ; %entry
3280 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
3281 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
3282 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
3283 ; SI-NEXT: s_mov_b32 s3, 0xf000
3284 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3285 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
3286 ; SI-NEXT: s_mov_b32 s2, 0
3287 ; SI-NEXT: v_mov_b32_e32 v2, s6
3288 ; SI-NEXT: v_mov_b32_e32 v0, s4
3289 ; SI-NEXT: v_mov_b32_e32 v1, s5
3290 ; SI-NEXT: buffer_atomic_or v2, v[0:1], s[0:3], 0 addr64 offset:16
3291 ; SI-NEXT: s_waitcnt vmcnt(0)
3292 ; SI-NEXT: buffer_wbinvl1
3295 ; VI-LABEL: atomic_or_i32_addr64_offset:
3296 ; VI: ; %bb.0: ; %entry
3297 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3298 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3299 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
3300 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3301 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3302 ; VI-NEXT: s_add_u32 s0, s4, s0
3303 ; VI-NEXT: s_addc_u32 s1, s5, s1
3304 ; VI-NEXT: s_add_u32 s0, s0, 16
3305 ; VI-NEXT: s_addc_u32 s1, s1, 0
3306 ; VI-NEXT: v_mov_b32_e32 v0, s0
3307 ; VI-NEXT: v_mov_b32_e32 v1, s1
3308 ; VI-NEXT: v_mov_b32_e32 v2, s6
3309 ; VI-NEXT: flat_atomic_or v[0:1], v2
3310 ; VI-NEXT: s_waitcnt vmcnt(0)
3311 ; VI-NEXT: buffer_wbinvl1_vol
3314 ; GFX9-LABEL: atomic_or_i32_addr64_offset:
3315 ; GFX9: ; %bb.0: ; %entry
3316 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3317 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3318 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
3319 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3320 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3321 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3322 ; GFX9-NEXT: s_add_u32 s0, s4, s0
3323 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
3324 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3325 ; GFX9-NEXT: global_atomic_or v0, v1, s[0:1] offset:16
3326 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3327 ; GFX9-NEXT: buffer_wbinvl1_vol
3328 ; GFX9-NEXT: s_endpgm
3330 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
3331 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
3332 %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
3336 define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
3337 ; SI-LABEL: atomic_or_i32_ret_addr64_offset:
3338 ; SI: ; %bb.0: ; %entry
3339 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3340 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
3341 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
3342 ; SI-NEXT: s_mov_b32 s3, 0xf000
3343 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3344 ; SI-NEXT: s_mov_b32 s0, s6
3345 ; SI-NEXT: s_mov_b32 s1, s7
3346 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
3347 ; SI-NEXT: s_mov_b32 s6, 0
3348 ; SI-NEXT: s_mov_b32 s7, s3
3349 ; SI-NEXT: v_mov_b32_e32 v2, s2
3350 ; SI-NEXT: v_mov_b32_e32 v0, s8
3351 ; SI-NEXT: v_mov_b32_e32 v1, s9
3352 ; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
3353 ; SI-NEXT: s_waitcnt vmcnt(0)
3354 ; SI-NEXT: buffer_wbinvl1
3355 ; SI-NEXT: s_mov_b32 s2, -1
3356 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
3359 ; VI-LABEL: atomic_or_i32_ret_addr64_offset:
3360 ; VI: ; %bb.0: ; %entry
3361 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
3362 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3363 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
3364 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3365 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3366 ; VI-NEXT: s_add_u32 s0, s4, s0
3367 ; VI-NEXT: s_addc_u32 s1, s5, s1
3368 ; VI-NEXT: s_add_u32 s0, s0, 16
3369 ; VI-NEXT: s_addc_u32 s1, s1, 0
3370 ; VI-NEXT: v_mov_b32_e32 v0, s0
3371 ; VI-NEXT: v_mov_b32_e32 v1, s1
3372 ; VI-NEXT: v_mov_b32_e32 v2, s8
3373 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc
3374 ; VI-NEXT: s_waitcnt vmcnt(0)
3375 ; VI-NEXT: buffer_wbinvl1_vol
3376 ; VI-NEXT: s_mov_b32 s3, 0xf000
3377 ; VI-NEXT: s_mov_b32 s2, -1
3378 ; VI-NEXT: s_mov_b32 s0, s6
3379 ; VI-NEXT: s_mov_b32 s1, s7
3380 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
3383 ; GFX9-LABEL: atomic_or_i32_ret_addr64_offset:
3384 ; GFX9: ; %bb.0: ; %entry
3385 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
3386 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3387 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
3388 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3389 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3390 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3391 ; GFX9-NEXT: s_add_u32 s0, s4, s0
3392 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
3393 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
3394 ; GFX9-NEXT: global_atomic_or v1, v0, v1, s[0:1] offset:16 glc
3395 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3396 ; GFX9-NEXT: buffer_wbinvl1_vol
3397 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
3398 ; GFX9-NEXT: s_endpgm
3400 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
3401 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
3402 %val = atomicrmw volatile or ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
3403 store i32 %val, ptr addrspace(1) %out2
3407 define amdgpu_kernel void @atomic_or_i32(ptr addrspace(1) %out, i32 %in) {
3408 ; SI-LABEL: atomic_or_i32:
3409 ; SI: ; %bb.0: ; %entry
3410 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
3411 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
3412 ; SI-NEXT: s_mov_b32 s3, 0xf000
3413 ; SI-NEXT: s_mov_b32 s2, -1
3414 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3415 ; SI-NEXT: v_mov_b32_e32 v0, s4
3416 ; SI-NEXT: buffer_atomic_or v0, off, s[0:3], 0
3417 ; SI-NEXT: s_waitcnt vmcnt(0)
3418 ; SI-NEXT: buffer_wbinvl1
3421 ; VI-LABEL: atomic_or_i32:
3422 ; VI: ; %bb.0: ; %entry
3423 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
3424 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
3425 ; VI-NEXT: s_mov_b32 s3, 0xf000
3426 ; VI-NEXT: s_mov_b32 s2, -1
3427 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3428 ; VI-NEXT: v_mov_b32_e32 v0, s4
3429 ; VI-NEXT: buffer_atomic_or v0, off, s[0:3], 0
3430 ; VI-NEXT: s_waitcnt vmcnt(0)
3431 ; VI-NEXT: buffer_wbinvl1_vol
3434 ; GFX9-LABEL: atomic_or_i32:
3435 ; GFX9: ; %bb.0: ; %entry
3436 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
3437 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
3438 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3439 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3440 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
3441 ; GFX9-NEXT: global_atomic_or v0, v1, s[2:3]
3442 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3443 ; GFX9-NEXT: buffer_wbinvl1_vol
3444 ; GFX9-NEXT: s_endpgm
3446 %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
3450 define amdgpu_kernel void @atomic_or_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
3451 ; SI-LABEL: atomic_or_i32_ret:
3452 ; SI: ; %bb.0: ; %entry
3453 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3454 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
3455 ; SI-NEXT: s_mov_b32 s3, 0xf000
3456 ; SI-NEXT: s_mov_b32 s2, -1
3457 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3458 ; SI-NEXT: s_mov_b32 s0, s4
3459 ; SI-NEXT: s_mov_b32 s1, s5
3460 ; SI-NEXT: v_mov_b32_e32 v0, s8
3461 ; SI-NEXT: buffer_atomic_or v0, off, s[0:3], 0 glc
3462 ; SI-NEXT: s_waitcnt vmcnt(0)
3463 ; SI-NEXT: buffer_wbinvl1
3464 ; SI-NEXT: s_mov_b32 s0, s6
3465 ; SI-NEXT: s_mov_b32 s1, s7
3466 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
3469 ; VI-LABEL: atomic_or_i32_ret:
3470 ; VI: ; %bb.0: ; %entry
3471 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3472 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
3473 ; VI-NEXT: s_mov_b32 s3, 0xf000
3474 ; VI-NEXT: s_mov_b32 s2, -1
3475 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3476 ; VI-NEXT: s_mov_b32 s0, s4
3477 ; VI-NEXT: s_mov_b32 s1, s5
3478 ; VI-NEXT: v_mov_b32_e32 v0, s8
3479 ; VI-NEXT: buffer_atomic_or v0, off, s[0:3], 0 glc
3480 ; VI-NEXT: s_waitcnt vmcnt(0)
3481 ; VI-NEXT: buffer_wbinvl1_vol
3482 ; VI-NEXT: s_mov_b32 s0, s6
3483 ; VI-NEXT: s_mov_b32 s1, s7
3484 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
3487 ; GFX9-LABEL: atomic_or_i32_ret:
3488 ; GFX9: ; %bb.0: ; %entry
3489 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
3490 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3491 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3492 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3493 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
3494 ; GFX9-NEXT: global_atomic_or v1, v0, v1, s[4:5] glc
3495 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3496 ; GFX9-NEXT: buffer_wbinvl1_vol
3497 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
3498 ; GFX9-NEXT: s_endpgm
3500 %val = atomicrmw volatile or ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
3501 store i32 %val, ptr addrspace(1) %out2
3505 define amdgpu_kernel void @atomic_or_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
3506 ; SI-LABEL: atomic_or_i32_addr64:
3507 ; SI: ; %bb.0: ; %entry
3508 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
3509 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
3510 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
3511 ; SI-NEXT: s_mov_b32 s3, 0xf000
3512 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3513 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
3514 ; SI-NEXT: s_mov_b32 s2, 0
3515 ; SI-NEXT: v_mov_b32_e32 v2, s6
3516 ; SI-NEXT: v_mov_b32_e32 v0, s4
3517 ; SI-NEXT: v_mov_b32_e32 v1, s5
3518 ; SI-NEXT: buffer_atomic_or v2, v[0:1], s[0:3], 0 addr64
3519 ; SI-NEXT: s_waitcnt vmcnt(0)
3520 ; SI-NEXT: buffer_wbinvl1
3523 ; VI-LABEL: atomic_or_i32_addr64:
3524 ; VI: ; %bb.0: ; %entry
3525 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3526 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3527 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
3528 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3529 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3530 ; VI-NEXT: s_add_u32 s0, s4, s0
3531 ; VI-NEXT: s_addc_u32 s1, s5, s1
3532 ; VI-NEXT: v_mov_b32_e32 v0, s0
3533 ; VI-NEXT: v_mov_b32_e32 v1, s1
3534 ; VI-NEXT: v_mov_b32_e32 v2, s6
3535 ; VI-NEXT: flat_atomic_or v[0:1], v2
3536 ; VI-NEXT: s_waitcnt vmcnt(0)
3537 ; VI-NEXT: buffer_wbinvl1_vol
3540 ; GFX9-LABEL: atomic_or_i32_addr64:
3541 ; GFX9: ; %bb.0: ; %entry
3542 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3543 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3544 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
3545 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3546 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3547 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3548 ; GFX9-NEXT: s_add_u32 s0, s4, s0
3549 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
3550 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3551 ; GFX9-NEXT: global_atomic_or v0, v1, s[0:1]
3552 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3553 ; GFX9-NEXT: buffer_wbinvl1_vol
3554 ; GFX9-NEXT: s_endpgm
3556 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
3557 %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
3561 define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
3562 ; SI-LABEL: atomic_or_i32_ret_addr64:
3563 ; SI: ; %bb.0: ; %entry
3564 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3565 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
3566 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
3567 ; SI-NEXT: s_mov_b32 s3, 0xf000
3568 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3569 ; SI-NEXT: s_mov_b32 s0, s6
3570 ; SI-NEXT: s_mov_b32 s1, s7
3571 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
3572 ; SI-NEXT: s_mov_b32 s6, 0
3573 ; SI-NEXT: s_mov_b32 s7, s3
3574 ; SI-NEXT: v_mov_b32_e32 v2, s2
3575 ; SI-NEXT: v_mov_b32_e32 v0, s8
3576 ; SI-NEXT: v_mov_b32_e32 v1, s9
3577 ; SI-NEXT: buffer_atomic_or v2, v[0:1], s[4:7], 0 addr64 glc
3578 ; SI-NEXT: s_waitcnt vmcnt(0)
3579 ; SI-NEXT: buffer_wbinvl1
3580 ; SI-NEXT: s_mov_b32 s2, -1
3581 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
3584 ; VI-LABEL: atomic_or_i32_ret_addr64:
3585 ; VI: ; %bb.0: ; %entry
3586 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
3587 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3588 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
3589 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3590 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3591 ; VI-NEXT: s_add_u32 s0, s4, s0
3592 ; VI-NEXT: s_addc_u32 s1, s5, s1
3593 ; VI-NEXT: v_mov_b32_e32 v0, s0
3594 ; VI-NEXT: v_mov_b32_e32 v1, s1
3595 ; VI-NEXT: v_mov_b32_e32 v2, s8
3596 ; VI-NEXT: flat_atomic_or v0, v[0:1], v2 glc
3597 ; VI-NEXT: s_waitcnt vmcnt(0)
3598 ; VI-NEXT: buffer_wbinvl1_vol
3599 ; VI-NEXT: s_mov_b32 s3, 0xf000
3600 ; VI-NEXT: s_mov_b32 s2, -1
3601 ; VI-NEXT: s_mov_b32 s0, s6
3602 ; VI-NEXT: s_mov_b32 s1, s7
3603 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
3606 ; GFX9-LABEL: atomic_or_i32_ret_addr64:
3607 ; GFX9: ; %bb.0: ; %entry
3608 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
3609 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3610 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
3611 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3612 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3613 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3614 ; GFX9-NEXT: s_add_u32 s0, s4, s0
3615 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
3616 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
3617 ; GFX9-NEXT: global_atomic_or v1, v0, v1, s[0:1] glc
3618 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3619 ; GFX9-NEXT: buffer_wbinvl1_vol
3620 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
3621 ; GFX9-NEXT: s_endpgm
3623 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
3624 %val = atomicrmw volatile or ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
3625 store i32 %val, ptr addrspace(1) %out2
3629 define amdgpu_kernel void @atomic_xchg_i32_offset(ptr addrspace(1) %out, i32 %in) {
3630 ; SI-LABEL: atomic_xchg_i32_offset:
3631 ; SI: ; %bb.0: ; %entry
3632 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
3633 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
3634 ; SI-NEXT: s_mov_b32 s3, 0xf000
3635 ; SI-NEXT: s_mov_b32 s2, -1
3636 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3637 ; SI-NEXT: v_mov_b32_e32 v0, s4
3638 ; SI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16
3639 ; SI-NEXT: s_waitcnt vmcnt(0)
3640 ; SI-NEXT: buffer_wbinvl1
3643 ; VI-LABEL: atomic_xchg_i32_offset:
3644 ; VI: ; %bb.0: ; %entry
3645 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
3646 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
3647 ; VI-NEXT: s_mov_b32 s3, 0xf000
3648 ; VI-NEXT: s_mov_b32 s2, -1
3649 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3650 ; VI-NEXT: v_mov_b32_e32 v0, s4
3651 ; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16
3652 ; VI-NEXT: s_waitcnt vmcnt(0)
3653 ; VI-NEXT: buffer_wbinvl1_vol
3656 ; GFX9-LABEL: atomic_xchg_i32_offset:
3657 ; GFX9: ; %bb.0: ; %entry
3658 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
3659 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
3660 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3661 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3662 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
3663 ; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:16
3664 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3665 ; GFX9-NEXT: buffer_wbinvl1_vol
3666 ; GFX9-NEXT: s_endpgm
3668 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
3669 %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
3673 define amdgpu_kernel void @atomic_xchg_f32_offset(ptr addrspace(1) %out, float %in) {
3674 ; SI-LABEL: atomic_xchg_f32_offset:
3675 ; SI: ; %bb.0: ; %entry
3676 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
3677 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
3678 ; SI-NEXT: s_mov_b32 s3, 0xf000
3679 ; SI-NEXT: s_mov_b32 s2, -1
3680 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3681 ; SI-NEXT: v_mov_b32_e32 v0, s4
3682 ; SI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16
3683 ; SI-NEXT: s_waitcnt vmcnt(0)
3684 ; SI-NEXT: buffer_wbinvl1
3687 ; VI-LABEL: atomic_xchg_f32_offset:
3688 ; VI: ; %bb.0: ; %entry
3689 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
3690 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
3691 ; VI-NEXT: s_mov_b32 s3, 0xf000
3692 ; VI-NEXT: s_mov_b32 s2, -1
3693 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3694 ; VI-NEXT: v_mov_b32_e32 v0, s4
3695 ; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 offset:16
3696 ; VI-NEXT: s_waitcnt vmcnt(0)
3697 ; VI-NEXT: buffer_wbinvl1_vol
3700 ; GFX9-LABEL: atomic_xchg_f32_offset:
3701 ; GFX9: ; %bb.0: ; %entry
3702 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
3703 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
3704 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3705 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3706 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
3707 ; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:16
3708 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3709 ; GFX9-NEXT: buffer_wbinvl1_vol
3710 ; GFX9-NEXT: s_endpgm
3712 %gep = getelementptr float, ptr addrspace(1) %out, i64 4
3713 %val = atomicrmw volatile xchg ptr addrspace(1) %gep, float %in syncscope("agent") seq_cst
3717 define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
3718 ; SI-LABEL: atomic_xchg_i32_ret_offset:
3719 ; SI: ; %bb.0: ; %entry
3720 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3721 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
3722 ; SI-NEXT: s_mov_b32 s3, 0xf000
3723 ; SI-NEXT: s_mov_b32 s2, -1
3724 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3725 ; SI-NEXT: s_mov_b32 s0, s6
3726 ; SI-NEXT: s_mov_b32 s1, s7
3727 ; SI-NEXT: s_mov_b32 s6, s2
3728 ; SI-NEXT: s_mov_b32 s7, s3
3729 ; SI-NEXT: v_mov_b32_e32 v0, s8
3730 ; SI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc
3731 ; SI-NEXT: s_waitcnt vmcnt(0)
3732 ; SI-NEXT: buffer_wbinvl1
3733 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
3736 ; VI-LABEL: atomic_xchg_i32_ret_offset:
3737 ; VI: ; %bb.0: ; %entry
3738 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3739 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
3740 ; VI-NEXT: s_mov_b32 s3, 0xf000
3741 ; VI-NEXT: s_mov_b32 s2, -1
3742 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3743 ; VI-NEXT: s_mov_b32 s0, s6
3744 ; VI-NEXT: s_mov_b32 s1, s7
3745 ; VI-NEXT: s_mov_b32 s6, s2
3746 ; VI-NEXT: s_mov_b32 s7, s3
3747 ; VI-NEXT: v_mov_b32_e32 v0, s8
3748 ; VI-NEXT: buffer_atomic_swap v0, off, s[4:7], 0 offset:16 glc
3749 ; VI-NEXT: s_waitcnt vmcnt(0)
3750 ; VI-NEXT: buffer_wbinvl1_vol
3751 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
3754 ; GFX9-LABEL: atomic_xchg_i32_ret_offset:
3755 ; GFX9: ; %bb.0: ; %entry
3756 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
3757 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3758 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3759 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3760 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
3761 ; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[4:5] offset:16 glc
3762 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3763 ; GFX9-NEXT: buffer_wbinvl1_vol
3764 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
3765 ; GFX9-NEXT: s_endpgm
3767 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
3768 %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
3769 store i32 %val, ptr addrspace(1) %out2
3773 define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
3774 ; SI-LABEL: atomic_xchg_i32_addr64_offset:
3775 ; SI: ; %bb.0: ; %entry
3776 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
3777 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
3778 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
3779 ; SI-NEXT: s_mov_b32 s3, 0xf000
3780 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3781 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
3782 ; SI-NEXT: s_mov_b32 s2, 0
3783 ; SI-NEXT: v_mov_b32_e32 v2, s6
3784 ; SI-NEXT: v_mov_b32_e32 v0, s4
3785 ; SI-NEXT: v_mov_b32_e32 v1, s5
3786 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[0:3], 0 addr64 offset:16
3787 ; SI-NEXT: s_waitcnt vmcnt(0)
3788 ; SI-NEXT: buffer_wbinvl1
3791 ; VI-LABEL: atomic_xchg_i32_addr64_offset:
3792 ; VI: ; %bb.0: ; %entry
3793 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3794 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3795 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
3796 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3797 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3798 ; VI-NEXT: s_add_u32 s0, s4, s0
3799 ; VI-NEXT: s_addc_u32 s1, s5, s1
3800 ; VI-NEXT: s_add_u32 s0, s0, 16
3801 ; VI-NEXT: s_addc_u32 s1, s1, 0
3802 ; VI-NEXT: v_mov_b32_e32 v0, s0
3803 ; VI-NEXT: v_mov_b32_e32 v1, s1
3804 ; VI-NEXT: v_mov_b32_e32 v2, s6
3805 ; VI-NEXT: flat_atomic_swap v[0:1], v2
3806 ; VI-NEXT: s_waitcnt vmcnt(0)
3807 ; VI-NEXT: buffer_wbinvl1_vol
3810 ; GFX9-LABEL: atomic_xchg_i32_addr64_offset:
3811 ; GFX9: ; %bb.0: ; %entry
3812 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3813 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3814 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
3815 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3816 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3817 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3818 ; GFX9-NEXT: s_add_u32 s0, s4, s0
3819 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
3820 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
3821 ; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:16
3822 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3823 ; GFX9-NEXT: buffer_wbinvl1_vol
3824 ; GFX9-NEXT: s_endpgm
3826 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
3827 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
3828 %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
3832 define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
3833 ; SI-LABEL: atomic_xchg_i32_ret_addr64_offset:
3834 ; SI: ; %bb.0: ; %entry
3835 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3836 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
3837 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
3838 ; SI-NEXT: s_mov_b32 s3, 0xf000
3839 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3840 ; SI-NEXT: s_mov_b32 s0, s6
3841 ; SI-NEXT: s_mov_b32 s1, s7
3842 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
3843 ; SI-NEXT: s_mov_b32 s6, 0
3844 ; SI-NEXT: s_mov_b32 s7, s3
3845 ; SI-NEXT: v_mov_b32_e32 v2, s2
3846 ; SI-NEXT: v_mov_b32_e32 v0, s8
3847 ; SI-NEXT: v_mov_b32_e32 v1, s9
3848 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
3849 ; SI-NEXT: s_waitcnt vmcnt(0)
3850 ; SI-NEXT: buffer_wbinvl1
3851 ; SI-NEXT: s_mov_b32 s2, -1
3852 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
3855 ; VI-LABEL: atomic_xchg_i32_ret_addr64_offset:
3856 ; VI: ; %bb.0: ; %entry
3857 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
3858 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3859 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
3860 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3861 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3862 ; VI-NEXT: s_add_u32 s0, s4, s0
3863 ; VI-NEXT: s_addc_u32 s1, s5, s1
3864 ; VI-NEXT: s_add_u32 s0, s0, 16
3865 ; VI-NEXT: s_addc_u32 s1, s1, 0
3866 ; VI-NEXT: v_mov_b32_e32 v0, s0
3867 ; VI-NEXT: v_mov_b32_e32 v1, s1
3868 ; VI-NEXT: v_mov_b32_e32 v2, s8
3869 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
3870 ; VI-NEXT: s_waitcnt vmcnt(0)
3871 ; VI-NEXT: buffer_wbinvl1_vol
3872 ; VI-NEXT: s_mov_b32 s3, 0xf000
3873 ; VI-NEXT: s_mov_b32 s2, -1
3874 ; VI-NEXT: s_mov_b32 s0, s6
3875 ; VI-NEXT: s_mov_b32 s1, s7
3876 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
3879 ; GFX9-LABEL: atomic_xchg_i32_ret_addr64_offset:
3880 ; GFX9: ; %bb.0: ; %entry
3881 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
3882 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3883 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
3884 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3885 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3886 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3887 ; GFX9-NEXT: s_add_u32 s0, s4, s0
3888 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
3889 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
3890 ; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[0:1] offset:16 glc
3891 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3892 ; GFX9-NEXT: buffer_wbinvl1_vol
3893 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
3894 ; GFX9-NEXT: s_endpgm
3896 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
3897 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
3898 %val = atomicrmw volatile xchg ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
3899 store i32 %val, ptr addrspace(1) %out2
3903 define amdgpu_kernel void @atomic_xchg_i32(ptr addrspace(1) %out, i32 %in) {
3904 ; SI-LABEL: atomic_xchg_i32:
3905 ; SI: ; %bb.0: ; %entry
3906 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
3907 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
3908 ; SI-NEXT: s_mov_b32 s3, 0xf000
3909 ; SI-NEXT: s_mov_b32 s2, -1
3910 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3911 ; SI-NEXT: v_mov_b32_e32 v0, s4
3912 ; SI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0
3913 ; SI-NEXT: s_waitcnt vmcnt(0)
3914 ; SI-NEXT: buffer_wbinvl1
3917 ; VI-LABEL: atomic_xchg_i32:
3918 ; VI: ; %bb.0: ; %entry
3919 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
3920 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
3921 ; VI-NEXT: s_mov_b32 s3, 0xf000
3922 ; VI-NEXT: s_mov_b32 s2, -1
3923 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3924 ; VI-NEXT: v_mov_b32_e32 v0, s4
3925 ; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0
3926 ; VI-NEXT: s_waitcnt vmcnt(0)
3927 ; VI-NEXT: buffer_wbinvl1_vol
3930 ; GFX9-LABEL: atomic_xchg_i32:
3931 ; GFX9: ; %bb.0: ; %entry
3932 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
3933 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
3934 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3935 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3936 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
3937 ; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3]
3938 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3939 ; GFX9-NEXT: buffer_wbinvl1_vol
3940 ; GFX9-NEXT: s_endpgm
3942 %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
3946 define amdgpu_kernel void @atomic_xchg_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
3947 ; SI-LABEL: atomic_xchg_i32_ret:
3948 ; SI: ; %bb.0: ; %entry
3949 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3950 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
3951 ; SI-NEXT: s_mov_b32 s3, 0xf000
3952 ; SI-NEXT: s_mov_b32 s2, -1
3953 ; SI-NEXT: s_waitcnt lgkmcnt(0)
3954 ; SI-NEXT: s_mov_b32 s0, s4
3955 ; SI-NEXT: s_mov_b32 s1, s5
3956 ; SI-NEXT: v_mov_b32_e32 v0, s8
3957 ; SI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc
3958 ; SI-NEXT: s_waitcnt vmcnt(0)
3959 ; SI-NEXT: buffer_wbinvl1
3960 ; SI-NEXT: s_mov_b32 s0, s6
3961 ; SI-NEXT: s_mov_b32 s1, s7
3962 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
3965 ; VI-LABEL: atomic_xchg_i32_ret:
3966 ; VI: ; %bb.0: ; %entry
3967 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3968 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
3969 ; VI-NEXT: s_mov_b32 s3, 0xf000
3970 ; VI-NEXT: s_mov_b32 s2, -1
3971 ; VI-NEXT: s_waitcnt lgkmcnt(0)
3972 ; VI-NEXT: s_mov_b32 s0, s4
3973 ; VI-NEXT: s_mov_b32 s1, s5
3974 ; VI-NEXT: v_mov_b32_e32 v0, s8
3975 ; VI-NEXT: buffer_atomic_swap v0, off, s[0:3], 0 glc
3976 ; VI-NEXT: s_waitcnt vmcnt(0)
3977 ; VI-NEXT: buffer_wbinvl1_vol
3978 ; VI-NEXT: s_mov_b32 s0, s6
3979 ; VI-NEXT: s_mov_b32 s1, s7
3980 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
3983 ; GFX9-LABEL: atomic_xchg_i32_ret:
3984 ; GFX9: ; %bb.0: ; %entry
3985 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
3986 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3987 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3988 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3989 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
3990 ; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[4:5] glc
3991 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3992 ; GFX9-NEXT: buffer_wbinvl1_vol
3993 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
3994 ; GFX9-NEXT: s_endpgm
3996 %val = atomicrmw volatile xchg ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
3997 store i32 %val, ptr addrspace(1) %out2
4001 define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
4002 ; SI-LABEL: atomic_xchg_i32_addr64:
4003 ; SI: ; %bb.0: ; %entry
4004 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
4005 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
4006 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
4007 ; SI-NEXT: s_mov_b32 s3, 0xf000
4008 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4009 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4010 ; SI-NEXT: s_mov_b32 s2, 0
4011 ; SI-NEXT: v_mov_b32_e32 v2, s6
4012 ; SI-NEXT: v_mov_b32_e32 v0, s4
4013 ; SI-NEXT: v_mov_b32_e32 v1, s5
4014 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[0:3], 0 addr64
4015 ; SI-NEXT: s_waitcnt vmcnt(0)
4016 ; SI-NEXT: buffer_wbinvl1
4019 ; VI-LABEL: atomic_xchg_i32_addr64:
4020 ; VI: ; %bb.0: ; %entry
4021 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4022 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4023 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
4024 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4025 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4026 ; VI-NEXT: s_add_u32 s0, s4, s0
4027 ; VI-NEXT: s_addc_u32 s1, s5, s1
4028 ; VI-NEXT: v_mov_b32_e32 v0, s0
4029 ; VI-NEXT: v_mov_b32_e32 v1, s1
4030 ; VI-NEXT: v_mov_b32_e32 v2, s6
4031 ; VI-NEXT: flat_atomic_swap v[0:1], v2
4032 ; VI-NEXT: s_waitcnt vmcnt(0)
4033 ; VI-NEXT: buffer_wbinvl1_vol
4036 ; GFX9-LABEL: atomic_xchg_i32_addr64:
4037 ; GFX9: ; %bb.0: ; %entry
4038 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4039 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4040 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
4041 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
4042 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4043 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4044 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4045 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4046 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
4047 ; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1]
4048 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4049 ; GFX9-NEXT: buffer_wbinvl1_vol
4050 ; GFX9-NEXT: s_endpgm
4052 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
4053 %val = atomicrmw volatile xchg ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
4057 define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
4058 ; SI-LABEL: atomic_xchg_i32_ret_addr64:
4059 ; SI: ; %bb.0: ; %entry
4060 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4061 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
4062 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
4063 ; SI-NEXT: s_mov_b32 s3, 0xf000
4064 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4065 ; SI-NEXT: s_mov_b32 s0, s6
4066 ; SI-NEXT: s_mov_b32 s1, s7
4067 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
4068 ; SI-NEXT: s_mov_b32 s6, 0
4069 ; SI-NEXT: s_mov_b32 s7, s3
4070 ; SI-NEXT: v_mov_b32_e32 v2, s2
4071 ; SI-NEXT: v_mov_b32_e32 v0, s8
4072 ; SI-NEXT: v_mov_b32_e32 v1, s9
4073 ; SI-NEXT: buffer_atomic_swap v2, v[0:1], s[4:7], 0 addr64 glc
4074 ; SI-NEXT: s_waitcnt vmcnt(0)
4075 ; SI-NEXT: buffer_wbinvl1
4076 ; SI-NEXT: s_mov_b32 s2, -1
4077 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
4080 ; VI-LABEL: atomic_xchg_i32_ret_addr64:
4081 ; VI: ; %bb.0: ; %entry
4082 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4083 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4084 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
4085 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4086 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4087 ; VI-NEXT: s_add_u32 s0, s4, s0
4088 ; VI-NEXT: s_addc_u32 s1, s5, s1
4089 ; VI-NEXT: v_mov_b32_e32 v0, s0
4090 ; VI-NEXT: v_mov_b32_e32 v1, s1
4091 ; VI-NEXT: v_mov_b32_e32 v2, s8
4092 ; VI-NEXT: flat_atomic_swap v0, v[0:1], v2 glc
4093 ; VI-NEXT: s_waitcnt vmcnt(0)
4094 ; VI-NEXT: buffer_wbinvl1_vol
4095 ; VI-NEXT: s_mov_b32 s3, 0xf000
4096 ; VI-NEXT: s_mov_b32 s2, -1
4097 ; VI-NEXT: s_mov_b32 s0, s6
4098 ; VI-NEXT: s_mov_b32 s1, s7
4099 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4102 ; GFX9-LABEL: atomic_xchg_i32_ret_addr64:
4103 ; GFX9: ; %bb.0: ; %entry
4104 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4105 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4106 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
4107 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
4108 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4109 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4110 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4111 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4112 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
4113 ; GFX9-NEXT: global_atomic_swap v1, v0, v1, s[0:1] glc
4114 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4115 ; GFX9-NEXT: buffer_wbinvl1_vol
4116 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
4117 ; GFX9-NEXT: s_endpgm
4119 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
4120 %val = atomicrmw volatile xchg ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
4121 store i32 %val, ptr addrspace(1) %out2
4125 define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr addrspace(1) %out, i32 %in, i32 %old) {
4126 ; SI-LABEL: atomic_cmpxchg_i32_offset:
4127 ; SI: ; %bb.0: ; %entry
4128 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
4129 ; SI-NEXT: s_mov_b32 s7, 0xf000
4130 ; SI-NEXT: s_mov_b32 s6, -1
4131 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4132 ; SI-NEXT: s_mov_b32 s4, s0
4133 ; SI-NEXT: s_mov_b32 s5, s1
4134 ; SI-NEXT: v_mov_b32_e32 v0, s2
4135 ; SI-NEXT: v_mov_b32_e32 v1, s3
4136 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4137 ; SI-NEXT: s_waitcnt vmcnt(0)
4138 ; SI-NEXT: buffer_wbinvl1
4141 ; VI-LABEL: atomic_cmpxchg_i32_offset:
4142 ; VI: ; %bb.0: ; %entry
4143 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4144 ; VI-NEXT: s_mov_b32 s7, 0xf000
4145 ; VI-NEXT: s_mov_b32 s6, -1
4146 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4147 ; VI-NEXT: v_mov_b32_e32 v0, s2
4148 ; VI-NEXT: s_mov_b32 s4, s0
4149 ; VI-NEXT: s_mov_b32 s5, s1
4150 ; VI-NEXT: v_mov_b32_e32 v1, s3
4151 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16
4152 ; VI-NEXT: s_waitcnt vmcnt(0)
4153 ; VI-NEXT: buffer_wbinvl1_vol
4156 ; GFX9-LABEL: atomic_cmpxchg_i32_offset:
4157 ; GFX9: ; %bb.0: ; %entry
4158 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4159 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4160 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4161 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4162 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4163 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
4164 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4165 ; GFX9-NEXT: buffer_wbinvl1_vol
4166 ; GFX9-NEXT: s_endpgm
4168 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
4169 %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4173 define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %old) {
4174 ; SI-LABEL: atomic_cmpxchg_i32_ret_offset:
4175 ; SI: ; %bb.0: ; %entry
4176 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4177 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
4178 ; SI-NEXT: s_mov_b32 s3, 0xf000
4179 ; SI-NEXT: s_mov_b32 s2, -1
4180 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4181 ; SI-NEXT: s_mov_b32 s0, s6
4182 ; SI-NEXT: s_mov_b32 s1, s7
4183 ; SI-NEXT: s_mov_b32 s6, s2
4184 ; SI-NEXT: s_mov_b32 s7, s3
4185 ; SI-NEXT: v_mov_b32_e32 v0, s8
4186 ; SI-NEXT: v_mov_b32_e32 v1, s9
4187 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4188 ; SI-NEXT: s_waitcnt vmcnt(0)
4189 ; SI-NEXT: buffer_wbinvl1
4190 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4193 ; VI-LABEL: atomic_cmpxchg_i32_ret_offset:
4194 ; VI: ; %bb.0: ; %entry
4195 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4196 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
4197 ; VI-NEXT: s_mov_b32 s3, 0xf000
4198 ; VI-NEXT: s_mov_b32 s2, -1
4199 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4200 ; VI-NEXT: s_mov_b32 s0, s6
4201 ; VI-NEXT: v_mov_b32_e32 v0, s8
4202 ; VI-NEXT: s_mov_b32 s1, s7
4203 ; VI-NEXT: s_mov_b32 s6, s2
4204 ; VI-NEXT: s_mov_b32 s7, s3
4205 ; VI-NEXT: v_mov_b32_e32 v1, s9
4206 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
4207 ; VI-NEXT: s_waitcnt vmcnt(0)
4208 ; VI-NEXT: buffer_wbinvl1_vol
4209 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4212 ; GFX9-LABEL: atomic_cmpxchg_i32_ret_offset:
4213 ; GFX9: ; %bb.0: ; %entry
4214 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4215 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4216 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4217 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4218 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4219 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4220 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] offset:16 glc
4221 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4222 ; GFX9-NEXT: buffer_wbinvl1_vol
4223 ; GFX9-NEXT: global_store_dword v2, v0, s[6:7]
4224 ; GFX9-NEXT: s_endpgm
4226 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
4227 %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4228 %extract0 = extractvalue { i32, i1 } %val, 0
4229 store i32 %extract0, ptr addrspace(1) %out2
4233 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) {
4234 ; SI-LABEL: atomic_cmpxchg_i32_addr64_offset:
4235 ; SI: ; %bb.0: ; %entry
4236 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
4237 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
4238 ; SI-NEXT: s_load_dword s7, s[0:1], 0xf
4239 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
4240 ; SI-NEXT: s_mov_b32 s3, 0xf000
4241 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4242 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4243 ; SI-NEXT: s_mov_b32 s2, 0
4244 ; SI-NEXT: v_mov_b32_e32 v0, s6
4245 ; SI-NEXT: v_mov_b32_e32 v1, s7
4246 ; SI-NEXT: v_mov_b32_e32 v2, s4
4247 ; SI-NEXT: v_mov_b32_e32 v3, s5
4248 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[0:3], 0 addr64 offset:16
4249 ; SI-NEXT: s_waitcnt vmcnt(0)
4250 ; SI-NEXT: buffer_wbinvl1
4253 ; VI-LABEL: atomic_cmpxchg_i32_addr64_offset:
4254 ; VI: ; %bb.0: ; %entry
4255 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4256 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
4257 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4258 ; VI-NEXT: s_load_dword s7, s[0:1], 0x3c
4259 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4260 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4261 ; VI-NEXT: v_mov_b32_e32 v0, s6
4262 ; VI-NEXT: s_add_u32 s0, s4, s0
4263 ; VI-NEXT: s_addc_u32 s1, s5, s1
4264 ; VI-NEXT: s_add_u32 s0, s0, 16
4265 ; VI-NEXT: s_addc_u32 s1, s1, 0
4266 ; VI-NEXT: v_mov_b32_e32 v3, s1
4267 ; VI-NEXT: v_mov_b32_e32 v1, s7
4268 ; VI-NEXT: v_mov_b32_e32 v2, s0
4269 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
4270 ; VI-NEXT: s_waitcnt vmcnt(0)
4271 ; VI-NEXT: buffer_wbinvl1_vol
4274 ; GFX9-LABEL: atomic_cmpxchg_i32_addr64_offset:
4275 ; GFX9: ; %bb.0: ; %entry
4276 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4277 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4278 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
4279 ; GFX9-NEXT: s_load_dword s7, s[0:1], 0x3c
4280 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4281 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4282 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4283 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4284 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4285 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
4286 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
4287 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16
4288 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4289 ; GFX9-NEXT: buffer_wbinvl1_vol
4290 ; GFX9-NEXT: s_endpgm
4292 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
4293 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
4294 %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4298 define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index, i32 %old) {
4299 ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
4300 ; SI: ; %bb.0: ; %entry
4301 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4302 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
4303 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
4304 ; SI-NEXT: s_load_dword s10, s[0:1], 0x11
4305 ; SI-NEXT: s_mov_b32 s3, 0xf000
4306 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4307 ; SI-NEXT: s_mov_b32 s0, s6
4308 ; SI-NEXT: s_mov_b32 s1, s7
4309 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
4310 ; SI-NEXT: s_mov_b32 s6, 0
4311 ; SI-NEXT: s_mov_b32 s7, s3
4312 ; SI-NEXT: v_mov_b32_e32 v0, s2
4313 ; SI-NEXT: v_mov_b32_e32 v1, s10
4314 ; SI-NEXT: v_mov_b32_e32 v2, s8
4315 ; SI-NEXT: v_mov_b32_e32 v3, s9
4316 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64 offset:16 glc
4317 ; SI-NEXT: s_waitcnt vmcnt(0)
4318 ; SI-NEXT: buffer_wbinvl1
4319 ; SI-NEXT: s_mov_b32 s2, -1
4320 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4323 ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
4324 ; VI: ; %bb.0: ; %entry
4325 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4326 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
4327 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4328 ; VI-NEXT: s_load_dword s9, s[0:1], 0x44
4329 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4330 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4331 ; VI-NEXT: v_mov_b32_e32 v0, s8
4332 ; VI-NEXT: s_add_u32 s0, s4, s0
4333 ; VI-NEXT: s_addc_u32 s1, s5, s1
4334 ; VI-NEXT: s_add_u32 s0, s0, 16
4335 ; VI-NEXT: s_addc_u32 s1, s1, 0
4336 ; VI-NEXT: v_mov_b32_e32 v3, s1
4337 ; VI-NEXT: v_mov_b32_e32 v1, s9
4338 ; VI-NEXT: v_mov_b32_e32 v2, s0
4339 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
4340 ; VI-NEXT: s_waitcnt vmcnt(0)
4341 ; VI-NEXT: buffer_wbinvl1_vol
4342 ; VI-NEXT: s_mov_b32 s3, 0xf000
4343 ; VI-NEXT: s_mov_b32 s2, -1
4344 ; VI-NEXT: s_mov_b32 s0, s6
4345 ; VI-NEXT: s_mov_b32 s1, s7
4346 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4349 ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
4350 ; GFX9: ; %bb.0: ; %entry
4351 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4352 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4353 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
4354 ; GFX9-NEXT: s_load_dword s9, s[0:1], 0x44
4355 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4356 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4357 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4358 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4359 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4360 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
4361 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
4362 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc
4363 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4364 ; GFX9-NEXT: buffer_wbinvl1_vol
4365 ; GFX9-NEXT: global_store_dword v2, v0, s[6:7]
4366 ; GFX9-NEXT: s_endpgm
4368 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
4369 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
4370 %val = cmpxchg volatile ptr addrspace(1) %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4371 %extract0 = extractvalue { i32, i1 } %val, 0
4372 store i32 %extract0, ptr addrspace(1) %out2
4376 define amdgpu_kernel void @atomic_cmpxchg_i32(ptr addrspace(1) %out, i32 %in, i32 %old) {
4377 ; SI-LABEL: atomic_cmpxchg_i32:
4378 ; SI: ; %bb.0: ; %entry
4379 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
4380 ; SI-NEXT: s_mov_b32 s7, 0xf000
4381 ; SI-NEXT: s_mov_b32 s6, -1
4382 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4383 ; SI-NEXT: s_mov_b32 s4, s0
4384 ; SI-NEXT: s_mov_b32 s5, s1
4385 ; SI-NEXT: v_mov_b32_e32 v0, s2
4386 ; SI-NEXT: v_mov_b32_e32 v1, s3
4387 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0
4388 ; SI-NEXT: s_waitcnt vmcnt(0)
4389 ; SI-NEXT: buffer_wbinvl1
4392 ; VI-LABEL: atomic_cmpxchg_i32:
4393 ; VI: ; %bb.0: ; %entry
4394 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4395 ; VI-NEXT: s_mov_b32 s7, 0xf000
4396 ; VI-NEXT: s_mov_b32 s6, -1
4397 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4398 ; VI-NEXT: v_mov_b32_e32 v0, s2
4399 ; VI-NEXT: s_mov_b32 s4, s0
4400 ; VI-NEXT: s_mov_b32 s5, s1
4401 ; VI-NEXT: v_mov_b32_e32 v1, s3
4402 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[4:7], 0
4403 ; VI-NEXT: s_waitcnt vmcnt(0)
4404 ; VI-NEXT: buffer_wbinvl1_vol
4407 ; GFX9-LABEL: atomic_cmpxchg_i32:
4408 ; GFX9: ; %bb.0: ; %entry
4409 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4410 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4411 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4412 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4413 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4414 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1]
4415 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4416 ; GFX9-NEXT: buffer_wbinvl1_vol
4417 ; GFX9-NEXT: s_endpgm
4419 %val = cmpxchg volatile ptr addrspace(1) %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4423 define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i32 %old) {
4424 ; SI-LABEL: atomic_cmpxchg_i32_ret:
4425 ; SI: ; %bb.0: ; %entry
4426 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4427 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
4428 ; SI-NEXT: s_mov_b32 s3, 0xf000
4429 ; SI-NEXT: s_mov_b32 s2, -1
4430 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4431 ; SI-NEXT: s_mov_b32 s0, s4
4432 ; SI-NEXT: s_mov_b32 s1, s5
4433 ; SI-NEXT: v_mov_b32_e32 v0, s8
4434 ; SI-NEXT: v_mov_b32_e32 v1, s9
4435 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
4436 ; SI-NEXT: s_waitcnt vmcnt(0)
4437 ; SI-NEXT: buffer_wbinvl1
4438 ; SI-NEXT: s_mov_b32 s0, s6
4439 ; SI-NEXT: s_mov_b32 s1, s7
4440 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4443 ; VI-LABEL: atomic_cmpxchg_i32_ret:
4444 ; VI: ; %bb.0: ; %entry
4445 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4446 ; VI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x34
4447 ; VI-NEXT: s_mov_b32 s3, 0xf000
4448 ; VI-NEXT: s_mov_b32 s2, -1
4449 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4450 ; VI-NEXT: s_mov_b32 s0, s4
4451 ; VI-NEXT: v_mov_b32_e32 v0, s8
4452 ; VI-NEXT: s_mov_b32 s1, s5
4453 ; VI-NEXT: v_mov_b32_e32 v1, s9
4454 ; VI-NEXT: buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 glc
4455 ; VI-NEXT: s_waitcnt vmcnt(0)
4456 ; VI-NEXT: buffer_wbinvl1_vol
4457 ; VI-NEXT: s_mov_b32 s0, s6
4458 ; VI-NEXT: s_mov_b32 s1, s7
4459 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4462 ; GFX9-LABEL: atomic_cmpxchg_i32_ret:
4463 ; GFX9: ; %bb.0: ; %entry
4464 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4465 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4466 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4467 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4468 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
4469 ; GFX9-NEXT: v_mov_b32_e32 v1, s3
4470 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[4:5] glc
4471 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4472 ; GFX9-NEXT: buffer_wbinvl1_vol
4473 ; GFX9-NEXT: global_store_dword v2, v0, s[6:7]
4474 ; GFX9-NEXT: s_endpgm
4476 %val = cmpxchg volatile ptr addrspace(1) %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4477 %extract0 = extractvalue { i32, i1 } %val, 0
4478 store i32 %extract0, ptr addrspace(1) %out2
4482 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index, i32 %old) {
4483 ; SI-LABEL: atomic_cmpxchg_i32_addr64:
4484 ; SI: ; %bb.0: ; %entry
4485 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
4486 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
4487 ; SI-NEXT: s_load_dword s7, s[0:1], 0xf
4488 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
4489 ; SI-NEXT: s_mov_b32 s3, 0xf000
4490 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4491 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4492 ; SI-NEXT: s_mov_b32 s2, 0
4493 ; SI-NEXT: v_mov_b32_e32 v0, s6
4494 ; SI-NEXT: v_mov_b32_e32 v1, s7
4495 ; SI-NEXT: v_mov_b32_e32 v2, s4
4496 ; SI-NEXT: v_mov_b32_e32 v3, s5
4497 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[0:3], 0 addr64
4498 ; SI-NEXT: s_waitcnt vmcnt(0)
4499 ; SI-NEXT: buffer_wbinvl1
4502 ; VI-LABEL: atomic_cmpxchg_i32_addr64:
4503 ; VI: ; %bb.0: ; %entry
4504 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4505 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
4506 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4507 ; VI-NEXT: s_load_dword s7, s[0:1], 0x3c
4508 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4509 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4510 ; VI-NEXT: v_mov_b32_e32 v0, s6
4511 ; VI-NEXT: s_add_u32 s0, s4, s0
4512 ; VI-NEXT: s_addc_u32 s1, s5, s1
4513 ; VI-NEXT: v_mov_b32_e32 v3, s1
4514 ; VI-NEXT: v_mov_b32_e32 v1, s7
4515 ; VI-NEXT: v_mov_b32_e32 v2, s0
4516 ; VI-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
4517 ; VI-NEXT: s_waitcnt vmcnt(0)
4518 ; VI-NEXT: buffer_wbinvl1_vol
4521 ; GFX9-LABEL: atomic_cmpxchg_i32_addr64:
4522 ; GFX9: ; %bb.0: ; %entry
4523 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4524 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4525 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
4526 ; GFX9-NEXT: s_load_dword s7, s[0:1], 0x3c
4527 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4528 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4529 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4530 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4531 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4532 ; GFX9-NEXT: v_mov_b32_e32 v0, s6
4533 ; GFX9-NEXT: v_mov_b32_e32 v1, s7
4534 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1]
4535 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4536 ; GFX9-NEXT: buffer_wbinvl1_vol
4537 ; GFX9-NEXT: s_endpgm
4539 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
4540 %val = cmpxchg volatile ptr addrspace(1) %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4544 define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index, i32 %old) {
4545 ; SI-LABEL: atomic_cmpxchg_i32_ret_addr64:
4546 ; SI: ; %bb.0: ; %entry
4547 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4548 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
4549 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
4550 ; SI-NEXT: s_load_dword s10, s[0:1], 0x11
4551 ; SI-NEXT: s_mov_b32 s3, 0xf000
4552 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4553 ; SI-NEXT: s_mov_b32 s0, s6
4554 ; SI-NEXT: s_mov_b32 s1, s7
4555 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
4556 ; SI-NEXT: s_mov_b32 s6, 0
4557 ; SI-NEXT: s_mov_b32 s7, s3
4558 ; SI-NEXT: v_mov_b32_e32 v0, s2
4559 ; SI-NEXT: v_mov_b32_e32 v1, s10
4560 ; SI-NEXT: v_mov_b32_e32 v2, s8
4561 ; SI-NEXT: v_mov_b32_e32 v3, s9
4562 ; SI-NEXT: buffer_atomic_cmpswap v[0:1], v[2:3], s[4:7], 0 addr64 glc
4563 ; SI-NEXT: s_waitcnt vmcnt(0)
4564 ; SI-NEXT: buffer_wbinvl1
4565 ; SI-NEXT: s_mov_b32 s2, -1
4566 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4569 ; VI-LABEL: atomic_cmpxchg_i32_ret_addr64:
4570 ; VI: ; %bb.0: ; %entry
4571 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4572 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
4573 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4574 ; VI-NEXT: s_load_dword s9, s[0:1], 0x44
4575 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4576 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4577 ; VI-NEXT: v_mov_b32_e32 v0, s8
4578 ; VI-NEXT: s_add_u32 s0, s4, s0
4579 ; VI-NEXT: s_addc_u32 s1, s5, s1
4580 ; VI-NEXT: v_mov_b32_e32 v3, s1
4581 ; VI-NEXT: v_mov_b32_e32 v1, s9
4582 ; VI-NEXT: v_mov_b32_e32 v2, s0
4583 ; VI-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
4584 ; VI-NEXT: s_waitcnt vmcnt(0)
4585 ; VI-NEXT: buffer_wbinvl1_vol
4586 ; VI-NEXT: s_mov_b32 s3, 0xf000
4587 ; VI-NEXT: s_mov_b32 s2, -1
4588 ; VI-NEXT: s_mov_b32 s0, s6
4589 ; VI-NEXT: s_mov_b32 s1, s7
4590 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4593 ; GFX9-LABEL: atomic_cmpxchg_i32_ret_addr64:
4594 ; GFX9: ; %bb.0: ; %entry
4595 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4596 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4597 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
4598 ; GFX9-NEXT: s_load_dword s9, s[0:1], 0x44
4599 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
4600 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4601 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4602 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4603 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4604 ; GFX9-NEXT: v_mov_b32_e32 v0, s8
4605 ; GFX9-NEXT: v_mov_b32_e32 v1, s9
4606 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
4607 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4608 ; GFX9-NEXT: buffer_wbinvl1_vol
4609 ; GFX9-NEXT: global_store_dword v2, v0, s[6:7]
4610 ; GFX9-NEXT: s_endpgm
4612 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
4613 %val = cmpxchg volatile ptr addrspace(1) %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4614 %extract0 = extractvalue { i32, i1 } %val, 0
4615 store i32 %extract0, ptr addrspace(1) %out2
4619 define amdgpu_kernel void @atomic_xor_i32_offset(ptr addrspace(1) %out, i32 %in) {
4620 ; SI-LABEL: atomic_xor_i32_offset:
4621 ; SI: ; %bb.0: ; %entry
4622 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
4623 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
4624 ; SI-NEXT: s_mov_b32 s3, 0xf000
4625 ; SI-NEXT: s_mov_b32 s2, -1
4626 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4627 ; SI-NEXT: v_mov_b32_e32 v0, s4
4628 ; SI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0 offset:16
4629 ; SI-NEXT: s_waitcnt vmcnt(0)
4630 ; SI-NEXT: buffer_wbinvl1
4633 ; VI-LABEL: atomic_xor_i32_offset:
4634 ; VI: ; %bb.0: ; %entry
4635 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
4636 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
4637 ; VI-NEXT: s_mov_b32 s3, 0xf000
4638 ; VI-NEXT: s_mov_b32 s2, -1
4639 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4640 ; VI-NEXT: v_mov_b32_e32 v0, s4
4641 ; VI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0 offset:16
4642 ; VI-NEXT: s_waitcnt vmcnt(0)
4643 ; VI-NEXT: buffer_wbinvl1_vol
4646 ; GFX9-LABEL: atomic_xor_i32_offset:
4647 ; GFX9: ; %bb.0: ; %entry
4648 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
4649 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
4650 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
4651 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4652 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
4653 ; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3] offset:16
4654 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4655 ; GFX9-NEXT: buffer_wbinvl1_vol
4656 ; GFX9-NEXT: s_endpgm
4658 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
4659 %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
4663 define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
4664 ; SI-LABEL: atomic_xor_i32_ret_offset:
4665 ; SI: ; %bb.0: ; %entry
4666 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4667 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
4668 ; SI-NEXT: s_mov_b32 s3, 0xf000
4669 ; SI-NEXT: s_mov_b32 s2, -1
4670 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4671 ; SI-NEXT: s_mov_b32 s0, s6
4672 ; SI-NEXT: s_mov_b32 s1, s7
4673 ; SI-NEXT: s_mov_b32 s6, s2
4674 ; SI-NEXT: s_mov_b32 s7, s3
4675 ; SI-NEXT: v_mov_b32_e32 v0, s8
4676 ; SI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc
4677 ; SI-NEXT: s_waitcnt vmcnt(0)
4678 ; SI-NEXT: buffer_wbinvl1
4679 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4682 ; VI-LABEL: atomic_xor_i32_ret_offset:
4683 ; VI: ; %bb.0: ; %entry
4684 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4685 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
4686 ; VI-NEXT: s_mov_b32 s3, 0xf000
4687 ; VI-NEXT: s_mov_b32 s2, -1
4688 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4689 ; VI-NEXT: s_mov_b32 s0, s6
4690 ; VI-NEXT: s_mov_b32 s1, s7
4691 ; VI-NEXT: s_mov_b32 s6, s2
4692 ; VI-NEXT: s_mov_b32 s7, s3
4693 ; VI-NEXT: v_mov_b32_e32 v0, s8
4694 ; VI-NEXT: buffer_atomic_xor v0, off, s[4:7], 0 offset:16 glc
4695 ; VI-NEXT: s_waitcnt vmcnt(0)
4696 ; VI-NEXT: buffer_wbinvl1_vol
4697 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4700 ; GFX9-LABEL: atomic_xor_i32_ret_offset:
4701 ; GFX9: ; %bb.0: ; %entry
4702 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
4703 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4704 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
4705 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4706 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
4707 ; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[4:5] offset:16 glc
4708 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4709 ; GFX9-NEXT: buffer_wbinvl1_vol
4710 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
4711 ; GFX9-NEXT: s_endpgm
4713 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
4714 %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
4715 store i32 %val, ptr addrspace(1) %out2
4719 define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
4720 ; SI-LABEL: atomic_xor_i32_addr64_offset:
4721 ; SI: ; %bb.0: ; %entry
4722 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
4723 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
4724 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
4725 ; SI-NEXT: s_mov_b32 s3, 0xf000
4726 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4727 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4728 ; SI-NEXT: s_mov_b32 s2, 0
4729 ; SI-NEXT: v_mov_b32_e32 v2, s6
4730 ; SI-NEXT: v_mov_b32_e32 v0, s4
4731 ; SI-NEXT: v_mov_b32_e32 v1, s5
4732 ; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[0:3], 0 addr64 offset:16
4733 ; SI-NEXT: s_waitcnt vmcnt(0)
4734 ; SI-NEXT: buffer_wbinvl1
4737 ; VI-LABEL: atomic_xor_i32_addr64_offset:
4738 ; VI: ; %bb.0: ; %entry
4739 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4740 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4741 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
4742 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4743 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4744 ; VI-NEXT: s_add_u32 s0, s4, s0
4745 ; VI-NEXT: s_addc_u32 s1, s5, s1
4746 ; VI-NEXT: s_add_u32 s0, s0, 16
4747 ; VI-NEXT: s_addc_u32 s1, s1, 0
4748 ; VI-NEXT: v_mov_b32_e32 v0, s0
4749 ; VI-NEXT: v_mov_b32_e32 v1, s1
4750 ; VI-NEXT: v_mov_b32_e32 v2, s6
4751 ; VI-NEXT: flat_atomic_xor v[0:1], v2
4752 ; VI-NEXT: s_waitcnt vmcnt(0)
4753 ; VI-NEXT: buffer_wbinvl1_vol
4756 ; GFX9-LABEL: atomic_xor_i32_addr64_offset:
4757 ; GFX9: ; %bb.0: ; %entry
4758 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4759 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4760 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
4761 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
4762 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4763 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4764 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4765 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4766 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
4767 ; GFX9-NEXT: global_atomic_xor v0, v1, s[0:1] offset:16
4768 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4769 ; GFX9-NEXT: buffer_wbinvl1_vol
4770 ; GFX9-NEXT: s_endpgm
4772 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
4773 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
4774 %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
4778 define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
4779 ; SI-LABEL: atomic_xor_i32_ret_addr64_offset:
4780 ; SI: ; %bb.0: ; %entry
4781 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4782 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
4783 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
4784 ; SI-NEXT: s_mov_b32 s3, 0xf000
4785 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4786 ; SI-NEXT: s_mov_b32 s0, s6
4787 ; SI-NEXT: s_mov_b32 s1, s7
4788 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
4789 ; SI-NEXT: s_mov_b32 s6, 0
4790 ; SI-NEXT: s_mov_b32 s7, s3
4791 ; SI-NEXT: v_mov_b32_e32 v2, s2
4792 ; SI-NEXT: v_mov_b32_e32 v0, s8
4793 ; SI-NEXT: v_mov_b32_e32 v1, s9
4794 ; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
4795 ; SI-NEXT: s_waitcnt vmcnt(0)
4796 ; SI-NEXT: buffer_wbinvl1
4797 ; SI-NEXT: s_mov_b32 s2, -1
4798 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
4801 ; VI-LABEL: atomic_xor_i32_ret_addr64_offset:
4802 ; VI: ; %bb.0: ; %entry
4803 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4804 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4805 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
4806 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4807 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4808 ; VI-NEXT: s_add_u32 s0, s4, s0
4809 ; VI-NEXT: s_addc_u32 s1, s5, s1
4810 ; VI-NEXT: s_add_u32 s0, s0, 16
4811 ; VI-NEXT: s_addc_u32 s1, s1, 0
4812 ; VI-NEXT: v_mov_b32_e32 v0, s0
4813 ; VI-NEXT: v_mov_b32_e32 v1, s1
4814 ; VI-NEXT: v_mov_b32_e32 v2, s8
4815 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
4816 ; VI-NEXT: s_waitcnt vmcnt(0)
4817 ; VI-NEXT: buffer_wbinvl1_vol
4818 ; VI-NEXT: s_mov_b32 s3, 0xf000
4819 ; VI-NEXT: s_mov_b32 s2, -1
4820 ; VI-NEXT: s_mov_b32 s0, s6
4821 ; VI-NEXT: s_mov_b32 s1, s7
4822 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4825 ; GFX9-LABEL: atomic_xor_i32_ret_addr64_offset:
4826 ; GFX9: ; %bb.0: ; %entry
4827 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4828 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4829 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
4830 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
4831 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4832 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4833 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4834 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4835 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
4836 ; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[0:1] offset:16 glc
4837 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4838 ; GFX9-NEXT: buffer_wbinvl1_vol
4839 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
4840 ; GFX9-NEXT: s_endpgm
4842 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
4843 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
4844 %val = atomicrmw volatile xor ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
4845 store i32 %val, ptr addrspace(1) %out2
4849 define amdgpu_kernel void @atomic_xor_i32(ptr addrspace(1) %out, i32 %in) {
4850 ; SI-LABEL: atomic_xor_i32:
4851 ; SI: ; %bb.0: ; %entry
4852 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
4853 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
4854 ; SI-NEXT: s_mov_b32 s3, 0xf000
4855 ; SI-NEXT: s_mov_b32 s2, -1
4856 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4857 ; SI-NEXT: v_mov_b32_e32 v0, s4
4858 ; SI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0
4859 ; SI-NEXT: s_waitcnt vmcnt(0)
4860 ; SI-NEXT: buffer_wbinvl1
4863 ; VI-LABEL: atomic_xor_i32:
4864 ; VI: ; %bb.0: ; %entry
4865 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
4866 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
4867 ; VI-NEXT: s_mov_b32 s3, 0xf000
4868 ; VI-NEXT: s_mov_b32 s2, -1
4869 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4870 ; VI-NEXT: v_mov_b32_e32 v0, s4
4871 ; VI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0
4872 ; VI-NEXT: s_waitcnt vmcnt(0)
4873 ; VI-NEXT: buffer_wbinvl1_vol
4876 ; GFX9-LABEL: atomic_xor_i32:
4877 ; GFX9: ; %bb.0: ; %entry
4878 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
4879 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
4880 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
4881 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4882 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
4883 ; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3]
4884 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4885 ; GFX9-NEXT: buffer_wbinvl1_vol
4886 ; GFX9-NEXT: s_endpgm
4888 %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
4892 define amdgpu_kernel void @atomic_xor_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
4893 ; SI-LABEL: atomic_xor_i32_ret:
4894 ; SI: ; %bb.0: ; %entry
4895 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4896 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
4897 ; SI-NEXT: s_mov_b32 s3, 0xf000
4898 ; SI-NEXT: s_mov_b32 s2, -1
4899 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4900 ; SI-NEXT: s_mov_b32 s0, s4
4901 ; SI-NEXT: s_mov_b32 s1, s5
4902 ; SI-NEXT: v_mov_b32_e32 v0, s8
4903 ; SI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0 glc
4904 ; SI-NEXT: s_waitcnt vmcnt(0)
4905 ; SI-NEXT: buffer_wbinvl1
4906 ; SI-NEXT: s_mov_b32 s0, s6
4907 ; SI-NEXT: s_mov_b32 s1, s7
4908 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4911 ; VI-LABEL: atomic_xor_i32_ret:
4912 ; VI: ; %bb.0: ; %entry
4913 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4914 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
4915 ; VI-NEXT: s_mov_b32 s3, 0xf000
4916 ; VI-NEXT: s_mov_b32 s2, -1
4917 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4918 ; VI-NEXT: s_mov_b32 s0, s4
4919 ; VI-NEXT: s_mov_b32 s1, s5
4920 ; VI-NEXT: v_mov_b32_e32 v0, s8
4921 ; VI-NEXT: buffer_atomic_xor v0, off, s[0:3], 0 glc
4922 ; VI-NEXT: s_waitcnt vmcnt(0)
4923 ; VI-NEXT: buffer_wbinvl1_vol
4924 ; VI-NEXT: s_mov_b32 s0, s6
4925 ; VI-NEXT: s_mov_b32 s1, s7
4926 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
4929 ; GFX9-LABEL: atomic_xor_i32_ret:
4930 ; GFX9: ; %bb.0: ; %entry
4931 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
4932 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4933 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
4934 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4935 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
4936 ; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[4:5] glc
4937 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4938 ; GFX9-NEXT: buffer_wbinvl1_vol
4939 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
4940 ; GFX9-NEXT: s_endpgm
4942 %val = atomicrmw volatile xor ptr addrspace(1) %out, i32 %in syncscope("agent") seq_cst
4943 store i32 %val, ptr addrspace(1) %out2
4947 define amdgpu_kernel void @atomic_xor_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
4948 ; SI-LABEL: atomic_xor_i32_addr64:
4949 ; SI: ; %bb.0: ; %entry
4950 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
4951 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
4952 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
4953 ; SI-NEXT: s_mov_b32 s3, 0xf000
4954 ; SI-NEXT: s_waitcnt lgkmcnt(0)
4955 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
4956 ; SI-NEXT: s_mov_b32 s2, 0
4957 ; SI-NEXT: v_mov_b32_e32 v2, s6
4958 ; SI-NEXT: v_mov_b32_e32 v0, s4
4959 ; SI-NEXT: v_mov_b32_e32 v1, s5
4960 ; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[0:3], 0 addr64
4961 ; SI-NEXT: s_waitcnt vmcnt(0)
4962 ; SI-NEXT: buffer_wbinvl1
4965 ; VI-LABEL: atomic_xor_i32_addr64:
4966 ; VI: ; %bb.0: ; %entry
4967 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4968 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4969 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
4970 ; VI-NEXT: s_waitcnt lgkmcnt(0)
4971 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4972 ; VI-NEXT: s_add_u32 s0, s4, s0
4973 ; VI-NEXT: s_addc_u32 s1, s5, s1
4974 ; VI-NEXT: v_mov_b32_e32 v0, s0
4975 ; VI-NEXT: v_mov_b32_e32 v1, s1
4976 ; VI-NEXT: v_mov_b32_e32 v2, s6
4977 ; VI-NEXT: flat_atomic_xor v[0:1], v2
4978 ; VI-NEXT: s_waitcnt vmcnt(0)
4979 ; VI-NEXT: buffer_wbinvl1_vol
4982 ; GFX9-LABEL: atomic_xor_i32_addr64:
4983 ; GFX9: ; %bb.0: ; %entry
4984 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4985 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4986 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
4987 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
4988 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4989 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4990 ; GFX9-NEXT: s_add_u32 s0, s4, s0
4991 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
4992 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
4993 ; GFX9-NEXT: global_atomic_xor v0, v1, s[0:1]
4994 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4995 ; GFX9-NEXT: buffer_wbinvl1_vol
4996 ; GFX9-NEXT: s_endpgm
4998 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
4999 %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
5003 define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
5004 ; SI-LABEL: atomic_xor_i32_ret_addr64:
5005 ; SI: ; %bb.0: ; %entry
5006 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
5007 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
5008 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
5009 ; SI-NEXT: s_mov_b32 s3, 0xf000
5010 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5011 ; SI-NEXT: s_mov_b32 s0, s6
5012 ; SI-NEXT: s_mov_b32 s1, s7
5013 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
5014 ; SI-NEXT: s_mov_b32 s6, 0
5015 ; SI-NEXT: s_mov_b32 s7, s3
5016 ; SI-NEXT: v_mov_b32_e32 v2, s2
5017 ; SI-NEXT: v_mov_b32_e32 v0, s8
5018 ; SI-NEXT: v_mov_b32_e32 v1, s9
5019 ; SI-NEXT: buffer_atomic_xor v2, v[0:1], s[4:7], 0 addr64 glc
5020 ; SI-NEXT: s_waitcnt vmcnt(0)
5021 ; SI-NEXT: buffer_wbinvl1
5022 ; SI-NEXT: s_mov_b32 s2, -1
5023 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
5026 ; VI-LABEL: atomic_xor_i32_ret_addr64:
5027 ; VI: ; %bb.0: ; %entry
5028 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
5029 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5030 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
5031 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5032 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
5033 ; VI-NEXT: s_add_u32 s0, s4, s0
5034 ; VI-NEXT: s_addc_u32 s1, s5, s1
5035 ; VI-NEXT: v_mov_b32_e32 v0, s0
5036 ; VI-NEXT: v_mov_b32_e32 v1, s1
5037 ; VI-NEXT: v_mov_b32_e32 v2, s8
5038 ; VI-NEXT: flat_atomic_xor v0, v[0:1], v2 glc
5039 ; VI-NEXT: s_waitcnt vmcnt(0)
5040 ; VI-NEXT: buffer_wbinvl1_vol
5041 ; VI-NEXT: s_mov_b32 s3, 0xf000
5042 ; VI-NEXT: s_mov_b32 s2, -1
5043 ; VI-NEXT: s_mov_b32 s0, s6
5044 ; VI-NEXT: s_mov_b32 s1, s7
5045 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
5048 ; GFX9-LABEL: atomic_xor_i32_ret_addr64:
5049 ; GFX9: ; %bb.0: ; %entry
5050 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
5051 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5052 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
5053 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5054 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5055 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
5056 ; GFX9-NEXT: s_add_u32 s0, s4, s0
5057 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
5058 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
5059 ; GFX9-NEXT: global_atomic_xor v1, v0, v1, s[0:1] glc
5060 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5061 ; GFX9-NEXT: buffer_wbinvl1_vol
5062 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
5063 ; GFX9-NEXT: s_endpgm
5065 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
5066 %val = atomicrmw volatile xor ptr addrspace(1) %ptr, i32 %in syncscope("agent") seq_cst
5067 store i32 %val, ptr addrspace(1) %out2
5071 define amdgpu_kernel void @atomic_load_i32_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
5072 ; SI-LABEL: atomic_load_i32_offset:
5073 ; SI: ; %bb.0: ; %entry
5074 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5075 ; SI-NEXT: s_mov_b32 s7, 0xf000
5076 ; SI-NEXT: s_mov_b32 s6, -1
5077 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5078 ; SI-NEXT: s_mov_b32 s4, s2
5079 ; SI-NEXT: s_mov_b32 s5, s3
5080 ; SI-NEXT: s_mov_b32 s2, s6
5081 ; SI-NEXT: s_mov_b32 s3, s7
5082 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:16 glc
5083 ; SI-NEXT: s_waitcnt vmcnt(0)
5084 ; SI-NEXT: buffer_wbinvl1
5085 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5088 ; VI-LABEL: atomic_load_i32_offset:
5089 ; VI: ; %bb.0: ; %entry
5090 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5091 ; VI-NEXT: s_mov_b32 s7, 0xf000
5092 ; VI-NEXT: s_mov_b32 s6, -1
5093 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5094 ; VI-NEXT: s_add_u32 s0, s0, 16
5095 ; VI-NEXT: s_addc_u32 s1, s1, 0
5096 ; VI-NEXT: v_mov_b32_e32 v0, s0
5097 ; VI-NEXT: v_mov_b32_e32 v1, s1
5098 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
5099 ; VI-NEXT: s_waitcnt vmcnt(0)
5100 ; VI-NEXT: buffer_wbinvl1_vol
5101 ; VI-NEXT: s_mov_b32 s4, s2
5102 ; VI-NEXT: s_mov_b32 s5, s3
5103 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5106 ; GFX9-LABEL: atomic_load_i32_offset:
5107 ; GFX9: ; %bb.0: ; %entry
5108 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5109 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5110 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5111 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc
5112 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5113 ; GFX9-NEXT: buffer_wbinvl1_vol
5114 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
5115 ; GFX9-NEXT: s_endpgm
5117 %gep = getelementptr i32, ptr addrspace(1) %in, i64 4
5118 %val = load atomic i32, ptr addrspace(1) %gep seq_cst, align 4
5119 store i32 %val, ptr addrspace(1) %out
5123 define amdgpu_kernel void @atomic_load_i32_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
5124 ; SI-LABEL: atomic_load_i32_negoffset:
5125 ; SI: ; %bb.0: ; %entry
5126 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5127 ; SI-NEXT: s_mov_b32 s7, 0xf000
5128 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5129 ; SI-NEXT: s_mov_b32 s4, s2
5130 ; SI-NEXT: s_mov_b32 s5, s3
5131 ; SI-NEXT: s_mov_b32 s2, 0
5132 ; SI-NEXT: s_mov_b32 s3, s7
5133 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffffe00
5134 ; SI-NEXT: v_mov_b32_e32 v1, -1
5135 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
5136 ; SI-NEXT: s_waitcnt vmcnt(0)
5137 ; SI-NEXT: buffer_wbinvl1
5138 ; SI-NEXT: s_mov_b32 s6, -1
5139 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5142 ; VI-LABEL: atomic_load_i32_negoffset:
5143 ; VI: ; %bb.0: ; %entry
5144 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5145 ; VI-NEXT: s_mov_b32 s7, 0xf000
5146 ; VI-NEXT: s_mov_b32 s6, -1
5147 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5148 ; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
5149 ; VI-NEXT: s_addc_u32 s1, s1, -1
5150 ; VI-NEXT: v_mov_b32_e32 v0, s0
5151 ; VI-NEXT: v_mov_b32_e32 v1, s1
5152 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
5153 ; VI-NEXT: s_waitcnt vmcnt(0)
5154 ; VI-NEXT: buffer_wbinvl1_vol
5155 ; VI-NEXT: s_mov_b32 s4, s2
5156 ; VI-NEXT: s_mov_b32 s5, s3
5157 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5160 ; GFX9-LABEL: atomic_load_i32_negoffset:
5161 ; GFX9: ; %bb.0: ; %entry
5162 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5163 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5164 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5165 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:-512 glc
5166 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5167 ; GFX9-NEXT: buffer_wbinvl1_vol
5168 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
5169 ; GFX9-NEXT: s_endpgm
5171 %gep = getelementptr i32, ptr addrspace(1) %in, i64 -128
5172 %val = load atomic i32, ptr addrspace(1) %gep seq_cst, align 4
5173 store i32 %val, ptr addrspace(1) %out
5177 define amdgpu_kernel void @atomic_load_f32_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
5178 ; SI-LABEL: atomic_load_f32_offset:
5179 ; SI: ; %bb.0: ; %entry
5180 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5181 ; SI-NEXT: s_mov_b32 s7, 0xf000
5182 ; SI-NEXT: s_mov_b32 s6, -1
5183 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5184 ; SI-NEXT: s_mov_b32 s4, s2
5185 ; SI-NEXT: s_mov_b32 s5, s3
5186 ; SI-NEXT: s_mov_b32 s2, s6
5187 ; SI-NEXT: s_mov_b32 s3, s7
5188 ; SI-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:16 glc
5189 ; SI-NEXT: s_waitcnt vmcnt(0)
5190 ; SI-NEXT: buffer_wbinvl1
5191 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5194 ; VI-LABEL: atomic_load_f32_offset:
5195 ; VI: ; %bb.0: ; %entry
5196 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5197 ; VI-NEXT: s_mov_b32 s7, 0xf000
5198 ; VI-NEXT: s_mov_b32 s6, -1
5199 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5200 ; VI-NEXT: s_add_u32 s0, s0, 16
5201 ; VI-NEXT: s_addc_u32 s1, s1, 0
5202 ; VI-NEXT: v_mov_b32_e32 v0, s0
5203 ; VI-NEXT: v_mov_b32_e32 v1, s1
5204 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
5205 ; VI-NEXT: s_waitcnt vmcnt(0)
5206 ; VI-NEXT: buffer_wbinvl1_vol
5207 ; VI-NEXT: s_mov_b32 s4, s2
5208 ; VI-NEXT: s_mov_b32 s5, s3
5209 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5212 ; GFX9-LABEL: atomic_load_f32_offset:
5213 ; GFX9: ; %bb.0: ; %entry
5214 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5215 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5216 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5217 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc
5218 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5219 ; GFX9-NEXT: buffer_wbinvl1_vol
5220 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
5221 ; GFX9-NEXT: s_endpgm
5223 %gep = getelementptr float, ptr addrspace(1) %in, i64 4
5224 %val = load atomic float, ptr addrspace(1) %gep seq_cst, align 4
5225 store float %val, ptr addrspace(1) %out
5229 define amdgpu_kernel void @atomic_load_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) {
5230 ; SI-LABEL: atomic_load_i32:
5231 ; SI: ; %bb.0: ; %entry
5232 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5233 ; SI-NEXT: s_mov_b32 s7, 0xf000
5234 ; SI-NEXT: s_mov_b32 s6, -1
5235 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5236 ; SI-NEXT: s_mov_b32 s4, s0
5237 ; SI-NEXT: s_mov_b32 s5, s1
5238 ; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 glc
5239 ; SI-NEXT: s_waitcnt vmcnt(0)
5240 ; SI-NEXT: buffer_wbinvl1
5241 ; SI-NEXT: s_mov_b32 s4, s2
5242 ; SI-NEXT: s_mov_b32 s5, s3
5243 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5246 ; VI-LABEL: atomic_load_i32:
5247 ; VI: ; %bb.0: ; %entry
5248 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5249 ; VI-NEXT: s_mov_b32 s7, 0xf000
5250 ; VI-NEXT: s_mov_b32 s6, -1
5251 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5252 ; VI-NEXT: v_mov_b32_e32 v0, s0
5253 ; VI-NEXT: v_mov_b32_e32 v1, s1
5254 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
5255 ; VI-NEXT: s_waitcnt vmcnt(0)
5256 ; VI-NEXT: buffer_wbinvl1_vol
5257 ; VI-NEXT: s_mov_b32 s4, s2
5258 ; VI-NEXT: s_mov_b32 s5, s3
5259 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5262 ; GFX9-LABEL: atomic_load_i32:
5263 ; GFX9: ; %bb.0: ; %entry
5264 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5265 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5266 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5267 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
5268 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5269 ; GFX9-NEXT: buffer_wbinvl1_vol
5270 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
5271 ; GFX9-NEXT: s_endpgm
5273 %val = load atomic i32, ptr addrspace(1) %in syncscope("agent") seq_cst, align 4
5274 store i32 %val, ptr addrspace(1) %out
5278 define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) {
5279 ; SI-LABEL: atomic_load_i32_addr64_offset:
5280 ; SI: ; %bb.0: ; %entry
5281 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
5282 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
5283 ; SI-NEXT: s_mov_b32 s3, 0xf000
5284 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5285 ; SI-NEXT: s_mov_b32 s0, s6
5286 ; SI-NEXT: s_mov_b32 s1, s7
5287 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
5288 ; SI-NEXT: s_mov_b32 s6, 0
5289 ; SI-NEXT: s_mov_b32 s7, s3
5290 ; SI-NEXT: v_mov_b32_e32 v0, s8
5291 ; SI-NEXT: v_mov_b32_e32 v1, s9
5292 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:16 glc
5293 ; SI-NEXT: s_waitcnt vmcnt(0)
5294 ; SI-NEXT: buffer_wbinvl1
5295 ; SI-NEXT: s_mov_b32 s2, -1
5296 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
5299 ; VI-LABEL: atomic_load_i32_addr64_offset:
5300 ; VI: ; %bb.0: ; %entry
5301 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
5302 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5303 ; VI-NEXT: s_mov_b32 s7, 0xf000
5304 ; VI-NEXT: s_mov_b32 s6, -1
5305 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5306 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5307 ; VI-NEXT: s_add_u32 s0, s0, s4
5308 ; VI-NEXT: s_addc_u32 s1, s1, s5
5309 ; VI-NEXT: s_add_u32 s0, s0, 16
5310 ; VI-NEXT: s_addc_u32 s1, s1, 0
5311 ; VI-NEXT: v_mov_b32_e32 v0, s0
5312 ; VI-NEXT: v_mov_b32_e32 v1, s1
5313 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
5314 ; VI-NEXT: s_waitcnt vmcnt(0)
5315 ; VI-NEXT: buffer_wbinvl1_vol
5316 ; VI-NEXT: s_mov_b32 s4, s2
5317 ; VI-NEXT: s_mov_b32 s5, s3
5318 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5321 ; GFX9-LABEL: atomic_load_i32_addr64_offset:
5322 ; GFX9: ; %bb.0: ; %entry
5323 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5324 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5325 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5326 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5327 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
5328 ; GFX9-NEXT: s_add_u32 s0, s4, s0
5329 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
5330 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc
5331 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5332 ; GFX9-NEXT: buffer_wbinvl1_vol
5333 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
5334 ; GFX9-NEXT: s_endpgm
5336 %ptr = getelementptr i32, ptr addrspace(1) %in, i64 %index
5337 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
5338 %val = load atomic i32, ptr addrspace(1) %gep seq_cst, align 4
5339 store i32 %val, ptr addrspace(1) %out
5343 define amdgpu_kernel void @atomic_load_i32_addr64(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) {
5344 ; SI-LABEL: atomic_load_i32_addr64:
5345 ; SI: ; %bb.0: ; %entry
5346 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
5347 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
5348 ; SI-NEXT: s_mov_b32 s3, 0xf000
5349 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5350 ; SI-NEXT: s_mov_b32 s0, s6
5351 ; SI-NEXT: s_mov_b32 s1, s7
5352 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
5353 ; SI-NEXT: s_mov_b32 s6, 0
5354 ; SI-NEXT: s_mov_b32 s7, s3
5355 ; SI-NEXT: v_mov_b32_e32 v0, s8
5356 ; SI-NEXT: v_mov_b32_e32 v1, s9
5357 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
5358 ; SI-NEXT: s_waitcnt vmcnt(0)
5359 ; SI-NEXT: buffer_wbinvl1
5360 ; SI-NEXT: s_mov_b32 s2, -1
5361 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
5364 ; VI-LABEL: atomic_load_i32_addr64:
5365 ; VI: ; %bb.0: ; %entry
5366 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
5367 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5368 ; VI-NEXT: s_mov_b32 s7, 0xf000
5369 ; VI-NEXT: s_mov_b32 s6, -1
5370 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5371 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5372 ; VI-NEXT: s_add_u32 s0, s0, s4
5373 ; VI-NEXT: s_addc_u32 s1, s1, s5
5374 ; VI-NEXT: v_mov_b32_e32 v0, s0
5375 ; VI-NEXT: v_mov_b32_e32 v1, s1
5376 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
5377 ; VI-NEXT: s_waitcnt vmcnt(0)
5378 ; VI-NEXT: buffer_wbinvl1_vol
5379 ; VI-NEXT: s_mov_b32 s4, s2
5380 ; VI-NEXT: s_mov_b32 s5, s3
5381 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5384 ; GFX9-LABEL: atomic_load_i32_addr64:
5385 ; GFX9: ; %bb.0: ; %entry
5386 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5387 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5388 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5389 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5390 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
5391 ; GFX9-NEXT: s_add_u32 s0, s4, s0
5392 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
5393 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] glc
5394 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5395 ; GFX9-NEXT: buffer_wbinvl1_vol
5396 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
5397 ; GFX9-NEXT: s_endpgm
5399 %ptr = getelementptr i32, ptr addrspace(1) %in, i64 %index
5400 %val = load atomic i32, ptr addrspace(1) %ptr seq_cst, align 4
5401 store i32 %val, ptr addrspace(1) %out
5405 define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr addrspace(1) %in, ptr addrspace(1) %out, i64 %index) {
5406 ; SI-LABEL: atomic_load_f32_addr64_offset:
5407 ; SI: ; %bb.0: ; %entry
5408 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
5409 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd
5410 ; SI-NEXT: s_mov_b32 s3, 0xf000
5411 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5412 ; SI-NEXT: s_mov_b32 s0, s6
5413 ; SI-NEXT: s_mov_b32 s1, s7
5414 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
5415 ; SI-NEXT: s_mov_b32 s6, 0
5416 ; SI-NEXT: s_mov_b32 s7, s3
5417 ; SI-NEXT: v_mov_b32_e32 v0, s8
5418 ; SI-NEXT: v_mov_b32_e32 v1, s9
5419 ; SI-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 offset:16 glc
5420 ; SI-NEXT: s_waitcnt vmcnt(0)
5421 ; SI-NEXT: buffer_wbinvl1
5422 ; SI-NEXT: s_mov_b32 s2, -1
5423 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
5426 ; VI-LABEL: atomic_load_f32_addr64_offset:
5427 ; VI: ; %bb.0: ; %entry
5428 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
5429 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5430 ; VI-NEXT: s_mov_b32 s7, 0xf000
5431 ; VI-NEXT: s_mov_b32 s6, -1
5432 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5433 ; VI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5434 ; VI-NEXT: s_add_u32 s0, s0, s4
5435 ; VI-NEXT: s_addc_u32 s1, s1, s5
5436 ; VI-NEXT: s_add_u32 s0, s0, 16
5437 ; VI-NEXT: s_addc_u32 s1, s1, 0
5438 ; VI-NEXT: v_mov_b32_e32 v0, s0
5439 ; VI-NEXT: v_mov_b32_e32 v1, s1
5440 ; VI-NEXT: flat_load_dword v0, v[0:1] glc
5441 ; VI-NEXT: s_waitcnt vmcnt(0)
5442 ; VI-NEXT: buffer_wbinvl1_vol
5443 ; VI-NEXT: s_mov_b32 s4, s2
5444 ; VI-NEXT: s_mov_b32 s5, s3
5445 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
5448 ; GFX9-LABEL: atomic_load_f32_addr64_offset:
5449 ; GFX9: ; %bb.0: ; %entry
5450 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5451 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5452 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5453 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5454 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
5455 ; GFX9-NEXT: s_add_u32 s0, s4, s0
5456 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
5457 ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] offset:16 glc
5458 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5459 ; GFX9-NEXT: buffer_wbinvl1_vol
5460 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
5461 ; GFX9-NEXT: s_endpgm
5463 %ptr = getelementptr float, ptr addrspace(1) %in, i64 %index
5464 %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4
5465 %val = load atomic float, ptr addrspace(1) %gep seq_cst, align 4
5466 store float %val, ptr addrspace(1) %out
5470 define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr addrspace(1) %out) {
5471 ; SI-LABEL: atomic_store_i32_offset:
5472 ; SI: ; %bb.0: ; %entry
5473 ; SI-NEXT: s_load_dword s4, s[0:1], 0x9
5474 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
5475 ; SI-NEXT: s_mov_b32 s3, 0xf000
5476 ; SI-NEXT: s_mov_b32 s2, -1
5477 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5478 ; SI-NEXT: v_mov_b32_e32 v0, s4
5479 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:16
5482 ; VI-LABEL: atomic_store_i32_offset:
5483 ; VI: ; %bb.0: ; %entry
5484 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5485 ; VI-NEXT: s_load_dword s4, s[0:1], 0x24
5486 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5487 ; VI-NEXT: s_add_u32 s0, s2, 16
5488 ; VI-NEXT: s_addc_u32 s1, s3, 0
5489 ; VI-NEXT: v_mov_b32_e32 v0, s0
5490 ; VI-NEXT: v_mov_b32_e32 v1, s1
5491 ; VI-NEXT: v_mov_b32_e32 v2, s4
5492 ; VI-NEXT: flat_store_dword v[0:1], v2
5495 ; GFX9-LABEL: atomic_store_i32_offset:
5496 ; GFX9: ; %bb.0: ; %entry
5497 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
5498 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5499 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5500 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5501 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
5502 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3] offset:16
5503 ; GFX9-NEXT: s_endpgm
5505 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
5506 store atomic i32 %in, ptr addrspace(1) %gep seq_cst, align 4
5510 define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr addrspace(1) %out) {
5511 ; SI-LABEL: atomic_store_i32:
5512 ; SI: ; %bb.0: ; %entry
5513 ; SI-NEXT: s_load_dword s4, s[0:1], 0x9
5514 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
5515 ; SI-NEXT: s_mov_b32 s3, 0xf000
5516 ; SI-NEXT: s_mov_b32 s2, -1
5517 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5518 ; SI-NEXT: v_mov_b32_e32 v0, s4
5519 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
5522 ; VI-LABEL: atomic_store_i32:
5523 ; VI: ; %bb.0: ; %entry
5524 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5525 ; VI-NEXT: s_load_dword s0, s[0:1], 0x24
5526 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5527 ; VI-NEXT: v_mov_b32_e32 v0, s2
5528 ; VI-NEXT: v_mov_b32_e32 v1, s3
5529 ; VI-NEXT: v_mov_b32_e32 v2, s0
5530 ; VI-NEXT: flat_store_dword v[0:1], v2
5533 ; GFX9-LABEL: atomic_store_i32:
5534 ; GFX9: ; %bb.0: ; %entry
5535 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
5536 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5537 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5538 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5539 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
5540 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
5541 ; GFX9-NEXT: s_endpgm
5543 store atomic i32 %in, ptr addrspace(1) %out seq_cst, align 4
5547 define amdgpu_kernel void @atomic_store_f32(float %in, ptr addrspace(1) %out) {
5548 ; SI-LABEL: atomic_store_f32:
5549 ; SI: ; %bb.0: ; %entry
5550 ; SI-NEXT: s_load_dword s4, s[0:1], 0x9
5551 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
5552 ; SI-NEXT: s_mov_b32 s3, 0xf000
5553 ; SI-NEXT: s_mov_b32 s2, -1
5554 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5555 ; SI-NEXT: v_mov_b32_e32 v0, s4
5556 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
5559 ; VI-LABEL: atomic_store_f32:
5560 ; VI: ; %bb.0: ; %entry
5561 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5562 ; VI-NEXT: s_load_dword s0, s[0:1], 0x24
5563 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5564 ; VI-NEXT: v_mov_b32_e32 v0, s2
5565 ; VI-NEXT: v_mov_b32_e32 v1, s3
5566 ; VI-NEXT: v_mov_b32_e32 v2, s0
5567 ; VI-NEXT: flat_store_dword v[0:1], v2
5570 ; GFX9-LABEL: atomic_store_f32:
5571 ; GFX9: ; %bb.0: ; %entry
5572 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
5573 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5574 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5575 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5576 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
5577 ; GFX9-NEXT: global_store_dword v0, v1, s[2:3]
5578 ; GFX9-NEXT: s_endpgm
5580 store atomic float %in, ptr addrspace(1) %out seq_cst, align 4
5584 define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr addrspace(1) %out, i64 %index) {
5585 ; SI-LABEL: atomic_store_i32_addr64_offset:
5586 ; SI: ; %bb.0: ; %entry
5587 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
5588 ; SI-NEXT: s_load_dword s2, s[0:1], 0x9
5589 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5590 ; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5591 ; SI-NEXT: s_mov_b32 s7, 0xf000
5592 ; SI-NEXT: s_mov_b32 s6, 0
5593 ; SI-NEXT: v_mov_b32_e32 v2, s2
5594 ; SI-NEXT: v_mov_b32_e32 v0, s0
5595 ; SI-NEXT: v_mov_b32_e32 v1, s1
5596 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 offset:16
5599 ; VI-LABEL: atomic_store_i32_addr64_offset:
5600 ; VI: ; %bb.0: ; %entry
5601 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
5602 ; VI-NEXT: s_load_dword s2, s[0:1], 0x24
5603 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5604 ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5605 ; VI-NEXT: s_add_u32 s0, s4, s0
5606 ; VI-NEXT: s_addc_u32 s1, s5, s1
5607 ; VI-NEXT: s_add_u32 s0, s0, 16
5608 ; VI-NEXT: s_addc_u32 s1, s1, 0
5609 ; VI-NEXT: v_mov_b32_e32 v0, s0
5610 ; VI-NEXT: v_mov_b32_e32 v1, s1
5611 ; VI-NEXT: v_mov_b32_e32 v2, s2
5612 ; VI-NEXT: flat_store_dword v[0:1], v2
5615 ; GFX9-LABEL: atomic_store_i32_addr64_offset:
5616 ; GFX9: ; %bb.0: ; %entry
5617 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
5618 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24
5619 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5620 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5621 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5622 ; GFX9-NEXT: s_add_u32 s0, s4, s0
5623 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
5624 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
5625 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16
5626 ; GFX9-NEXT: s_endpgm
5628 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
5629 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
5630 store atomic i32 %in, ptr addrspace(1) %gep seq_cst, align 4
5634 define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr addrspace(1) %out, i64 %index) {
5635 ; SI-LABEL: atomic_store_f32_addr64_offset:
5636 ; SI: ; %bb.0: ; %entry
5637 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
5638 ; SI-NEXT: s_load_dword s2, s[0:1], 0x9
5639 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5640 ; SI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5641 ; SI-NEXT: s_mov_b32 s7, 0xf000
5642 ; SI-NEXT: s_mov_b32 s6, 0
5643 ; SI-NEXT: v_mov_b32_e32 v2, s2
5644 ; SI-NEXT: v_mov_b32_e32 v0, s0
5645 ; SI-NEXT: v_mov_b32_e32 v1, s1
5646 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64 offset:16
5649 ; VI-LABEL: atomic_store_f32_addr64_offset:
5650 ; VI: ; %bb.0: ; %entry
5651 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
5652 ; VI-NEXT: s_load_dword s2, s[0:1], 0x24
5653 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5654 ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5655 ; VI-NEXT: s_add_u32 s0, s4, s0
5656 ; VI-NEXT: s_addc_u32 s1, s5, s1
5657 ; VI-NEXT: s_add_u32 s0, s0, 16
5658 ; VI-NEXT: s_addc_u32 s1, s1, 0
5659 ; VI-NEXT: v_mov_b32_e32 v0, s0
5660 ; VI-NEXT: v_mov_b32_e32 v1, s1
5661 ; VI-NEXT: v_mov_b32_e32 v2, s2
5662 ; VI-NEXT: flat_store_dword v[0:1], v2
5665 ; GFX9-LABEL: atomic_store_f32_addr64_offset:
5666 ; GFX9: ; %bb.0: ; %entry
5667 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
5668 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24
5669 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5670 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5671 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5672 ; GFX9-NEXT: s_add_u32 s0, s4, s0
5673 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
5674 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
5675 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:16
5676 ; GFX9-NEXT: s_endpgm
5678 %ptr = getelementptr float, ptr addrspace(1) %out, i64 %index
5679 %gep = getelementptr float, ptr addrspace(1) %ptr, i64 4
5680 store atomic float %in, ptr addrspace(1) %gep seq_cst, align 4
5684 define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr addrspace(1) %out, i64 %index) {
5685 ; SI-LABEL: atomic_store_i32_addr64:
5686 ; SI: ; %bb.0: ; %entry
5687 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
5688 ; SI-NEXT: s_load_dword s8, s[0:1], 0x9
5689 ; SI-NEXT: s_mov_b32 s3, 0xf000
5690 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5691 ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
5692 ; SI-NEXT: s_mov_b32 s2, 0
5693 ; SI-NEXT: s_mov_b64 s[0:1], s[4:5]
5694 ; SI-NEXT: v_mov_b32_e32 v2, s8
5695 ; SI-NEXT: v_mov_b32_e32 v0, s6
5696 ; SI-NEXT: v_mov_b32_e32 v1, s7
5697 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
5700 ; VI-LABEL: atomic_store_i32_addr64:
5701 ; VI: ; %bb.0: ; %entry
5702 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
5703 ; VI-NEXT: s_load_dword s2, s[0:1], 0x24
5704 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5705 ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5706 ; VI-NEXT: s_add_u32 s0, s4, s0
5707 ; VI-NEXT: s_addc_u32 s1, s5, s1
5708 ; VI-NEXT: v_mov_b32_e32 v0, s0
5709 ; VI-NEXT: v_mov_b32_e32 v1, s1
5710 ; VI-NEXT: v_mov_b32_e32 v2, s2
5711 ; VI-NEXT: flat_store_dword v[0:1], v2
5714 ; GFX9-LABEL: atomic_store_i32_addr64:
5715 ; GFX9: ; %bb.0: ; %entry
5716 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
5717 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24
5718 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5719 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5720 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5721 ; GFX9-NEXT: s_add_u32 s0, s4, s0
5722 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
5723 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
5724 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
5725 ; GFX9-NEXT: s_endpgm
5727 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
5728 store atomic i32 %in, ptr addrspace(1) %ptr seq_cst, align 4
5732 define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr addrspace(1) %out, i64 %index) {
5733 ; SI-LABEL: atomic_store_f32_addr64:
5734 ; SI: ; %bb.0: ; %entry
5735 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
5736 ; SI-NEXT: s_load_dword s8, s[0:1], 0x9
5737 ; SI-NEXT: s_mov_b32 s3, 0xf000
5738 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5739 ; SI-NEXT: s_lshl_b64 s[6:7], s[6:7], 2
5740 ; SI-NEXT: s_mov_b32 s2, 0
5741 ; SI-NEXT: s_mov_b64 s[0:1], s[4:5]
5742 ; SI-NEXT: v_mov_b32_e32 v2, s8
5743 ; SI-NEXT: v_mov_b32_e32 v0, s6
5744 ; SI-NEXT: v_mov_b32_e32 v1, s7
5745 ; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
5748 ; VI-LABEL: atomic_store_f32_addr64:
5749 ; VI: ; %bb.0: ; %entry
5750 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
5751 ; VI-NEXT: s_load_dword s2, s[0:1], 0x24
5752 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5753 ; VI-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5754 ; VI-NEXT: s_add_u32 s0, s4, s0
5755 ; VI-NEXT: s_addc_u32 s1, s5, s1
5756 ; VI-NEXT: v_mov_b32_e32 v0, s0
5757 ; VI-NEXT: v_mov_b32_e32 v1, s1
5758 ; VI-NEXT: v_mov_b32_e32 v2, s2
5759 ; VI-NEXT: flat_store_dword v[0:1], v2
5762 ; GFX9-LABEL: atomic_store_f32_addr64:
5763 ; GFX9: ; %bb.0: ; %entry
5764 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
5765 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x24
5766 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5767 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5768 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5769 ; GFX9-NEXT: s_add_u32 s0, s4, s0
5770 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
5771 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
5772 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1]
5773 ; GFX9-NEXT: s_endpgm
5775 %ptr = getelementptr float, ptr addrspace(1) %out, i64 %index
5776 store atomic float %in, ptr addrspace(1) %ptr seq_cst, align 4
5780 define amdgpu_kernel void @atomic_load_i8_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
5781 ; SI-LABEL: atomic_load_i8_offset:
5782 ; SI: ; %bb.0: ; %entry
5783 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5784 ; SI-NEXT: s_mov_b32 s7, 0xf000
5785 ; SI-NEXT: s_mov_b32 s6, -1
5786 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5787 ; SI-NEXT: s_mov_b32 s4, s2
5788 ; SI-NEXT: s_mov_b32 s5, s3
5789 ; SI-NEXT: s_mov_b32 s2, s6
5790 ; SI-NEXT: s_mov_b32 s3, s7
5791 ; SI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:16 glc
5792 ; SI-NEXT: s_waitcnt vmcnt(0)
5793 ; SI-NEXT: buffer_wbinvl1
5794 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
5797 ; VI-LABEL: atomic_load_i8_offset:
5798 ; VI: ; %bb.0: ; %entry
5799 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5800 ; VI-NEXT: s_mov_b32 s7, 0xf000
5801 ; VI-NEXT: s_mov_b32 s6, -1
5802 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5803 ; VI-NEXT: s_mov_b32 s4, s2
5804 ; VI-NEXT: s_mov_b32 s5, s3
5805 ; VI-NEXT: s_mov_b32 s2, s6
5806 ; VI-NEXT: s_mov_b32 s3, s7
5807 ; VI-NEXT: buffer_load_ubyte v0, off, s[0:3], 0 offset:16 glc
5808 ; VI-NEXT: s_waitcnt vmcnt(0)
5809 ; VI-NEXT: buffer_wbinvl1_vol
5810 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
5813 ; GFX9-LABEL: atomic_load_i8_offset:
5814 ; GFX9: ; %bb.0: ; %entry
5815 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5816 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5817 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5818 ; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:16 glc
5819 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5820 ; GFX9-NEXT: buffer_wbinvl1_vol
5821 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
5822 ; GFX9-NEXT: s_endpgm
5824 %gep = getelementptr i8, ptr addrspace(1) %in, i64 16
5825 %val = load atomic i8, ptr addrspace(1) %gep seq_cst, align 1
5826 store i8 %val, ptr addrspace(1) %out
5830 define amdgpu_kernel void @atomic_load_i8_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
5831 ; SI-LABEL: atomic_load_i8_negoffset:
5832 ; SI: ; %bb.0: ; %entry
5833 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5834 ; SI-NEXT: s_mov_b32 s7, 0xf000
5835 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5836 ; SI-NEXT: s_mov_b32 s4, s2
5837 ; SI-NEXT: s_mov_b32 s5, s3
5838 ; SI-NEXT: s_mov_b32 s2, 0
5839 ; SI-NEXT: s_mov_b32 s3, s7
5840 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffffe00
5841 ; SI-NEXT: v_mov_b32_e32 v1, -1
5842 ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 glc
5843 ; SI-NEXT: s_waitcnt vmcnt(0)
5844 ; SI-NEXT: buffer_wbinvl1
5845 ; SI-NEXT: s_mov_b32 s6, -1
5846 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
5849 ; VI-LABEL: atomic_load_i8_negoffset:
5850 ; VI: ; %bb.0: ; %entry
5851 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5852 ; VI-NEXT: s_mov_b32 s7, 0xf000
5853 ; VI-NEXT: s_mov_b32 s6, -1
5854 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5855 ; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
5856 ; VI-NEXT: s_addc_u32 s1, s1, -1
5857 ; VI-NEXT: v_mov_b32_e32 v0, s0
5858 ; VI-NEXT: v_mov_b32_e32 v1, s1
5859 ; VI-NEXT: flat_load_ubyte v0, v[0:1] glc
5860 ; VI-NEXT: s_waitcnt vmcnt(0)
5861 ; VI-NEXT: buffer_wbinvl1_vol
5862 ; VI-NEXT: s_mov_b32 s4, s2
5863 ; VI-NEXT: s_mov_b32 s5, s3
5864 ; VI-NEXT: buffer_store_byte v0, off, s[4:7], 0
5867 ; GFX9-LABEL: atomic_load_i8_negoffset:
5868 ; GFX9: ; %bb.0: ; %entry
5869 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5870 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5871 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5872 ; GFX9-NEXT: global_load_ubyte v1, v0, s[0:1] offset:-512 glc
5873 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5874 ; GFX9-NEXT: buffer_wbinvl1_vol
5875 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
5876 ; GFX9-NEXT: s_endpgm
5878 %gep = getelementptr i8, ptr addrspace(1) %in, i64 -512
5879 %val = load atomic i8, ptr addrspace(1) %gep seq_cst, align 1
5880 store i8 %val, ptr addrspace(1) %out
5884 define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr addrspace(1) %out) {
5885 ; SI-LABEL: atomic_store_i8_offset:
5886 ; SI: ; %bb.0: ; %entry
5887 ; SI-NEXT: s_load_dword s4, s[0:1], 0x9
5888 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
5889 ; SI-NEXT: s_mov_b32 s3, 0xf000
5890 ; SI-NEXT: s_mov_b32 s2, -1
5891 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5892 ; SI-NEXT: v_mov_b32_e32 v0, s4
5893 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 offset:16
5896 ; VI-LABEL: atomic_store_i8_offset:
5897 ; VI: ; %bb.0: ; %entry
5898 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5899 ; VI-NEXT: s_load_dword s4, s[0:1], 0x24
5900 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5901 ; VI-NEXT: s_add_u32 s0, s2, 16
5902 ; VI-NEXT: s_addc_u32 s1, s3, 0
5903 ; VI-NEXT: v_mov_b32_e32 v0, s0
5904 ; VI-NEXT: v_mov_b32_e32 v1, s1
5905 ; VI-NEXT: v_mov_b32_e32 v2, s4
5906 ; VI-NEXT: flat_store_byte v[0:1], v2
5909 ; GFX9-LABEL: atomic_store_i8_offset:
5910 ; GFX9: ; %bb.0: ; %entry
5911 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
5912 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5913 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5914 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5915 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
5916 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3] offset:16
5917 ; GFX9-NEXT: s_endpgm
5919 %gep = getelementptr i8, ptr addrspace(1) %out, i64 16
5920 store atomic i8 %in, ptr addrspace(1) %gep seq_cst, align 1
5924 define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr addrspace(1) %out) {
5925 ; SI-LABEL: atomic_store_i8:
5926 ; SI: ; %bb.0: ; %entry
5927 ; SI-NEXT: s_load_dword s4, s[0:1], 0x9
5928 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
5929 ; SI-NEXT: s_mov_b32 s3, 0xf000
5930 ; SI-NEXT: s_mov_b32 s2, -1
5931 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5932 ; SI-NEXT: v_mov_b32_e32 v0, s4
5933 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
5936 ; VI-LABEL: atomic_store_i8:
5937 ; VI: ; %bb.0: ; %entry
5938 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5939 ; VI-NEXT: s_load_dword s0, s[0:1], 0x24
5940 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5941 ; VI-NEXT: v_mov_b32_e32 v0, s2
5942 ; VI-NEXT: v_mov_b32_e32 v1, s3
5943 ; VI-NEXT: v_mov_b32_e32 v2, s0
5944 ; VI-NEXT: flat_store_byte v[0:1], v2
5947 ; GFX9-LABEL: atomic_store_i8:
5948 ; GFX9: ; %bb.0: ; %entry
5949 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
5950 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5951 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5952 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5953 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
5954 ; GFX9-NEXT: global_store_byte v0, v1, s[2:3]
5955 ; GFX9-NEXT: s_endpgm
5957 store atomic i8 %in, ptr addrspace(1) %out seq_cst, align 1
5961 define amdgpu_kernel void @atomic_load_i16_offset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
5962 ; SI-LABEL: atomic_load_i16_offset:
5963 ; SI: ; %bb.0: ; %entry
5964 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5965 ; SI-NEXT: s_mov_b32 s7, 0xf000
5966 ; SI-NEXT: s_mov_b32 s6, -1
5967 ; SI-NEXT: s_waitcnt lgkmcnt(0)
5968 ; SI-NEXT: s_mov_b32 s4, s2
5969 ; SI-NEXT: s_mov_b32 s5, s3
5970 ; SI-NEXT: s_mov_b32 s2, s6
5971 ; SI-NEXT: s_mov_b32 s3, s7
5972 ; SI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
5973 ; SI-NEXT: s_waitcnt vmcnt(0)
5974 ; SI-NEXT: buffer_wbinvl1
5975 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
5978 ; VI-LABEL: atomic_load_i16_offset:
5979 ; VI: ; %bb.0: ; %entry
5980 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5981 ; VI-NEXT: s_mov_b32 s7, 0xf000
5982 ; VI-NEXT: s_mov_b32 s6, -1
5983 ; VI-NEXT: s_waitcnt lgkmcnt(0)
5984 ; VI-NEXT: s_mov_b32 s4, s2
5985 ; VI-NEXT: s_mov_b32 s5, s3
5986 ; VI-NEXT: s_mov_b32 s2, s6
5987 ; VI-NEXT: s_mov_b32 s3, s7
5988 ; VI-NEXT: buffer_load_ushort v0, off, s[0:3], 0 offset:16 glc
5989 ; VI-NEXT: s_waitcnt vmcnt(0)
5990 ; VI-NEXT: buffer_wbinvl1_vol
5991 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
5994 ; GFX9-LABEL: atomic_load_i16_offset:
5995 ; GFX9: ; %bb.0: ; %entry
5996 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5997 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
5998 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5999 ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:16 glc
6000 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6001 ; GFX9-NEXT: buffer_wbinvl1_vol
6002 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
6003 ; GFX9-NEXT: s_endpgm
6005 %gep = getelementptr i16, ptr addrspace(1) %in, i64 8
6006 %val = load atomic i16, ptr addrspace(1) %gep seq_cst, align 2
6007 store i16 %val, ptr addrspace(1) %out
6011 define amdgpu_kernel void @atomic_load_i16_negoffset(ptr addrspace(1) %in, ptr addrspace(1) %out) {
6012 ; SI-LABEL: atomic_load_i16_negoffset:
6013 ; SI: ; %bb.0: ; %entry
6014 ; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
6015 ; SI-NEXT: s_mov_b32 s7, 0xf000
6016 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6017 ; SI-NEXT: s_mov_b32 s4, s2
6018 ; SI-NEXT: s_mov_b32 s5, s3
6019 ; SI-NEXT: s_mov_b32 s2, 0
6020 ; SI-NEXT: s_mov_b32 s3, s7
6021 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffffe00
6022 ; SI-NEXT: v_mov_b32_e32 v1, -1
6023 ; SI-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64 glc
6024 ; SI-NEXT: s_waitcnt vmcnt(0)
6025 ; SI-NEXT: buffer_wbinvl1
6026 ; SI-NEXT: s_mov_b32 s6, -1
6027 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
6030 ; VI-LABEL: atomic_load_i16_negoffset:
6031 ; VI: ; %bb.0: ; %entry
6032 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
6033 ; VI-NEXT: s_mov_b32 s7, 0xf000
6034 ; VI-NEXT: s_mov_b32 s6, -1
6035 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6036 ; VI-NEXT: s_add_u32 s0, s0, 0xfffffe00
6037 ; VI-NEXT: s_addc_u32 s1, s1, -1
6038 ; VI-NEXT: v_mov_b32_e32 v0, s0
6039 ; VI-NEXT: v_mov_b32_e32 v1, s1
6040 ; VI-NEXT: flat_load_ushort v0, v[0:1] glc
6041 ; VI-NEXT: s_waitcnt vmcnt(0)
6042 ; VI-NEXT: buffer_wbinvl1_vol
6043 ; VI-NEXT: s_mov_b32 s4, s2
6044 ; VI-NEXT: s_mov_b32 s5, s3
6045 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0
6048 ; GFX9-LABEL: atomic_load_i16_negoffset:
6049 ; GFX9: ; %bb.0: ; %entry
6050 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
6051 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6052 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6053 ; GFX9-NEXT: global_load_ushort v1, v0, s[0:1] offset:-512 glc
6054 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6055 ; GFX9-NEXT: buffer_wbinvl1_vol
6056 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
6057 ; GFX9-NEXT: s_endpgm
6059 %gep = getelementptr i16, ptr addrspace(1) %in, i64 -256
6060 %val = load atomic i16, ptr addrspace(1) %gep seq_cst, align 2
6061 store i16 %val, ptr addrspace(1) %out
6065 define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr addrspace(1) %out) {
6066 ; SI-LABEL: atomic_store_i16_offset:
6067 ; SI: ; %bb.0: ; %entry
6068 ; SI-NEXT: s_load_dword s4, s[0:1], 0x9
6069 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
6070 ; SI-NEXT: s_mov_b32 s3, 0xf000
6071 ; SI-NEXT: s_mov_b32 s2, -1
6072 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6073 ; SI-NEXT: v_mov_b32_e32 v0, s4
6074 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:16
6077 ; VI-LABEL: atomic_store_i16_offset:
6078 ; VI: ; %bb.0: ; %entry
6079 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6080 ; VI-NEXT: s_load_dword s4, s[0:1], 0x24
6081 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6082 ; VI-NEXT: s_add_u32 s0, s2, 16
6083 ; VI-NEXT: s_addc_u32 s1, s3, 0
6084 ; VI-NEXT: v_mov_b32_e32 v0, s0
6085 ; VI-NEXT: v_mov_b32_e32 v1, s1
6086 ; VI-NEXT: v_mov_b32_e32 v2, s4
6087 ; VI-NEXT: flat_store_short v[0:1], v2
6090 ; GFX9-LABEL: atomic_store_i16_offset:
6091 ; GFX9: ; %bb.0: ; %entry
6092 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
6093 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6094 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6095 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6096 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6097 ; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16
6098 ; GFX9-NEXT: s_endpgm
6100 %gep = getelementptr i16, ptr addrspace(1) %out, i64 8
6101 store atomic i16 %in, ptr addrspace(1) %gep seq_cst, align 2
6105 define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr addrspace(1) %out) {
6106 ; SI-LABEL: atomic_store_i16:
6107 ; SI: ; %bb.0: ; %entry
6108 ; SI-NEXT: s_load_dword s4, s[0:1], 0x9
6109 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
6110 ; SI-NEXT: s_mov_b32 s3, 0xf000
6111 ; SI-NEXT: s_mov_b32 s2, -1
6112 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6113 ; SI-NEXT: v_mov_b32_e32 v0, s4
6114 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
6117 ; VI-LABEL: atomic_store_i16:
6118 ; VI: ; %bb.0: ; %entry
6119 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6120 ; VI-NEXT: s_load_dword s0, s[0:1], 0x24
6121 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6122 ; VI-NEXT: v_mov_b32_e32 v0, s2
6123 ; VI-NEXT: v_mov_b32_e32 v1, s3
6124 ; VI-NEXT: v_mov_b32_e32 v2, s0
6125 ; VI-NEXT: flat_store_short v[0:1], v2
6128 ; GFX9-LABEL: atomic_store_i16:
6129 ; GFX9: ; %bb.0: ; %entry
6130 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
6131 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6132 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6133 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6134 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6135 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
6136 ; GFX9-NEXT: s_endpgm
6138 store atomic i16 %in, ptr addrspace(1) %out seq_cst, align 2
6142 define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr addrspace(1) %out) {
6143 ; SI-LABEL: atomic_store_f16_offset:
6144 ; SI: ; %bb.0: ; %entry
6145 ; SI-NEXT: s_load_dword s4, s[0:1], 0x9
6146 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
6147 ; SI-NEXT: s_mov_b32 s3, 0xf000
6148 ; SI-NEXT: s_mov_b32 s2, -1
6149 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6150 ; SI-NEXT: v_mov_b32_e32 v0, s4
6151 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 offset:16
6154 ; VI-LABEL: atomic_store_f16_offset:
6155 ; VI: ; %bb.0: ; %entry
6156 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6157 ; VI-NEXT: s_load_dword s4, s[0:1], 0x24
6158 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6159 ; VI-NEXT: s_add_u32 s0, s2, 16
6160 ; VI-NEXT: s_addc_u32 s1, s3, 0
6161 ; VI-NEXT: v_mov_b32_e32 v0, s0
6162 ; VI-NEXT: v_mov_b32_e32 v1, s1
6163 ; VI-NEXT: v_mov_b32_e32 v2, s4
6164 ; VI-NEXT: flat_store_short v[0:1], v2
6167 ; GFX9-LABEL: atomic_store_f16_offset:
6168 ; GFX9: ; %bb.0: ; %entry
6169 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
6170 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6171 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6172 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6173 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6174 ; GFX9-NEXT: global_store_short v0, v1, s[2:3] offset:16
6175 ; GFX9-NEXT: s_endpgm
6177 %gep = getelementptr half, ptr addrspace(1) %out, i64 8
6178 store atomic half %in, ptr addrspace(1) %gep seq_cst, align 2
6182 define amdgpu_kernel void @atomic_store_f16(half %in, ptr addrspace(1) %out) {
6183 ; SI-LABEL: atomic_store_f16:
6184 ; SI: ; %bb.0: ; %entry
6185 ; SI-NEXT: s_load_dword s4, s[0:1], 0x9
6186 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
6187 ; SI-NEXT: s_mov_b32 s3, 0xf000
6188 ; SI-NEXT: s_mov_b32 s2, -1
6189 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6190 ; SI-NEXT: v_mov_b32_e32 v0, s4
6191 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
6194 ; VI-LABEL: atomic_store_f16:
6195 ; VI: ; %bb.0: ; %entry
6196 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6197 ; VI-NEXT: s_load_dword s0, s[0:1], 0x24
6198 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6199 ; VI-NEXT: v_mov_b32_e32 v0, s2
6200 ; VI-NEXT: v_mov_b32_e32 v1, s3
6201 ; VI-NEXT: v_mov_b32_e32 v2, s0
6202 ; VI-NEXT: flat_store_short v[0:1], v2
6205 ; GFX9-LABEL: atomic_store_f16:
6206 ; GFX9: ; %bb.0: ; %entry
6207 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x24
6208 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6209 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6210 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6211 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6212 ; GFX9-NEXT: global_store_short v0, v1, s[2:3]
6213 ; GFX9-NEXT: s_endpgm
6215 store atomic half %in, ptr addrspace(1) %out seq_cst, align 2
6219 define amdgpu_kernel void @atomic_inc_i32_offset(ptr addrspace(1) %out, i32 %in) {
6220 ; SI-LABEL: atomic_inc_i32_offset:
6221 ; SI: ; %bb.0: ; %entry
6222 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
6223 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
6224 ; SI-NEXT: s_mov_b32 s3, 0xf000
6225 ; SI-NEXT: s_mov_b32 s2, -1
6226 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6227 ; SI-NEXT: v_mov_b32_e32 v0, s4
6228 ; SI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 offset:16
6229 ; SI-NEXT: s_waitcnt vmcnt(0)
6230 ; SI-NEXT: buffer_wbinvl1
6233 ; VI-LABEL: atomic_inc_i32_offset:
6234 ; VI: ; %bb.0: ; %entry
6235 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
6236 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
6237 ; VI-NEXT: s_mov_b32 s3, 0xf000
6238 ; VI-NEXT: s_mov_b32 s2, -1
6239 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6240 ; VI-NEXT: v_mov_b32_e32 v0, s4
6241 ; VI-NEXT: buffer_atomic_inc v0, off, s[0:3], 0 offset:16
6242 ; VI-NEXT: s_waitcnt vmcnt(0)
6243 ; VI-NEXT: buffer_wbinvl1_vol
6246 ; GFX9-LABEL: atomic_inc_i32_offset:
6247 ; GFX9: ; %bb.0: ; %entry
6248 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
6249 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6250 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6251 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6252 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6253 ; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:16
6254 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6255 ; GFX9-NEXT: buffer_wbinvl1_vol
6256 ; GFX9-NEXT: s_endpgm
6258 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
6259 %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
6263 define amdgpu_kernel void @atomic_inc_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) {
6264 ; SI-LABEL: atomic_inc_i32_max_neg_offset:
6265 ; SI: ; %bb.0: ; %entry
6266 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
6267 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
6268 ; SI-NEXT: s_mov_b32 s3, 0xf000
6269 ; SI-NEXT: s_mov_b32 s2, 0
6270 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000
6271 ; SI-NEXT: v_mov_b32_e32 v1, -1
6272 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6273 ; SI-NEXT: v_mov_b32_e32 v2, s4
6274 ; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64
6275 ; SI-NEXT: s_waitcnt vmcnt(0)
6276 ; SI-NEXT: buffer_wbinvl1
6279 ; VI-LABEL: atomic_inc_i32_max_neg_offset:
6280 ; VI: ; %bb.0: ; %entry
6281 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6282 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
6283 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6284 ; VI-NEXT: s_add_u32 s0, s2, 0xfffff000
6285 ; VI-NEXT: s_addc_u32 s1, s3, -1
6286 ; VI-NEXT: v_mov_b32_e32 v0, s0
6287 ; VI-NEXT: v_mov_b32_e32 v1, s1
6288 ; VI-NEXT: v_mov_b32_e32 v2, s4
6289 ; VI-NEXT: flat_atomic_inc v[0:1], v2
6290 ; VI-NEXT: s_waitcnt vmcnt(0)
6291 ; VI-NEXT: buffer_wbinvl1_vol
6294 ; GFX9-LABEL: atomic_inc_i32_max_neg_offset:
6295 ; GFX9: ; %bb.0: ; %entry
6296 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
6297 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6298 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6299 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6300 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6301 ; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:-4096
6302 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6303 ; GFX9-NEXT: buffer_wbinvl1_vol
6304 ; GFX9-NEXT: s_endpgm
6306 %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024
6307 %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
6311 define amdgpu_kernel void @atomic_inc_i32_soffset(ptr addrspace(1) %out, i32 %in) {
6312 ; SI-LABEL: atomic_inc_i32_soffset:
6313 ; SI: ; %bb.0: ; %entry
6314 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
6315 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
6316 ; SI-NEXT: s_mov_b32 s3, 0xf000
6317 ; SI-NEXT: s_mov_b32 s2, -1
6318 ; SI-NEXT: s_mov_b32 s5, 0x8ca0
6319 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6320 ; SI-NEXT: v_mov_b32_e32 v0, s4
6321 ; SI-NEXT: buffer_atomic_inc v0, off, s[0:3], s5
6322 ; SI-NEXT: s_waitcnt vmcnt(0)
6323 ; SI-NEXT: buffer_wbinvl1
6326 ; VI-LABEL: atomic_inc_i32_soffset:
6327 ; VI: ; %bb.0: ; %entry
6328 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
6329 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
6330 ; VI-NEXT: s_mov_b32 s3, 0xf000
6331 ; VI-NEXT: s_mov_b32 s2, -1
6332 ; VI-NEXT: s_mov_b32 s5, 0x8ca0
6333 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6334 ; VI-NEXT: v_mov_b32_e32 v0, s4
6335 ; VI-NEXT: buffer_atomic_inc v0, off, s[0:3], s5
6336 ; VI-NEXT: s_waitcnt vmcnt(0)
6337 ; VI-NEXT: buffer_wbinvl1_vol
6340 ; GFX9-LABEL: atomic_inc_i32_soffset:
6341 ; GFX9: ; %bb.0: ; %entry
6342 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
6343 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6344 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000
6345 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6346 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6347 ; GFX9-NEXT: global_atomic_inc v0, v1, s[2:3] offset:3232
6348 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6349 ; GFX9-NEXT: buffer_wbinvl1_vol
6350 ; GFX9-NEXT: s_endpgm
6352 %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000
6353 %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
6357 define amdgpu_kernel void @atomic_inc_i32_huge_offset(ptr addrspace(1) %out, i32 %in) {
6358 ; SI-LABEL: atomic_inc_i32_huge_offset:
6359 ; SI: ; %bb.0: ; %entry
6360 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
6361 ; SI-NEXT: s_load_dword s0, s[0:1], 0xb
6362 ; SI-NEXT: s_mov_b32 s7, 0xf000
6363 ; SI-NEXT: s_mov_b32 s6, 0
6364 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac
6365 ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd
6366 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6367 ; SI-NEXT: v_mov_b32_e32 v2, s0
6368 ; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64
6369 ; SI-NEXT: s_waitcnt vmcnt(0)
6370 ; SI-NEXT: buffer_wbinvl1
6373 ; VI-LABEL: atomic_inc_i32_huge_offset:
6374 ; VI: ; %bb.0: ; %entry
6375 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6376 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
6377 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6378 ; VI-NEXT: s_add_u32 s0, s2, 0xdeac
6379 ; VI-NEXT: s_addc_u32 s1, s3, 0xabcd
6380 ; VI-NEXT: v_mov_b32_e32 v0, s0
6381 ; VI-NEXT: v_mov_b32_e32 v1, s1
6382 ; VI-NEXT: v_mov_b32_e32 v2, s4
6383 ; VI-NEXT: flat_atomic_inc v[0:1], v2
6384 ; VI-NEXT: s_waitcnt vmcnt(0)
6385 ; VI-NEXT: buffer_wbinvl1_vol
6388 ; GFX9-LABEL: atomic_inc_i32_huge_offset:
6389 ; GFX9: ; %bb.0: ; %entry
6390 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6391 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
6392 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6393 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6394 ; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac
6395 ; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd
6396 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6397 ; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1]
6398 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6399 ; GFX9-NEXT: buffer_wbinvl1_vol
6400 ; GFX9-NEXT: s_endpgm
6402 %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595
6403 %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
6407 define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
6408 ; SI-LABEL: atomic_inc_i32_ret_offset:
6409 ; SI: ; %bb.0: ; %entry
6410 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
6411 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
6412 ; SI-NEXT: s_mov_b32 s3, 0xf000
6413 ; SI-NEXT: s_mov_b32 s2, -1
6414 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6415 ; SI-NEXT: s_mov_b32 s0, s6
6416 ; SI-NEXT: s_mov_b32 s1, s7
6417 ; SI-NEXT: s_mov_b32 s6, s2
6418 ; SI-NEXT: s_mov_b32 s7, s3
6419 ; SI-NEXT: v_mov_b32_e32 v0, s8
6420 ; SI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc
6421 ; SI-NEXT: s_waitcnt vmcnt(0)
6422 ; SI-NEXT: buffer_wbinvl1
6423 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
6426 ; VI-LABEL: atomic_inc_i32_ret_offset:
6427 ; VI: ; %bb.0: ; %entry
6428 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6429 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
6430 ; VI-NEXT: s_mov_b32 s3, 0xf000
6431 ; VI-NEXT: s_mov_b32 s2, -1
6432 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6433 ; VI-NEXT: s_mov_b32 s0, s6
6434 ; VI-NEXT: s_mov_b32 s1, s7
6435 ; VI-NEXT: s_mov_b32 s6, s2
6436 ; VI-NEXT: s_mov_b32 s7, s3
6437 ; VI-NEXT: v_mov_b32_e32 v0, s8
6438 ; VI-NEXT: buffer_atomic_inc v0, off, s[4:7], 0 offset:16 glc
6439 ; VI-NEXT: s_waitcnt vmcnt(0)
6440 ; VI-NEXT: buffer_wbinvl1_vol
6441 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
6444 ; GFX9-LABEL: atomic_inc_i32_ret_offset:
6445 ; GFX9: ; %bb.0: ; %entry
6446 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
6447 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6448 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6449 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6450 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
6451 ; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[4:5] offset:16 glc
6452 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6453 ; GFX9-NEXT: buffer_wbinvl1_vol
6454 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
6455 ; GFX9-NEXT: s_endpgm
6457 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
6458 %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
6459 store i32 %val, ptr addrspace(1) %out2
6463 define amdgpu_kernel void @atomic_inc_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
6464 ; SI-LABEL: atomic_inc_i32_addr64_offset:
6465 ; SI: ; %bb.0: ; %entry
6466 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
6467 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
6468 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
6469 ; SI-NEXT: s_mov_b32 s3, 0xf000
6470 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6471 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
6472 ; SI-NEXT: s_mov_b32 s2, 0
6473 ; SI-NEXT: v_mov_b32_e32 v2, s6
6474 ; SI-NEXT: v_mov_b32_e32 v0, s4
6475 ; SI-NEXT: v_mov_b32_e32 v1, s5
6476 ; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[0:3], 0 addr64 offset:16
6477 ; SI-NEXT: s_waitcnt vmcnt(0)
6478 ; SI-NEXT: buffer_wbinvl1
6481 ; VI-LABEL: atomic_inc_i32_addr64_offset:
6482 ; VI: ; %bb.0: ; %entry
6483 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
6484 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
6485 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
6486 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6487 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
6488 ; VI-NEXT: s_add_u32 s0, s4, s0
6489 ; VI-NEXT: s_addc_u32 s1, s5, s1
6490 ; VI-NEXT: s_add_u32 s0, s0, 16
6491 ; VI-NEXT: s_addc_u32 s1, s1, 0
6492 ; VI-NEXT: v_mov_b32_e32 v0, s0
6493 ; VI-NEXT: v_mov_b32_e32 v1, s1
6494 ; VI-NEXT: v_mov_b32_e32 v2, s6
6495 ; VI-NEXT: flat_atomic_inc v[0:1], v2
6496 ; VI-NEXT: s_waitcnt vmcnt(0)
6497 ; VI-NEXT: buffer_wbinvl1_vol
6500 ; GFX9-LABEL: atomic_inc_i32_addr64_offset:
6501 ; GFX9: ; %bb.0: ; %entry
6502 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
6503 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
6504 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
6505 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6506 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6507 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
6508 ; GFX9-NEXT: s_add_u32 s0, s4, s0
6509 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
6510 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
6511 ; GFX9-NEXT: global_atomic_inc v0, v1, s[0:1] offset:16
6512 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6513 ; GFX9-NEXT: buffer_wbinvl1_vol
6514 ; GFX9-NEXT: s_endpgm
6516 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
6517 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
6518 %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
6522 define amdgpu_kernel void @atomic_inc_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
6523 ; SI-LABEL: atomic_inc_i32_ret_addr64_offset:
6524 ; SI: ; %bb.0: ; %entry
6525 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
6526 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
6527 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
6528 ; SI-NEXT: s_mov_b32 s3, 0xf000
6529 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6530 ; SI-NEXT: s_mov_b32 s0, s6
6531 ; SI-NEXT: s_mov_b32 s1, s7
6532 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
6533 ; SI-NEXT: s_mov_b32 s6, 0
6534 ; SI-NEXT: s_mov_b32 s7, s3
6535 ; SI-NEXT: v_mov_b32_e32 v2, s2
6536 ; SI-NEXT: v_mov_b32_e32 v0, s8
6537 ; SI-NEXT: v_mov_b32_e32 v1, s9
6538 ; SI-NEXT: buffer_atomic_inc v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
6539 ; SI-NEXT: s_waitcnt vmcnt(0)
6540 ; SI-NEXT: buffer_wbinvl1
6541 ; SI-NEXT: s_mov_b32 s2, -1
6542 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
6545 ; VI-LABEL: atomic_inc_i32_ret_addr64_offset:
6546 ; VI: ; %bb.0: ; %entry
6547 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
6548 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6549 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
6550 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6551 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
6552 ; VI-NEXT: s_add_u32 s0, s4, s0
6553 ; VI-NEXT: s_addc_u32 s1, s5, s1
6554 ; VI-NEXT: s_add_u32 s0, s0, 16
6555 ; VI-NEXT: s_addc_u32 s1, s1, 0
6556 ; VI-NEXT: v_mov_b32_e32 v0, s0
6557 ; VI-NEXT: v_mov_b32_e32 v1, s1
6558 ; VI-NEXT: v_mov_b32_e32 v2, s8
6559 ; VI-NEXT: flat_atomic_inc v0, v[0:1], v2 glc
6560 ; VI-NEXT: s_waitcnt vmcnt(0)
6561 ; VI-NEXT: buffer_wbinvl1_vol
6562 ; VI-NEXT: s_mov_b32 s3, 0xf000
6563 ; VI-NEXT: s_mov_b32 s2, -1
6564 ; VI-NEXT: s_mov_b32 s0, s6
6565 ; VI-NEXT: s_mov_b32 s1, s7
6566 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
6569 ; GFX9-LABEL: atomic_inc_i32_ret_addr64_offset:
6570 ; GFX9: ; %bb.0: ; %entry
6571 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
6572 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6573 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
6574 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6575 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6576 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
6577 ; GFX9-NEXT: s_add_u32 s0, s4, s0
6578 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
6579 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
6580 ; GFX9-NEXT: global_atomic_inc v1, v0, v1, s[0:1] offset:16 glc
6581 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6582 ; GFX9-NEXT: buffer_wbinvl1_vol
6583 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
6584 ; GFX9-NEXT: s_endpgm
6586 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
6587 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
6588 %val = atomicrmw volatile uinc_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
6589 store i32 %val, ptr addrspace(1) %out2
6593 define amdgpu_kernel void @atomic_dec_i32_offset(ptr addrspace(1) %out, i32 %in) {
6594 ; SI-LABEL: atomic_dec_i32_offset:
6595 ; SI: ; %bb.0: ; %entry
6596 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
6597 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
6598 ; SI-NEXT: s_mov_b32 s3, 0xf000
6599 ; SI-NEXT: s_mov_b32 s2, -1
6600 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6601 ; SI-NEXT: v_mov_b32_e32 v0, s4
6602 ; SI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 offset:16
6603 ; SI-NEXT: s_waitcnt vmcnt(0)
6604 ; SI-NEXT: buffer_wbinvl1
6607 ; VI-LABEL: atomic_dec_i32_offset:
6608 ; VI: ; %bb.0: ; %entry
6609 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
6610 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
6611 ; VI-NEXT: s_mov_b32 s3, 0xf000
6612 ; VI-NEXT: s_mov_b32 s2, -1
6613 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6614 ; VI-NEXT: v_mov_b32_e32 v0, s4
6615 ; VI-NEXT: buffer_atomic_dec v0, off, s[0:3], 0 offset:16
6616 ; VI-NEXT: s_waitcnt vmcnt(0)
6617 ; VI-NEXT: buffer_wbinvl1_vol
6620 ; GFX9-LABEL: atomic_dec_i32_offset:
6621 ; GFX9: ; %bb.0: ; %entry
6622 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
6623 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6624 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6625 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6626 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6627 ; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:16
6628 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6629 ; GFX9-NEXT: buffer_wbinvl1_vol
6630 ; GFX9-NEXT: s_endpgm
6632 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
6633 %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
6637 define amdgpu_kernel void @atomic_dec_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) {
6638 ; SI-LABEL: atomic_dec_i32_max_neg_offset:
6639 ; SI: ; %bb.0: ; %entry
6640 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
6641 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
6642 ; SI-NEXT: s_mov_b32 s3, 0xf000
6643 ; SI-NEXT: s_mov_b32 s2, 0
6644 ; SI-NEXT: v_mov_b32_e32 v0, 0xfffff000
6645 ; SI-NEXT: v_mov_b32_e32 v1, -1
6646 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6647 ; SI-NEXT: v_mov_b32_e32 v2, s4
6648 ; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64
6649 ; SI-NEXT: s_waitcnt vmcnt(0)
6650 ; SI-NEXT: buffer_wbinvl1
6653 ; VI-LABEL: atomic_dec_i32_max_neg_offset:
6654 ; VI: ; %bb.0: ; %entry
6655 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6656 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
6657 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6658 ; VI-NEXT: s_add_u32 s0, s2, 0xfffff000
6659 ; VI-NEXT: s_addc_u32 s1, s3, -1
6660 ; VI-NEXT: v_mov_b32_e32 v0, s0
6661 ; VI-NEXT: v_mov_b32_e32 v1, s1
6662 ; VI-NEXT: v_mov_b32_e32 v2, s4
6663 ; VI-NEXT: flat_atomic_dec v[0:1], v2
6664 ; VI-NEXT: s_waitcnt vmcnt(0)
6665 ; VI-NEXT: buffer_wbinvl1_vol
6668 ; GFX9-LABEL: atomic_dec_i32_max_neg_offset:
6669 ; GFX9: ; %bb.0: ; %entry
6670 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
6671 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6672 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6673 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6674 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6675 ; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:-4096
6676 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6677 ; GFX9-NEXT: buffer_wbinvl1_vol
6678 ; GFX9-NEXT: s_endpgm
6680 %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024
6681 %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
6685 define amdgpu_kernel void @atomic_dec_i32_soffset(ptr addrspace(1) %out, i32 %in) {
6686 ; SI-LABEL: atomic_dec_i32_soffset:
6687 ; SI: ; %bb.0: ; %entry
6688 ; SI-NEXT: s_load_dword s4, s[0:1], 0xb
6689 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
6690 ; SI-NEXT: s_mov_b32 s3, 0xf000
6691 ; SI-NEXT: s_mov_b32 s2, -1
6692 ; SI-NEXT: s_mov_b32 s5, 0x8ca0
6693 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6694 ; SI-NEXT: v_mov_b32_e32 v0, s4
6695 ; SI-NEXT: buffer_atomic_dec v0, off, s[0:3], s5
6696 ; SI-NEXT: s_waitcnt vmcnt(0)
6697 ; SI-NEXT: buffer_wbinvl1
6700 ; VI-LABEL: atomic_dec_i32_soffset:
6701 ; VI: ; %bb.0: ; %entry
6702 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
6703 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
6704 ; VI-NEXT: s_mov_b32 s3, 0xf000
6705 ; VI-NEXT: s_mov_b32 s2, -1
6706 ; VI-NEXT: s_mov_b32 s5, 0x8ca0
6707 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6708 ; VI-NEXT: v_mov_b32_e32 v0, s4
6709 ; VI-NEXT: buffer_atomic_dec v0, off, s[0:3], s5
6710 ; VI-NEXT: s_waitcnt vmcnt(0)
6711 ; VI-NEXT: buffer_wbinvl1_vol
6714 ; GFX9-LABEL: atomic_dec_i32_soffset:
6715 ; GFX9: ; %bb.0: ; %entry
6716 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
6717 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6718 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x8000
6719 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6720 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6721 ; GFX9-NEXT: global_atomic_dec v0, v1, s[2:3] offset:3232
6722 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6723 ; GFX9-NEXT: buffer_wbinvl1_vol
6724 ; GFX9-NEXT: s_endpgm
6726 %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000
6727 %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
6731 define amdgpu_kernel void @atomic_dec_i32_huge_offset(ptr addrspace(1) %out, i32 %in) {
6732 ; SI-LABEL: atomic_dec_i32_huge_offset:
6733 ; SI: ; %bb.0: ; %entry
6734 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
6735 ; SI-NEXT: s_load_dword s0, s[0:1], 0xb
6736 ; SI-NEXT: s_mov_b32 s7, 0xf000
6737 ; SI-NEXT: s_mov_b32 s6, 0
6738 ; SI-NEXT: v_mov_b32_e32 v0, 0xdeac
6739 ; SI-NEXT: v_mov_b32_e32 v1, 0xabcd
6740 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6741 ; SI-NEXT: v_mov_b32_e32 v2, s0
6742 ; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64
6743 ; SI-NEXT: s_waitcnt vmcnt(0)
6744 ; SI-NEXT: buffer_wbinvl1
6747 ; VI-LABEL: atomic_dec_i32_huge_offset:
6748 ; VI: ; %bb.0: ; %entry
6749 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6750 ; VI-NEXT: s_load_dword s4, s[0:1], 0x2c
6751 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6752 ; VI-NEXT: s_add_u32 s0, s2, 0xdeac
6753 ; VI-NEXT: s_addc_u32 s1, s3, 0xabcd
6754 ; VI-NEXT: v_mov_b32_e32 v0, s0
6755 ; VI-NEXT: v_mov_b32_e32 v1, s1
6756 ; VI-NEXT: v_mov_b32_e32 v2, s4
6757 ; VI-NEXT: flat_atomic_dec v[0:1], v2
6758 ; VI-NEXT: s_waitcnt vmcnt(0)
6759 ; VI-NEXT: buffer_wbinvl1_vol
6762 ; GFX9-LABEL: atomic_dec_i32_huge_offset:
6763 ; GFX9: ; %bb.0: ; %entry
6764 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6765 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x2c
6766 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6767 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6768 ; GFX9-NEXT: s_add_u32 s0, s2, 0xdeac
6769 ; GFX9-NEXT: s_addc_u32 s1, s3, 0xabcd
6770 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
6771 ; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1]
6772 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6773 ; GFX9-NEXT: buffer_wbinvl1_vol
6774 ; GFX9-NEXT: s_endpgm
6776 %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595
6777 %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
6781 define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
6782 ; SI-LABEL: atomic_dec_i32_ret_offset:
6783 ; SI: ; %bb.0: ; %entry
6784 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
6785 ; SI-NEXT: s_load_dword s8, s[0:1], 0xd
6786 ; SI-NEXT: s_mov_b32 s3, 0xf000
6787 ; SI-NEXT: s_mov_b32 s2, -1
6788 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6789 ; SI-NEXT: s_mov_b32 s0, s6
6790 ; SI-NEXT: s_mov_b32 s1, s7
6791 ; SI-NEXT: s_mov_b32 s6, s2
6792 ; SI-NEXT: s_mov_b32 s7, s3
6793 ; SI-NEXT: v_mov_b32_e32 v0, s8
6794 ; SI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc
6795 ; SI-NEXT: s_waitcnt vmcnt(0)
6796 ; SI-NEXT: buffer_wbinvl1
6797 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
6800 ; VI-LABEL: atomic_dec_i32_ret_offset:
6801 ; VI: ; %bb.0: ; %entry
6802 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6803 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
6804 ; VI-NEXT: s_mov_b32 s3, 0xf000
6805 ; VI-NEXT: s_mov_b32 s2, -1
6806 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6807 ; VI-NEXT: s_mov_b32 s0, s6
6808 ; VI-NEXT: s_mov_b32 s1, s7
6809 ; VI-NEXT: s_mov_b32 s6, s2
6810 ; VI-NEXT: s_mov_b32 s7, s3
6811 ; VI-NEXT: v_mov_b32_e32 v0, s8
6812 ; VI-NEXT: buffer_atomic_dec v0, off, s[4:7], 0 offset:16 glc
6813 ; VI-NEXT: s_waitcnt vmcnt(0)
6814 ; VI-NEXT: buffer_wbinvl1_vol
6815 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
6818 ; GFX9-LABEL: atomic_dec_i32_ret_offset:
6819 ; GFX9: ; %bb.0: ; %entry
6820 ; GFX9-NEXT: s_load_dword s2, s[0:1], 0x34
6821 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6822 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6823 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6824 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
6825 ; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[4:5] offset:16 glc
6826 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6827 ; GFX9-NEXT: buffer_wbinvl1_vol
6828 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
6829 ; GFX9-NEXT: s_endpgm
6831 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
6832 %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
6833 store i32 %val, ptr addrspace(1) %out2
6837 define amdgpu_kernel void @atomic_dec_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
6838 ; SI-LABEL: atomic_dec_i32_addr64_offset:
6839 ; SI: ; %bb.0: ; %entry
6840 ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
6841 ; SI-NEXT: s_load_dword s6, s[0:1], 0xb
6842 ; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
6843 ; SI-NEXT: s_mov_b32 s3, 0xf000
6844 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6845 ; SI-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
6846 ; SI-NEXT: s_mov_b32 s2, 0
6847 ; SI-NEXT: v_mov_b32_e32 v2, s6
6848 ; SI-NEXT: v_mov_b32_e32 v0, s4
6849 ; SI-NEXT: v_mov_b32_e32 v1, s5
6850 ; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[0:3], 0 addr64 offset:16
6851 ; SI-NEXT: s_waitcnt vmcnt(0)
6852 ; SI-NEXT: buffer_wbinvl1
6855 ; VI-LABEL: atomic_dec_i32_addr64_offset:
6856 ; VI: ; %bb.0: ; %entry
6857 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
6858 ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
6859 ; VI-NEXT: s_load_dword s6, s[0:1], 0x2c
6860 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6861 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
6862 ; VI-NEXT: s_add_u32 s0, s4, s0
6863 ; VI-NEXT: s_addc_u32 s1, s5, s1
6864 ; VI-NEXT: s_add_u32 s0, s0, 16
6865 ; VI-NEXT: s_addc_u32 s1, s1, 0
6866 ; VI-NEXT: v_mov_b32_e32 v0, s0
6867 ; VI-NEXT: v_mov_b32_e32 v1, s1
6868 ; VI-NEXT: v_mov_b32_e32 v2, s6
6869 ; VI-NEXT: flat_atomic_dec v[0:1], v2
6870 ; VI-NEXT: s_waitcnt vmcnt(0)
6871 ; VI-NEXT: buffer_wbinvl1_vol
6874 ; GFX9-LABEL: atomic_dec_i32_addr64_offset:
6875 ; GFX9: ; %bb.0: ; %entry
6876 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
6877 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
6878 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x2c
6879 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6880 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6881 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
6882 ; GFX9-NEXT: s_add_u32 s0, s4, s0
6883 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
6884 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
6885 ; GFX9-NEXT: global_atomic_dec v0, v1, s[0:1] offset:16
6886 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6887 ; GFX9-NEXT: buffer_wbinvl1_vol
6888 ; GFX9-NEXT: s_endpgm
6890 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
6891 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
6892 %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
6896 define amdgpu_kernel void @atomic_dec_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
6897 ; SI-LABEL: atomic_dec_i32_ret_addr64_offset:
6898 ; SI: ; %bb.0: ; %entry
6899 ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
6900 ; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xf
6901 ; SI-NEXT: s_load_dword s2, s[0:1], 0xd
6902 ; SI-NEXT: s_mov_b32 s3, 0xf000
6903 ; SI-NEXT: s_waitcnt lgkmcnt(0)
6904 ; SI-NEXT: s_mov_b32 s0, s6
6905 ; SI-NEXT: s_mov_b32 s1, s7
6906 ; SI-NEXT: s_lshl_b64 s[8:9], s[8:9], 2
6907 ; SI-NEXT: s_mov_b32 s6, 0
6908 ; SI-NEXT: s_mov_b32 s7, s3
6909 ; SI-NEXT: v_mov_b32_e32 v2, s2
6910 ; SI-NEXT: v_mov_b32_e32 v0, s8
6911 ; SI-NEXT: v_mov_b32_e32 v1, s9
6912 ; SI-NEXT: buffer_atomic_dec v2, v[0:1], s[4:7], 0 addr64 offset:16 glc
6913 ; SI-NEXT: s_waitcnt vmcnt(0)
6914 ; SI-NEXT: buffer_wbinvl1
6915 ; SI-NEXT: s_mov_b32 s2, -1
6916 ; SI-NEXT: buffer_store_dword v2, off, s[0:3], 0
6919 ; VI-LABEL: atomic_dec_i32_ret_addr64_offset:
6920 ; VI: ; %bb.0: ; %entry
6921 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
6922 ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6923 ; VI-NEXT: s_load_dword s8, s[0:1], 0x34
6924 ; VI-NEXT: s_waitcnt lgkmcnt(0)
6925 ; VI-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
6926 ; VI-NEXT: s_add_u32 s0, s4, s0
6927 ; VI-NEXT: s_addc_u32 s1, s5, s1
6928 ; VI-NEXT: s_add_u32 s0, s0, 16
6929 ; VI-NEXT: s_addc_u32 s1, s1, 0
6930 ; VI-NEXT: v_mov_b32_e32 v0, s0
6931 ; VI-NEXT: v_mov_b32_e32 v1, s1
6932 ; VI-NEXT: v_mov_b32_e32 v2, s8
6933 ; VI-NEXT: flat_atomic_dec v0, v[0:1], v2 glc
6934 ; VI-NEXT: s_waitcnt vmcnt(0)
6935 ; VI-NEXT: buffer_wbinvl1_vol
6936 ; VI-NEXT: s_mov_b32 s3, 0xf000
6937 ; VI-NEXT: s_mov_b32 s2, -1
6938 ; VI-NEXT: s_mov_b32 s0, s6
6939 ; VI-NEXT: s_mov_b32 s1, s7
6940 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0
6943 ; GFX9-LABEL: atomic_dec_i32_ret_addr64_offset:
6944 ; GFX9: ; %bb.0: ; %entry
6945 ; GFX9-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
6946 ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6947 ; GFX9-NEXT: s_load_dword s8, s[0:1], 0x34
6948 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
6949 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6950 ; GFX9-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
6951 ; GFX9-NEXT: s_add_u32 s0, s4, s0
6952 ; GFX9-NEXT: s_addc_u32 s1, s5, s1
6953 ; GFX9-NEXT: v_mov_b32_e32 v1, s8
6954 ; GFX9-NEXT: global_atomic_dec v1, v0, v1, s[0:1] offset:16 glc
6955 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6956 ; GFX9-NEXT: buffer_wbinvl1_vol
6957 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7]
6958 ; GFX9-NEXT: s_endpgm
6960 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
6961 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
6962 %val = atomicrmw volatile udec_wrap ptr addrspace(1) %gep, i32 %in syncscope("agent") seq_cst
6963 store i32 %val, ptr addrspace(1) %out2