1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN1 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN2 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN3 %s
6 define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) {
7 ; GCN1-LABEL: atomic_add_i32_offset:
8 ; GCN1: ; %bb.0: ; %entry
9 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
10 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
11 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
12 ; GCN1-NEXT: s_add_u32 s0, s2, 16
13 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
14 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
15 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
16 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
17 ; GCN1-NEXT: flat_atomic_add v[0:1], v2
18 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19 ; GCN1-NEXT: buffer_wbinvl1_vol
22 ; GCN2-LABEL: atomic_add_i32_offset:
23 ; GCN2: ; %bb.0: ; %entry
24 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
25 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
26 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
27 ; GCN2-NEXT: s_add_u32 s0, s2, 16
28 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
29 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
30 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
31 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
32 ; GCN2-NEXT: flat_atomic_add v[0:1], v2
33 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
34 ; GCN2-NEXT: buffer_wbinvl1_vol
37 ; GCN3-LABEL: atomic_add_i32_offset:
38 ; GCN3: ; %bb.0: ; %entry
39 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
40 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
41 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
42 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
43 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
44 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
45 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16
46 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
47 ; GCN3-NEXT: buffer_wbinvl1_vol
50 %gep = getelementptr i32, ptr %out, i32 4
51 %val = atomicrmw add ptr %gep, i32 %in syncscope("agent") seq_cst
55 define amdgpu_kernel void @atomic_add_i32_max_offset(ptr %out, i32 %in) {
56 ; GCN1-LABEL: atomic_add_i32_max_offset:
57 ; GCN1: ; %bb.0: ; %entry
58 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
59 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
60 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
61 ; GCN1-NEXT: s_add_u32 s0, s2, 0xffc
62 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
63 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
64 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
65 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
66 ; GCN1-NEXT: flat_atomic_add v[0:1], v2
67 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
68 ; GCN1-NEXT: buffer_wbinvl1_vol
71 ; GCN2-LABEL: atomic_add_i32_max_offset:
72 ; GCN2: ; %bb.0: ; %entry
73 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
74 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
75 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
76 ; GCN2-NEXT: s_add_u32 s0, s2, 0xffc
77 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
78 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
79 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
80 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
81 ; GCN2-NEXT: flat_atomic_add v[0:1], v2
82 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
83 ; GCN2-NEXT: buffer_wbinvl1_vol
86 ; GCN3-LABEL: atomic_add_i32_max_offset:
87 ; GCN3: ; %bb.0: ; %entry
88 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
89 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
90 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
91 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
92 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
93 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
94 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:4092
95 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
96 ; GCN3-NEXT: buffer_wbinvl1_vol
99 %gep = getelementptr i32, ptr %out, i32 1023
100 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
104 define amdgpu_kernel void @atomic_add_i32_max_offset_p1(ptr %out, i32 %in) {
105 ; GCN1-LABEL: atomic_add_i32_max_offset_p1:
106 ; GCN1: ; %bb.0: ; %entry
107 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
108 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
109 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
110 ; GCN1-NEXT: s_add_u32 s0, s2, 0x1000
111 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
112 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
113 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
114 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
115 ; GCN1-NEXT: flat_atomic_add v[0:1], v2
116 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
117 ; GCN1-NEXT: buffer_wbinvl1_vol
118 ; GCN1-NEXT: s_endpgm
120 ; GCN2-LABEL: atomic_add_i32_max_offset_p1:
121 ; GCN2: ; %bb.0: ; %entry
122 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
123 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
124 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
125 ; GCN2-NEXT: s_add_u32 s0, s2, 0x1000
126 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
127 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
128 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
129 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
130 ; GCN2-NEXT: flat_atomic_add v[0:1], v2
131 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
132 ; GCN2-NEXT: buffer_wbinvl1_vol
133 ; GCN2-NEXT: s_endpgm
135 ; GCN3-LABEL: atomic_add_i32_max_offset_p1:
136 ; GCN3: ; %bb.0: ; %entry
137 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
138 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
139 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
140 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
141 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
142 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
143 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
144 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
145 ; GCN3-NEXT: flat_atomic_add v[0:1], v2
146 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
147 ; GCN3-NEXT: buffer_wbinvl1_vol
148 ; GCN3-NEXT: s_endpgm
150 %gep = getelementptr i32, ptr %out, i32 1024
151 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
155 define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
156 ; GCN1-LABEL: atomic_add_i32_ret_offset:
157 ; GCN1: ; %bb.0: ; %entry
158 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
159 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
160 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
161 ; GCN1-NEXT: s_add_u32 s0, s4, 16
162 ; GCN1-NEXT: s_addc_u32 s1, s5, 0
163 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
164 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
165 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
166 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc
167 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
168 ; GCN1-NEXT: buffer_wbinvl1_vol
169 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
170 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
171 ; GCN1-NEXT: flat_store_dword v[0:1], v2
172 ; GCN1-NEXT: s_endpgm
174 ; GCN2-LABEL: atomic_add_i32_ret_offset:
175 ; GCN2: ; %bb.0: ; %entry
176 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
177 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
178 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
179 ; GCN2-NEXT: s_add_u32 s0, s4, 16
180 ; GCN2-NEXT: s_addc_u32 s1, s5, 0
181 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
182 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
183 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
184 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc
185 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
186 ; GCN2-NEXT: buffer_wbinvl1_vol
187 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
188 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
189 ; GCN2-NEXT: flat_store_dword v[0:1], v2
190 ; GCN2-NEXT: s_endpgm
192 ; GCN3-LABEL: atomic_add_i32_ret_offset:
193 ; GCN3: ; %bb.0: ; %entry
194 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
195 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
196 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
197 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
198 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
199 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
200 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc
201 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
202 ; GCN3-NEXT: buffer_wbinvl1_vol
203 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
204 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
205 ; GCN3-NEXT: flat_store_dword v[0:1], v2
206 ; GCN3-NEXT: s_endpgm
208 %gep = getelementptr i32, ptr %out, i32 4
209 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
210 store i32 %val, ptr %out2
214 define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
215 ; GCN1-LABEL: atomic_add_i32_addr64_offset:
216 ; GCN1: ; %bb.0: ; %entry
217 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
218 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
219 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
220 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
221 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
222 ; GCN1-NEXT: s_add_u32 s0, s4, s0
223 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
224 ; GCN1-NEXT: s_add_u32 s0, s0, 16
225 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
226 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
227 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
228 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
229 ; GCN1-NEXT: flat_atomic_add v[0:1], v2
230 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
231 ; GCN1-NEXT: buffer_wbinvl1_vol
232 ; GCN1-NEXT: s_endpgm
234 ; GCN2-LABEL: atomic_add_i32_addr64_offset:
235 ; GCN2: ; %bb.0: ; %entry
236 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
237 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
238 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
239 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
240 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
241 ; GCN2-NEXT: s_add_u32 s0, s4, s0
242 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
243 ; GCN2-NEXT: s_add_u32 s0, s0, 16
244 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
245 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
246 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
247 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
248 ; GCN2-NEXT: flat_atomic_add v[0:1], v2
249 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
250 ; GCN2-NEXT: buffer_wbinvl1_vol
251 ; GCN2-NEXT: s_endpgm
253 ; GCN3-LABEL: atomic_add_i32_addr64_offset:
254 ; GCN3: ; %bb.0: ; %entry
255 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
256 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
257 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
258 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
259 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
260 ; GCN3-NEXT: s_add_u32 s0, s4, s0
261 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
262 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
263 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
264 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
265 ; GCN3-NEXT: flat_atomic_add v[0:1], v2 offset:16
266 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
267 ; GCN3-NEXT: buffer_wbinvl1_vol
268 ; GCN3-NEXT: s_endpgm
270 %ptr = getelementptr i32, ptr %out, i64 %index
271 %gep = getelementptr i32, ptr %ptr, i32 4
272 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
276 define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
277 ; GCN1-LABEL: atomic_add_i32_ret_addr64_offset:
278 ; GCN1: ; %bb.0: ; %entry
279 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
280 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
281 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
282 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
283 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
284 ; GCN1-NEXT: s_add_u32 s0, s4, s0
285 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
286 ; GCN1-NEXT: s_add_u32 s0, s0, 16
287 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
288 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
289 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
290 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
291 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc
292 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
293 ; GCN1-NEXT: buffer_wbinvl1_vol
294 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
295 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
296 ; GCN1-NEXT: flat_store_dword v[0:1], v2
297 ; GCN1-NEXT: s_endpgm
299 ; GCN2-LABEL: atomic_add_i32_ret_addr64_offset:
300 ; GCN2: ; %bb.0: ; %entry
301 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
302 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
303 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
304 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
305 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
306 ; GCN2-NEXT: s_add_u32 s0, s4, s0
307 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
308 ; GCN2-NEXT: s_add_u32 s0, s0, 16
309 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
310 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
311 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
312 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
313 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc
314 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
315 ; GCN2-NEXT: buffer_wbinvl1_vol
316 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
317 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
318 ; GCN2-NEXT: flat_store_dword v[0:1], v2
319 ; GCN2-NEXT: s_endpgm
321 ; GCN3-LABEL: atomic_add_i32_ret_addr64_offset:
322 ; GCN3: ; %bb.0: ; %entry
323 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
324 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
325 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
326 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
327 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
328 ; GCN3-NEXT: s_add_u32 s0, s4, s0
329 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
330 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
331 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
332 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
333 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 offset:16 glc
334 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
335 ; GCN3-NEXT: buffer_wbinvl1_vol
336 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
337 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
338 ; GCN3-NEXT: flat_store_dword v[0:1], v2
339 ; GCN3-NEXT: s_endpgm
341 %ptr = getelementptr i32, ptr %out, i64 %index
342 %gep = getelementptr i32, ptr %ptr, i32 4
343 %val = atomicrmw volatile add ptr %gep, i32 %in syncscope("agent") seq_cst
344 store i32 %val, ptr %out2
348 define amdgpu_kernel void @atomic_add_i32(ptr %out, i32 %in) {
349 ; GCN1-LABEL: atomic_add_i32:
350 ; GCN1: ; %bb.0: ; %entry
351 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
352 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb
353 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
354 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
355 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
356 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
357 ; GCN1-NEXT: flat_atomic_add v[0:1], v2
358 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
359 ; GCN1-NEXT: buffer_wbinvl1_vol
360 ; GCN1-NEXT: s_endpgm
362 ; GCN2-LABEL: atomic_add_i32:
363 ; GCN2: ; %bb.0: ; %entry
364 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
365 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c
366 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
367 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
368 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
369 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
370 ; GCN2-NEXT: flat_atomic_add v[0:1], v2
371 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
372 ; GCN2-NEXT: buffer_wbinvl1_vol
373 ; GCN2-NEXT: s_endpgm
375 ; GCN3-LABEL: atomic_add_i32:
376 ; GCN3: ; %bb.0: ; %entry
377 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
378 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
379 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
380 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
381 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
382 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
383 ; GCN3-NEXT: flat_atomic_add v[0:1], v2
384 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
385 ; GCN3-NEXT: buffer_wbinvl1_vol
386 ; GCN3-NEXT: s_endpgm
388 %val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst
392 define amdgpu_kernel void @atomic_add_i32_ret(ptr %out, ptr %out2, i32 %in) {
393 ; GCN1-LABEL: atomic_add_i32_ret:
394 ; GCN1: ; %bb.0: ; %entry
395 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
396 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd
397 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
398 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
399 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
400 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
401 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc
402 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
403 ; GCN1-NEXT: buffer_wbinvl1_vol
404 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
405 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
406 ; GCN1-NEXT: flat_store_dword v[0:1], v2
407 ; GCN1-NEXT: s_endpgm
409 ; GCN2-LABEL: atomic_add_i32_ret:
410 ; GCN2: ; %bb.0: ; %entry
411 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
412 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34
413 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
414 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
415 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
416 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
417 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc
418 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
419 ; GCN2-NEXT: buffer_wbinvl1_vol
420 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
421 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
422 ; GCN2-NEXT: flat_store_dword v[0:1], v2
423 ; GCN2-NEXT: s_endpgm
425 ; GCN3-LABEL: atomic_add_i32_ret:
426 ; GCN3: ; %bb.0: ; %entry
427 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
428 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
429 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
430 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
431 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
432 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
433 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc
434 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
435 ; GCN3-NEXT: buffer_wbinvl1_vol
436 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
437 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
438 ; GCN3-NEXT: flat_store_dword v[0:1], v2
439 ; GCN3-NEXT: s_endpgm
441 %val = atomicrmw volatile add ptr %out, i32 %in syncscope("agent") seq_cst
442 store i32 %val, ptr %out2
446 define amdgpu_kernel void @atomic_add_i32_addr64(ptr %out, i32 %in, i64 %index) {
447 ; GCN1-LABEL: atomic_add_i32_addr64:
448 ; GCN1: ; %bb.0: ; %entry
449 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
450 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
451 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
452 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
453 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
454 ; GCN1-NEXT: s_add_u32 s0, s4, s0
455 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
456 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
457 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
458 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
459 ; GCN1-NEXT: flat_atomic_add v[0:1], v2
460 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
461 ; GCN1-NEXT: buffer_wbinvl1_vol
462 ; GCN1-NEXT: s_endpgm
464 ; GCN2-LABEL: atomic_add_i32_addr64:
465 ; GCN2: ; %bb.0: ; %entry
466 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
467 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
468 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
469 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
470 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
471 ; GCN2-NEXT: s_add_u32 s0, s4, s0
472 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
473 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
474 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
475 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
476 ; GCN2-NEXT: flat_atomic_add v[0:1], v2
477 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
478 ; GCN2-NEXT: buffer_wbinvl1_vol
479 ; GCN2-NEXT: s_endpgm
481 ; GCN3-LABEL: atomic_add_i32_addr64:
482 ; GCN3: ; %bb.0: ; %entry
483 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
484 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
485 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
486 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
487 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
488 ; GCN3-NEXT: s_add_u32 s0, s4, s0
489 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
490 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
491 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
492 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
493 ; GCN3-NEXT: flat_atomic_add v[0:1], v2
494 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
495 ; GCN3-NEXT: buffer_wbinvl1_vol
496 ; GCN3-NEXT: s_endpgm
498 %ptr = getelementptr i32, ptr %out, i64 %index
499 %val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst
503 define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
504 ; GCN1-LABEL: atomic_add_i32_ret_addr64:
505 ; GCN1: ; %bb.0: ; %entry
506 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
507 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
508 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
509 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
510 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
511 ; GCN1-NEXT: s_add_u32 s0, s4, s0
512 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
513 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
514 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
515 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
516 ; GCN1-NEXT: flat_atomic_add v2, v[0:1], v2 glc
517 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
518 ; GCN1-NEXT: buffer_wbinvl1_vol
519 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
520 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
521 ; GCN1-NEXT: flat_store_dword v[0:1], v2
522 ; GCN1-NEXT: s_endpgm
524 ; GCN2-LABEL: atomic_add_i32_ret_addr64:
525 ; GCN2: ; %bb.0: ; %entry
526 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
527 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
528 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
529 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
530 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
531 ; GCN2-NEXT: s_add_u32 s0, s4, s0
532 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
533 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
534 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
535 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
536 ; GCN2-NEXT: flat_atomic_add v2, v[0:1], v2 glc
537 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
538 ; GCN2-NEXT: buffer_wbinvl1_vol
539 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
540 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
541 ; GCN2-NEXT: flat_store_dword v[0:1], v2
542 ; GCN2-NEXT: s_endpgm
544 ; GCN3-LABEL: atomic_add_i32_ret_addr64:
545 ; GCN3: ; %bb.0: ; %entry
546 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
547 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
548 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
549 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
550 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
551 ; GCN3-NEXT: s_add_u32 s0, s4, s0
552 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
553 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
554 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
555 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
556 ; GCN3-NEXT: flat_atomic_add v2, v[0:1], v2 glc
557 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
558 ; GCN3-NEXT: buffer_wbinvl1_vol
559 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
560 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
561 ; GCN3-NEXT: flat_store_dword v[0:1], v2
562 ; GCN3-NEXT: s_endpgm
564 %ptr = getelementptr i32, ptr %out, i64 %index
565 %val = atomicrmw volatile add ptr %ptr, i32 %in syncscope("agent") seq_cst
566 store i32 %val, ptr %out2
570 define amdgpu_kernel void @atomic_and_i32_offset(ptr %out, i32 %in) {
571 ; GCN1-LABEL: atomic_and_i32_offset:
572 ; GCN1: ; %bb.0: ; %entry
573 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
574 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
575 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
576 ; GCN1-NEXT: s_add_u32 s0, s2, 16
577 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
578 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
579 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
580 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
581 ; GCN1-NEXT: flat_atomic_and v[0:1], v2
582 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
583 ; GCN1-NEXT: buffer_wbinvl1_vol
584 ; GCN1-NEXT: s_endpgm
586 ; GCN2-LABEL: atomic_and_i32_offset:
587 ; GCN2: ; %bb.0: ; %entry
588 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
589 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
590 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
591 ; GCN2-NEXT: s_add_u32 s0, s2, 16
592 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
593 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
594 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
595 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
596 ; GCN2-NEXT: flat_atomic_and v[0:1], v2
597 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
598 ; GCN2-NEXT: buffer_wbinvl1_vol
599 ; GCN2-NEXT: s_endpgm
601 ; GCN3-LABEL: atomic_and_i32_offset:
602 ; GCN3: ; %bb.0: ; %entry
603 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
604 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
605 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
606 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
607 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
608 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
609 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16
610 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
611 ; GCN3-NEXT: buffer_wbinvl1_vol
612 ; GCN3-NEXT: s_endpgm
614 %gep = getelementptr i32, ptr %out, i32 4
615 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
619 define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
620 ; GCN1-LABEL: atomic_and_i32_ret_offset:
621 ; GCN1: ; %bb.0: ; %entry
622 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
623 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
624 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
625 ; GCN1-NEXT: s_add_u32 s0, s4, 16
626 ; GCN1-NEXT: s_addc_u32 s1, s5, 0
627 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
628 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
629 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
630 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc
631 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
632 ; GCN1-NEXT: buffer_wbinvl1_vol
633 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
634 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
635 ; GCN1-NEXT: flat_store_dword v[0:1], v2
636 ; GCN1-NEXT: s_endpgm
638 ; GCN2-LABEL: atomic_and_i32_ret_offset:
639 ; GCN2: ; %bb.0: ; %entry
640 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
641 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
642 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
643 ; GCN2-NEXT: s_add_u32 s0, s4, 16
644 ; GCN2-NEXT: s_addc_u32 s1, s5, 0
645 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
646 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
647 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
648 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc
649 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
650 ; GCN2-NEXT: buffer_wbinvl1_vol
651 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
652 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
653 ; GCN2-NEXT: flat_store_dword v[0:1], v2
654 ; GCN2-NEXT: s_endpgm
656 ; GCN3-LABEL: atomic_and_i32_ret_offset:
657 ; GCN3: ; %bb.0: ; %entry
658 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
659 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
660 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
661 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
662 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
663 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
664 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc
665 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
666 ; GCN3-NEXT: buffer_wbinvl1_vol
667 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
668 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
669 ; GCN3-NEXT: flat_store_dword v[0:1], v2
670 ; GCN3-NEXT: s_endpgm
672 %gep = getelementptr i32, ptr %out, i32 4
673 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
674 store i32 %val, ptr %out2
678 define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
679 ; GCN1-LABEL: atomic_and_i32_addr64_offset:
680 ; GCN1: ; %bb.0: ; %entry
681 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
682 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
683 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
684 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
685 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
686 ; GCN1-NEXT: s_add_u32 s0, s4, s0
687 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
688 ; GCN1-NEXT: s_add_u32 s0, s0, 16
689 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
690 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
691 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
692 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
693 ; GCN1-NEXT: flat_atomic_and v[0:1], v2
694 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
695 ; GCN1-NEXT: buffer_wbinvl1_vol
696 ; GCN1-NEXT: s_endpgm
698 ; GCN2-LABEL: atomic_and_i32_addr64_offset:
699 ; GCN2: ; %bb.0: ; %entry
700 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
701 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
702 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
703 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
704 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
705 ; GCN2-NEXT: s_add_u32 s0, s4, s0
706 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
707 ; GCN2-NEXT: s_add_u32 s0, s0, 16
708 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
709 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
710 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
711 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
712 ; GCN2-NEXT: flat_atomic_and v[0:1], v2
713 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
714 ; GCN2-NEXT: buffer_wbinvl1_vol
715 ; GCN2-NEXT: s_endpgm
717 ; GCN3-LABEL: atomic_and_i32_addr64_offset:
718 ; GCN3: ; %bb.0: ; %entry
719 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
720 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
721 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
722 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
723 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
724 ; GCN3-NEXT: s_add_u32 s0, s4, s0
725 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
726 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
727 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
728 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
729 ; GCN3-NEXT: flat_atomic_and v[0:1], v2 offset:16
730 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
731 ; GCN3-NEXT: buffer_wbinvl1_vol
732 ; GCN3-NEXT: s_endpgm
734 %ptr = getelementptr i32, ptr %out, i64 %index
735 %gep = getelementptr i32, ptr %ptr, i32 4
736 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
740 define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
741 ; GCN1-LABEL: atomic_and_i32_ret_addr64_offset:
742 ; GCN1: ; %bb.0: ; %entry
743 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
744 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
745 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
746 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
747 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
748 ; GCN1-NEXT: s_add_u32 s0, s4, s0
749 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
750 ; GCN1-NEXT: s_add_u32 s0, s0, 16
751 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
752 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
753 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
754 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
755 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc
756 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
757 ; GCN1-NEXT: buffer_wbinvl1_vol
758 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
759 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
760 ; GCN1-NEXT: flat_store_dword v[0:1], v2
761 ; GCN1-NEXT: s_endpgm
763 ; GCN2-LABEL: atomic_and_i32_ret_addr64_offset:
764 ; GCN2: ; %bb.0: ; %entry
765 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
766 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
767 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
768 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
769 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
770 ; GCN2-NEXT: s_add_u32 s0, s4, s0
771 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
772 ; GCN2-NEXT: s_add_u32 s0, s0, 16
773 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
774 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
775 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
776 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
777 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc
778 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
779 ; GCN2-NEXT: buffer_wbinvl1_vol
780 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
781 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
782 ; GCN2-NEXT: flat_store_dword v[0:1], v2
783 ; GCN2-NEXT: s_endpgm
785 ; GCN3-LABEL: atomic_and_i32_ret_addr64_offset:
786 ; GCN3: ; %bb.0: ; %entry
787 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
788 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
789 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
790 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
791 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
792 ; GCN3-NEXT: s_add_u32 s0, s4, s0
793 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
794 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
795 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
796 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
797 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 offset:16 glc
798 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
799 ; GCN3-NEXT: buffer_wbinvl1_vol
800 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
801 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
802 ; GCN3-NEXT: flat_store_dword v[0:1], v2
803 ; GCN3-NEXT: s_endpgm
805 %ptr = getelementptr i32, ptr %out, i64 %index
806 %gep = getelementptr i32, ptr %ptr, i32 4
807 %val = atomicrmw volatile and ptr %gep, i32 %in syncscope("agent") seq_cst
808 store i32 %val, ptr %out2
812 define amdgpu_kernel void @atomic_and_i32(ptr %out, i32 %in) {
813 ; GCN1-LABEL: atomic_and_i32:
814 ; GCN1: ; %bb.0: ; %entry
815 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
816 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb
817 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
818 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
819 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
820 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
821 ; GCN1-NEXT: flat_atomic_and v[0:1], v2
822 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
823 ; GCN1-NEXT: buffer_wbinvl1_vol
824 ; GCN1-NEXT: s_endpgm
826 ; GCN2-LABEL: atomic_and_i32:
827 ; GCN2: ; %bb.0: ; %entry
828 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
829 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c
830 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
831 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
832 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
833 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
834 ; GCN2-NEXT: flat_atomic_and v[0:1], v2
835 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
836 ; GCN2-NEXT: buffer_wbinvl1_vol
837 ; GCN2-NEXT: s_endpgm
839 ; GCN3-LABEL: atomic_and_i32:
840 ; GCN3: ; %bb.0: ; %entry
841 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
842 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
843 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
844 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
845 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
846 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
847 ; GCN3-NEXT: flat_atomic_and v[0:1], v2
848 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
849 ; GCN3-NEXT: buffer_wbinvl1_vol
850 ; GCN3-NEXT: s_endpgm
852 %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst
856 define amdgpu_kernel void @atomic_and_i32_ret(ptr %out, ptr %out2, i32 %in) {
857 ; GCN1-LABEL: atomic_and_i32_ret:
858 ; GCN1: ; %bb.0: ; %entry
859 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
860 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd
861 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
862 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
863 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
864 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
865 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc
866 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
867 ; GCN1-NEXT: buffer_wbinvl1_vol
868 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
869 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
870 ; GCN1-NEXT: flat_store_dword v[0:1], v2
871 ; GCN1-NEXT: s_endpgm
873 ; GCN2-LABEL: atomic_and_i32_ret:
874 ; GCN2: ; %bb.0: ; %entry
875 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
876 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34
877 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
878 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
879 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
880 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
881 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc
882 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
883 ; GCN2-NEXT: buffer_wbinvl1_vol
884 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
885 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
886 ; GCN2-NEXT: flat_store_dword v[0:1], v2
887 ; GCN2-NEXT: s_endpgm
889 ; GCN3-LABEL: atomic_and_i32_ret:
890 ; GCN3: ; %bb.0: ; %entry
891 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
892 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
893 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
894 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
895 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
896 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
897 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc
898 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
899 ; GCN3-NEXT: buffer_wbinvl1_vol
900 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
901 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
902 ; GCN3-NEXT: flat_store_dword v[0:1], v2
903 ; GCN3-NEXT: s_endpgm
905 %val = atomicrmw volatile and ptr %out, i32 %in syncscope("agent") seq_cst
906 store i32 %val, ptr %out2
910 define amdgpu_kernel void @atomic_and_i32_addr64(ptr %out, i32 %in, i64 %index) {
911 ; GCN1-LABEL: atomic_and_i32_addr64:
912 ; GCN1: ; %bb.0: ; %entry
913 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
914 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
915 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
916 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
917 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
918 ; GCN1-NEXT: s_add_u32 s0, s4, s0
919 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
920 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
921 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
922 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
923 ; GCN1-NEXT: flat_atomic_and v[0:1], v2
924 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
925 ; GCN1-NEXT: buffer_wbinvl1_vol
926 ; GCN1-NEXT: s_endpgm
928 ; GCN2-LABEL: atomic_and_i32_addr64:
929 ; GCN2: ; %bb.0: ; %entry
930 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
931 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
932 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
933 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
934 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
935 ; GCN2-NEXT: s_add_u32 s0, s4, s0
936 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
937 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
938 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
939 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
940 ; GCN2-NEXT: flat_atomic_and v[0:1], v2
941 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
942 ; GCN2-NEXT: buffer_wbinvl1_vol
943 ; GCN2-NEXT: s_endpgm
945 ; GCN3-LABEL: atomic_and_i32_addr64:
946 ; GCN3: ; %bb.0: ; %entry
947 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
948 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
949 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
950 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
951 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
952 ; GCN3-NEXT: s_add_u32 s0, s4, s0
953 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
954 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
955 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
956 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
957 ; GCN3-NEXT: flat_atomic_and v[0:1], v2
958 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
959 ; GCN3-NEXT: buffer_wbinvl1_vol
960 ; GCN3-NEXT: s_endpgm
962 %ptr = getelementptr i32, ptr %out, i64 %index
963 %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst
967 define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
968 ; GCN1-LABEL: atomic_and_i32_ret_addr64:
969 ; GCN1: ; %bb.0: ; %entry
970 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
971 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
972 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
973 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
974 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
975 ; GCN1-NEXT: s_add_u32 s0, s4, s0
976 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
977 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
978 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
979 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
980 ; GCN1-NEXT: flat_atomic_and v2, v[0:1], v2 glc
981 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
982 ; GCN1-NEXT: buffer_wbinvl1_vol
983 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
984 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
985 ; GCN1-NEXT: flat_store_dword v[0:1], v2
986 ; GCN1-NEXT: s_endpgm
988 ; GCN2-LABEL: atomic_and_i32_ret_addr64:
989 ; GCN2: ; %bb.0: ; %entry
990 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
991 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
992 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
993 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
994 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
995 ; GCN2-NEXT: s_add_u32 s0, s4, s0
996 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
997 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
998 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
999 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
1000 ; GCN2-NEXT: flat_atomic_and v2, v[0:1], v2 glc
1001 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1002 ; GCN2-NEXT: buffer_wbinvl1_vol
1003 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1004 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1005 ; GCN2-NEXT: flat_store_dword v[0:1], v2
1006 ; GCN2-NEXT: s_endpgm
1008 ; GCN3-LABEL: atomic_and_i32_ret_addr64:
1009 ; GCN3: ; %bb.0: ; %entry
1010 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1011 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1012 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
1013 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1014 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1015 ; GCN3-NEXT: s_add_u32 s0, s4, s0
1016 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
1017 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
1018 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
1019 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
1020 ; GCN3-NEXT: flat_atomic_and v2, v[0:1], v2 glc
1021 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1022 ; GCN3-NEXT: buffer_wbinvl1_vol
1023 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1024 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1025 ; GCN3-NEXT: flat_store_dword v[0:1], v2
1026 ; GCN3-NEXT: s_endpgm
1028 %ptr = getelementptr i32, ptr %out, i64 %index
1029 %val = atomicrmw volatile and ptr %ptr, i32 %in syncscope("agent") seq_cst
1030 store i32 %val, ptr %out2
1034 define amdgpu_kernel void @atomic_sub_i32_offset(ptr %out, i32 %in) {
1035 ; GCN1-LABEL: atomic_sub_i32_offset:
1036 ; GCN1: ; %bb.0: ; %entry
1037 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
1038 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
1039 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1040 ; GCN1-NEXT: s_add_u32 s0, s2, 16
1041 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
1042 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
1043 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
1044 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
1045 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2
1046 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1047 ; GCN1-NEXT: buffer_wbinvl1_vol
1048 ; GCN1-NEXT: s_endpgm
1050 ; GCN2-LABEL: atomic_sub_i32_offset:
1051 ; GCN2: ; %bb.0: ; %entry
1052 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1053 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
1054 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1055 ; GCN2-NEXT: s_add_u32 s0, s2, 16
1056 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
1057 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
1058 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
1059 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
1060 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2
1061 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1062 ; GCN2-NEXT: buffer_wbinvl1_vol
1063 ; GCN2-NEXT: s_endpgm
1065 ; GCN3-LABEL: atomic_sub_i32_offset:
1066 ; GCN3: ; %bb.0: ; %entry
1067 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1068 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
1069 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1070 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
1071 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
1072 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
1073 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16
1074 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1075 ; GCN3-NEXT: buffer_wbinvl1_vol
1076 ; GCN3-NEXT: s_endpgm
1078 %gep = getelementptr i32, ptr %out, i32 4
1079 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
1083 define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
1084 ; GCN1-LABEL: atomic_sub_i32_ret_offset:
1085 ; GCN1: ; %bb.0: ; %entry
1086 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1087 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
1088 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1089 ; GCN1-NEXT: s_add_u32 s0, s4, 16
1090 ; GCN1-NEXT: s_addc_u32 s1, s5, 0
1091 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
1092 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
1093 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
1094 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
1095 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1096 ; GCN1-NEXT: buffer_wbinvl1_vol
1097 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1098 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
1099 ; GCN1-NEXT: flat_store_dword v[0:1], v2
1100 ; GCN1-NEXT: s_endpgm
1102 ; GCN2-LABEL: atomic_sub_i32_ret_offset:
1103 ; GCN2: ; %bb.0: ; %entry
1104 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1105 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
1106 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1107 ; GCN2-NEXT: s_add_u32 s0, s4, 16
1108 ; GCN2-NEXT: s_addc_u32 s1, s5, 0
1109 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
1110 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
1111 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
1112 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
1113 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1114 ; GCN2-NEXT: buffer_wbinvl1_vol
1115 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1116 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1117 ; GCN2-NEXT: flat_store_dword v[0:1], v2
1118 ; GCN2-NEXT: s_endpgm
1120 ; GCN3-LABEL: atomic_sub_i32_ret_offset:
1121 ; GCN3: ; %bb.0: ; %entry
1122 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1123 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
1124 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1125 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1126 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1127 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
1128 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc
1129 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1130 ; GCN3-NEXT: buffer_wbinvl1_vol
1131 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1132 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1133 ; GCN3-NEXT: flat_store_dword v[0:1], v2
1134 ; GCN3-NEXT: s_endpgm
1136 %gep = getelementptr i32, ptr %out, i32 4
1137 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
1138 store i32 %val, ptr %out2
1142 define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
1143 ; GCN1-LABEL: atomic_sub_i32_addr64_offset:
1144 ; GCN1: ; %bb.0: ; %entry
1145 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
1146 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1147 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
1148 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1149 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1150 ; GCN1-NEXT: s_add_u32 s0, s4, s0
1151 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
1152 ; GCN1-NEXT: s_add_u32 s0, s0, 16
1153 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
1154 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
1155 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
1156 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
1157 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2
1158 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1159 ; GCN1-NEXT: buffer_wbinvl1_vol
1160 ; GCN1-NEXT: s_endpgm
1162 ; GCN2-LABEL: atomic_sub_i32_addr64_offset:
1163 ; GCN2: ; %bb.0: ; %entry
1164 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1165 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1166 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
1167 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1168 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1169 ; GCN2-NEXT: s_add_u32 s0, s4, s0
1170 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
1171 ; GCN2-NEXT: s_add_u32 s0, s0, 16
1172 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
1173 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
1174 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
1175 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
1176 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2
1177 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1178 ; GCN2-NEXT: buffer_wbinvl1_vol
1179 ; GCN2-NEXT: s_endpgm
1181 ; GCN3-LABEL: atomic_sub_i32_addr64_offset:
1182 ; GCN3: ; %bb.0: ; %entry
1183 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1184 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1185 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
1186 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1187 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1188 ; GCN3-NEXT: s_add_u32 s0, s4, s0
1189 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
1190 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
1191 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
1192 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
1193 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2 offset:16
1194 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1195 ; GCN3-NEXT: buffer_wbinvl1_vol
1196 ; GCN3-NEXT: s_endpgm
1198 %ptr = getelementptr i32, ptr %out, i64 %index
1199 %gep = getelementptr i32, ptr %ptr, i32 4
1200 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
1204 define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
1205 ; GCN1-LABEL: atomic_sub_i32_ret_addr64_offset:
1206 ; GCN1: ; %bb.0: ; %entry
1207 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
1208 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1209 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
1210 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1211 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1212 ; GCN1-NEXT: s_add_u32 s0, s4, s0
1213 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
1214 ; GCN1-NEXT: s_add_u32 s0, s0, 16
1215 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
1216 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
1217 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
1218 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
1219 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
1220 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1221 ; GCN1-NEXT: buffer_wbinvl1_vol
1222 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1223 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
1224 ; GCN1-NEXT: flat_store_dword v[0:1], v2
1225 ; GCN1-NEXT: s_endpgm
1227 ; GCN2-LABEL: atomic_sub_i32_ret_addr64_offset:
1228 ; GCN2: ; %bb.0: ; %entry
1229 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1230 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1231 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
1232 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1233 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1234 ; GCN2-NEXT: s_add_u32 s0, s4, s0
1235 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
1236 ; GCN2-NEXT: s_add_u32 s0, s0, 16
1237 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
1238 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
1239 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
1240 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
1241 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
1242 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1243 ; GCN2-NEXT: buffer_wbinvl1_vol
1244 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1245 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1246 ; GCN2-NEXT: flat_store_dword v[0:1], v2
1247 ; GCN2-NEXT: s_endpgm
1249 ; GCN3-LABEL: atomic_sub_i32_ret_addr64_offset:
1250 ; GCN3: ; %bb.0: ; %entry
1251 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1252 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1253 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
1254 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1255 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1256 ; GCN3-NEXT: s_add_u32 s0, s4, s0
1257 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
1258 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
1259 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
1260 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
1261 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 offset:16 glc
1262 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1263 ; GCN3-NEXT: buffer_wbinvl1_vol
1264 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1265 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1266 ; GCN3-NEXT: flat_store_dword v[0:1], v2
1267 ; GCN3-NEXT: s_endpgm
1269 %ptr = getelementptr i32, ptr %out, i64 %index
1270 %gep = getelementptr i32, ptr %ptr, i32 4
1271 %val = atomicrmw volatile sub ptr %gep, i32 %in syncscope("agent") seq_cst
1272 store i32 %val, ptr %out2
1276 define amdgpu_kernel void @atomic_sub_i32(ptr %out, i32 %in) {
1277 ; GCN1-LABEL: atomic_sub_i32:
1278 ; GCN1: ; %bb.0: ; %entry
1279 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
1280 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb
1281 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1282 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
1283 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
1284 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
1285 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2
1286 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1287 ; GCN1-NEXT: buffer_wbinvl1_vol
1288 ; GCN1-NEXT: s_endpgm
1290 ; GCN2-LABEL: atomic_sub_i32:
1291 ; GCN2: ; %bb.0: ; %entry
1292 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1293 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c
1294 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1295 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
1296 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
1297 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
1298 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2
1299 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1300 ; GCN2-NEXT: buffer_wbinvl1_vol
1301 ; GCN2-NEXT: s_endpgm
1303 ; GCN3-LABEL: atomic_sub_i32:
1304 ; GCN3: ; %bb.0: ; %entry
1305 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1306 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
1307 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1308 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
1309 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
1310 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
1311 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2
1312 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1313 ; GCN3-NEXT: buffer_wbinvl1_vol
1314 ; GCN3-NEXT: s_endpgm
1316 %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst
1320 define amdgpu_kernel void @atomic_sub_i32_ret(ptr %out, ptr %out2, i32 %in) {
1321 ; GCN1-LABEL: atomic_sub_i32_ret:
1322 ; GCN1: ; %bb.0: ; %entry
1323 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1324 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd
1325 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1326 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
1327 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
1328 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
1329 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
1330 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1331 ; GCN1-NEXT: buffer_wbinvl1_vol
1332 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1333 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
1334 ; GCN1-NEXT: flat_store_dword v[0:1], v2
1335 ; GCN1-NEXT: s_endpgm
1337 ; GCN2-LABEL: atomic_sub_i32_ret:
1338 ; GCN2: ; %bb.0: ; %entry
1339 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1340 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34
1341 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1342 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
1343 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
1344 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
1345 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
1346 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1347 ; GCN2-NEXT: buffer_wbinvl1_vol
1348 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1349 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1350 ; GCN2-NEXT: flat_store_dword v[0:1], v2
1351 ; GCN2-NEXT: s_endpgm
1353 ; GCN3-LABEL: atomic_sub_i32_ret:
1354 ; GCN3: ; %bb.0: ; %entry
1355 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1356 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
1357 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1358 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1359 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1360 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
1361 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
1362 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1363 ; GCN3-NEXT: buffer_wbinvl1_vol
1364 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1365 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1366 ; GCN3-NEXT: flat_store_dword v[0:1], v2
1367 ; GCN3-NEXT: s_endpgm
1369 %val = atomicrmw volatile sub ptr %out, i32 %in syncscope("agent") seq_cst
1370 store i32 %val, ptr %out2
1374 define amdgpu_kernel void @atomic_sub_i32_addr64(ptr %out, i32 %in, i64 %index) {
1375 ; GCN1-LABEL: atomic_sub_i32_addr64:
1376 ; GCN1: ; %bb.0: ; %entry
1377 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
1378 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1379 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
1380 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1381 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1382 ; GCN1-NEXT: s_add_u32 s0, s4, s0
1383 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
1384 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
1385 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
1386 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
1387 ; GCN1-NEXT: flat_atomic_sub v[0:1], v2
1388 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1389 ; GCN1-NEXT: buffer_wbinvl1_vol
1390 ; GCN1-NEXT: s_endpgm
1392 ; GCN2-LABEL: atomic_sub_i32_addr64:
1393 ; GCN2: ; %bb.0: ; %entry
1394 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1395 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1396 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
1397 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1398 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1399 ; GCN2-NEXT: s_add_u32 s0, s4, s0
1400 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
1401 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
1402 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
1403 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
1404 ; GCN2-NEXT: flat_atomic_sub v[0:1], v2
1405 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1406 ; GCN2-NEXT: buffer_wbinvl1_vol
1407 ; GCN2-NEXT: s_endpgm
1409 ; GCN3-LABEL: atomic_sub_i32_addr64:
1410 ; GCN3: ; %bb.0: ; %entry
1411 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1412 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1413 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
1414 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1415 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1416 ; GCN3-NEXT: s_add_u32 s0, s4, s0
1417 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
1418 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
1419 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
1420 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
1421 ; GCN3-NEXT: flat_atomic_sub v[0:1], v2
1422 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1423 ; GCN3-NEXT: buffer_wbinvl1_vol
1424 ; GCN3-NEXT: s_endpgm
1426 %ptr = getelementptr i32, ptr %out, i64 %index
1427 %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst
1431 define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
1432 ; GCN1-LABEL: atomic_sub_i32_ret_addr64:
1433 ; GCN1: ; %bb.0: ; %entry
1434 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
1435 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1436 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
1437 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1438 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1439 ; GCN1-NEXT: s_add_u32 s0, s4, s0
1440 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
1441 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
1442 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
1443 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
1444 ; GCN1-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
1445 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1446 ; GCN1-NEXT: buffer_wbinvl1_vol
1447 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1448 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
1449 ; GCN1-NEXT: flat_store_dword v[0:1], v2
1450 ; GCN1-NEXT: s_endpgm
1452 ; GCN2-LABEL: atomic_sub_i32_ret_addr64:
1453 ; GCN2: ; %bb.0: ; %entry
1454 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1455 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1456 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
1457 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1458 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1459 ; GCN2-NEXT: s_add_u32 s0, s4, s0
1460 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
1461 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
1462 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
1463 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
1464 ; GCN2-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
1465 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1466 ; GCN2-NEXT: buffer_wbinvl1_vol
1467 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1468 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1469 ; GCN2-NEXT: flat_store_dword v[0:1], v2
1470 ; GCN2-NEXT: s_endpgm
1472 ; GCN3-LABEL: atomic_sub_i32_ret_addr64:
1473 ; GCN3: ; %bb.0: ; %entry
1474 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1475 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1476 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
1477 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1478 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1479 ; GCN3-NEXT: s_add_u32 s0, s4, s0
1480 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
1481 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
1482 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
1483 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
1484 ; GCN3-NEXT: flat_atomic_sub v2, v[0:1], v2 glc
1485 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1486 ; GCN3-NEXT: buffer_wbinvl1_vol
1487 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1488 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1489 ; GCN3-NEXT: flat_store_dword v[0:1], v2
1490 ; GCN3-NEXT: s_endpgm
1492 %ptr = getelementptr i32, ptr %out, i64 %index
1493 %val = atomicrmw volatile sub ptr %ptr, i32 %in syncscope("agent") seq_cst
1494 store i32 %val, ptr %out2
1498 define amdgpu_kernel void @atomic_max_i32_offset(ptr %out, i32 %in) {
1499 ; GCN1-LABEL: atomic_max_i32_offset:
1500 ; GCN1: ; %bb.0: ; %entry
1501 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
1502 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
1503 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1504 ; GCN1-NEXT: s_add_u32 s0, s2, 16
1505 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
1506 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
1507 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
1508 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
1509 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2
1510 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1511 ; GCN1-NEXT: s_endpgm
1513 ; GCN2-LABEL: atomic_max_i32_offset:
1514 ; GCN2: ; %bb.0: ; %entry
1515 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1516 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
1517 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1518 ; GCN2-NEXT: s_add_u32 s0, s2, 16
1519 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
1520 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
1521 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
1522 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
1523 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2
1524 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1525 ; GCN2-NEXT: s_endpgm
1527 ; GCN3-LABEL: atomic_max_i32_offset:
1528 ; GCN3: ; %bb.0: ; %entry
1529 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1530 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
1531 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1532 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
1533 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
1534 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
1535 ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16
1536 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1537 ; GCN3-NEXT: s_endpgm
1539 %gep = getelementptr i32, ptr %out, i32 4
1540 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
1544 define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
1545 ; GCN1-LABEL: atomic_max_i32_ret_offset:
1546 ; GCN1: ; %bb.0: ; %entry
1547 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1548 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
1549 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1550 ; GCN1-NEXT: s_add_u32 s0, s4, 16
1551 ; GCN1-NEXT: s_addc_u32 s1, s5, 0
1552 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
1553 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
1554 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
1555 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
1556 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1557 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1558 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
1559 ; GCN1-NEXT: s_waitcnt vmcnt(0)
1560 ; GCN1-NEXT: flat_store_dword v[0:1], v2
1561 ; GCN1-NEXT: s_endpgm
1563 ; GCN2-LABEL: atomic_max_i32_ret_offset:
1564 ; GCN2: ; %bb.0: ; %entry
1565 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1566 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
1567 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1568 ; GCN2-NEXT: s_add_u32 s0, s4, 16
1569 ; GCN2-NEXT: s_addc_u32 s1, s5, 0
1570 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
1571 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
1572 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
1573 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
1574 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1575 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1576 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1577 ; GCN2-NEXT: s_waitcnt vmcnt(0)
1578 ; GCN2-NEXT: flat_store_dword v[0:1], v2
1579 ; GCN2-NEXT: s_endpgm
1581 ; GCN3-LABEL: atomic_max_i32_ret_offset:
1582 ; GCN3: ; %bb.0: ; %entry
1583 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1584 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
1585 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1586 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1587 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1588 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
1589 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc
1590 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1591 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1592 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1593 ; GCN3-NEXT: s_waitcnt vmcnt(0)
1594 ; GCN3-NEXT: flat_store_dword v[0:1], v2
1595 ; GCN3-NEXT: s_endpgm
1597 %gep = getelementptr i32, ptr %out, i32 4
1598 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
1599 store i32 %val, ptr %out2
1603 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
1604 ; GCN1-LABEL: atomic_max_i32_addr64_offset:
1605 ; GCN1: ; %bb.0: ; %entry
1606 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
1607 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1608 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
1609 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1610 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1611 ; GCN1-NEXT: s_add_u32 s0, s4, s0
1612 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
1613 ; GCN1-NEXT: s_add_u32 s0, s0, 16
1614 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
1615 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
1616 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
1617 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
1618 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2
1619 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1620 ; GCN1-NEXT: s_endpgm
1622 ; GCN2-LABEL: atomic_max_i32_addr64_offset:
1623 ; GCN2: ; %bb.0: ; %entry
1624 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1625 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1626 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
1627 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1628 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1629 ; GCN2-NEXT: s_add_u32 s0, s4, s0
1630 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
1631 ; GCN2-NEXT: s_add_u32 s0, s0, 16
1632 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
1633 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
1634 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
1635 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
1636 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2
1637 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1638 ; GCN2-NEXT: s_endpgm
1640 ; GCN3-LABEL: atomic_max_i32_addr64_offset:
1641 ; GCN3: ; %bb.0: ; %entry
1642 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1643 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1644 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
1645 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1646 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1647 ; GCN3-NEXT: s_add_u32 s0, s4, s0
1648 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
1649 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
1650 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
1651 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
1652 ; GCN3-NEXT: flat_atomic_smax v[0:1], v2 offset:16
1653 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1654 ; GCN3-NEXT: s_endpgm
1656 %ptr = getelementptr i32, ptr %out, i64 %index
1657 %gep = getelementptr i32, ptr %ptr, i32 4
1658 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
1662 define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
1663 ; GCN1-LABEL: atomic_max_i32_ret_addr64_offset:
1664 ; GCN1: ; %bb.0: ; %entry
1665 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
1666 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1667 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
1668 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1669 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1670 ; GCN1-NEXT: s_add_u32 s0, s4, s0
1671 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
1672 ; GCN1-NEXT: s_add_u32 s0, s0, 16
1673 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
1674 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
1675 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
1676 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
1677 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
1678 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1679 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1680 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
1681 ; GCN1-NEXT: s_waitcnt vmcnt(0)
1682 ; GCN1-NEXT: flat_store_dword v[0:1], v2
1683 ; GCN1-NEXT: s_endpgm
1685 ; GCN2-LABEL: atomic_max_i32_ret_addr64_offset:
1686 ; GCN2: ; %bb.0: ; %entry
1687 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1688 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1689 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
1690 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1691 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1692 ; GCN2-NEXT: s_add_u32 s0, s4, s0
1693 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
1694 ; GCN2-NEXT: s_add_u32 s0, s0, 16
1695 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
1696 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
1697 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
1698 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
1699 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
1700 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1701 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1702 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1703 ; GCN2-NEXT: s_waitcnt vmcnt(0)
1704 ; GCN2-NEXT: flat_store_dword v[0:1], v2
1705 ; GCN2-NEXT: s_endpgm
1707 ; GCN3-LABEL: atomic_max_i32_ret_addr64_offset:
1708 ; GCN3: ; %bb.0: ; %entry
1709 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1710 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1711 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
1712 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1713 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1714 ; GCN3-NEXT: s_add_u32 s0, s4, s0
1715 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
1716 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
1717 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
1718 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
1719 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 offset:16 glc
1720 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1721 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1722 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1723 ; GCN3-NEXT: s_waitcnt vmcnt(0)
1724 ; GCN3-NEXT: flat_store_dword v[0:1], v2
1725 ; GCN3-NEXT: s_endpgm
1727 %ptr = getelementptr i32, ptr %out, i64 %index
1728 %gep = getelementptr i32, ptr %ptr, i32 4
1729 %val = atomicrmw volatile max ptr %gep, i32 %in syncscope("workgroup") seq_cst
1730 store i32 %val, ptr %out2
1734 define amdgpu_kernel void @atomic_max_i32(ptr %out, i32 %in) {
1735 ; GCN1-LABEL: atomic_max_i32:
1736 ; GCN1: ; %bb.0: ; %entry
1737 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
1738 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb
1739 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1740 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
1741 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
1742 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
1743 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2
1744 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1745 ; GCN1-NEXT: s_endpgm
1747 ; GCN2-LABEL: atomic_max_i32:
1748 ; GCN2: ; %bb.0: ; %entry
1749 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1750 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c
1751 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1752 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
1753 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
1754 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
1755 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2
1756 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1757 ; GCN2-NEXT: s_endpgm
1759 ; GCN3-LABEL: atomic_max_i32:
1760 ; GCN3: ; %bb.0: ; %entry
1761 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1762 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
1763 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1764 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
1765 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
1766 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
1767 ; GCN3-NEXT: flat_atomic_smax v[0:1], v2
1768 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1769 ; GCN3-NEXT: s_endpgm
1771 %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst
1775 define amdgpu_kernel void @atomic_max_i32_ret(ptr %out, ptr %out2, i32 %in) {
1776 ; GCN1-LABEL: atomic_max_i32_ret:
1777 ; GCN1: ; %bb.0: ; %entry
1778 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1779 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd
1780 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1781 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
1782 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
1783 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
1784 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
1785 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1786 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1787 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
1788 ; GCN1-NEXT: s_waitcnt vmcnt(0)
1789 ; GCN1-NEXT: flat_store_dword v[0:1], v2
1790 ; GCN1-NEXT: s_endpgm
1792 ; GCN2-LABEL: atomic_max_i32_ret:
1793 ; GCN2: ; %bb.0: ; %entry
1794 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1795 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34
1796 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1797 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
1798 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
1799 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
1800 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
1801 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1802 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1803 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1804 ; GCN2-NEXT: s_waitcnt vmcnt(0)
1805 ; GCN2-NEXT: flat_store_dword v[0:1], v2
1806 ; GCN2-NEXT: s_endpgm
1808 ; GCN3-LABEL: atomic_max_i32_ret:
1809 ; GCN3: ; %bb.0: ; %entry
1810 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1811 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
1812 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1813 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
1814 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
1815 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
1816 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
1817 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1818 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1819 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1820 ; GCN3-NEXT: s_waitcnt vmcnt(0)
1821 ; GCN3-NEXT: flat_store_dword v[0:1], v2
1822 ; GCN3-NEXT: s_endpgm
1824 %val = atomicrmw volatile max ptr %out, i32 %in syncscope("workgroup") seq_cst
1825 store i32 %val, ptr %out2
1829 define amdgpu_kernel void @atomic_max_i32_addr64(ptr %out, i32 %in, i64 %index) {
1830 ; GCN1-LABEL: atomic_max_i32_addr64:
1831 ; GCN1: ; %bb.0: ; %entry
1832 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
1833 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
1834 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
1835 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1836 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1837 ; GCN1-NEXT: s_add_u32 s0, s4, s0
1838 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
1839 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
1840 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
1841 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
1842 ; GCN1-NEXT: flat_atomic_smax v[0:1], v2
1843 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1844 ; GCN1-NEXT: s_endpgm
1846 ; GCN2-LABEL: atomic_max_i32_addr64:
1847 ; GCN2: ; %bb.0: ; %entry
1848 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1849 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1850 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
1851 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1852 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1853 ; GCN2-NEXT: s_add_u32 s0, s4, s0
1854 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
1855 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
1856 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
1857 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
1858 ; GCN2-NEXT: flat_atomic_smax v[0:1], v2
1859 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1860 ; GCN2-NEXT: s_endpgm
1862 ; GCN3-LABEL: atomic_max_i32_addr64:
1863 ; GCN3: ; %bb.0: ; %entry
1864 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1865 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
1866 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
1867 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1868 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1869 ; GCN3-NEXT: s_add_u32 s0, s4, s0
1870 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
1871 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
1872 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
1873 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
1874 ; GCN3-NEXT: flat_atomic_smax v[0:1], v2
1875 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1876 ; GCN3-NEXT: s_endpgm
1878 %ptr = getelementptr i32, ptr %out, i64 %index
1879 %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst
1883 define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
1884 ; GCN1-LABEL: atomic_max_i32_ret_addr64:
1885 ; GCN1: ; %bb.0: ; %entry
1886 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
1887 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
1888 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
1889 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1890 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1891 ; GCN1-NEXT: s_add_u32 s0, s4, s0
1892 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
1893 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
1894 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
1895 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
1896 ; GCN1-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
1897 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1898 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
1899 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
1900 ; GCN1-NEXT: s_waitcnt vmcnt(0)
1901 ; GCN1-NEXT: flat_store_dword v[0:1], v2
1902 ; GCN1-NEXT: s_endpgm
1904 ; GCN2-LABEL: atomic_max_i32_ret_addr64:
1905 ; GCN2: ; %bb.0: ; %entry
1906 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1907 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1908 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
1909 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1910 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1911 ; GCN2-NEXT: s_add_u32 s0, s4, s0
1912 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
1913 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
1914 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
1915 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
1916 ; GCN2-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
1917 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1918 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
1919 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
1920 ; GCN2-NEXT: s_waitcnt vmcnt(0)
1921 ; GCN2-NEXT: flat_store_dword v[0:1], v2
1922 ; GCN2-NEXT: s_endpgm
1924 ; GCN3-LABEL: atomic_max_i32_ret_addr64:
1925 ; GCN3: ; %bb.0: ; %entry
1926 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
1927 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1928 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
1929 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1930 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
1931 ; GCN3-NEXT: s_add_u32 s0, s4, s0
1932 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
1933 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
1934 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
1935 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
1936 ; GCN3-NEXT: flat_atomic_smax v2, v[0:1], v2 glc
1937 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1938 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
1939 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
1940 ; GCN3-NEXT: s_waitcnt vmcnt(0)
1941 ; GCN3-NEXT: flat_store_dword v[0:1], v2
1942 ; GCN3-NEXT: s_endpgm
1944 %ptr = getelementptr i32, ptr %out, i64 %index
1945 %val = atomicrmw volatile max ptr %ptr, i32 %in syncscope("workgroup") seq_cst
1946 store i32 %val, ptr %out2
1950 define amdgpu_kernel void @atomic_umax_i32_offset(ptr %out, i32 %in) {
1951 ; GCN1-LABEL: atomic_umax_i32_offset:
1952 ; GCN1: ; %bb.0: ; %entry
1953 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
1954 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
1955 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1956 ; GCN1-NEXT: s_add_u32 s0, s2, 16
1957 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
1958 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
1959 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
1960 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
1961 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2
1962 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
1963 ; GCN1-NEXT: s_endpgm
1965 ; GCN2-LABEL: atomic_umax_i32_offset:
1966 ; GCN2: ; %bb.0: ; %entry
1967 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1968 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
1969 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1970 ; GCN2-NEXT: s_add_u32 s0, s2, 16
1971 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
1972 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
1973 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
1974 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
1975 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2
1976 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
1977 ; GCN2-NEXT: s_endpgm
1979 ; GCN3-LABEL: atomic_umax_i32_offset:
1980 ; GCN3: ; %bb.0: ; %entry
1981 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
1982 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
1983 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1984 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
1985 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
1986 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
1987 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16
1988 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
1989 ; GCN3-NEXT: s_endpgm
1991 %gep = getelementptr i32, ptr %out, i32 4
1992 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
1996 define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
1997 ; GCN1-LABEL: atomic_umax_i32_ret_offset:
1998 ; GCN1: ; %bb.0: ; %entry
1999 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2000 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
2001 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2002 ; GCN1-NEXT: s_add_u32 s0, s4, 16
2003 ; GCN1-NEXT: s_addc_u32 s1, s5, 0
2004 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
2005 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
2006 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
2007 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
2008 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2009 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2010 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2011 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2012 ; GCN1-NEXT: flat_store_dword v[0:1], v2
2013 ; GCN1-NEXT: s_endpgm
2015 ; GCN2-LABEL: atomic_umax_i32_ret_offset:
2016 ; GCN2: ; %bb.0: ; %entry
2017 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2018 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
2019 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2020 ; GCN2-NEXT: s_add_u32 s0, s4, 16
2021 ; GCN2-NEXT: s_addc_u32 s1, s5, 0
2022 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
2023 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
2024 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
2025 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
2026 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2027 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2028 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2029 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2030 ; GCN2-NEXT: flat_store_dword v[0:1], v2
2031 ; GCN2-NEXT: s_endpgm
2033 ; GCN3-LABEL: atomic_umax_i32_ret_offset:
2034 ; GCN3: ; %bb.0: ; %entry
2035 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2036 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
2037 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2038 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2039 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2040 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
2041 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc
2042 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2043 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2044 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2045 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2046 ; GCN3-NEXT: flat_store_dword v[0:1], v2
2047 ; GCN3-NEXT: s_endpgm
2049 %gep = getelementptr i32, ptr %out, i32 4
2050 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
2051 store i32 %val, ptr %out2
2055 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
2056 ; GCN1-LABEL: atomic_umax_i32_addr64_offset:
2057 ; GCN1: ; %bb.0: ; %entry
2058 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
2059 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
2060 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
2061 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2062 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2063 ; GCN1-NEXT: s_add_u32 s0, s4, s0
2064 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
2065 ; GCN1-NEXT: s_add_u32 s0, s0, 16
2066 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
2067 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
2068 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
2069 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
2070 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2
2071 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2072 ; GCN1-NEXT: s_endpgm
2074 ; GCN2-LABEL: atomic_umax_i32_addr64_offset:
2075 ; GCN2: ; %bb.0: ; %entry
2076 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2077 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2078 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
2079 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2080 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2081 ; GCN2-NEXT: s_add_u32 s0, s4, s0
2082 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
2083 ; GCN2-NEXT: s_add_u32 s0, s0, 16
2084 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
2085 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
2086 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
2087 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
2088 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2
2089 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2090 ; GCN2-NEXT: s_endpgm
2092 ; GCN3-LABEL: atomic_umax_i32_addr64_offset:
2093 ; GCN3: ; %bb.0: ; %entry
2094 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2095 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2096 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
2097 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2098 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2099 ; GCN3-NEXT: s_add_u32 s0, s4, s0
2100 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
2101 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
2102 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
2103 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
2104 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2 offset:16
2105 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2106 ; GCN3-NEXT: s_endpgm
2108 %ptr = getelementptr i32, ptr %out, i64 %index
2109 %gep = getelementptr i32, ptr %ptr, i32 4
2110 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
2114 define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
2115 ; GCN1-LABEL: atomic_umax_i32_ret_addr64_offset:
2116 ; GCN1: ; %bb.0: ; %entry
2117 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
2118 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2119 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
2120 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2121 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2122 ; GCN1-NEXT: s_add_u32 s0, s4, s0
2123 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
2124 ; GCN1-NEXT: s_add_u32 s0, s0, 16
2125 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
2126 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
2127 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
2128 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
2129 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
2130 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2131 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2132 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2133 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2134 ; GCN1-NEXT: flat_store_dword v[0:1], v2
2135 ; GCN1-NEXT: s_endpgm
2137 ; GCN2-LABEL: atomic_umax_i32_ret_addr64_offset:
2138 ; GCN2: ; %bb.0: ; %entry
2139 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2140 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2141 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
2142 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2143 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2144 ; GCN2-NEXT: s_add_u32 s0, s4, s0
2145 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
2146 ; GCN2-NEXT: s_add_u32 s0, s0, 16
2147 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
2148 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
2149 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
2150 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
2151 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
2152 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2153 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2154 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2155 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2156 ; GCN2-NEXT: flat_store_dword v[0:1], v2
2157 ; GCN2-NEXT: s_endpgm
2159 ; GCN3-LABEL: atomic_umax_i32_ret_addr64_offset:
2160 ; GCN3: ; %bb.0: ; %entry
2161 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2162 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2163 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
2164 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2165 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2166 ; GCN3-NEXT: s_add_u32 s0, s4, s0
2167 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
2168 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
2169 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
2170 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
2171 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 offset:16 glc
2172 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2173 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2174 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2175 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2176 ; GCN3-NEXT: flat_store_dword v[0:1], v2
2177 ; GCN3-NEXT: s_endpgm
2179 %ptr = getelementptr i32, ptr %out, i64 %index
2180 %gep = getelementptr i32, ptr %ptr, i32 4
2181 %val = atomicrmw volatile umax ptr %gep, i32 %in syncscope("workgroup") seq_cst
2182 store i32 %val, ptr %out2
2186 define amdgpu_kernel void @atomic_umax_i32(ptr %out, i32 %in) {
2187 ; GCN1-LABEL: atomic_umax_i32:
2188 ; GCN1: ; %bb.0: ; %entry
2189 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
2190 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb
2191 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2192 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
2193 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
2194 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
2195 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2
2196 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2197 ; GCN1-NEXT: s_endpgm
2199 ; GCN2-LABEL: atomic_umax_i32:
2200 ; GCN2: ; %bb.0: ; %entry
2201 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2202 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c
2203 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2204 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
2205 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
2206 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
2207 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2
2208 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2209 ; GCN2-NEXT: s_endpgm
2211 ; GCN3-LABEL: atomic_umax_i32:
2212 ; GCN3: ; %bb.0: ; %entry
2213 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2214 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
2215 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2216 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
2217 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
2218 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
2219 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2
2220 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2221 ; GCN3-NEXT: s_endpgm
2223 %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst
2227 define amdgpu_kernel void @atomic_umax_i32_ret(ptr %out, ptr %out2, i32 %in) {
2228 ; GCN1-LABEL: atomic_umax_i32_ret:
2229 ; GCN1: ; %bb.0: ; %entry
2230 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2231 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd
2232 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2233 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
2234 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
2235 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
2236 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
2237 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2238 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2239 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2240 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2241 ; GCN1-NEXT: flat_store_dword v[0:1], v2
2242 ; GCN1-NEXT: s_endpgm
2244 ; GCN2-LABEL: atomic_umax_i32_ret:
2245 ; GCN2: ; %bb.0: ; %entry
2246 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2247 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34
2248 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2249 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
2250 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
2251 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
2252 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
2253 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2254 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2255 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2256 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2257 ; GCN2-NEXT: flat_store_dword v[0:1], v2
2258 ; GCN2-NEXT: s_endpgm
2260 ; GCN3-LABEL: atomic_umax_i32_ret:
2261 ; GCN3: ; %bb.0: ; %entry
2262 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2263 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
2264 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2265 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2266 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2267 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
2268 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
2269 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2270 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2271 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2272 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2273 ; GCN3-NEXT: flat_store_dword v[0:1], v2
2274 ; GCN3-NEXT: s_endpgm
2276 %val = atomicrmw volatile umax ptr %out, i32 %in syncscope("workgroup") seq_cst
2277 store i32 %val, ptr %out2
2281 define amdgpu_kernel void @atomic_umax_i32_addr64(ptr %out, i32 %in, i64 %index) {
2282 ; GCN1-LABEL: atomic_umax_i32_addr64:
2283 ; GCN1: ; %bb.0: ; %entry
2284 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
2285 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
2286 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
2287 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2288 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2289 ; GCN1-NEXT: s_add_u32 s0, s4, s0
2290 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
2291 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
2292 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
2293 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
2294 ; GCN1-NEXT: flat_atomic_umax v[0:1], v2
2295 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2296 ; GCN1-NEXT: s_endpgm
2298 ; GCN2-LABEL: atomic_umax_i32_addr64:
2299 ; GCN2: ; %bb.0: ; %entry
2300 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2301 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2302 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
2303 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2304 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2305 ; GCN2-NEXT: s_add_u32 s0, s4, s0
2306 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
2307 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
2308 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
2309 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
2310 ; GCN2-NEXT: flat_atomic_umax v[0:1], v2
2311 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2312 ; GCN2-NEXT: s_endpgm
2314 ; GCN3-LABEL: atomic_umax_i32_addr64:
2315 ; GCN3: ; %bb.0: ; %entry
2316 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2317 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2318 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
2319 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2320 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2321 ; GCN3-NEXT: s_add_u32 s0, s4, s0
2322 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
2323 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
2324 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
2325 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
2326 ; GCN3-NEXT: flat_atomic_umax v[0:1], v2
2327 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2328 ; GCN3-NEXT: s_endpgm
2330 %ptr = getelementptr i32, ptr %out, i64 %index
2331 %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst
2335 define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
2336 ; GCN1-LABEL: atomic_umax_i32_ret_addr64:
2337 ; GCN1: ; %bb.0: ; %entry
2338 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
2339 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2340 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
2341 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2342 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2343 ; GCN1-NEXT: s_add_u32 s0, s4, s0
2344 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
2345 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
2346 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
2347 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
2348 ; GCN1-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
2349 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2350 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2351 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2352 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2353 ; GCN1-NEXT: flat_store_dword v[0:1], v2
2354 ; GCN1-NEXT: s_endpgm
2356 ; GCN2-LABEL: atomic_umax_i32_ret_addr64:
2357 ; GCN2: ; %bb.0: ; %entry
2358 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2359 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2360 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
2361 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2362 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2363 ; GCN2-NEXT: s_add_u32 s0, s4, s0
2364 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
2365 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
2366 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
2367 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
2368 ; GCN2-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
2369 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2370 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2371 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2372 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2373 ; GCN2-NEXT: flat_store_dword v[0:1], v2
2374 ; GCN2-NEXT: s_endpgm
2376 ; GCN3-LABEL: atomic_umax_i32_ret_addr64:
2377 ; GCN3: ; %bb.0: ; %entry
2378 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2379 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2380 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
2381 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2382 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2383 ; GCN3-NEXT: s_add_u32 s0, s4, s0
2384 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
2385 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
2386 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
2387 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
2388 ; GCN3-NEXT: flat_atomic_umax v2, v[0:1], v2 glc
2389 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2390 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2391 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2392 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2393 ; GCN3-NEXT: flat_store_dword v[0:1], v2
2394 ; GCN3-NEXT: s_endpgm
2396 %ptr = getelementptr i32, ptr %out, i64 %index
2397 %val = atomicrmw volatile umax ptr %ptr, i32 %in syncscope("workgroup") seq_cst
2398 store i32 %val, ptr %out2
2402 define amdgpu_kernel void @atomic_min_i32_offset(ptr %out, i32 %in) {
2403 ; GCN1-LABEL: atomic_min_i32_offset:
2404 ; GCN1: ; %bb.0: ; %entry
2405 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
2406 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
2407 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2408 ; GCN1-NEXT: s_add_u32 s0, s2, 16
2409 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
2410 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
2411 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
2412 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
2413 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2
2414 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2415 ; GCN1-NEXT: s_endpgm
2417 ; GCN2-LABEL: atomic_min_i32_offset:
2418 ; GCN2: ; %bb.0: ; %entry
2419 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2420 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
2421 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2422 ; GCN2-NEXT: s_add_u32 s0, s2, 16
2423 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
2424 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
2425 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
2426 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
2427 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2
2428 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2429 ; GCN2-NEXT: s_endpgm
2431 ; GCN3-LABEL: atomic_min_i32_offset:
2432 ; GCN3: ; %bb.0: ; %entry
2433 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2434 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
2435 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2436 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
2437 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
2438 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
2439 ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16
2440 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2441 ; GCN3-NEXT: s_endpgm
2443 %gep = getelementptr i32, ptr %out, i32 4
2444 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
2448 define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
2449 ; GCN1-LABEL: atomic_min_i32_ret_offset:
2450 ; GCN1: ; %bb.0: ; %entry
2451 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2452 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
2453 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2454 ; GCN1-NEXT: s_add_u32 s0, s4, 16
2455 ; GCN1-NEXT: s_addc_u32 s1, s5, 0
2456 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
2457 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
2458 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
2459 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
2460 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2461 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2462 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2463 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2464 ; GCN1-NEXT: flat_store_dword v[0:1], v2
2465 ; GCN1-NEXT: s_endpgm
2467 ; GCN2-LABEL: atomic_min_i32_ret_offset:
2468 ; GCN2: ; %bb.0: ; %entry
2469 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2470 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
2471 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2472 ; GCN2-NEXT: s_add_u32 s0, s4, 16
2473 ; GCN2-NEXT: s_addc_u32 s1, s5, 0
2474 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
2475 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
2476 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
2477 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
2478 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2479 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2480 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2481 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2482 ; GCN2-NEXT: flat_store_dword v[0:1], v2
2483 ; GCN2-NEXT: s_endpgm
2485 ; GCN3-LABEL: atomic_min_i32_ret_offset:
2486 ; GCN3: ; %bb.0: ; %entry
2487 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2488 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
2489 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2490 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2491 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2492 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
2493 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc
2494 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2495 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2496 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2497 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2498 ; GCN3-NEXT: flat_store_dword v[0:1], v2
2499 ; GCN3-NEXT: s_endpgm
2501 %gep = getelementptr i32, ptr %out, i32 4
2502 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
2503 store i32 %val, ptr %out2
2507 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
2508 ; GCN1-LABEL: atomic_min_i32_addr64_offset:
2509 ; GCN1: ; %bb.0: ; %entry
2510 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
2511 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
2512 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
2513 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2514 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2515 ; GCN1-NEXT: s_add_u32 s0, s4, s0
2516 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
2517 ; GCN1-NEXT: s_add_u32 s0, s0, 16
2518 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
2519 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
2520 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
2521 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
2522 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2
2523 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2524 ; GCN1-NEXT: s_endpgm
2526 ; GCN2-LABEL: atomic_min_i32_addr64_offset:
2527 ; GCN2: ; %bb.0: ; %entry
2528 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2529 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2530 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
2531 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2532 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2533 ; GCN2-NEXT: s_add_u32 s0, s4, s0
2534 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
2535 ; GCN2-NEXT: s_add_u32 s0, s0, 16
2536 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
2537 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
2538 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
2539 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
2540 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2
2541 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2542 ; GCN2-NEXT: s_endpgm
2544 ; GCN3-LABEL: atomic_min_i32_addr64_offset:
2545 ; GCN3: ; %bb.0: ; %entry
2546 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2547 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2548 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
2549 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2550 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2551 ; GCN3-NEXT: s_add_u32 s0, s4, s0
2552 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
2553 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
2554 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
2555 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
2556 ; GCN3-NEXT: flat_atomic_smin v[0:1], v2 offset:16
2557 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2558 ; GCN3-NEXT: s_endpgm
2560 %ptr = getelementptr i32, ptr %out, i64 %index
2561 %gep = getelementptr i32, ptr %ptr, i32 4
2562 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
2566 define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
2567 ; GCN1-LABEL: atomic_min_i32_ret_addr64_offset:
2568 ; GCN1: ; %bb.0: ; %entry
2569 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
2570 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2571 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
2572 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2573 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2574 ; GCN1-NEXT: s_add_u32 s0, s4, s0
2575 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
2576 ; GCN1-NEXT: s_add_u32 s0, s0, 16
2577 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
2578 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
2579 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
2580 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
2581 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
2582 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2583 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2584 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2585 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2586 ; GCN1-NEXT: flat_store_dword v[0:1], v2
2587 ; GCN1-NEXT: s_endpgm
2589 ; GCN2-LABEL: atomic_min_i32_ret_addr64_offset:
2590 ; GCN2: ; %bb.0: ; %entry
2591 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2592 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2593 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
2594 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2595 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2596 ; GCN2-NEXT: s_add_u32 s0, s4, s0
2597 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
2598 ; GCN2-NEXT: s_add_u32 s0, s0, 16
2599 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
2600 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
2601 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
2602 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
2603 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
2604 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2605 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2606 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2607 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2608 ; GCN2-NEXT: flat_store_dword v[0:1], v2
2609 ; GCN2-NEXT: s_endpgm
2611 ; GCN3-LABEL: atomic_min_i32_ret_addr64_offset:
2612 ; GCN3: ; %bb.0: ; %entry
2613 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2614 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2615 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
2616 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2617 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2618 ; GCN3-NEXT: s_add_u32 s0, s4, s0
2619 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
2620 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
2621 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
2622 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
2623 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 offset:16 glc
2624 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2625 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2626 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2627 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2628 ; GCN3-NEXT: flat_store_dword v[0:1], v2
2629 ; GCN3-NEXT: s_endpgm
2631 %ptr = getelementptr i32, ptr %out, i64 %index
2632 %gep = getelementptr i32, ptr %ptr, i32 4
2633 %val = atomicrmw volatile min ptr %gep, i32 %in syncscope("workgroup") seq_cst
2634 store i32 %val, ptr %out2
2638 define amdgpu_kernel void @atomic_min_i32(ptr %out, i32 %in) {
2639 ; GCN1-LABEL: atomic_min_i32:
2640 ; GCN1: ; %bb.0: ; %entry
2641 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
2642 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb
2643 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2644 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
2645 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
2646 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
2647 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2
2648 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2649 ; GCN1-NEXT: s_endpgm
2651 ; GCN2-LABEL: atomic_min_i32:
2652 ; GCN2: ; %bb.0: ; %entry
2653 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2654 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c
2655 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2656 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
2657 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
2658 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
2659 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2
2660 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2661 ; GCN2-NEXT: s_endpgm
2663 ; GCN3-LABEL: atomic_min_i32:
2664 ; GCN3: ; %bb.0: ; %entry
2665 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2666 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
2667 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2668 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
2669 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
2670 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
2671 ; GCN3-NEXT: flat_atomic_smin v[0:1], v2
2672 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2673 ; GCN3-NEXT: s_endpgm
2675 %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst
2679 define amdgpu_kernel void @atomic_min_i32_ret(ptr %out, ptr %out2, i32 %in) {
2680 ; GCN1-LABEL: atomic_min_i32_ret:
2681 ; GCN1: ; %bb.0: ; %entry
2682 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2683 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd
2684 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2685 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
2686 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
2687 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
2688 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
2689 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2690 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2691 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2692 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2693 ; GCN1-NEXT: flat_store_dword v[0:1], v2
2694 ; GCN1-NEXT: s_endpgm
2696 ; GCN2-LABEL: atomic_min_i32_ret:
2697 ; GCN2: ; %bb.0: ; %entry
2698 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2699 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34
2700 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2701 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
2702 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
2703 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
2704 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
2705 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2706 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2707 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2708 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2709 ; GCN2-NEXT: flat_store_dword v[0:1], v2
2710 ; GCN2-NEXT: s_endpgm
2712 ; GCN3-LABEL: atomic_min_i32_ret:
2713 ; GCN3: ; %bb.0: ; %entry
2714 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2715 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
2716 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2717 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2718 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2719 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
2720 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
2721 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2722 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2723 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2724 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2725 ; GCN3-NEXT: flat_store_dword v[0:1], v2
2726 ; GCN3-NEXT: s_endpgm
2728 %val = atomicrmw volatile min ptr %out, i32 %in syncscope("workgroup") seq_cst
2729 store i32 %val, ptr %out2
2733 define amdgpu_kernel void @atomic_min_i32_addr64(ptr %out, i32 %in, i64 %index) {
2734 ; GCN1-LABEL: atomic_min_i32_addr64:
2735 ; GCN1: ; %bb.0: ; %entry
2736 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
2737 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
2738 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
2739 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2740 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2741 ; GCN1-NEXT: s_add_u32 s0, s4, s0
2742 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
2743 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
2744 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
2745 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
2746 ; GCN1-NEXT: flat_atomic_smin v[0:1], v2
2747 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2748 ; GCN1-NEXT: s_endpgm
2750 ; GCN2-LABEL: atomic_min_i32_addr64:
2751 ; GCN2: ; %bb.0: ; %entry
2752 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2753 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2754 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
2755 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2756 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2757 ; GCN2-NEXT: s_add_u32 s0, s4, s0
2758 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
2759 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
2760 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
2761 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
2762 ; GCN2-NEXT: flat_atomic_smin v[0:1], v2
2763 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2764 ; GCN2-NEXT: s_endpgm
2766 ; GCN3-LABEL: atomic_min_i32_addr64:
2767 ; GCN3: ; %bb.0: ; %entry
2768 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2769 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2770 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
2771 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2772 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2773 ; GCN3-NEXT: s_add_u32 s0, s4, s0
2774 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
2775 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
2776 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
2777 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
2778 ; GCN3-NEXT: flat_atomic_smin v[0:1], v2
2779 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2780 ; GCN3-NEXT: s_endpgm
2782 %ptr = getelementptr i32, ptr %out, i64 %index
2783 %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst
2787 define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
2788 ; GCN1-LABEL: atomic_min_i32_ret_addr64:
2789 ; GCN1: ; %bb.0: ; %entry
2790 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
2791 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2792 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
2793 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2794 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2795 ; GCN1-NEXT: s_add_u32 s0, s4, s0
2796 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
2797 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
2798 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
2799 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
2800 ; GCN1-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
2801 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2802 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2803 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2804 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2805 ; GCN1-NEXT: flat_store_dword v[0:1], v2
2806 ; GCN1-NEXT: s_endpgm
2808 ; GCN2-LABEL: atomic_min_i32_ret_addr64:
2809 ; GCN2: ; %bb.0: ; %entry
2810 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2811 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2812 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
2813 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2814 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2815 ; GCN2-NEXT: s_add_u32 s0, s4, s0
2816 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
2817 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
2818 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
2819 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
2820 ; GCN2-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
2821 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2822 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2823 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2824 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2825 ; GCN2-NEXT: flat_store_dword v[0:1], v2
2826 ; GCN2-NEXT: s_endpgm
2828 ; GCN3-LABEL: atomic_min_i32_ret_addr64:
2829 ; GCN3: ; %bb.0: ; %entry
2830 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
2831 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2832 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
2833 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2834 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2835 ; GCN3-NEXT: s_add_u32 s0, s4, s0
2836 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
2837 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
2838 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
2839 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
2840 ; GCN3-NEXT: flat_atomic_smin v2, v[0:1], v2 glc
2841 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2842 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2843 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2844 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2845 ; GCN3-NEXT: flat_store_dword v[0:1], v2
2846 ; GCN3-NEXT: s_endpgm
2848 %ptr = getelementptr i32, ptr %out, i64 %index
2849 %val = atomicrmw volatile min ptr %ptr, i32 %in syncscope("workgroup") seq_cst
2850 store i32 %val, ptr %out2
2854 define amdgpu_kernel void @atomic_umin_i32_offset(ptr %out, i32 %in) {
2855 ; GCN1-LABEL: atomic_umin_i32_offset:
2856 ; GCN1: ; %bb.0: ; %entry
2857 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
2858 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
2859 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2860 ; GCN1-NEXT: s_add_u32 s0, s2, 16
2861 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
2862 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
2863 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
2864 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
2865 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2
2866 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2867 ; GCN1-NEXT: s_endpgm
2869 ; GCN2-LABEL: atomic_umin_i32_offset:
2870 ; GCN2: ; %bb.0: ; %entry
2871 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2872 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
2873 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2874 ; GCN2-NEXT: s_add_u32 s0, s2, 16
2875 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
2876 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
2877 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
2878 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
2879 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2
2880 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2881 ; GCN2-NEXT: s_endpgm
2883 ; GCN3-LABEL: atomic_umin_i32_offset:
2884 ; GCN3: ; %bb.0: ; %entry
2885 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
2886 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
2887 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2888 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
2889 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
2890 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
2891 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16
2892 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2893 ; GCN3-NEXT: s_endpgm
2895 %gep = getelementptr i32, ptr %out, i32 4
2896 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
2900 define amdgpu_kernel void @atomic_umin_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
2901 ; GCN1-LABEL: atomic_umin_i32_ret_offset:
2902 ; GCN1: ; %bb.0: ; %entry
2903 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
2904 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
2905 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2906 ; GCN1-NEXT: s_add_u32 s0, s4, 16
2907 ; GCN1-NEXT: s_addc_u32 s1, s5, 0
2908 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
2909 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
2910 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
2911 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
2912 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2913 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
2914 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
2915 ; GCN1-NEXT: s_waitcnt vmcnt(0)
2916 ; GCN1-NEXT: flat_store_dword v[0:1], v2
2917 ; GCN1-NEXT: s_endpgm
2919 ; GCN2-LABEL: atomic_umin_i32_ret_offset:
2920 ; GCN2: ; %bb.0: ; %entry
2921 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2922 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
2923 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2924 ; GCN2-NEXT: s_add_u32 s0, s4, 16
2925 ; GCN2-NEXT: s_addc_u32 s1, s5, 0
2926 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
2927 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
2928 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
2929 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
2930 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2931 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
2932 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
2933 ; GCN2-NEXT: s_waitcnt vmcnt(0)
2934 ; GCN2-NEXT: flat_store_dword v[0:1], v2
2935 ; GCN2-NEXT: s_endpgm
2937 ; GCN3-LABEL: atomic_umin_i32_ret_offset:
2938 ; GCN3: ; %bb.0: ; %entry
2939 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
2940 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
2941 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2942 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
2943 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
2944 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
2945 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc
2946 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
2947 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
2948 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
2949 ; GCN3-NEXT: s_waitcnt vmcnt(0)
2950 ; GCN3-NEXT: flat_store_dword v[0:1], v2
2951 ; GCN3-NEXT: s_endpgm
2953 %gep = getelementptr i32, ptr %out, i32 4
2954 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
2955 store i32 %val, ptr %out2
2959 define amdgpu_kernel void @atomic_umin_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
2960 ; GCN1-LABEL: atomic_umin_i32_addr64_offset:
2961 ; GCN1: ; %bb.0: ; %entry
2962 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
2963 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
2964 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
2965 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2966 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2967 ; GCN1-NEXT: s_add_u32 s0, s4, s0
2968 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
2969 ; GCN1-NEXT: s_add_u32 s0, s0, 16
2970 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
2971 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
2972 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
2973 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
2974 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2
2975 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
2976 ; GCN1-NEXT: s_endpgm
2978 ; GCN2-LABEL: atomic_umin_i32_addr64_offset:
2979 ; GCN2: ; %bb.0: ; %entry
2980 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2981 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
2982 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
2983 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2984 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
2985 ; GCN2-NEXT: s_add_u32 s0, s4, s0
2986 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
2987 ; GCN2-NEXT: s_add_u32 s0, s0, 16
2988 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
2989 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
2990 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
2991 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
2992 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2
2993 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
2994 ; GCN2-NEXT: s_endpgm
2996 ; GCN3-LABEL: atomic_umin_i32_addr64_offset:
2997 ; GCN3: ; %bb.0: ; %entry
2998 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
2999 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3000 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
3001 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3002 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3003 ; GCN3-NEXT: s_add_u32 s0, s4, s0
3004 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
3005 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
3006 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
3007 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
3008 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2 offset:16
3009 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3010 ; GCN3-NEXT: s_endpgm
3012 %ptr = getelementptr i32, ptr %out, i64 %index
3013 %gep = getelementptr i32, ptr %ptr, i32 4
3014 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
3018 define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
3019 ; GCN1-LABEL: atomic_umin_i32_ret_addr64_offset:
3020 ; GCN1: ; %bb.0: ; %entry
3021 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
3022 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3023 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
3024 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3025 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3026 ; GCN1-NEXT: s_add_u32 s0, s4, s0
3027 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
3028 ; GCN1-NEXT: s_add_u32 s0, s0, 16
3029 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
3030 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3031 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3032 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
3033 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
3034 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3035 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
3036 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
3037 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3038 ; GCN1-NEXT: flat_store_dword v[0:1], v2
3039 ; GCN1-NEXT: s_endpgm
3041 ; GCN2-LABEL: atomic_umin_i32_ret_addr64_offset:
3042 ; GCN2: ; %bb.0: ; %entry
3043 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
3044 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3045 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
3046 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3047 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3048 ; GCN2-NEXT: s_add_u32 s0, s4, s0
3049 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
3050 ; GCN2-NEXT: s_add_u32 s0, s0, 16
3051 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
3052 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3053 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3054 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
3055 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
3056 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3057 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
3058 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
3059 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3060 ; GCN2-NEXT: flat_store_dword v[0:1], v2
3061 ; GCN2-NEXT: s_endpgm
3063 ; GCN3-LABEL: atomic_umin_i32_ret_addr64_offset:
3064 ; GCN3: ; %bb.0: ; %entry
3065 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
3066 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3067 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
3068 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3069 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3070 ; GCN3-NEXT: s_add_u32 s0, s4, s0
3071 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
3072 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
3073 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
3074 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
3075 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 offset:16 glc
3076 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3077 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
3078 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
3079 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3080 ; GCN3-NEXT: flat_store_dword v[0:1], v2
3081 ; GCN3-NEXT: s_endpgm
3083 %ptr = getelementptr i32, ptr %out, i64 %index
3084 %gep = getelementptr i32, ptr %ptr, i32 4
3085 %val = atomicrmw volatile umin ptr %gep, i32 %in syncscope("workgroup") seq_cst
3086 store i32 %val, ptr %out2
3090 define amdgpu_kernel void @atomic_umin_i32(ptr %out, i32 %in) {
3091 ; GCN1-LABEL: atomic_umin_i32:
3092 ; GCN1: ; %bb.0: ; %entry
3093 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
3094 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb
3095 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3096 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
3097 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
3098 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
3099 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2
3100 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3101 ; GCN1-NEXT: s_endpgm
3103 ; GCN2-LABEL: atomic_umin_i32:
3104 ; GCN2: ; %bb.0: ; %entry
3105 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
3106 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c
3107 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3108 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
3109 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
3110 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
3111 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2
3112 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3113 ; GCN2-NEXT: s_endpgm
3115 ; GCN3-LABEL: atomic_umin_i32:
3116 ; GCN3: ; %bb.0: ; %entry
3117 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
3118 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
3119 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3120 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
3121 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
3122 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
3123 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2
3124 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3125 ; GCN3-NEXT: s_endpgm
3127 %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst
3131 define amdgpu_kernel void @atomic_umin_i32_ret(ptr %out, ptr %out2, i32 %in) {
3132 ; GCN1-LABEL: atomic_umin_i32_ret:
3133 ; GCN1: ; %bb.0: ; %entry
3134 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3135 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd
3136 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3137 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
3138 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
3139 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
3140 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
3141 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3142 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
3143 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
3144 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3145 ; GCN1-NEXT: flat_store_dword v[0:1], v2
3146 ; GCN1-NEXT: s_endpgm
3148 ; GCN2-LABEL: atomic_umin_i32_ret:
3149 ; GCN2: ; %bb.0: ; %entry
3150 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3151 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34
3152 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3153 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
3154 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
3155 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
3156 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
3157 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3158 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
3159 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
3160 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3161 ; GCN2-NEXT: flat_store_dword v[0:1], v2
3162 ; GCN2-NEXT: s_endpgm
3164 ; GCN3-LABEL: atomic_umin_i32_ret:
3165 ; GCN3: ; %bb.0: ; %entry
3166 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3167 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
3168 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3169 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3170 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3171 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
3172 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
3173 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3174 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
3175 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
3176 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3177 ; GCN3-NEXT: flat_store_dword v[0:1], v2
3178 ; GCN3-NEXT: s_endpgm
3180 %val = atomicrmw volatile umin ptr %out, i32 %in syncscope("workgroup") seq_cst
3181 store i32 %val, ptr %out2
3185 define amdgpu_kernel void @atomic_umin_i32_addr64(ptr %out, i32 %in, i64 %index) {
3186 ; GCN1-LABEL: atomic_umin_i32_addr64:
3187 ; GCN1: ; %bb.0: ; %entry
3188 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
3189 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
3190 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
3191 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3192 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3193 ; GCN1-NEXT: s_add_u32 s0, s4, s0
3194 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
3195 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3196 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3197 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
3198 ; GCN1-NEXT: flat_atomic_umin v[0:1], v2
3199 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3200 ; GCN1-NEXT: s_endpgm
3202 ; GCN2-LABEL: atomic_umin_i32_addr64:
3203 ; GCN2: ; %bb.0: ; %entry
3204 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3205 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3206 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
3207 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3208 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3209 ; GCN2-NEXT: s_add_u32 s0, s4, s0
3210 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
3211 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3212 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3213 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
3214 ; GCN2-NEXT: flat_atomic_umin v[0:1], v2
3215 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3216 ; GCN2-NEXT: s_endpgm
3218 ; GCN3-LABEL: atomic_umin_i32_addr64:
3219 ; GCN3: ; %bb.0: ; %entry
3220 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3221 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3222 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
3223 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3224 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3225 ; GCN3-NEXT: s_add_u32 s0, s4, s0
3226 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
3227 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
3228 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
3229 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
3230 ; GCN3-NEXT: flat_atomic_umin v[0:1], v2
3231 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3232 ; GCN3-NEXT: s_endpgm
3234 %ptr = getelementptr i32, ptr %out, i64 %index
3235 %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst
3239 define amdgpu_kernel void @atomic_umin_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
3240 ; GCN1-LABEL: atomic_umin_i32_ret_addr64:
3241 ; GCN1: ; %bb.0: ; %entry
3242 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
3243 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3244 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
3245 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3246 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3247 ; GCN1-NEXT: s_add_u32 s0, s4, s0
3248 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
3249 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3250 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3251 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
3252 ; GCN1-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
3253 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3254 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
3255 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
3256 ; GCN1-NEXT: s_waitcnt vmcnt(0)
3257 ; GCN1-NEXT: flat_store_dword v[0:1], v2
3258 ; GCN1-NEXT: s_endpgm
3260 ; GCN2-LABEL: atomic_umin_i32_ret_addr64:
3261 ; GCN2: ; %bb.0: ; %entry
3262 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
3263 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3264 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
3265 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3266 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3267 ; GCN2-NEXT: s_add_u32 s0, s4, s0
3268 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
3269 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3270 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3271 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
3272 ; GCN2-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
3273 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3274 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
3275 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
3276 ; GCN2-NEXT: s_waitcnt vmcnt(0)
3277 ; GCN2-NEXT: flat_store_dword v[0:1], v2
3278 ; GCN2-NEXT: s_endpgm
3280 ; GCN3-LABEL: atomic_umin_i32_ret_addr64:
3281 ; GCN3: ; %bb.0: ; %entry
3282 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
3283 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3284 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
3285 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3286 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3287 ; GCN3-NEXT: s_add_u32 s0, s4, s0
3288 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
3289 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
3290 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
3291 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
3292 ; GCN3-NEXT: flat_atomic_umin v2, v[0:1], v2 glc
3293 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3294 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
3295 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
3296 ; GCN3-NEXT: s_waitcnt vmcnt(0)
3297 ; GCN3-NEXT: flat_store_dword v[0:1], v2
3298 ; GCN3-NEXT: s_endpgm
3300 %ptr = getelementptr i32, ptr %out, i64 %index
3301 %val = atomicrmw volatile umin ptr %ptr, i32 %in syncscope("workgroup") seq_cst
3302 store i32 %val, ptr %out2
3306 define amdgpu_kernel void @atomic_or_i32_offset(ptr %out, i32 %in) {
3307 ; GCN1-LABEL: atomic_or_i32_offset:
3308 ; GCN1: ; %bb.0: ; %entry
3309 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
3310 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
3311 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3312 ; GCN1-NEXT: s_add_u32 s0, s2, 16
3313 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
3314 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3315 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3316 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
3317 ; GCN1-NEXT: flat_atomic_or v[0:1], v2
3318 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3319 ; GCN1-NEXT: buffer_wbinvl1_vol
3320 ; GCN1-NEXT: s_endpgm
3322 ; GCN2-LABEL: atomic_or_i32_offset:
3323 ; GCN2: ; %bb.0: ; %entry
3324 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
3325 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
3326 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3327 ; GCN2-NEXT: s_add_u32 s0, s2, 16
3328 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
3329 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3330 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3331 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
3332 ; GCN2-NEXT: flat_atomic_or v[0:1], v2
3333 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3334 ; GCN2-NEXT: buffer_wbinvl1_vol
3335 ; GCN2-NEXT: s_endpgm
3337 ; GCN3-LABEL: atomic_or_i32_offset:
3338 ; GCN3: ; %bb.0: ; %entry
3339 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
3340 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
3341 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3342 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
3343 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
3344 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
3345 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16
3346 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3347 ; GCN3-NEXT: buffer_wbinvl1_vol
3348 ; GCN3-NEXT: s_endpgm
3350 %gep = getelementptr i32, ptr %out, i32 4
3351 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
3355 define amdgpu_kernel void @atomic_or_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
3356 ; GCN1-LABEL: atomic_or_i32_ret_offset:
3357 ; GCN1: ; %bb.0: ; %entry
3358 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3359 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
3360 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3361 ; GCN1-NEXT: s_add_u32 s0, s4, 16
3362 ; GCN1-NEXT: s_addc_u32 s1, s5, 0
3363 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3364 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3365 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
3366 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc
3367 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3368 ; GCN1-NEXT: buffer_wbinvl1_vol
3369 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
3370 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
3371 ; GCN1-NEXT: flat_store_dword v[0:1], v2
3372 ; GCN1-NEXT: s_endpgm
3374 ; GCN2-LABEL: atomic_or_i32_ret_offset:
3375 ; GCN2: ; %bb.0: ; %entry
3376 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3377 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
3378 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3379 ; GCN2-NEXT: s_add_u32 s0, s4, 16
3380 ; GCN2-NEXT: s_addc_u32 s1, s5, 0
3381 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3382 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3383 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
3384 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc
3385 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3386 ; GCN2-NEXT: buffer_wbinvl1_vol
3387 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
3388 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
3389 ; GCN2-NEXT: flat_store_dword v[0:1], v2
3390 ; GCN2-NEXT: s_endpgm
3392 ; GCN3-LABEL: atomic_or_i32_ret_offset:
3393 ; GCN3: ; %bb.0: ; %entry
3394 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3395 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
3396 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3397 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3398 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3399 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
3400 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc
3401 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3402 ; GCN3-NEXT: buffer_wbinvl1_vol
3403 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
3404 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
3405 ; GCN3-NEXT: flat_store_dword v[0:1], v2
3406 ; GCN3-NEXT: s_endpgm
3408 %gep = getelementptr i32, ptr %out, i32 4
3409 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
3410 store i32 %val, ptr %out2
3414 define amdgpu_kernel void @atomic_or_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
3415 ; GCN1-LABEL: atomic_or_i32_addr64_offset:
3416 ; GCN1: ; %bb.0: ; %entry
3417 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
3418 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
3419 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
3420 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3421 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3422 ; GCN1-NEXT: s_add_u32 s0, s4, s0
3423 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
3424 ; GCN1-NEXT: s_add_u32 s0, s0, 16
3425 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
3426 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3427 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3428 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
3429 ; GCN1-NEXT: flat_atomic_or v[0:1], v2
3430 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3431 ; GCN1-NEXT: buffer_wbinvl1_vol
3432 ; GCN1-NEXT: s_endpgm
3434 ; GCN2-LABEL: atomic_or_i32_addr64_offset:
3435 ; GCN2: ; %bb.0: ; %entry
3436 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3437 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3438 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
3439 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3440 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3441 ; GCN2-NEXT: s_add_u32 s0, s4, s0
3442 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
3443 ; GCN2-NEXT: s_add_u32 s0, s0, 16
3444 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
3445 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3446 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3447 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
3448 ; GCN2-NEXT: flat_atomic_or v[0:1], v2
3449 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3450 ; GCN2-NEXT: buffer_wbinvl1_vol
3451 ; GCN2-NEXT: s_endpgm
3453 ; GCN3-LABEL: atomic_or_i32_addr64_offset:
3454 ; GCN3: ; %bb.0: ; %entry
3455 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3456 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3457 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
3458 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3459 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3460 ; GCN3-NEXT: s_add_u32 s0, s4, s0
3461 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
3462 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
3463 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
3464 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
3465 ; GCN3-NEXT: flat_atomic_or v[0:1], v2 offset:16
3466 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3467 ; GCN3-NEXT: buffer_wbinvl1_vol
3468 ; GCN3-NEXT: s_endpgm
3470 %ptr = getelementptr i32, ptr %out, i64 %index
3471 %gep = getelementptr i32, ptr %ptr, i32 4
3472 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
3476 define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
3477 ; GCN1-LABEL: atomic_or_i32_ret_addr64_offset:
3478 ; GCN1: ; %bb.0: ; %entry
3479 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
3480 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3481 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
3482 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3483 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3484 ; GCN1-NEXT: s_add_u32 s0, s4, s0
3485 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
3486 ; GCN1-NEXT: s_add_u32 s0, s0, 16
3487 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
3488 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3489 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3490 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
3491 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc
3492 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3493 ; GCN1-NEXT: buffer_wbinvl1_vol
3494 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
3495 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
3496 ; GCN1-NEXT: flat_store_dword v[0:1], v2
3497 ; GCN1-NEXT: s_endpgm
3499 ; GCN2-LABEL: atomic_or_i32_ret_addr64_offset:
3500 ; GCN2: ; %bb.0: ; %entry
3501 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
3502 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3503 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
3504 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3505 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3506 ; GCN2-NEXT: s_add_u32 s0, s4, s0
3507 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
3508 ; GCN2-NEXT: s_add_u32 s0, s0, 16
3509 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
3510 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3511 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3512 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
3513 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc
3514 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3515 ; GCN2-NEXT: buffer_wbinvl1_vol
3516 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
3517 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
3518 ; GCN2-NEXT: flat_store_dword v[0:1], v2
3519 ; GCN2-NEXT: s_endpgm
3521 ; GCN3-LABEL: atomic_or_i32_ret_addr64_offset:
3522 ; GCN3: ; %bb.0: ; %entry
3523 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
3524 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3525 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
3526 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3527 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3528 ; GCN3-NEXT: s_add_u32 s0, s4, s0
3529 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
3530 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
3531 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
3532 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
3533 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 offset:16 glc
3534 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3535 ; GCN3-NEXT: buffer_wbinvl1_vol
3536 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
3537 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
3538 ; GCN3-NEXT: flat_store_dword v[0:1], v2
3539 ; GCN3-NEXT: s_endpgm
3541 %ptr = getelementptr i32, ptr %out, i64 %index
3542 %gep = getelementptr i32, ptr %ptr, i32 4
3543 %val = atomicrmw volatile or ptr %gep, i32 %in syncscope("agent") seq_cst
3544 store i32 %val, ptr %out2
3548 define amdgpu_kernel void @atomic_or_i32(ptr %out, i32 %in) {
3549 ; GCN1-LABEL: atomic_or_i32:
3550 ; GCN1: ; %bb.0: ; %entry
3551 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
3552 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb
3553 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3554 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
3555 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
3556 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
3557 ; GCN1-NEXT: flat_atomic_or v[0:1], v2
3558 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3559 ; GCN1-NEXT: buffer_wbinvl1_vol
3560 ; GCN1-NEXT: s_endpgm
3562 ; GCN2-LABEL: atomic_or_i32:
3563 ; GCN2: ; %bb.0: ; %entry
3564 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
3565 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c
3566 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3567 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
3568 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
3569 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
3570 ; GCN2-NEXT: flat_atomic_or v[0:1], v2
3571 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3572 ; GCN2-NEXT: buffer_wbinvl1_vol
3573 ; GCN2-NEXT: s_endpgm
3575 ; GCN3-LABEL: atomic_or_i32:
3576 ; GCN3: ; %bb.0: ; %entry
3577 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
3578 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
3579 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3580 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
3581 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
3582 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
3583 ; GCN3-NEXT: flat_atomic_or v[0:1], v2
3584 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3585 ; GCN3-NEXT: buffer_wbinvl1_vol
3586 ; GCN3-NEXT: s_endpgm
3588 %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst
3592 define amdgpu_kernel void @atomic_or_i32_ret(ptr %out, ptr %out2, i32 %in) {
3593 ; GCN1-LABEL: atomic_or_i32_ret:
3594 ; GCN1: ; %bb.0: ; %entry
3595 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3596 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd
3597 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3598 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
3599 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
3600 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
3601 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc
3602 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3603 ; GCN1-NEXT: buffer_wbinvl1_vol
3604 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
3605 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
3606 ; GCN1-NEXT: flat_store_dword v[0:1], v2
3607 ; GCN1-NEXT: s_endpgm
3609 ; GCN2-LABEL: atomic_or_i32_ret:
3610 ; GCN2: ; %bb.0: ; %entry
3611 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3612 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34
3613 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3614 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
3615 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
3616 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
3617 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc
3618 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3619 ; GCN2-NEXT: buffer_wbinvl1_vol
3620 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
3621 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
3622 ; GCN2-NEXT: flat_store_dword v[0:1], v2
3623 ; GCN2-NEXT: s_endpgm
3625 ; GCN3-LABEL: atomic_or_i32_ret:
3626 ; GCN3: ; %bb.0: ; %entry
3627 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3628 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
3629 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3630 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3631 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3632 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
3633 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc
3634 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3635 ; GCN3-NEXT: buffer_wbinvl1_vol
3636 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
3637 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
3638 ; GCN3-NEXT: flat_store_dword v[0:1], v2
3639 ; GCN3-NEXT: s_endpgm
3641 %val = atomicrmw volatile or ptr %out, i32 %in syncscope("agent") seq_cst
3642 store i32 %val, ptr %out2
3646 define amdgpu_kernel void @atomic_or_i32_addr64(ptr %out, i32 %in, i64 %index) {
3647 ; GCN1-LABEL: atomic_or_i32_addr64:
3648 ; GCN1: ; %bb.0: ; %entry
3649 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
3650 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
3651 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
3652 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3653 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3654 ; GCN1-NEXT: s_add_u32 s0, s4, s0
3655 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
3656 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3657 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3658 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
3659 ; GCN1-NEXT: flat_atomic_or v[0:1], v2
3660 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3661 ; GCN1-NEXT: buffer_wbinvl1_vol
3662 ; GCN1-NEXT: s_endpgm
3664 ; GCN2-LABEL: atomic_or_i32_addr64:
3665 ; GCN2: ; %bb.0: ; %entry
3666 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3667 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3668 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
3669 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3670 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3671 ; GCN2-NEXT: s_add_u32 s0, s4, s0
3672 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
3673 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3674 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3675 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
3676 ; GCN2-NEXT: flat_atomic_or v[0:1], v2
3677 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3678 ; GCN2-NEXT: buffer_wbinvl1_vol
3679 ; GCN2-NEXT: s_endpgm
3681 ; GCN3-LABEL: atomic_or_i32_addr64:
3682 ; GCN3: ; %bb.0: ; %entry
3683 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3684 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3685 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
3686 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3687 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3688 ; GCN3-NEXT: s_add_u32 s0, s4, s0
3689 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
3690 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
3691 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
3692 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
3693 ; GCN3-NEXT: flat_atomic_or v[0:1], v2
3694 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3695 ; GCN3-NEXT: buffer_wbinvl1_vol
3696 ; GCN3-NEXT: s_endpgm
3698 %ptr = getelementptr i32, ptr %out, i64 %index
3699 %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst
3703 define amdgpu_kernel void @atomic_or_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
3704 ; GCN1-LABEL: atomic_or_i32_ret_addr64:
3705 ; GCN1: ; %bb.0: ; %entry
3706 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
3707 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3708 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
3709 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3710 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3711 ; GCN1-NEXT: s_add_u32 s0, s4, s0
3712 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
3713 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3714 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3715 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
3716 ; GCN1-NEXT: flat_atomic_or v2, v[0:1], v2 glc
3717 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3718 ; GCN1-NEXT: buffer_wbinvl1_vol
3719 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
3720 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
3721 ; GCN1-NEXT: flat_store_dword v[0:1], v2
3722 ; GCN1-NEXT: s_endpgm
3724 ; GCN2-LABEL: atomic_or_i32_ret_addr64:
3725 ; GCN2: ; %bb.0: ; %entry
3726 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
3727 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3728 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
3729 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3730 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3731 ; GCN2-NEXT: s_add_u32 s0, s4, s0
3732 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
3733 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3734 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3735 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
3736 ; GCN2-NEXT: flat_atomic_or v2, v[0:1], v2 glc
3737 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3738 ; GCN2-NEXT: buffer_wbinvl1_vol
3739 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
3740 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
3741 ; GCN2-NEXT: flat_store_dword v[0:1], v2
3742 ; GCN2-NEXT: s_endpgm
3744 ; GCN3-LABEL: atomic_or_i32_ret_addr64:
3745 ; GCN3: ; %bb.0: ; %entry
3746 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
3747 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3748 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
3749 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3750 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3751 ; GCN3-NEXT: s_add_u32 s0, s4, s0
3752 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
3753 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
3754 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
3755 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
3756 ; GCN3-NEXT: flat_atomic_or v2, v[0:1], v2 glc
3757 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3758 ; GCN3-NEXT: buffer_wbinvl1_vol
3759 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
3760 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
3761 ; GCN3-NEXT: flat_store_dword v[0:1], v2
3762 ; GCN3-NEXT: s_endpgm
3764 %ptr = getelementptr i32, ptr %out, i64 %index
3765 %val = atomicrmw volatile or ptr %ptr, i32 %in syncscope("agent") seq_cst
3766 store i32 %val, ptr %out2
3770 define amdgpu_kernel void @atomic_xchg_i32_offset(ptr %out, i32 %in) {
3771 ; GCN1-LABEL: atomic_xchg_i32_offset:
3772 ; GCN1: ; %bb.0: ; %entry
3773 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
3774 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
3775 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3776 ; GCN1-NEXT: s_add_u32 s0, s2, 16
3777 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
3778 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3779 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3780 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
3781 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2
3782 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3783 ; GCN1-NEXT: buffer_wbinvl1_vol
3784 ; GCN1-NEXT: s_endpgm
3786 ; GCN2-LABEL: atomic_xchg_i32_offset:
3787 ; GCN2: ; %bb.0: ; %entry
3788 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
3789 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
3790 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3791 ; GCN2-NEXT: s_add_u32 s0, s2, 16
3792 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
3793 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3794 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3795 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
3796 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2
3797 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3798 ; GCN2-NEXT: buffer_wbinvl1_vol
3799 ; GCN2-NEXT: s_endpgm
3801 ; GCN3-LABEL: atomic_xchg_i32_offset:
3802 ; GCN3: ; %bb.0: ; %entry
3803 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
3804 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
3805 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3806 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
3807 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
3808 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
3809 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16
3810 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3811 ; GCN3-NEXT: buffer_wbinvl1_vol
3812 ; GCN3-NEXT: s_endpgm
3814 %gep = getelementptr i32, ptr %out, i32 4
3815 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
3819 define amdgpu_kernel void @atomic_xchg_f32_offset(ptr %out, float %in) {
3820 ; GCN1-LABEL: atomic_xchg_f32_offset:
3821 ; GCN1: ; %bb.0: ; %entry
3822 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
3823 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
3824 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3825 ; GCN1-NEXT: s_add_u32 s0, s2, 16
3826 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
3827 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3828 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3829 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
3830 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2
3831 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3832 ; GCN1-NEXT: buffer_wbinvl1_vol
3833 ; GCN1-NEXT: s_endpgm
3835 ; GCN2-LABEL: atomic_xchg_f32_offset:
3836 ; GCN2: ; %bb.0: ; %entry
3837 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
3838 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
3839 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3840 ; GCN2-NEXT: s_add_u32 s0, s2, 16
3841 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
3842 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3843 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3844 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
3845 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2
3846 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3847 ; GCN2-NEXT: buffer_wbinvl1_vol
3848 ; GCN2-NEXT: s_endpgm
3850 ; GCN3-LABEL: atomic_xchg_f32_offset:
3851 ; GCN3: ; %bb.0: ; %entry
3852 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
3853 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
3854 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3855 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
3856 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
3857 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
3858 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16
3859 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3860 ; GCN3-NEXT: buffer_wbinvl1_vol
3861 ; GCN3-NEXT: s_endpgm
3863 %gep = getelementptr float, ptr %out, i32 4
3864 %val = atomicrmw volatile xchg ptr %gep, float %in syncscope("agent") seq_cst
3868 define amdgpu_kernel void @atomic_xchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
3869 ; GCN1-LABEL: atomic_xchg_i32_ret_offset:
3870 ; GCN1: ; %bb.0: ; %entry
3871 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3872 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
3873 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3874 ; GCN1-NEXT: s_add_u32 s0, s4, 16
3875 ; GCN1-NEXT: s_addc_u32 s1, s5, 0
3876 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3877 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3878 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
3879 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
3880 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3881 ; GCN1-NEXT: buffer_wbinvl1_vol
3882 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
3883 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
3884 ; GCN1-NEXT: flat_store_dword v[0:1], v2
3885 ; GCN1-NEXT: s_endpgm
3887 ; GCN2-LABEL: atomic_xchg_i32_ret_offset:
3888 ; GCN2: ; %bb.0: ; %entry
3889 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3890 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
3891 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3892 ; GCN2-NEXT: s_add_u32 s0, s4, 16
3893 ; GCN2-NEXT: s_addc_u32 s1, s5, 0
3894 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3895 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3896 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
3897 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
3898 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3899 ; GCN2-NEXT: buffer_wbinvl1_vol
3900 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
3901 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
3902 ; GCN2-NEXT: flat_store_dword v[0:1], v2
3903 ; GCN2-NEXT: s_endpgm
3905 ; GCN3-LABEL: atomic_xchg_i32_ret_offset:
3906 ; GCN3: ; %bb.0: ; %entry
3907 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
3908 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
3909 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3910 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
3911 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
3912 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
3913 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc
3914 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3915 ; GCN3-NEXT: buffer_wbinvl1_vol
3916 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
3917 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
3918 ; GCN3-NEXT: flat_store_dword v[0:1], v2
3919 ; GCN3-NEXT: s_endpgm
3921 %gep = getelementptr i32, ptr %out, i32 4
3922 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
3923 store i32 %val, ptr %out2
3927 define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
3928 ; GCN1-LABEL: atomic_xchg_i32_addr64_offset:
3929 ; GCN1: ; %bb.0: ; %entry
3930 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
3931 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
3932 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
3933 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3934 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3935 ; GCN1-NEXT: s_add_u32 s0, s4, s0
3936 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
3937 ; GCN1-NEXT: s_add_u32 s0, s0, 16
3938 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
3939 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
3940 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
3941 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
3942 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2
3943 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3944 ; GCN1-NEXT: buffer_wbinvl1_vol
3945 ; GCN1-NEXT: s_endpgm
3947 ; GCN2-LABEL: atomic_xchg_i32_addr64_offset:
3948 ; GCN2: ; %bb.0: ; %entry
3949 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3950 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3951 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
3952 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
3953 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3954 ; GCN2-NEXT: s_add_u32 s0, s4, s0
3955 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
3956 ; GCN2-NEXT: s_add_u32 s0, s0, 16
3957 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
3958 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
3959 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
3960 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
3961 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2
3962 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3963 ; GCN2-NEXT: buffer_wbinvl1_vol
3964 ; GCN2-NEXT: s_endpgm
3966 ; GCN3-LABEL: atomic_xchg_i32_addr64_offset:
3967 ; GCN3: ; %bb.0: ; %entry
3968 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
3969 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
3970 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
3971 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
3972 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3973 ; GCN3-NEXT: s_add_u32 s0, s4, s0
3974 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
3975 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
3976 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
3977 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
3978 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2 offset:16
3979 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3980 ; GCN3-NEXT: buffer_wbinvl1_vol
3981 ; GCN3-NEXT: s_endpgm
3983 %ptr = getelementptr i32, ptr %out, i64 %index
3984 %gep = getelementptr i32, ptr %ptr, i32 4
3985 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
3989 define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
3990 ; GCN1-LABEL: atomic_xchg_i32_ret_addr64_offset:
3991 ; GCN1: ; %bb.0: ; %entry
3992 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
3993 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
3994 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
3995 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
3996 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
3997 ; GCN1-NEXT: s_add_u32 s0, s4, s0
3998 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
3999 ; GCN1-NEXT: s_add_u32 s0, s0, 16
4000 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
4001 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
4002 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
4003 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
4004 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
4005 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4006 ; GCN1-NEXT: buffer_wbinvl1_vol
4007 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
4008 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
4009 ; GCN1-NEXT: flat_store_dword v[0:1], v2
4010 ; GCN1-NEXT: s_endpgm
4012 ; GCN2-LABEL: atomic_xchg_i32_ret_addr64_offset:
4013 ; GCN2: ; %bb.0: ; %entry
4014 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4015 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4016 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
4017 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4018 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4019 ; GCN2-NEXT: s_add_u32 s0, s4, s0
4020 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
4021 ; GCN2-NEXT: s_add_u32 s0, s0, 16
4022 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
4023 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
4024 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
4025 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
4026 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
4027 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4028 ; GCN2-NEXT: buffer_wbinvl1_vol
4029 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
4030 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
4031 ; GCN2-NEXT: flat_store_dword v[0:1], v2
4032 ; GCN2-NEXT: s_endpgm
4034 ; GCN3-LABEL: atomic_xchg_i32_ret_addr64_offset:
4035 ; GCN3: ; %bb.0: ; %entry
4036 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4037 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4038 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
4039 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4040 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4041 ; GCN3-NEXT: s_add_u32 s0, s4, s0
4042 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
4043 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
4044 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
4045 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
4046 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 offset:16 glc
4047 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4048 ; GCN3-NEXT: buffer_wbinvl1_vol
4049 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
4050 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
4051 ; GCN3-NEXT: flat_store_dword v[0:1], v2
4052 ; GCN3-NEXT: s_endpgm
4054 %ptr = getelementptr i32, ptr %out, i64 %index
4055 %gep = getelementptr i32, ptr %ptr, i32 4
4056 %val = atomicrmw volatile xchg ptr %gep, i32 %in syncscope("agent") seq_cst
4057 store i32 %val, ptr %out2
4061 define amdgpu_kernel void @atomic_xchg_i32(ptr %out, i32 %in) {
4062 ; GCN1-LABEL: atomic_xchg_i32:
4063 ; GCN1: ; %bb.0: ; %entry
4064 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
4065 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb
4066 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4067 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
4068 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
4069 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
4070 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2
4071 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4072 ; GCN1-NEXT: buffer_wbinvl1_vol
4073 ; GCN1-NEXT: s_endpgm
4075 ; GCN2-LABEL: atomic_xchg_i32:
4076 ; GCN2: ; %bb.0: ; %entry
4077 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
4078 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c
4079 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4080 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
4081 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
4082 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
4083 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2
4084 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4085 ; GCN2-NEXT: buffer_wbinvl1_vol
4086 ; GCN2-NEXT: s_endpgm
4088 ; GCN3-LABEL: atomic_xchg_i32:
4089 ; GCN3: ; %bb.0: ; %entry
4090 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
4091 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
4092 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4093 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
4094 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
4095 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
4096 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2
4097 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4098 ; GCN3-NEXT: buffer_wbinvl1_vol
4099 ; GCN3-NEXT: s_endpgm
4101 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
4105 define amdgpu_kernel void @atomic_xchg_i32_ret(ptr %out, ptr %out2, i32 %in) {
4106 ; GCN1-LABEL: atomic_xchg_i32_ret:
4107 ; GCN1: ; %bb.0: ; %entry
4108 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4109 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd
4110 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4111 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
4112 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
4113 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
4114 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
4115 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4116 ; GCN1-NEXT: buffer_wbinvl1_vol
4117 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
4118 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
4119 ; GCN1-NEXT: flat_store_dword v[0:1], v2
4120 ; GCN1-NEXT: s_endpgm
4122 ; GCN2-LABEL: atomic_xchg_i32_ret:
4123 ; GCN2: ; %bb.0: ; %entry
4124 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4125 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34
4126 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4127 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
4128 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
4129 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
4130 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
4131 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4132 ; GCN2-NEXT: buffer_wbinvl1_vol
4133 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
4134 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
4135 ; GCN2-NEXT: flat_store_dword v[0:1], v2
4136 ; GCN2-NEXT: s_endpgm
4138 ; GCN3-LABEL: atomic_xchg_i32_ret:
4139 ; GCN3: ; %bb.0: ; %entry
4140 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4141 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
4142 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4143 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
4144 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
4145 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
4146 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
4147 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4148 ; GCN3-NEXT: buffer_wbinvl1_vol
4149 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
4150 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
4151 ; GCN3-NEXT: flat_store_dword v[0:1], v2
4152 ; GCN3-NEXT: s_endpgm
4154 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("agent") seq_cst
4155 store i32 %val, ptr %out2
4159 define amdgpu_kernel void @atomic_xchg_i32_addr64(ptr %out, i32 %in, i64 %index) {
4160 ; GCN1-LABEL: atomic_xchg_i32_addr64:
4161 ; GCN1: ; %bb.0: ; %entry
4162 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
4163 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
4164 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
4165 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4166 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4167 ; GCN1-NEXT: s_add_u32 s0, s4, s0
4168 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
4169 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
4170 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
4171 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
4172 ; GCN1-NEXT: flat_atomic_swap v[0:1], v2
4173 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4174 ; GCN1-NEXT: buffer_wbinvl1_vol
4175 ; GCN1-NEXT: s_endpgm
4177 ; GCN2-LABEL: atomic_xchg_i32_addr64:
4178 ; GCN2: ; %bb.0: ; %entry
4179 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4180 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4181 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
4182 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4183 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4184 ; GCN2-NEXT: s_add_u32 s0, s4, s0
4185 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
4186 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
4187 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
4188 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
4189 ; GCN2-NEXT: flat_atomic_swap v[0:1], v2
4190 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4191 ; GCN2-NEXT: buffer_wbinvl1_vol
4192 ; GCN2-NEXT: s_endpgm
4194 ; GCN3-LABEL: atomic_xchg_i32_addr64:
4195 ; GCN3: ; %bb.0: ; %entry
4196 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4197 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4198 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
4199 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4200 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4201 ; GCN3-NEXT: s_add_u32 s0, s4, s0
4202 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
4203 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
4204 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
4205 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
4206 ; GCN3-NEXT: flat_atomic_swap v[0:1], v2
4207 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4208 ; GCN3-NEXT: buffer_wbinvl1_vol
4209 ; GCN3-NEXT: s_endpgm
4211 %ptr = getelementptr i32, ptr %out, i64 %index
4212 %val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst
4216 define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
4217 ; GCN1-LABEL: atomic_xchg_i32_ret_addr64:
4218 ; GCN1: ; %bb.0: ; %entry
4219 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
4220 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4221 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
4222 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4223 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4224 ; GCN1-NEXT: s_add_u32 s0, s4, s0
4225 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
4226 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
4227 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
4228 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
4229 ; GCN1-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
4230 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4231 ; GCN1-NEXT: buffer_wbinvl1_vol
4232 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
4233 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
4234 ; GCN1-NEXT: flat_store_dword v[0:1], v2
4235 ; GCN1-NEXT: s_endpgm
4237 ; GCN2-LABEL: atomic_xchg_i32_ret_addr64:
4238 ; GCN2: ; %bb.0: ; %entry
4239 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4240 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4241 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
4242 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4243 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4244 ; GCN2-NEXT: s_add_u32 s0, s4, s0
4245 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
4246 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
4247 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
4248 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
4249 ; GCN2-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
4250 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4251 ; GCN2-NEXT: buffer_wbinvl1_vol
4252 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
4253 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
4254 ; GCN2-NEXT: flat_store_dword v[0:1], v2
4255 ; GCN2-NEXT: s_endpgm
4257 ; GCN3-LABEL: atomic_xchg_i32_ret_addr64:
4258 ; GCN3: ; %bb.0: ; %entry
4259 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4260 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4261 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
4262 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4263 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4264 ; GCN3-NEXT: s_add_u32 s0, s4, s0
4265 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
4266 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
4267 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
4268 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
4269 ; GCN3-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
4270 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4271 ; GCN3-NEXT: buffer_wbinvl1_vol
4272 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
4273 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
4274 ; GCN3-NEXT: flat_store_dword v[0:1], v2
4275 ; GCN3-NEXT: s_endpgm
4277 %ptr = getelementptr i32, ptr %out, i64 %index
4278 %val = atomicrmw volatile xchg ptr %ptr, i32 %in syncscope("agent") seq_cst
4279 store i32 %val, ptr %out2
4285 define amdgpu_kernel void @atomic_cmpxchg_i32_offset(ptr %out, i32 %in, i32 %old) {
4286 ; GCN1-LABEL: atomic_cmpxchg_i32_offset:
4287 ; GCN1: ; %bb.0: ; %entry
4288 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
4289 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4290 ; GCN1-NEXT: s_add_u32 s0, s0, 16
4291 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
4292 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
4293 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
4294 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
4295 ; GCN1-NEXT: v_mov_b32_e32 v3, s3
4296 ; GCN1-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4297 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4298 ; GCN1-NEXT: buffer_wbinvl1_vol
4299 ; GCN1-NEXT: s_endpgm
4301 ; GCN2-LABEL: atomic_cmpxchg_i32_offset:
4302 ; GCN2: ; %bb.0: ; %entry
4303 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4304 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4305 ; GCN2-NEXT: s_add_u32 s0, s0, 16
4306 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
4307 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
4308 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
4309 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
4310 ; GCN2-NEXT: v_mov_b32_e32 v3, s3
4311 ; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4312 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4313 ; GCN2-NEXT: buffer_wbinvl1_vol
4314 ; GCN2-NEXT: s_endpgm
4316 ; GCN3-LABEL: atomic_cmpxchg_i32_offset:
4317 ; GCN3: ; %bb.0: ; %entry
4318 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4319 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4320 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
4321 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
4322 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
4323 ; GCN3-NEXT: v_mov_b32_e32 v3, s3
4324 ; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4325 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4326 ; GCN3-NEXT: buffer_wbinvl1_vol
4327 ; GCN3-NEXT: s_endpgm
4329 %gep = getelementptr i32, ptr %out, i32 4
4330 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4334 define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(ptr %out, ptr %out2, i32 %in, i32 %old) {
4335 ; GCN1-LABEL: atomic_cmpxchg_i32_ret_offset:
4336 ; GCN1: ; %bb.0: ; %entry
4337 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4338 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
4339 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4340 ; GCN1-NEXT: s_add_u32 s2, s4, 16
4341 ; GCN1-NEXT: s_addc_u32 s3, s5, 0
4342 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
4343 ; GCN1-NEXT: v_mov_b32_e32 v3, s1
4344 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
4345 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
4346 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4347 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4348 ; GCN1-NEXT: buffer_wbinvl1_vol
4349 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
4350 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
4351 ; GCN1-NEXT: flat_store_dword v[0:1], v2
4352 ; GCN1-NEXT: s_endpgm
4354 ; GCN2-LABEL: atomic_cmpxchg_i32_ret_offset:
4355 ; GCN2: ; %bb.0: ; %entry
4356 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4357 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
4358 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4359 ; GCN2-NEXT: s_add_u32 s2, s4, 16
4360 ; GCN2-NEXT: s_addc_u32 s3, s5, 0
4361 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
4362 ; GCN2-NEXT: v_mov_b32_e32 v3, s1
4363 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
4364 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
4365 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4366 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4367 ; GCN2-NEXT: buffer_wbinvl1_vol
4368 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
4369 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
4370 ; GCN2-NEXT: flat_store_dword v[0:1], v2
4371 ; GCN2-NEXT: s_endpgm
4373 ; GCN3-LABEL: atomic_cmpxchg_i32_ret_offset:
4374 ; GCN3: ; %bb.0: ; %entry
4375 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4376 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4377 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4378 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
4379 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
4380 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
4381 ; GCN3-NEXT: v_mov_b32_e32 v3, s3
4382 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
4383 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4384 ; GCN3-NEXT: buffer_wbinvl1_vol
4385 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
4386 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
4387 ; GCN3-NEXT: flat_store_dword v[0:1], v2
4388 ; GCN3-NEXT: s_endpgm
4390 %gep = getelementptr i32, ptr %out, i32 4
4391 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4392 %flag = extractvalue { i32, i1 } %val, 0
4393 store i32 %flag, ptr %out2
4397 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(ptr %out, i32 %in, i64 %index, i32 %old) {
4398 ; GCN1-LABEL: atomic_cmpxchg_i32_addr64_offset:
4399 ; GCN1: ; %bb.0: ; %entry
4400 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
4401 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
4402 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
4403 ; GCN1-NEXT: s_load_dword s7, s[0:1], 0xf
4404 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4405 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4406 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
4407 ; GCN1-NEXT: s_add_u32 s0, s4, s0
4408 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
4409 ; GCN1-NEXT: s_add_u32 s0, s0, 16
4410 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
4411 ; GCN1-NEXT: v_mov_b32_e32 v3, s1
4412 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
4413 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
4414 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
4415 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4416 ; GCN1-NEXT: buffer_wbinvl1_vol
4417 ; GCN1-NEXT: s_endpgm
4419 ; GCN2-LABEL: atomic_cmpxchg_i32_addr64_offset:
4420 ; GCN2: ; %bb.0: ; %entry
4421 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4422 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
4423 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4424 ; GCN2-NEXT: s_load_dword s7, s[0:1], 0x3c
4425 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4426 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4427 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
4428 ; GCN2-NEXT: s_add_u32 s0, s4, s0
4429 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
4430 ; GCN2-NEXT: s_add_u32 s0, s0, 16
4431 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
4432 ; GCN2-NEXT: v_mov_b32_e32 v3, s1
4433 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
4434 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
4435 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
4436 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4437 ; GCN2-NEXT: buffer_wbinvl1_vol
4438 ; GCN2-NEXT: s_endpgm
4440 ; GCN3-LABEL: atomic_cmpxchg_i32_addr64_offset:
4441 ; GCN3: ; %bb.0: ; %entry
4442 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4443 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
4444 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4445 ; GCN3-NEXT: s_load_dword s7, s[0:1], 0x3c
4446 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4447 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4448 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
4449 ; GCN3-NEXT: s_add_u32 s0, s4, s0
4450 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
4451 ; GCN3-NEXT: v_mov_b32_e32 v3, s1
4452 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
4453 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
4454 ; GCN3-NEXT: flat_atomic_cmpswap v[2:3], v[0:1] offset:16
4455 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4456 ; GCN3-NEXT: buffer_wbinvl1_vol
4457 ; GCN3-NEXT: s_endpgm
4459 %ptr = getelementptr i32, ptr %out, i64 %index
4460 %gep = getelementptr i32, ptr %ptr, i32 4
4461 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4465 define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) {
4466 ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
4467 ; GCN1: ; %bb.0: ; %entry
4468 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
4469 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
4470 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4471 ; GCN1-NEXT: s_load_dword s9, s[0:1], 0x11
4472 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4473 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4474 ; GCN1-NEXT: v_mov_b32_e32 v0, s8
4475 ; GCN1-NEXT: s_add_u32 s0, s4, s0
4476 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
4477 ; GCN1-NEXT: s_add_u32 s0, s0, 16
4478 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
4479 ; GCN1-NEXT: v_mov_b32_e32 v3, s1
4480 ; GCN1-NEXT: v_mov_b32_e32 v1, s9
4481 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
4482 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
4483 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4484 ; GCN1-NEXT: buffer_wbinvl1_vol
4485 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
4486 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
4487 ; GCN1-NEXT: flat_store_dword v[0:1], v2
4488 ; GCN1-NEXT: s_endpgm
4490 ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
4491 ; GCN2: ; %bb.0: ; %entry
4492 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4493 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
4494 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4495 ; GCN2-NEXT: s_load_dword s9, s[0:1], 0x44
4496 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4497 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4498 ; GCN2-NEXT: v_mov_b32_e32 v0, s8
4499 ; GCN2-NEXT: s_add_u32 s0, s4, s0
4500 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
4501 ; GCN2-NEXT: s_add_u32 s0, s0, 16
4502 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
4503 ; GCN2-NEXT: v_mov_b32_e32 v3, s1
4504 ; GCN2-NEXT: v_mov_b32_e32 v1, s9
4505 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
4506 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
4507 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4508 ; GCN2-NEXT: buffer_wbinvl1_vol
4509 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
4510 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
4511 ; GCN2-NEXT: flat_store_dword v[0:1], v2
4512 ; GCN2-NEXT: s_endpgm
4514 ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64_offset:
4515 ; GCN3: ; %bb.0: ; %entry
4516 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4517 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
4518 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4519 ; GCN3-NEXT: s_load_dword s9, s[0:1], 0x44
4520 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4521 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4522 ; GCN3-NEXT: v_mov_b32_e32 v0, s8
4523 ; GCN3-NEXT: s_add_u32 s0, s4, s0
4524 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
4525 ; GCN3-NEXT: v_mov_b32_e32 v3, s1
4526 ; GCN3-NEXT: v_mov_b32_e32 v1, s9
4527 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
4528 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] offset:16 glc
4529 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4530 ; GCN3-NEXT: buffer_wbinvl1_vol
4531 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
4532 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
4533 ; GCN3-NEXT: flat_store_dword v[0:1], v2
4534 ; GCN3-NEXT: s_endpgm
4536 %ptr = getelementptr i32, ptr %out, i64 %index
4537 %gep = getelementptr i32, ptr %ptr, i32 4
4538 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4539 %flag = extractvalue { i32, i1 } %val, 0
4540 store i32 %flag, ptr %out2
4544 define amdgpu_kernel void @atomic_cmpxchg_i32(ptr %out, i32 %in, i32 %old) {
4545 ; GCN1-LABEL: atomic_cmpxchg_i32:
4546 ; GCN1: ; %bb.0: ; %entry
4547 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
4548 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4549 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
4550 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
4551 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
4552 ; GCN1-NEXT: v_mov_b32_e32 v3, s3
4553 ; GCN1-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4554 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4555 ; GCN1-NEXT: buffer_wbinvl1_vol
4556 ; GCN1-NEXT: s_endpgm
4558 ; GCN2-LABEL: atomic_cmpxchg_i32:
4559 ; GCN2: ; %bb.0: ; %entry
4560 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4561 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4562 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
4563 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
4564 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
4565 ; GCN2-NEXT: v_mov_b32_e32 v3, s3
4566 ; GCN2-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4567 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4568 ; GCN2-NEXT: buffer_wbinvl1_vol
4569 ; GCN2-NEXT: s_endpgm
4571 ; GCN3-LABEL: atomic_cmpxchg_i32:
4572 ; GCN3: ; %bb.0: ; %entry
4573 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
4574 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4575 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
4576 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
4577 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
4578 ; GCN3-NEXT: v_mov_b32_e32 v3, s3
4579 ; GCN3-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4580 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4581 ; GCN3-NEXT: buffer_wbinvl1_vol
4582 ; GCN3-NEXT: s_endpgm
4584 %val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4588 define amdgpu_kernel void @atomic_cmpxchg_i32_ret(ptr %out, ptr %out2, i32 %in, i32 %old) {
4589 ; GCN1-LABEL: atomic_cmpxchg_i32_ret:
4590 ; GCN1: ; %bb.0: ; %entry
4591 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4592 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
4593 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4594 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
4595 ; GCN1-NEXT: v_mov_b32_e32 v3, s1
4596 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
4597 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
4598 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4599 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4600 ; GCN1-NEXT: buffer_wbinvl1_vol
4601 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
4602 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
4603 ; GCN1-NEXT: flat_store_dword v[0:1], v2
4604 ; GCN1-NEXT: s_endpgm
4606 ; GCN2-LABEL: atomic_cmpxchg_i32_ret:
4607 ; GCN2: ; %bb.0: ; %entry
4608 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4609 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
4610 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4611 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
4612 ; GCN2-NEXT: v_mov_b32_e32 v3, s1
4613 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
4614 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
4615 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4616 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4617 ; GCN2-NEXT: buffer_wbinvl1_vol
4618 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
4619 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
4620 ; GCN2-NEXT: flat_store_dword v[0:1], v2
4621 ; GCN2-NEXT: s_endpgm
4623 ; GCN3-LABEL: atomic_cmpxchg_i32_ret:
4624 ; GCN3: ; %bb.0: ; %entry
4625 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4626 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4627 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4628 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
4629 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
4630 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
4631 ; GCN3-NEXT: v_mov_b32_e32 v3, s3
4632 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
4633 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4634 ; GCN3-NEXT: buffer_wbinvl1_vol
4635 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
4636 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
4637 ; GCN3-NEXT: flat_store_dword v[0:1], v2
4638 ; GCN3-NEXT: s_endpgm
4640 %val = cmpxchg volatile ptr %out, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4641 %flag = extractvalue { i32, i1 } %val, 0
4642 store i32 %flag, ptr %out2
4646 define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(ptr %out, i32 %in, i64 %index, i32 %old) {
4647 ; GCN1-LABEL: atomic_cmpxchg_i32_addr64:
4648 ; GCN1: ; %bb.0: ; %entry
4649 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
4650 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
4651 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
4652 ; GCN1-NEXT: s_load_dword s7, s[0:1], 0xf
4653 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4654 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4655 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
4656 ; GCN1-NEXT: s_add_u32 s0, s4, s0
4657 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
4658 ; GCN1-NEXT: v_mov_b32_e32 v3, s1
4659 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
4660 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
4661 ; GCN1-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
4662 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4663 ; GCN1-NEXT: buffer_wbinvl1_vol
4664 ; GCN1-NEXT: s_endpgm
4666 ; GCN2-LABEL: atomic_cmpxchg_i32_addr64:
4667 ; GCN2: ; %bb.0: ; %entry
4668 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4669 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
4670 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4671 ; GCN2-NEXT: s_load_dword s7, s[0:1], 0x3c
4672 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4673 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4674 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
4675 ; GCN2-NEXT: s_add_u32 s0, s4, s0
4676 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
4677 ; GCN2-NEXT: v_mov_b32_e32 v3, s1
4678 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
4679 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
4680 ; GCN2-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
4681 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4682 ; GCN2-NEXT: buffer_wbinvl1_vol
4683 ; GCN2-NEXT: s_endpgm
4685 ; GCN3-LABEL: atomic_cmpxchg_i32_addr64:
4686 ; GCN3: ; %bb.0: ; %entry
4687 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4688 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
4689 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4690 ; GCN3-NEXT: s_load_dword s7, s[0:1], 0x3c
4691 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4692 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4693 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
4694 ; GCN3-NEXT: s_add_u32 s0, s4, s0
4695 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
4696 ; GCN3-NEXT: v_mov_b32_e32 v3, s1
4697 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
4698 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
4699 ; GCN3-NEXT: flat_atomic_cmpswap v[2:3], v[0:1]
4700 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4701 ; GCN3-NEXT: buffer_wbinvl1_vol
4702 ; GCN3-NEXT: s_endpgm
4704 %ptr = getelementptr i32, ptr %out, i64 %index
4705 %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4709 define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index, i32 %old) {
4710 ; GCN1-LABEL: atomic_cmpxchg_i32_ret_addr64:
4711 ; GCN1: ; %bb.0: ; %entry
4712 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
4713 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
4714 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4715 ; GCN1-NEXT: s_load_dword s9, s[0:1], 0x11
4716 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4717 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4718 ; GCN1-NEXT: v_mov_b32_e32 v0, s8
4719 ; GCN1-NEXT: s_add_u32 s0, s4, s0
4720 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
4721 ; GCN1-NEXT: v_mov_b32_e32 v3, s1
4722 ; GCN1-NEXT: v_mov_b32_e32 v1, s9
4723 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
4724 ; GCN1-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
4725 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4726 ; GCN1-NEXT: buffer_wbinvl1_vol
4727 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
4728 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
4729 ; GCN1-NEXT: flat_store_dword v[0:1], v2
4730 ; GCN1-NEXT: s_endpgm
4732 ; GCN2-LABEL: atomic_cmpxchg_i32_ret_addr64:
4733 ; GCN2: ; %bb.0: ; %entry
4734 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4735 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
4736 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4737 ; GCN2-NEXT: s_load_dword s9, s[0:1], 0x44
4738 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4739 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4740 ; GCN2-NEXT: v_mov_b32_e32 v0, s8
4741 ; GCN2-NEXT: s_add_u32 s0, s4, s0
4742 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
4743 ; GCN2-NEXT: v_mov_b32_e32 v3, s1
4744 ; GCN2-NEXT: v_mov_b32_e32 v1, s9
4745 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
4746 ; GCN2-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
4747 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4748 ; GCN2-NEXT: buffer_wbinvl1_vol
4749 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
4750 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
4751 ; GCN2-NEXT: flat_store_dword v[0:1], v2
4752 ; GCN2-NEXT: s_endpgm
4754 ; GCN3-LABEL: atomic_cmpxchg_i32_ret_addr64:
4755 ; GCN3: ; %bb.0: ; %entry
4756 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4757 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
4758 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4759 ; GCN3-NEXT: s_load_dword s9, s[0:1], 0x44
4760 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4761 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4762 ; GCN3-NEXT: v_mov_b32_e32 v0, s8
4763 ; GCN3-NEXT: s_add_u32 s0, s4, s0
4764 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
4765 ; GCN3-NEXT: v_mov_b32_e32 v3, s1
4766 ; GCN3-NEXT: v_mov_b32_e32 v1, s9
4767 ; GCN3-NEXT: v_mov_b32_e32 v2, s0
4768 ; GCN3-NEXT: flat_atomic_cmpswap v2, v[2:3], v[0:1] glc
4769 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4770 ; GCN3-NEXT: buffer_wbinvl1_vol
4771 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
4772 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
4773 ; GCN3-NEXT: flat_store_dword v[0:1], v2
4774 ; GCN3-NEXT: s_endpgm
4776 %ptr = getelementptr i32, ptr %out, i64 %index
4777 %val = cmpxchg volatile ptr %ptr, i32 %old, i32 %in syncscope("agent") seq_cst seq_cst
4778 %flag = extractvalue { i32, i1 } %val, 0
4779 store i32 %flag, ptr %out2
4783 define amdgpu_kernel void @atomic_xor_i32_offset(ptr %out, i32 %in) {
4784 ; GCN1-LABEL: atomic_xor_i32_offset:
4785 ; GCN1: ; %bb.0: ; %entry
4786 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
4787 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
4788 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4789 ; GCN1-NEXT: s_add_u32 s0, s2, 16
4790 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
4791 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
4792 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
4793 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
4794 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2
4795 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4796 ; GCN1-NEXT: buffer_wbinvl1_vol
4797 ; GCN1-NEXT: s_endpgm
4799 ; GCN2-LABEL: atomic_xor_i32_offset:
4800 ; GCN2: ; %bb.0: ; %entry
4801 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
4802 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
4803 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4804 ; GCN2-NEXT: s_add_u32 s0, s2, 16
4805 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
4806 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
4807 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
4808 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
4809 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2
4810 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4811 ; GCN2-NEXT: buffer_wbinvl1_vol
4812 ; GCN2-NEXT: s_endpgm
4814 ; GCN3-LABEL: atomic_xor_i32_offset:
4815 ; GCN3: ; %bb.0: ; %entry
4816 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
4817 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
4818 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4819 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
4820 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
4821 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
4822 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16
4823 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4824 ; GCN3-NEXT: buffer_wbinvl1_vol
4825 ; GCN3-NEXT: s_endpgm
4827 %gep = getelementptr i32, ptr %out, i32 4
4828 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
4832 define amdgpu_kernel void @atomic_xor_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
4833 ; GCN1-LABEL: atomic_xor_i32_ret_offset:
4834 ; GCN1: ; %bb.0: ; %entry
4835 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4836 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
4837 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4838 ; GCN1-NEXT: s_add_u32 s0, s4, 16
4839 ; GCN1-NEXT: s_addc_u32 s1, s5, 0
4840 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
4841 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
4842 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
4843 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
4844 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4845 ; GCN1-NEXT: buffer_wbinvl1_vol
4846 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
4847 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
4848 ; GCN1-NEXT: flat_store_dword v[0:1], v2
4849 ; GCN1-NEXT: s_endpgm
4851 ; GCN2-LABEL: atomic_xor_i32_ret_offset:
4852 ; GCN2: ; %bb.0: ; %entry
4853 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4854 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
4855 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4856 ; GCN2-NEXT: s_add_u32 s0, s4, 16
4857 ; GCN2-NEXT: s_addc_u32 s1, s5, 0
4858 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
4859 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
4860 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
4861 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
4862 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4863 ; GCN2-NEXT: buffer_wbinvl1_vol
4864 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
4865 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
4866 ; GCN2-NEXT: flat_store_dword v[0:1], v2
4867 ; GCN2-NEXT: s_endpgm
4869 ; GCN3-LABEL: atomic_xor_i32_ret_offset:
4870 ; GCN3: ; %bb.0: ; %entry
4871 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4872 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
4873 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4874 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
4875 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
4876 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
4877 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc
4878 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4879 ; GCN3-NEXT: buffer_wbinvl1_vol
4880 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
4881 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
4882 ; GCN3-NEXT: flat_store_dword v[0:1], v2
4883 ; GCN3-NEXT: s_endpgm
4885 %gep = getelementptr i32, ptr %out, i32 4
4886 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
4887 store i32 %val, ptr %out2
4891 define amdgpu_kernel void @atomic_xor_i32_addr64_offset(ptr %out, i32 %in, i64 %index) {
4892 ; GCN1-LABEL: atomic_xor_i32_addr64_offset:
4893 ; GCN1: ; %bb.0: ; %entry
4894 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
4895 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
4896 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
4897 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4898 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4899 ; GCN1-NEXT: s_add_u32 s0, s4, s0
4900 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
4901 ; GCN1-NEXT: s_add_u32 s0, s0, 16
4902 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
4903 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
4904 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
4905 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
4906 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2
4907 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4908 ; GCN1-NEXT: buffer_wbinvl1_vol
4909 ; GCN1-NEXT: s_endpgm
4911 ; GCN2-LABEL: atomic_xor_i32_addr64_offset:
4912 ; GCN2: ; %bb.0: ; %entry
4913 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4914 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4915 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
4916 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4917 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4918 ; GCN2-NEXT: s_add_u32 s0, s4, s0
4919 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
4920 ; GCN2-NEXT: s_add_u32 s0, s0, 16
4921 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
4922 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
4923 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
4924 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
4925 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2
4926 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4927 ; GCN2-NEXT: buffer_wbinvl1_vol
4928 ; GCN2-NEXT: s_endpgm
4930 ; GCN3-LABEL: atomic_xor_i32_addr64_offset:
4931 ; GCN3: ; %bb.0: ; %entry
4932 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
4933 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
4934 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
4935 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
4936 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4937 ; GCN3-NEXT: s_add_u32 s0, s4, s0
4938 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
4939 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
4940 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
4941 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
4942 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2 offset:16
4943 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4944 ; GCN3-NEXT: buffer_wbinvl1_vol
4945 ; GCN3-NEXT: s_endpgm
4947 %ptr = getelementptr i32, ptr %out, i64 %index
4948 %gep = getelementptr i32, ptr %ptr, i32 4
4949 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
4953 define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
4954 ; GCN1-LABEL: atomic_xor_i32_ret_addr64_offset:
4955 ; GCN1: ; %bb.0: ; %entry
4956 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
4957 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
4958 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
4959 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
4960 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4961 ; GCN1-NEXT: s_add_u32 s0, s4, s0
4962 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
4963 ; GCN1-NEXT: s_add_u32 s0, s0, 16
4964 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
4965 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
4966 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
4967 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
4968 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
4969 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4970 ; GCN1-NEXT: buffer_wbinvl1_vol
4971 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
4972 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
4973 ; GCN1-NEXT: flat_store_dword v[0:1], v2
4974 ; GCN1-NEXT: s_endpgm
4976 ; GCN2-LABEL: atomic_xor_i32_ret_addr64_offset:
4977 ; GCN2: ; %bb.0: ; %entry
4978 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
4979 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
4980 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
4981 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
4982 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
4983 ; GCN2-NEXT: s_add_u32 s0, s4, s0
4984 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
4985 ; GCN2-NEXT: s_add_u32 s0, s0, 16
4986 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
4987 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
4988 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
4989 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
4990 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
4991 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4992 ; GCN2-NEXT: buffer_wbinvl1_vol
4993 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
4994 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
4995 ; GCN2-NEXT: flat_store_dword v[0:1], v2
4996 ; GCN2-NEXT: s_endpgm
4998 ; GCN3-LABEL: atomic_xor_i32_ret_addr64_offset:
4999 ; GCN3: ; %bb.0: ; %entry
5000 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
5001 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5002 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
5003 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5004 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
5005 ; GCN3-NEXT: s_add_u32 s0, s4, s0
5006 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
5007 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5008 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5009 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
5010 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 offset:16 glc
5011 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5012 ; GCN3-NEXT: buffer_wbinvl1_vol
5013 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
5014 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
5015 ; GCN3-NEXT: flat_store_dword v[0:1], v2
5016 ; GCN3-NEXT: s_endpgm
5018 %ptr = getelementptr i32, ptr %out, i64 %index
5019 %gep = getelementptr i32, ptr %ptr, i32 4
5020 %val = atomicrmw volatile xor ptr %gep, i32 %in syncscope("agent") seq_cst
5021 store i32 %val, ptr %out2
5025 define amdgpu_kernel void @atomic_xor_i32(ptr %out, i32 %in) {
5026 ; GCN1-LABEL: atomic_xor_i32:
5027 ; GCN1: ; %bb.0: ; %entry
5028 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
5029 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb
5030 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5031 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
5032 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
5033 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
5034 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2
5035 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5036 ; GCN1-NEXT: buffer_wbinvl1_vol
5037 ; GCN1-NEXT: s_endpgm
5039 ; GCN2-LABEL: atomic_xor_i32:
5040 ; GCN2: ; %bb.0: ; %entry
5041 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
5042 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c
5043 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5044 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
5045 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
5046 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
5047 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2
5048 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5049 ; GCN2-NEXT: buffer_wbinvl1_vol
5050 ; GCN2-NEXT: s_endpgm
5052 ; GCN3-LABEL: atomic_xor_i32:
5053 ; GCN3: ; %bb.0: ; %entry
5054 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
5055 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
5056 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5057 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
5058 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
5059 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
5060 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2
5061 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5062 ; GCN3-NEXT: buffer_wbinvl1_vol
5063 ; GCN3-NEXT: s_endpgm
5065 %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst
5069 define amdgpu_kernel void @atomic_xor_i32_ret(ptr %out, ptr %out2, i32 %in) {
5070 ; GCN1-LABEL: atomic_xor_i32_ret:
5071 ; GCN1: ; %bb.0: ; %entry
5072 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
5073 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd
5074 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5075 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
5076 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
5077 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
5078 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
5079 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5080 ; GCN1-NEXT: buffer_wbinvl1_vol
5081 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
5082 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
5083 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5084 ; GCN1-NEXT: s_endpgm
5086 ; GCN2-LABEL: atomic_xor_i32_ret:
5087 ; GCN2: ; %bb.0: ; %entry
5088 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5089 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34
5090 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5091 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
5092 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
5093 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
5094 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
5095 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5096 ; GCN2-NEXT: buffer_wbinvl1_vol
5097 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
5098 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
5099 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5100 ; GCN2-NEXT: s_endpgm
5102 ; GCN3-LABEL: atomic_xor_i32_ret:
5103 ; GCN3: ; %bb.0: ; %entry
5104 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5105 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
5106 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5107 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
5108 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
5109 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
5110 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
5111 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5112 ; GCN3-NEXT: buffer_wbinvl1_vol
5113 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
5114 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
5115 ; GCN3-NEXT: flat_store_dword v[0:1], v2
5116 ; GCN3-NEXT: s_endpgm
5118 %val = atomicrmw volatile xor ptr %out, i32 %in syncscope("agent") seq_cst
5119 store i32 %val, ptr %out2
5123 define amdgpu_kernel void @atomic_xor_i32_addr64(ptr %out, i32 %in, i64 %index) {
5124 ; GCN1-LABEL: atomic_xor_i32_addr64:
5125 ; GCN1: ; %bb.0: ; %entry
5126 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
5127 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
5128 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
5129 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5130 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
5131 ; GCN1-NEXT: s_add_u32 s0, s4, s0
5132 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
5133 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5134 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5135 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
5136 ; GCN1-NEXT: flat_atomic_xor v[0:1], v2
5137 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5138 ; GCN1-NEXT: buffer_wbinvl1_vol
5139 ; GCN1-NEXT: s_endpgm
5141 ; GCN2-LABEL: atomic_xor_i32_addr64:
5142 ; GCN2: ; %bb.0: ; %entry
5143 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5144 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
5145 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
5146 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5147 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
5148 ; GCN2-NEXT: s_add_u32 s0, s4, s0
5149 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
5150 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5151 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5152 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
5153 ; GCN2-NEXT: flat_atomic_xor v[0:1], v2
5154 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5155 ; GCN2-NEXT: buffer_wbinvl1_vol
5156 ; GCN2-NEXT: s_endpgm
5158 ; GCN3-LABEL: atomic_xor_i32_addr64:
5159 ; GCN3: ; %bb.0: ; %entry
5160 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5161 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
5162 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
5163 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5164 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
5165 ; GCN3-NEXT: s_add_u32 s0, s4, s0
5166 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
5167 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5168 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5169 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
5170 ; GCN3-NEXT: flat_atomic_xor v[0:1], v2
5171 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5172 ; GCN3-NEXT: buffer_wbinvl1_vol
5173 ; GCN3-NEXT: s_endpgm
5175 %ptr = getelementptr i32, ptr %out, i64 %index
5176 %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst
5180 define amdgpu_kernel void @atomic_xor_i32_ret_addr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
5181 ; GCN1-LABEL: atomic_xor_i32_ret_addr64:
5182 ; GCN1: ; %bb.0: ; %entry
5183 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
5184 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
5185 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
5186 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5187 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
5188 ; GCN1-NEXT: s_add_u32 s0, s4, s0
5189 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
5190 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5191 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5192 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
5193 ; GCN1-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
5194 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5195 ; GCN1-NEXT: buffer_wbinvl1_vol
5196 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
5197 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
5198 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5199 ; GCN1-NEXT: s_endpgm
5201 ; GCN2-LABEL: atomic_xor_i32_ret_addr64:
5202 ; GCN2: ; %bb.0: ; %entry
5203 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
5204 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5205 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
5206 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5207 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
5208 ; GCN2-NEXT: s_add_u32 s0, s4, s0
5209 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
5210 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5211 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5212 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
5213 ; GCN2-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
5214 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5215 ; GCN2-NEXT: buffer_wbinvl1_vol
5216 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
5217 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
5218 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5219 ; GCN2-NEXT: s_endpgm
5221 ; GCN3-LABEL: atomic_xor_i32_ret_addr64:
5222 ; GCN3: ; %bb.0: ; %entry
5223 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
5224 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5225 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
5226 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5227 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
5228 ; GCN3-NEXT: s_add_u32 s0, s4, s0
5229 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
5230 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5231 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5232 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
5233 ; GCN3-NEXT: flat_atomic_xor v2, v[0:1], v2 glc
5234 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5235 ; GCN3-NEXT: buffer_wbinvl1_vol
5236 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
5237 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
5238 ; GCN3-NEXT: flat_store_dword v[0:1], v2
5239 ; GCN3-NEXT: s_endpgm
5241 %ptr = getelementptr i32, ptr %out, i64 %index
5242 %val = atomicrmw volatile xor ptr %ptr, i32 %in syncscope("agent") seq_cst
5243 store i32 %val, ptr %out2
5247 define amdgpu_kernel void @atomic_load_i32_offset(ptr %in, ptr %out) {
5248 ; GCN1-LABEL: atomic_load_i32_offset:
5249 ; GCN1: ; %bb.0: ; %entry
5250 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5251 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5252 ; GCN1-NEXT: s_add_u32 s0, s0, 16
5253 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
5254 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5255 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5256 ; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
5257 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5258 ; GCN1-NEXT: buffer_wbinvl1_vol
5259 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
5260 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
5261 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5262 ; GCN1-NEXT: s_endpgm
5264 ; GCN2-LABEL: atomic_load_i32_offset:
5265 ; GCN2: ; %bb.0: ; %entry
5266 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5267 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5268 ; GCN2-NEXT: s_add_u32 s0, s0, 16
5269 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
5270 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5271 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5272 ; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
5273 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5274 ; GCN2-NEXT: buffer_wbinvl1_vol
5275 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
5276 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
5277 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5278 ; GCN2-NEXT: s_endpgm
5280 ; GCN3-LABEL: atomic_load_i32_offset:
5281 ; GCN3: ; %bb.0: ; %entry
5282 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5283 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5284 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5285 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5286 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
5287 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5288 ; GCN3-NEXT: buffer_wbinvl1_vol
5289 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
5290 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
5291 ; GCN3-NEXT: flat_store_dword v[0:1], v2
5292 ; GCN3-NEXT: s_endpgm
5294 %gep = getelementptr i32, ptr %in, i32 4
5295 %val = load atomic i32, ptr %gep seq_cst, align 4
5296 store i32 %val, ptr %out
5300 define amdgpu_kernel void @atomic_load_i32(ptr %in, ptr %out) {
5301 ; GCN1-LABEL: atomic_load_i32:
5302 ; GCN1: ; %bb.0: ; %entry
5303 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5304 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5305 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5306 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5307 ; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
5308 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5309 ; GCN1-NEXT: buffer_wbinvl1_vol
5310 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
5311 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
5312 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5313 ; GCN1-NEXT: s_endpgm
5315 ; GCN2-LABEL: atomic_load_i32:
5316 ; GCN2: ; %bb.0: ; %entry
5317 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5318 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5319 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5320 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5321 ; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
5322 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5323 ; GCN2-NEXT: buffer_wbinvl1_vol
5324 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
5325 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
5326 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5327 ; GCN2-NEXT: s_endpgm
5329 ; GCN3-LABEL: atomic_load_i32:
5330 ; GCN3: ; %bb.0: ; %entry
5331 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5332 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5333 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5334 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5335 ; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
5336 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5337 ; GCN3-NEXT: buffer_wbinvl1_vol
5338 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
5339 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
5340 ; GCN3-NEXT: flat_store_dword v[0:1], v2
5341 ; GCN3-NEXT: s_endpgm
5343 %val = load atomic i32, ptr %in seq_cst, align 4
5344 store i32 %val, ptr %out
5348 define amdgpu_kernel void @atomic_load_i32_addr64_offset(ptr %in, ptr %out, i64 %index) {
5349 ; GCN1-LABEL: atomic_load_i32_addr64_offset:
5350 ; GCN1: ; %bb.0: ; %entry
5351 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
5352 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5353 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5354 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5355 ; GCN1-NEXT: s_add_u32 s0, s0, s4
5356 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
5357 ; GCN1-NEXT: s_add_u32 s0, s0, 16
5358 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
5359 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5360 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5361 ; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
5362 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5363 ; GCN1-NEXT: buffer_wbinvl1_vol
5364 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
5365 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
5366 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5367 ; GCN1-NEXT: s_endpgm
5369 ; GCN2-LABEL: atomic_load_i32_addr64_offset:
5370 ; GCN2: ; %bb.0: ; %entry
5371 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
5372 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5373 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5374 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5375 ; GCN2-NEXT: s_add_u32 s0, s0, s4
5376 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
5377 ; GCN2-NEXT: s_add_u32 s0, s0, 16
5378 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
5379 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5380 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5381 ; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
5382 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5383 ; GCN2-NEXT: buffer_wbinvl1_vol
5384 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
5385 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
5386 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5387 ; GCN2-NEXT: s_endpgm
5389 ; GCN3-LABEL: atomic_load_i32_addr64_offset:
5390 ; GCN3: ; %bb.0: ; %entry
5391 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5392 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5393 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5394 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
5395 ; GCN3-NEXT: s_add_u32 s0, s4, s0
5396 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
5397 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5398 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5399 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
5400 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5401 ; GCN3-NEXT: buffer_wbinvl1_vol
5402 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
5403 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
5404 ; GCN3-NEXT: flat_store_dword v[0:1], v2
5405 ; GCN3-NEXT: s_endpgm
5407 %ptr = getelementptr i32, ptr %in, i64 %index
5408 %gep = getelementptr i32, ptr %ptr, i32 4
5409 %val = load atomic i32, ptr %gep seq_cst, align 4
5410 store i32 %val, ptr %out
5414 define amdgpu_kernel void @atomic_load_i32_addr64(ptr %in, ptr %out, i64 %index) {
5415 ; GCN1-LABEL: atomic_load_i32_addr64:
5416 ; GCN1: ; %bb.0: ; %entry
5417 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
5418 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5419 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5420 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5421 ; GCN1-NEXT: s_add_u32 s0, s0, s4
5422 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
5423 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5424 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5425 ; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
5426 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5427 ; GCN1-NEXT: buffer_wbinvl1_vol
5428 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
5429 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
5430 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5431 ; GCN1-NEXT: s_endpgm
5433 ; GCN2-LABEL: atomic_load_i32_addr64:
5434 ; GCN2: ; %bb.0: ; %entry
5435 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
5436 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5437 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5438 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5439 ; GCN2-NEXT: s_add_u32 s0, s0, s4
5440 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
5441 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5442 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5443 ; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
5444 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5445 ; GCN2-NEXT: buffer_wbinvl1_vol
5446 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
5447 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
5448 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5449 ; GCN2-NEXT: s_endpgm
5451 ; GCN3-LABEL: atomic_load_i32_addr64:
5452 ; GCN3: ; %bb.0: ; %entry
5453 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5454 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5455 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5456 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
5457 ; GCN3-NEXT: s_add_u32 s0, s4, s0
5458 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
5459 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5460 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5461 ; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
5462 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5463 ; GCN3-NEXT: buffer_wbinvl1_vol
5464 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
5465 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
5466 ; GCN3-NEXT: flat_store_dword v[0:1], v2
5467 ; GCN3-NEXT: s_endpgm
5469 %ptr = getelementptr i32, ptr %in, i64 %index
5470 %val = load atomic i32, ptr %ptr seq_cst, align 4
5471 store i32 %val, ptr %out
5475 define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, ptr %out) {
5476 ; GCN1-LABEL: atomic_store_i32_offset:
5477 ; GCN1: ; %bb.0: ; %entry
5478 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
5479 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9
5480 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5481 ; GCN1-NEXT: s_add_u32 s0, s2, 16
5482 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
5483 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5484 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5485 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
5486 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5487 ; GCN1-NEXT: s_endpgm
5489 ; GCN2-LABEL: atomic_store_i32_offset:
5490 ; GCN2: ; %bb.0: ; %entry
5491 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5492 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24
5493 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5494 ; GCN2-NEXT: s_add_u32 s0, s2, 16
5495 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
5496 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5497 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5498 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
5499 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5500 ; GCN2-NEXT: s_endpgm
5502 ; GCN3-LABEL: atomic_store_i32_offset:
5503 ; GCN3: ; %bb.0: ; %entry
5504 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5505 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24
5506 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5507 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
5508 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
5509 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
5510 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16
5511 ; GCN3-NEXT: s_endpgm
5513 %gep = getelementptr i32, ptr %out, i32 4
5514 store atomic i32 %in, ptr %gep seq_cst, align 4
5518 define amdgpu_kernel void @atomic_store_i32(i32 %in, ptr %out) {
5519 ; GCN1-LABEL: atomic_store_i32:
5520 ; GCN1: ; %bb.0: ; %entry
5521 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
5522 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9
5523 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5524 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
5525 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
5526 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
5527 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5528 ; GCN1-NEXT: s_endpgm
5530 ; GCN2-LABEL: atomic_store_i32:
5531 ; GCN2: ; %bb.0: ; %entry
5532 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5533 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24
5534 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5535 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
5536 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
5537 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
5538 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5539 ; GCN2-NEXT: s_endpgm
5541 ; GCN3-LABEL: atomic_store_i32:
5542 ; GCN3: ; %bb.0: ; %entry
5543 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5544 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24
5545 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5546 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
5547 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
5548 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
5549 ; GCN3-NEXT: flat_store_dword v[0:1], v2
5550 ; GCN3-NEXT: s_endpgm
5552 store atomic i32 %in, ptr %out seq_cst, align 4
5556 define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, ptr %out, i64 %index) {
5557 ; GCN1-LABEL: atomic_store_i32_addr64_offset:
5558 ; GCN1: ; %bb.0: ; %entry
5559 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
5560 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9
5561 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5562 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5563 ; GCN1-NEXT: s_add_u32 s0, s4, s0
5564 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
5565 ; GCN1-NEXT: s_add_u32 s0, s0, 16
5566 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
5567 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5568 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5569 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
5570 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5571 ; GCN1-NEXT: s_endpgm
5573 ; GCN2-LABEL: atomic_store_i32_addr64_offset:
5574 ; GCN2: ; %bb.0: ; %entry
5575 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
5576 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24
5577 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5578 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5579 ; GCN2-NEXT: s_add_u32 s0, s4, s0
5580 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
5581 ; GCN2-NEXT: s_add_u32 s0, s0, 16
5582 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
5583 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5584 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5585 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
5586 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5587 ; GCN2-NEXT: s_endpgm
5589 ; GCN3-LABEL: atomic_store_i32_addr64_offset:
5590 ; GCN3: ; %bb.0: ; %entry
5591 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
5592 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24
5593 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5594 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5595 ; GCN3-NEXT: s_add_u32 s0, s4, s0
5596 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
5597 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5598 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5599 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
5600 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16
5601 ; GCN3-NEXT: s_endpgm
5603 %ptr = getelementptr i32, ptr %out, i64 %index
5604 %gep = getelementptr i32, ptr %ptr, i32 4
5605 store atomic i32 %in, ptr %gep seq_cst, align 4
5609 define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, ptr %out, i64 %index) {
5610 ; GCN1-LABEL: atomic_store_i32_addr64:
5611 ; GCN1: ; %bb.0: ; %entry
5612 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
5613 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9
5614 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5615 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5616 ; GCN1-NEXT: s_add_u32 s0, s4, s0
5617 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
5618 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5619 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5620 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
5621 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5622 ; GCN1-NEXT: s_endpgm
5624 ; GCN2-LABEL: atomic_store_i32_addr64:
5625 ; GCN2: ; %bb.0: ; %entry
5626 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
5627 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24
5628 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5629 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5630 ; GCN2-NEXT: s_add_u32 s0, s4, s0
5631 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
5632 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5633 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5634 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
5635 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5636 ; GCN2-NEXT: s_endpgm
5638 ; GCN3-LABEL: atomic_store_i32_addr64:
5639 ; GCN3: ; %bb.0: ; %entry
5640 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
5641 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24
5642 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5643 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5644 ; GCN3-NEXT: s_add_u32 s0, s4, s0
5645 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
5646 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5647 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5648 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
5649 ; GCN3-NEXT: flat_store_dword v[0:1], v2
5650 ; GCN3-NEXT: s_endpgm
5652 %ptr = getelementptr i32, ptr %out, i64 %index
5653 store atomic i32 %in, ptr %ptr seq_cst, align 4
5657 define amdgpu_kernel void @atomic_load_f32_offset(ptr %in, ptr %out) {
5658 ; GCN1-LABEL: atomic_load_f32_offset:
5659 ; GCN1: ; %bb.0: ; %entry
5660 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5661 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5662 ; GCN1-NEXT: s_add_u32 s0, s0, 16
5663 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
5664 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5665 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5666 ; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
5667 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5668 ; GCN1-NEXT: buffer_wbinvl1_vol
5669 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
5670 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
5671 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5672 ; GCN1-NEXT: s_endpgm
5674 ; GCN2-LABEL: atomic_load_f32_offset:
5675 ; GCN2: ; %bb.0: ; %entry
5676 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5677 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5678 ; GCN2-NEXT: s_add_u32 s0, s0, 16
5679 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
5680 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5681 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5682 ; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
5683 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5684 ; GCN2-NEXT: buffer_wbinvl1_vol
5685 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
5686 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
5687 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5688 ; GCN2-NEXT: s_endpgm
5690 ; GCN3-LABEL: atomic_load_f32_offset:
5691 ; GCN3: ; %bb.0: ; %entry
5692 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5693 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5694 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5695 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5696 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
5697 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5698 ; GCN3-NEXT: buffer_wbinvl1_vol
5699 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
5700 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
5701 ; GCN3-NEXT: flat_store_dword v[0:1], v2
5702 ; GCN3-NEXT: s_endpgm
5704 %gep = getelementptr float, ptr %in, i32 4
5705 %val = load atomic float, ptr %gep seq_cst, align 4
5706 store float %val, ptr %out
5710 define amdgpu_kernel void @atomic_load_f32(ptr %in, ptr %out) {
5711 ; GCN1-LABEL: atomic_load_f32:
5712 ; GCN1: ; %bb.0: ; %entry
5713 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5714 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5715 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5716 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5717 ; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
5718 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5719 ; GCN1-NEXT: buffer_wbinvl1_vol
5720 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
5721 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
5722 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5723 ; GCN1-NEXT: s_endpgm
5725 ; GCN2-LABEL: atomic_load_f32:
5726 ; GCN2: ; %bb.0: ; %entry
5727 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5728 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5729 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5730 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5731 ; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
5732 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5733 ; GCN2-NEXT: buffer_wbinvl1_vol
5734 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
5735 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
5736 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5737 ; GCN2-NEXT: s_endpgm
5739 ; GCN3-LABEL: atomic_load_f32:
5740 ; GCN3: ; %bb.0: ; %entry
5741 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5742 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5743 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5744 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5745 ; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
5746 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5747 ; GCN3-NEXT: buffer_wbinvl1_vol
5748 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
5749 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
5750 ; GCN3-NEXT: flat_store_dword v[0:1], v2
5751 ; GCN3-NEXT: s_endpgm
5753 %val = load atomic float, ptr %in seq_cst, align 4
5754 store float %val, ptr %out
5758 define amdgpu_kernel void @atomic_load_f32_addr64_offset(ptr %in, ptr %out, i64 %index) {
5759 ; GCN1-LABEL: atomic_load_f32_addr64_offset:
5760 ; GCN1: ; %bb.0: ; %entry
5761 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
5762 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5763 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5764 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5765 ; GCN1-NEXT: s_add_u32 s0, s0, s4
5766 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
5767 ; GCN1-NEXT: s_add_u32 s0, s0, 16
5768 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
5769 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5770 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5771 ; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
5772 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5773 ; GCN1-NEXT: buffer_wbinvl1_vol
5774 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
5775 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
5776 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5777 ; GCN1-NEXT: s_endpgm
5779 ; GCN2-LABEL: atomic_load_f32_addr64_offset:
5780 ; GCN2: ; %bb.0: ; %entry
5781 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
5782 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5783 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5784 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5785 ; GCN2-NEXT: s_add_u32 s0, s0, s4
5786 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
5787 ; GCN2-NEXT: s_add_u32 s0, s0, 16
5788 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
5789 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5790 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5791 ; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
5792 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5793 ; GCN2-NEXT: buffer_wbinvl1_vol
5794 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
5795 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
5796 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5797 ; GCN2-NEXT: s_endpgm
5799 ; GCN3-LABEL: atomic_load_f32_addr64_offset:
5800 ; GCN3: ; %bb.0: ; %entry
5801 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5802 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5803 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5804 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
5805 ; GCN3-NEXT: s_add_u32 s0, s4, s0
5806 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
5807 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5808 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5809 ; GCN3-NEXT: flat_load_dword v2, v[0:1] offset:16 glc
5810 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5811 ; GCN3-NEXT: buffer_wbinvl1_vol
5812 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
5813 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
5814 ; GCN3-NEXT: flat_store_dword v[0:1], v2
5815 ; GCN3-NEXT: s_endpgm
5817 %ptr = getelementptr float, ptr %in, i64 %index
5818 %gep = getelementptr float, ptr %ptr, i32 4
5819 %val = load atomic float, ptr %gep seq_cst, align 4
5820 store float %val, ptr %out
5824 define amdgpu_kernel void @atomic_load_f32_addr64(ptr %in, ptr %out, i64 %index) {
5825 ; GCN1-LABEL: atomic_load_f32_addr64:
5826 ; GCN1: ; %bb.0: ; %entry
5827 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
5828 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
5829 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5830 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5831 ; GCN1-NEXT: s_add_u32 s0, s0, s4
5832 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
5833 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5834 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5835 ; GCN1-NEXT: flat_load_dword v2, v[0:1] glc
5836 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5837 ; GCN1-NEXT: buffer_wbinvl1_vol
5838 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
5839 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
5840 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5841 ; GCN1-NEXT: s_endpgm
5843 ; GCN2-LABEL: atomic_load_f32_addr64:
5844 ; GCN2: ; %bb.0: ; %entry
5845 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
5846 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
5847 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5848 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
5849 ; GCN2-NEXT: s_add_u32 s0, s0, s4
5850 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
5851 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5852 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5853 ; GCN2-NEXT: flat_load_dword v2, v[0:1] glc
5854 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5855 ; GCN2-NEXT: buffer_wbinvl1_vol
5856 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
5857 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
5858 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5859 ; GCN2-NEXT: s_endpgm
5861 ; GCN3-LABEL: atomic_load_f32_addr64:
5862 ; GCN3: ; %bb.0: ; %entry
5863 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
5864 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
5865 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5866 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
5867 ; GCN3-NEXT: s_add_u32 s0, s4, s0
5868 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
5869 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
5870 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
5871 ; GCN3-NEXT: flat_load_dword v2, v[0:1] glc
5872 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5873 ; GCN3-NEXT: buffer_wbinvl1_vol
5874 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
5875 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
5876 ; GCN3-NEXT: flat_store_dword v[0:1], v2
5877 ; GCN3-NEXT: s_endpgm
5879 %ptr = getelementptr float, ptr %in, i64 %index
5880 %val = load atomic float, ptr %ptr seq_cst, align 4
5881 store float %val, ptr %out
5885 define amdgpu_kernel void @atomic_store_f32_offset(float %in, ptr %out) {
5886 ; GCN1-LABEL: atomic_store_f32_offset:
5887 ; GCN1: ; %bb.0: ; %entry
5888 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
5889 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9
5890 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5891 ; GCN1-NEXT: s_add_u32 s0, s2, 16
5892 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
5893 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5894 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5895 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
5896 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5897 ; GCN1-NEXT: s_endpgm
5899 ; GCN2-LABEL: atomic_store_f32_offset:
5900 ; GCN2: ; %bb.0: ; %entry
5901 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5902 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24
5903 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5904 ; GCN2-NEXT: s_add_u32 s0, s2, 16
5905 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
5906 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5907 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5908 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
5909 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5910 ; GCN2-NEXT: s_endpgm
5912 ; GCN3-LABEL: atomic_store_f32_offset:
5913 ; GCN3: ; %bb.0: ; %entry
5914 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5915 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24
5916 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5917 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
5918 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
5919 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
5920 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16
5921 ; GCN3-NEXT: s_endpgm
5923 %gep = getelementptr float, ptr %out, i32 4
5924 store atomic float %in, ptr %gep seq_cst, align 4
5928 define amdgpu_kernel void @atomic_store_f32(float %in, ptr %out) {
5929 ; GCN1-LABEL: atomic_store_f32:
5930 ; GCN1: ; %bb.0: ; %entry
5931 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
5932 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9
5933 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5934 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
5935 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
5936 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
5937 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5938 ; GCN1-NEXT: s_endpgm
5940 ; GCN2-LABEL: atomic_store_f32:
5941 ; GCN2: ; %bb.0: ; %entry
5942 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5943 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24
5944 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5945 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
5946 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
5947 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
5948 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5949 ; GCN2-NEXT: s_endpgm
5951 ; GCN3-LABEL: atomic_store_f32:
5952 ; GCN3: ; %bb.0: ; %entry
5953 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
5954 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24
5955 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
5956 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
5957 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
5958 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
5959 ; GCN3-NEXT: flat_store_dword v[0:1], v2
5960 ; GCN3-NEXT: s_endpgm
5962 store atomic float %in, ptr %out seq_cst, align 4
5966 define amdgpu_kernel void @atomic_store_f32_addr64_offset(float %in, ptr %out, i64 %index) {
5967 ; GCN1-LABEL: atomic_store_f32_addr64_offset:
5968 ; GCN1: ; %bb.0: ; %entry
5969 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
5970 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9
5971 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
5972 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5973 ; GCN1-NEXT: s_add_u32 s0, s4, s0
5974 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
5975 ; GCN1-NEXT: s_add_u32 s0, s0, 16
5976 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
5977 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
5978 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
5979 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
5980 ; GCN1-NEXT: flat_store_dword v[0:1], v2
5981 ; GCN1-NEXT: s_endpgm
5983 ; GCN2-LABEL: atomic_store_f32_addr64_offset:
5984 ; GCN2: ; %bb.0: ; %entry
5985 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
5986 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24
5987 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
5988 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
5989 ; GCN2-NEXT: s_add_u32 s0, s4, s0
5990 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
5991 ; GCN2-NEXT: s_add_u32 s0, s0, 16
5992 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
5993 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
5994 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
5995 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
5996 ; GCN2-NEXT: flat_store_dword v[0:1], v2
5997 ; GCN2-NEXT: s_endpgm
5999 ; GCN3-LABEL: atomic_store_f32_addr64_offset:
6000 ; GCN3: ; %bb.0: ; %entry
6001 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
6002 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24
6003 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6004 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
6005 ; GCN3-NEXT: s_add_u32 s0, s4, s0
6006 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
6007 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
6008 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
6009 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
6010 ; GCN3-NEXT: flat_store_dword v[0:1], v2 offset:16
6011 ; GCN3-NEXT: s_endpgm
6013 %ptr = getelementptr float, ptr %out, i64 %index
6014 %gep = getelementptr float, ptr %ptr, i32 4
6015 store atomic float %in, ptr %gep seq_cst, align 4
6019 define amdgpu_kernel void @atomic_store_f32_addr64(float %in, ptr %out, i64 %index) {
6020 ; GCN1-LABEL: atomic_store_f32_addr64:
6021 ; GCN1: ; %bb.0: ; %entry
6022 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
6023 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9
6024 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6025 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
6026 ; GCN1-NEXT: s_add_u32 s0, s4, s0
6027 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
6028 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6029 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6030 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
6031 ; GCN1-NEXT: flat_store_dword v[0:1], v2
6032 ; GCN1-NEXT: s_endpgm
6034 ; GCN2-LABEL: atomic_store_f32_addr64:
6035 ; GCN2: ; %bb.0: ; %entry
6036 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
6037 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24
6038 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6039 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
6040 ; GCN2-NEXT: s_add_u32 s0, s4, s0
6041 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
6042 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6043 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6044 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
6045 ; GCN2-NEXT: flat_store_dword v[0:1], v2
6046 ; GCN2-NEXT: s_endpgm
6048 ; GCN3-LABEL: atomic_store_f32_addr64:
6049 ; GCN3: ; %bb.0: ; %entry
6050 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
6051 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24
6052 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6053 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 2
6054 ; GCN3-NEXT: s_add_u32 s0, s4, s0
6055 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
6056 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
6057 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
6058 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
6059 ; GCN3-NEXT: flat_store_dword v[0:1], v2
6060 ; GCN3-NEXT: s_endpgm
6062 %ptr = getelementptr float, ptr %out, i64 %index
6063 store atomic float %in, ptr %ptr seq_cst, align 4
6067 define amdgpu_kernel void @atomic_load_i8_offset(ptr %in, ptr %out) {
6068 ; GCN1-LABEL: atomic_load_i8_offset:
6069 ; GCN1: ; %bb.0: ; %entry
6070 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
6071 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6072 ; GCN1-NEXT: s_add_u32 s0, s0, 16
6073 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
6074 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6075 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6076 ; GCN1-NEXT: flat_load_ubyte v2, v[0:1] glc
6077 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6078 ; GCN1-NEXT: buffer_wbinvl1_vol
6079 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
6080 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
6081 ; GCN1-NEXT: flat_store_byte v[0:1], v2
6082 ; GCN1-NEXT: s_endpgm
6084 ; GCN2-LABEL: atomic_load_i8_offset:
6085 ; GCN2: ; %bb.0: ; %entry
6086 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
6087 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6088 ; GCN2-NEXT: s_add_u32 s0, s0, 16
6089 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
6090 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6091 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6092 ; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc
6093 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6094 ; GCN2-NEXT: buffer_wbinvl1_vol
6095 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
6096 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
6097 ; GCN2-NEXT: flat_store_byte v[0:1], v2
6098 ; GCN2-NEXT: s_endpgm
6100 ; GCN3-LABEL: atomic_load_i8_offset:
6101 ; GCN3: ; %bb.0: ; %entry
6102 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
6103 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6104 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
6105 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
6106 ; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc
6107 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6108 ; GCN3-NEXT: buffer_wbinvl1_vol
6109 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6110 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6111 ; GCN3-NEXT: flat_store_byte v[0:1], v2
6112 ; GCN3-NEXT: s_endpgm
6114 %gep = getelementptr i8, ptr %in, i64 16
6115 %val = load atomic i8, ptr %gep seq_cst, align 1
6116 store i8 %val, ptr %out
6120 define amdgpu_kernel void @atomic_load_i8(ptr %in, ptr %out) {
6121 ; GCN1-LABEL: atomic_load_i8:
6122 ; GCN1: ; %bb.0: ; %entry
6123 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
6124 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6125 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6126 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6127 ; GCN1-NEXT: flat_load_ubyte v2, v[0:1] glc
6128 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6129 ; GCN1-NEXT: buffer_wbinvl1_vol
6130 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
6131 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
6132 ; GCN1-NEXT: flat_store_byte v[0:1], v2
6133 ; GCN1-NEXT: s_endpgm
6135 ; GCN2-LABEL: atomic_load_i8:
6136 ; GCN2: ; %bb.0: ; %entry
6137 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
6138 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6139 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6140 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6141 ; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc
6142 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6143 ; GCN2-NEXT: buffer_wbinvl1_vol
6144 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
6145 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
6146 ; GCN2-NEXT: flat_store_byte v[0:1], v2
6147 ; GCN2-NEXT: s_endpgm
6149 ; GCN3-LABEL: atomic_load_i8:
6150 ; GCN3: ; %bb.0: ; %entry
6151 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
6152 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6153 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
6154 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
6155 ; GCN3-NEXT: flat_load_ubyte v2, v[0:1] glc
6156 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6157 ; GCN3-NEXT: buffer_wbinvl1_vol
6158 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6159 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6160 ; GCN3-NEXT: flat_store_byte v[0:1], v2
6161 ; GCN3-NEXT: s_endpgm
6163 %val = load atomic i8, ptr %in seq_cst, align 1
6164 store i8 %val, ptr %out
6168 define amdgpu_kernel void @atomic_load_i8_addr64_offset(ptr %in, ptr %out, i64 %index) {
6169 ; GCN1-LABEL: atomic_load_i8_addr64_offset:
6170 ; GCN1: ; %bb.0: ; %entry
6171 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
6172 ; GCN1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd
6173 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6174 ; GCN1-NEXT: s_add_u32 s0, s4, s0
6175 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
6176 ; GCN1-NEXT: s_add_u32 s0, s0, 16
6177 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
6178 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6179 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6180 ; GCN1-NEXT: flat_load_ubyte v2, v[0:1] glc
6181 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6182 ; GCN1-NEXT: buffer_wbinvl1_vol
6183 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
6184 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
6185 ; GCN1-NEXT: flat_store_byte v[0:1], v2
6186 ; GCN1-NEXT: s_endpgm
6188 ; GCN2-LABEL: atomic_load_i8_addr64_offset:
6189 ; GCN2: ; %bb.0: ; %entry
6190 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6191 ; GCN2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34
6192 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6193 ; GCN2-NEXT: s_add_u32 s0, s4, s0
6194 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
6195 ; GCN2-NEXT: s_add_u32 s0, s0, 16
6196 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
6197 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6198 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6199 ; GCN2-NEXT: flat_load_ubyte v2, v[0:1] glc
6200 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6201 ; GCN2-NEXT: buffer_wbinvl1_vol
6202 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
6203 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
6204 ; GCN2-NEXT: flat_store_byte v[0:1], v2
6205 ; GCN2-NEXT: s_endpgm
6207 ; GCN3-LABEL: atomic_load_i8_addr64_offset:
6208 ; GCN3: ; %bb.0: ; %entry
6209 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6210 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
6211 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6212 ; GCN3-NEXT: s_add_u32 s0, s4, s2
6213 ; GCN3-NEXT: s_addc_u32 s1, s5, s3
6214 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
6215 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
6216 ; GCN3-NEXT: flat_load_ubyte v2, v[0:1] offset:16 glc
6217 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6218 ; GCN3-NEXT: buffer_wbinvl1_vol
6219 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
6220 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
6221 ; GCN3-NEXT: flat_store_byte v[0:1], v2
6222 ; GCN3-NEXT: s_endpgm
6224 %ptr = getelementptr i8, ptr %in, i64 %index
6225 %gep = getelementptr i8, ptr %ptr, i64 16
6226 %val = load atomic i8, ptr %gep seq_cst, align 1
6227 store i8 %val, ptr %out
6231 define amdgpu_kernel void @atomic_store_i8_offset(i8 %in, ptr %out) {
6232 ; GCN1-LABEL: atomic_store_i8_offset:
6233 ; GCN1: ; %bb.0: ; %entry
6234 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
6235 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9
6236 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6237 ; GCN1-NEXT: s_add_u32 s0, s2, 16
6238 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
6239 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6240 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6241 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
6242 ; GCN1-NEXT: flat_store_byte v[0:1], v2
6243 ; GCN1-NEXT: s_endpgm
6245 ; GCN2-LABEL: atomic_store_i8_offset:
6246 ; GCN2: ; %bb.0: ; %entry
6247 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6248 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24
6249 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6250 ; GCN2-NEXT: s_add_u32 s0, s2, 16
6251 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
6252 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6253 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6254 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
6255 ; GCN2-NEXT: flat_store_byte v[0:1], v2
6256 ; GCN2-NEXT: s_endpgm
6258 ; GCN3-LABEL: atomic_store_i8_offset:
6259 ; GCN3: ; %bb.0: ; %entry
6260 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6261 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24
6262 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6263 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6264 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6265 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
6266 ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16
6267 ; GCN3-NEXT: s_endpgm
6269 %gep = getelementptr i8, ptr %out, i64 16
6270 store atomic i8 %in, ptr %gep seq_cst, align 1
6274 define amdgpu_kernel void @atomic_store_i8(i8 %in, ptr %out) {
6275 ; GCN1-LABEL: atomic_store_i8:
6276 ; GCN1: ; %bb.0: ; %entry
6277 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
6278 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9
6279 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6280 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
6281 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
6282 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
6283 ; GCN1-NEXT: flat_store_byte v[0:1], v2
6284 ; GCN1-NEXT: s_endpgm
6286 ; GCN2-LABEL: atomic_store_i8:
6287 ; GCN2: ; %bb.0: ; %entry
6288 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6289 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24
6290 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6291 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
6292 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
6293 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
6294 ; GCN2-NEXT: flat_store_byte v[0:1], v2
6295 ; GCN2-NEXT: s_endpgm
6297 ; GCN3-LABEL: atomic_store_i8:
6298 ; GCN3: ; %bb.0: ; %entry
6299 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6300 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24
6301 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6302 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6303 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6304 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
6305 ; GCN3-NEXT: flat_store_byte v[0:1], v2
6306 ; GCN3-NEXT: s_endpgm
6308 store atomic i8 %in, ptr %out seq_cst, align 1
6312 define amdgpu_kernel void @atomic_store_i8_addr64_offset(i8 %in, ptr %out, i64 %index) {
6313 ; GCN1-LABEL: atomic_store_i8_addr64_offset:
6314 ; GCN1: ; %bb.0: ; %entry
6315 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
6316 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9
6317 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6318 ; GCN1-NEXT: s_add_u32 s0, s4, s6
6319 ; GCN1-NEXT: s_addc_u32 s1, s5, s7
6320 ; GCN1-NEXT: s_add_u32 s0, s0, 16
6321 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
6322 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6323 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6324 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
6325 ; GCN1-NEXT: flat_store_byte v[0:1], v2
6326 ; GCN1-NEXT: s_endpgm
6328 ; GCN2-LABEL: atomic_store_i8_addr64_offset:
6329 ; GCN2: ; %bb.0: ; %entry
6330 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
6331 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24
6332 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6333 ; GCN2-NEXT: s_add_u32 s0, s4, s6
6334 ; GCN2-NEXT: s_addc_u32 s1, s5, s7
6335 ; GCN2-NEXT: s_add_u32 s0, s0, 16
6336 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
6337 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6338 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6339 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
6340 ; GCN2-NEXT: flat_store_byte v[0:1], v2
6341 ; GCN2-NEXT: s_endpgm
6343 ; GCN3-LABEL: atomic_store_i8_addr64_offset:
6344 ; GCN3: ; %bb.0: ; %entry
6345 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
6346 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24
6347 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6348 ; GCN3-NEXT: s_add_u32 s0, s4, s6
6349 ; GCN3-NEXT: s_addc_u32 s1, s5, s7
6350 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
6351 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
6352 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
6353 ; GCN3-NEXT: flat_store_byte v[0:1], v2 offset:16
6354 ; GCN3-NEXT: s_endpgm
6356 %ptr = getelementptr i8, ptr %out, i64 %index
6357 %gep = getelementptr i8, ptr %ptr, i64 16
6358 store atomic i8 %in, ptr %gep seq_cst, align 1
6362 define amdgpu_kernel void @atomic_load_i16_offset(ptr %in, ptr %out) {
6363 ; GCN1-LABEL: atomic_load_i16_offset:
6364 ; GCN1: ; %bb.0: ; %entry
6365 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
6366 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6367 ; GCN1-NEXT: s_add_u32 s0, s0, 16
6368 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
6369 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6370 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6371 ; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
6372 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6373 ; GCN1-NEXT: buffer_wbinvl1_vol
6374 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
6375 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
6376 ; GCN1-NEXT: flat_store_short v[0:1], v2
6377 ; GCN1-NEXT: s_endpgm
6379 ; GCN2-LABEL: atomic_load_i16_offset:
6380 ; GCN2: ; %bb.0: ; %entry
6381 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
6382 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6383 ; GCN2-NEXT: s_add_u32 s0, s0, 16
6384 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
6385 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6386 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6387 ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
6388 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6389 ; GCN2-NEXT: buffer_wbinvl1_vol
6390 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
6391 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
6392 ; GCN2-NEXT: flat_store_short v[0:1], v2
6393 ; GCN2-NEXT: s_endpgm
6395 ; GCN3-LABEL: atomic_load_i16_offset:
6396 ; GCN3: ; %bb.0: ; %entry
6397 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
6398 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6399 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
6400 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
6401 ; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
6402 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6403 ; GCN3-NEXT: buffer_wbinvl1_vol
6404 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6405 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6406 ; GCN3-NEXT: flat_store_short v[0:1], v2
6407 ; GCN3-NEXT: s_endpgm
6409 %gep = getelementptr i16, ptr %in, i64 8
6410 %val = load atomic i16, ptr %gep seq_cst, align 2
6411 store i16 %val, ptr %out
6415 define amdgpu_kernel void @atomic_load_i16(ptr %in, ptr %out) {
6416 ; GCN1-LABEL: atomic_load_i16:
6417 ; GCN1: ; %bb.0: ; %entry
6418 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
6419 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6420 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6421 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6422 ; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
6423 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6424 ; GCN1-NEXT: buffer_wbinvl1_vol
6425 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
6426 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
6427 ; GCN1-NEXT: flat_store_short v[0:1], v2
6428 ; GCN1-NEXT: s_endpgm
6430 ; GCN2-LABEL: atomic_load_i16:
6431 ; GCN2: ; %bb.0: ; %entry
6432 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
6433 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6434 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6435 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6436 ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
6437 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6438 ; GCN2-NEXT: buffer_wbinvl1_vol
6439 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
6440 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
6441 ; GCN2-NEXT: flat_store_short v[0:1], v2
6442 ; GCN2-NEXT: s_endpgm
6444 ; GCN3-LABEL: atomic_load_i16:
6445 ; GCN3: ; %bb.0: ; %entry
6446 ; GCN3-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
6447 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6448 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
6449 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
6450 ; GCN3-NEXT: flat_load_ushort v2, v[0:1] glc
6451 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6452 ; GCN3-NEXT: buffer_wbinvl1_vol
6453 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6454 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6455 ; GCN3-NEXT: flat_store_short v[0:1], v2
6456 ; GCN3-NEXT: s_endpgm
6458 %val = load atomic i16, ptr %in seq_cst, align 2
6459 store i16 %val, ptr %out
6463 define amdgpu_kernel void @atomic_load_i16_addr64_offset(ptr %in, ptr %out, i64 %index) {
6464 ; GCN1-LABEL: atomic_load_i16_addr64_offset:
6465 ; GCN1: ; %bb.0: ; %entry
6466 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0xd
6467 ; GCN1-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
6468 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6469 ; GCN1-NEXT: s_lshl_b64 s[4:5], s[4:5], 1
6470 ; GCN1-NEXT: s_add_u32 s0, s0, s4
6471 ; GCN1-NEXT: s_addc_u32 s1, s1, s5
6472 ; GCN1-NEXT: s_add_u32 s0, s0, 16
6473 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
6474 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6475 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6476 ; GCN1-NEXT: flat_load_ushort v2, v[0:1] glc
6477 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6478 ; GCN1-NEXT: buffer_wbinvl1_vol
6479 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
6480 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
6481 ; GCN1-NEXT: flat_store_short v[0:1], v2
6482 ; GCN1-NEXT: s_endpgm
6484 ; GCN2-LABEL: atomic_load_i16_addr64_offset:
6485 ; GCN2: ; %bb.0: ; %entry
6486 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x34
6487 ; GCN2-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
6488 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6489 ; GCN2-NEXT: s_lshl_b64 s[4:5], s[4:5], 1
6490 ; GCN2-NEXT: s_add_u32 s0, s0, s4
6491 ; GCN2-NEXT: s_addc_u32 s1, s1, s5
6492 ; GCN2-NEXT: s_add_u32 s0, s0, 16
6493 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
6494 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6495 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6496 ; GCN2-NEXT: flat_load_ushort v2, v[0:1] glc
6497 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6498 ; GCN2-NEXT: buffer_wbinvl1_vol
6499 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
6500 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
6501 ; GCN2-NEXT: flat_store_short v[0:1], v2
6502 ; GCN2-NEXT: s_endpgm
6504 ; GCN3-LABEL: atomic_load_i16_addr64_offset:
6505 ; GCN3: ; %bb.0: ; %entry
6506 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
6507 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6508 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6509 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 1
6510 ; GCN3-NEXT: s_add_u32 s0, s4, s0
6511 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
6512 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
6513 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
6514 ; GCN3-NEXT: flat_load_ushort v2, v[0:1] offset:16 glc
6515 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6516 ; GCN3-NEXT: buffer_wbinvl1_vol
6517 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
6518 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
6519 ; GCN3-NEXT: flat_store_short v[0:1], v2
6520 ; GCN3-NEXT: s_endpgm
6522 %ptr = getelementptr i16, ptr %in, i64 %index
6523 %gep = getelementptr i16, ptr %ptr, i64 8
6524 %val = load atomic i16, ptr %gep seq_cst, align 2
6525 store i16 %val, ptr %out
6529 define amdgpu_kernel void @atomic_store_i16_offset(i16 %in, ptr %out) {
6530 ; GCN1-LABEL: atomic_store_i16_offset:
6531 ; GCN1: ; %bb.0: ; %entry
6532 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
6533 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9
6534 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6535 ; GCN1-NEXT: s_add_u32 s0, s2, 16
6536 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
6537 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6538 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6539 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
6540 ; GCN1-NEXT: flat_store_short v[0:1], v2
6541 ; GCN1-NEXT: s_endpgm
6543 ; GCN2-LABEL: atomic_store_i16_offset:
6544 ; GCN2: ; %bb.0: ; %entry
6545 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6546 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24
6547 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6548 ; GCN2-NEXT: s_add_u32 s0, s2, 16
6549 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
6550 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6551 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6552 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
6553 ; GCN2-NEXT: flat_store_short v[0:1], v2
6554 ; GCN2-NEXT: s_endpgm
6556 ; GCN3-LABEL: atomic_store_i16_offset:
6557 ; GCN3: ; %bb.0: ; %entry
6558 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6559 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24
6560 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6561 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6562 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6563 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
6564 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16
6565 ; GCN3-NEXT: s_endpgm
6567 %gep = getelementptr i16, ptr %out, i64 8
6568 store atomic i16 %in, ptr %gep seq_cst, align 2
6572 define amdgpu_kernel void @atomic_store_i16(i16 %in, ptr %out) {
6573 ; GCN1-LABEL: atomic_store_i16:
6574 ; GCN1: ; %bb.0: ; %entry
6575 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
6576 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9
6577 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6578 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
6579 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
6580 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
6581 ; GCN1-NEXT: flat_store_short v[0:1], v2
6582 ; GCN1-NEXT: s_endpgm
6584 ; GCN2-LABEL: atomic_store_i16:
6585 ; GCN2: ; %bb.0: ; %entry
6586 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6587 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24
6588 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6589 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
6590 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
6591 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
6592 ; GCN2-NEXT: flat_store_short v[0:1], v2
6593 ; GCN2-NEXT: s_endpgm
6595 ; GCN3-LABEL: atomic_store_i16:
6596 ; GCN3: ; %bb.0: ; %entry
6597 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6598 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24
6599 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6600 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6601 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6602 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
6603 ; GCN3-NEXT: flat_store_short v[0:1], v2
6604 ; GCN3-NEXT: s_endpgm
6606 store atomic i16 %in, ptr %out seq_cst, align 2
6610 define amdgpu_kernel void @atomic_store_i16_addr64_offset(i16 %in, ptr %out, i64 %index) {
6611 ; GCN1-LABEL: atomic_store_i16_addr64_offset:
6612 ; GCN1: ; %bb.0: ; %entry
6613 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb
6614 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0x9
6615 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6616 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
6617 ; GCN1-NEXT: s_add_u32 s0, s4, s0
6618 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
6619 ; GCN1-NEXT: s_add_u32 s0, s0, 16
6620 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
6621 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6622 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6623 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
6624 ; GCN1-NEXT: flat_store_short v[0:1], v2
6625 ; GCN1-NEXT: s_endpgm
6627 ; GCN2-LABEL: atomic_store_i16_addr64_offset:
6628 ; GCN2: ; %bb.0: ; %entry
6629 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
6630 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x24
6631 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6632 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
6633 ; GCN2-NEXT: s_add_u32 s0, s4, s0
6634 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
6635 ; GCN2-NEXT: s_add_u32 s0, s0, 16
6636 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
6637 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6638 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6639 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
6640 ; GCN2-NEXT: flat_store_short v[0:1], v2
6641 ; GCN2-NEXT: s_endpgm
6643 ; GCN3-LABEL: atomic_store_i16_addr64_offset:
6644 ; GCN3: ; %bb.0: ; %entry
6645 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2c
6646 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x24
6647 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6648 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[6:7], 1
6649 ; GCN3-NEXT: s_add_u32 s0, s4, s0
6650 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
6651 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
6652 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
6653 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
6654 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16
6655 ; GCN3-NEXT: s_endpgm
6657 %ptr = getelementptr i16, ptr %out, i64 %index
6658 %gep = getelementptr i16, ptr %ptr, i64 8
6659 store atomic i16 %in, ptr %gep seq_cst, align 2
6663 define amdgpu_kernel void @atomic_store_f16_offset(half %in, ptr %out) {
6664 ; GCN1-LABEL: atomic_store_f16_offset:
6665 ; GCN1: ; %bb.0: ; %entry
6666 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
6667 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0x9
6668 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6669 ; GCN1-NEXT: s_add_u32 s0, s2, 16
6670 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
6671 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6672 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6673 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
6674 ; GCN1-NEXT: flat_store_short v[0:1], v2
6675 ; GCN1-NEXT: s_endpgm
6677 ; GCN2-LABEL: atomic_store_f16_offset:
6678 ; GCN2: ; %bb.0: ; %entry
6679 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6680 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x24
6681 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6682 ; GCN2-NEXT: s_add_u32 s0, s2, 16
6683 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
6684 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6685 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6686 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
6687 ; GCN2-NEXT: flat_store_short v[0:1], v2
6688 ; GCN2-NEXT: s_endpgm
6690 ; GCN3-LABEL: atomic_store_f16_offset:
6691 ; GCN3: ; %bb.0: ; %entry
6692 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6693 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24
6694 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6695 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6696 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6697 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
6698 ; GCN3-NEXT: flat_store_short v[0:1], v2 offset:16
6699 ; GCN3-NEXT: s_endpgm
6701 %gep = getelementptr half, ptr %out, i64 8
6702 store atomic half %in, ptr %gep seq_cst, align 2
6706 define amdgpu_kernel void @atomic_store_f16(half %in, ptr %out) {
6707 ; GCN1-LABEL: atomic_store_f16:
6708 ; GCN1: ; %bb.0: ; %entry
6709 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xb
6710 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0x9
6711 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6712 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
6713 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
6714 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
6715 ; GCN1-NEXT: flat_store_short v[0:1], v2
6716 ; GCN1-NEXT: s_endpgm
6718 ; GCN2-LABEL: atomic_store_f16:
6719 ; GCN2: ; %bb.0: ; %entry
6720 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6721 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x24
6722 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6723 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
6724 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
6725 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
6726 ; GCN2-NEXT: flat_store_short v[0:1], v2
6727 ; GCN2-NEXT: s_endpgm
6729 ; GCN3-LABEL: atomic_store_f16:
6730 ; GCN3: ; %bb.0: ; %entry
6731 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
6732 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x24
6733 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6734 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6735 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6736 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
6737 ; GCN3-NEXT: flat_store_short v[0:1], v2
6738 ; GCN3-NEXT: s_endpgm
6740 store atomic half %in, ptr %out seq_cst, align 2
6744 define amdgpu_kernel void @atomic_inc_i32_offset(ptr %out, i32 %in) {
6745 ; GCN1-LABEL: atomic_inc_i32_offset:
6746 ; GCN1: ; %bb.0: ; %entry
6747 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
6748 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
6749 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6750 ; GCN1-NEXT: s_add_u32 s0, s2, 16
6751 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
6752 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6753 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6754 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
6755 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2
6756 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6757 ; GCN1-NEXT: buffer_wbinvl1_vol
6758 ; GCN1-NEXT: s_endpgm
6760 ; GCN2-LABEL: atomic_inc_i32_offset:
6761 ; GCN2: ; %bb.0: ; %entry
6762 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6763 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
6764 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6765 ; GCN2-NEXT: s_add_u32 s0, s2, 16
6766 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
6767 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6768 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6769 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
6770 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2
6771 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6772 ; GCN2-NEXT: buffer_wbinvl1_vol
6773 ; GCN2-NEXT: s_endpgm
6775 ; GCN3-LABEL: atomic_inc_i32_offset:
6776 ; GCN3: ; %bb.0: ; %entry
6777 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6778 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
6779 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6780 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6781 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6782 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
6783 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16
6784 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6785 ; GCN3-NEXT: buffer_wbinvl1_vol
6786 ; GCN3-NEXT: s_endpgm
6788 %gep = getelementptr i32, ptr %out, i32 4
6789 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
6793 define amdgpu_kernel void @atomic_inc_i32_max_offset(ptr %out, i32 %in) {
6794 ; GCN1-LABEL: atomic_inc_i32_max_offset:
6795 ; GCN1: ; %bb.0: ; %entry
6796 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
6797 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
6798 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6799 ; GCN1-NEXT: s_add_u32 s0, s2, 0xffc
6800 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
6801 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6802 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6803 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
6804 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2
6805 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6806 ; GCN1-NEXT: buffer_wbinvl1_vol
6807 ; GCN1-NEXT: s_endpgm
6809 ; GCN2-LABEL: atomic_inc_i32_max_offset:
6810 ; GCN2: ; %bb.0: ; %entry
6811 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6812 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
6813 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6814 ; GCN2-NEXT: s_add_u32 s0, s2, 0xffc
6815 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
6816 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6817 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6818 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
6819 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2
6820 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6821 ; GCN2-NEXT: buffer_wbinvl1_vol
6822 ; GCN2-NEXT: s_endpgm
6824 ; GCN3-LABEL: atomic_inc_i32_max_offset:
6825 ; GCN3: ; %bb.0: ; %entry
6826 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6827 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
6828 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6829 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6830 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6831 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
6832 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:4092
6833 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6834 ; GCN3-NEXT: buffer_wbinvl1_vol
6835 ; GCN3-NEXT: s_endpgm
6837 %gep = getelementptr i32, ptr %out, i32 1023
6838 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
6842 define amdgpu_kernel void @atomic_inc_i32_max_offset_p1(ptr %out, i32 %in) {
6843 ; GCN1-LABEL: atomic_inc_i32_max_offset_p1:
6844 ; GCN1: ; %bb.0: ; %entry
6845 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
6846 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
6847 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6848 ; GCN1-NEXT: s_add_u32 s0, s2, 0x1000
6849 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
6850 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6851 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6852 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
6853 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2
6854 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6855 ; GCN1-NEXT: buffer_wbinvl1_vol
6856 ; GCN1-NEXT: s_endpgm
6858 ; GCN2-LABEL: atomic_inc_i32_max_offset_p1:
6859 ; GCN2: ; %bb.0: ; %entry
6860 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6861 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
6862 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6863 ; GCN2-NEXT: s_add_u32 s0, s2, 0x1000
6864 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
6865 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6866 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6867 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
6868 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2
6869 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6870 ; GCN2-NEXT: buffer_wbinvl1_vol
6871 ; GCN2-NEXT: s_endpgm
6873 ; GCN3-LABEL: atomic_inc_i32_max_offset_p1:
6874 ; GCN3: ; %bb.0: ; %entry
6875 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
6876 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
6877 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6878 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
6879 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
6880 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
6881 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
6882 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
6883 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2
6884 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6885 ; GCN3-NEXT: buffer_wbinvl1_vol
6886 ; GCN3-NEXT: s_endpgm
6888 %gep = getelementptr i32, ptr %out, i32 1024
6889 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
6893 define amdgpu_kernel void @atomic_inc_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
6894 ; GCN1-LABEL: atomic_inc_i32_ret_offset:
6895 ; GCN1: ; %bb.0: ; %entry
6896 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
6897 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
6898 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6899 ; GCN1-NEXT: s_add_u32 s0, s4, 16
6900 ; GCN1-NEXT: s_addc_u32 s1, s5, 0
6901 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6902 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6903 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
6904 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
6905 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6906 ; GCN1-NEXT: buffer_wbinvl1_vol
6907 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
6908 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
6909 ; GCN1-NEXT: flat_store_dword v[0:1], v2
6910 ; GCN1-NEXT: s_endpgm
6912 ; GCN2-LABEL: atomic_inc_i32_ret_offset:
6913 ; GCN2: ; %bb.0: ; %entry
6914 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6915 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
6916 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6917 ; GCN2-NEXT: s_add_u32 s0, s4, 16
6918 ; GCN2-NEXT: s_addc_u32 s1, s5, 0
6919 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6920 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6921 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
6922 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
6923 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6924 ; GCN2-NEXT: buffer_wbinvl1_vol
6925 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
6926 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
6927 ; GCN2-NEXT: flat_store_dword v[0:1], v2
6928 ; GCN2-NEXT: s_endpgm
6930 ; GCN3-LABEL: atomic_inc_i32_ret_offset:
6931 ; GCN3: ; %bb.0: ; %entry
6932 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
6933 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
6934 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6935 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
6936 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
6937 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
6938 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc
6939 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6940 ; GCN3-NEXT: buffer_wbinvl1_vol
6941 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
6942 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
6943 ; GCN3-NEXT: flat_store_dword v[0:1], v2
6944 ; GCN3-NEXT: s_endpgm
6946 %gep = getelementptr i32, ptr %out, i32 4
6947 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
6948 store i32 %val, ptr %out2
6952 define amdgpu_kernel void @atomic_inc_i32_incr64_offset(ptr %out, i32 %in, i64 %index) {
6953 ; GCN1-LABEL: atomic_inc_i32_incr64_offset:
6954 ; GCN1: ; %bb.0: ; %entry
6955 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
6956 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
6957 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
6958 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
6959 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
6960 ; GCN1-NEXT: s_add_u32 s0, s4, s0
6961 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
6962 ; GCN1-NEXT: s_add_u32 s0, s0, 16
6963 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
6964 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
6965 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
6966 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
6967 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2
6968 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6969 ; GCN1-NEXT: buffer_wbinvl1_vol
6970 ; GCN1-NEXT: s_endpgm
6972 ; GCN2-LABEL: atomic_inc_i32_incr64_offset:
6973 ; GCN2: ; %bb.0: ; %entry
6974 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
6975 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
6976 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
6977 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
6978 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
6979 ; GCN2-NEXT: s_add_u32 s0, s4, s0
6980 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
6981 ; GCN2-NEXT: s_add_u32 s0, s0, 16
6982 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
6983 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
6984 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
6985 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
6986 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2
6987 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6988 ; GCN2-NEXT: buffer_wbinvl1_vol
6989 ; GCN2-NEXT: s_endpgm
6991 ; GCN3-LABEL: atomic_inc_i32_incr64_offset:
6992 ; GCN3: ; %bb.0: ; %entry
6993 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
6994 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
6995 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
6996 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
6997 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
6998 ; GCN3-NEXT: s_add_u32 s0, s4, s0
6999 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
7000 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
7001 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
7002 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
7003 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2 offset:16
7004 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7005 ; GCN3-NEXT: buffer_wbinvl1_vol
7006 ; GCN3-NEXT: s_endpgm
7008 %ptr = getelementptr i32, ptr %out, i64 %index
7009 %gep = getelementptr i32, ptr %ptr, i32 4
7010 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7014 define amdgpu_kernel void @atomic_inc_i32_ret_incr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
7015 ; GCN1-LABEL: atomic_inc_i32_ret_incr64_offset:
7016 ; GCN1: ; %bb.0: ; %entry
7017 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
7018 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
7019 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
7020 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7021 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7022 ; GCN1-NEXT: s_add_u32 s0, s4, s0
7023 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
7024 ; GCN1-NEXT: s_add_u32 s0, s0, 16
7025 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
7026 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7027 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7028 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
7029 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
7030 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7031 ; GCN1-NEXT: buffer_wbinvl1_vol
7032 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
7033 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
7034 ; GCN1-NEXT: flat_store_dword v[0:1], v2
7035 ; GCN1-NEXT: s_endpgm
7037 ; GCN2-LABEL: atomic_inc_i32_ret_incr64_offset:
7038 ; GCN2: ; %bb.0: ; %entry
7039 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
7040 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7041 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
7042 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7043 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7044 ; GCN2-NEXT: s_add_u32 s0, s4, s0
7045 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
7046 ; GCN2-NEXT: s_add_u32 s0, s0, 16
7047 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
7048 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7049 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7050 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
7051 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
7052 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7053 ; GCN2-NEXT: buffer_wbinvl1_vol
7054 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
7055 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
7056 ; GCN2-NEXT: flat_store_dword v[0:1], v2
7057 ; GCN2-NEXT: s_endpgm
7059 ; GCN3-LABEL: atomic_inc_i32_ret_incr64_offset:
7060 ; GCN3: ; %bb.0: ; %entry
7061 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
7062 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7063 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
7064 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7065 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7066 ; GCN3-NEXT: s_add_u32 s0, s4, s0
7067 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
7068 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
7069 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
7070 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
7071 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 offset:16 glc
7072 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7073 ; GCN3-NEXT: buffer_wbinvl1_vol
7074 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
7075 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
7076 ; GCN3-NEXT: flat_store_dword v[0:1], v2
7077 ; GCN3-NEXT: s_endpgm
7079 %ptr = getelementptr i32, ptr %out, i64 %index
7080 %gep = getelementptr i32, ptr %ptr, i32 4
7081 %val = atomicrmw volatile uinc_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7082 store i32 %val, ptr %out2
7086 define amdgpu_kernel void @atomic_inc_i32(ptr %out, i32 %in) {
7087 ; GCN1-LABEL: atomic_inc_i32:
7088 ; GCN1: ; %bb.0: ; %entry
7089 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
7090 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb
7091 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7092 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
7093 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
7094 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
7095 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2
7096 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7097 ; GCN1-NEXT: buffer_wbinvl1_vol
7098 ; GCN1-NEXT: s_endpgm
7100 ; GCN2-LABEL: atomic_inc_i32:
7101 ; GCN2: ; %bb.0: ; %entry
7102 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
7103 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c
7104 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7105 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
7106 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
7107 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
7108 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2
7109 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7110 ; GCN2-NEXT: buffer_wbinvl1_vol
7111 ; GCN2-NEXT: s_endpgm
7113 ; GCN3-LABEL: atomic_inc_i32:
7114 ; GCN3: ; %bb.0: ; %entry
7115 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
7116 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
7117 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7118 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
7119 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
7120 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
7121 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2
7122 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7123 ; GCN3-NEXT: buffer_wbinvl1_vol
7124 ; GCN3-NEXT: s_endpgm
7126 %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst
7130 define amdgpu_kernel void @atomic_inc_i32_ret(ptr %out, ptr %out2, i32 %in) {
7131 ; GCN1-LABEL: atomic_inc_i32_ret:
7132 ; GCN1: ; %bb.0: ; %entry
7133 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
7134 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd
7135 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7136 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
7137 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
7138 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
7139 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
7140 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7141 ; GCN1-NEXT: buffer_wbinvl1_vol
7142 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
7143 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
7144 ; GCN1-NEXT: flat_store_dword v[0:1], v2
7145 ; GCN1-NEXT: s_endpgm
7147 ; GCN2-LABEL: atomic_inc_i32_ret:
7148 ; GCN2: ; %bb.0: ; %entry
7149 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7150 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34
7151 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7152 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
7153 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
7154 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
7155 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
7156 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7157 ; GCN2-NEXT: buffer_wbinvl1_vol
7158 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
7159 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
7160 ; GCN2-NEXT: flat_store_dword v[0:1], v2
7161 ; GCN2-NEXT: s_endpgm
7163 ; GCN3-LABEL: atomic_inc_i32_ret:
7164 ; GCN3: ; %bb.0: ; %entry
7165 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7166 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
7167 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7168 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
7169 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
7170 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
7171 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
7172 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7173 ; GCN3-NEXT: buffer_wbinvl1_vol
7174 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
7175 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
7176 ; GCN3-NEXT: flat_store_dword v[0:1], v2
7177 ; GCN3-NEXT: s_endpgm
7179 %val = atomicrmw volatile uinc_wrap ptr %out, i32 %in syncscope("agent") seq_cst
7180 store i32 %val, ptr %out2
7184 define amdgpu_kernel void @atomic_inc_i32_incr64(ptr %out, i32 %in, i64 %index) {
7185 ; GCN1-LABEL: atomic_inc_i32_incr64:
7186 ; GCN1: ; %bb.0: ; %entry
7187 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
7188 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
7189 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
7190 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7191 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7192 ; GCN1-NEXT: s_add_u32 s0, s4, s0
7193 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
7194 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7195 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7196 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
7197 ; GCN1-NEXT: flat_atomic_inc v[0:1], v2
7198 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7199 ; GCN1-NEXT: buffer_wbinvl1_vol
7200 ; GCN1-NEXT: s_endpgm
7202 ; GCN2-LABEL: atomic_inc_i32_incr64:
7203 ; GCN2: ; %bb.0: ; %entry
7204 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
7205 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
7206 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
7207 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7208 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7209 ; GCN2-NEXT: s_add_u32 s0, s4, s0
7210 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
7211 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7212 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7213 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
7214 ; GCN2-NEXT: flat_atomic_inc v[0:1], v2
7215 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7216 ; GCN2-NEXT: buffer_wbinvl1_vol
7217 ; GCN2-NEXT: s_endpgm
7219 ; GCN3-LABEL: atomic_inc_i32_incr64:
7220 ; GCN3: ; %bb.0: ; %entry
7221 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
7222 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
7223 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
7224 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7225 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7226 ; GCN3-NEXT: s_add_u32 s0, s4, s0
7227 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
7228 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
7229 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
7230 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
7231 ; GCN3-NEXT: flat_atomic_inc v[0:1], v2
7232 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7233 ; GCN3-NEXT: buffer_wbinvl1_vol
7234 ; GCN3-NEXT: s_endpgm
7236 %ptr = getelementptr i32, ptr %out, i64 %index
7237 %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
7241 define amdgpu_kernel void @atomic_inc_i32_ret_incr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
7242 ; GCN1-LABEL: atomic_inc_i32_ret_incr64:
7243 ; GCN1: ; %bb.0: ; %entry
7244 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
7245 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
7246 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
7247 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7248 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7249 ; GCN1-NEXT: s_add_u32 s0, s4, s0
7250 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
7251 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7252 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7253 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
7254 ; GCN1-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
7255 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7256 ; GCN1-NEXT: buffer_wbinvl1_vol
7257 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
7258 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
7259 ; GCN1-NEXT: flat_store_dword v[0:1], v2
7260 ; GCN1-NEXT: s_endpgm
7262 ; GCN2-LABEL: atomic_inc_i32_ret_incr64:
7263 ; GCN2: ; %bb.0: ; %entry
7264 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
7265 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7266 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
7267 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7268 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7269 ; GCN2-NEXT: s_add_u32 s0, s4, s0
7270 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
7271 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7272 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7273 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
7274 ; GCN2-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
7275 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7276 ; GCN2-NEXT: buffer_wbinvl1_vol
7277 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
7278 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
7279 ; GCN2-NEXT: flat_store_dword v[0:1], v2
7280 ; GCN2-NEXT: s_endpgm
7282 ; GCN3-LABEL: atomic_inc_i32_ret_incr64:
7283 ; GCN3: ; %bb.0: ; %entry
7284 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
7285 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7286 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
7287 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7288 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7289 ; GCN3-NEXT: s_add_u32 s0, s4, s0
7290 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
7291 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
7292 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
7293 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
7294 ; GCN3-NEXT: flat_atomic_inc v2, v[0:1], v2 glc
7295 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7296 ; GCN3-NEXT: buffer_wbinvl1_vol
7297 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
7298 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
7299 ; GCN3-NEXT: flat_store_dword v[0:1], v2
7300 ; GCN3-NEXT: s_endpgm
7302 %ptr = getelementptr i32, ptr %out, i64 %index
7303 %val = atomicrmw volatile uinc_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
7304 store i32 %val, ptr %out2
7308 define amdgpu_kernel void @atomic_dec_i32_offset(ptr %out, i32 %in) {
7309 ; GCN1-LABEL: atomic_dec_i32_offset:
7310 ; GCN1: ; %bb.0: ; %entry
7311 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
7312 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
7313 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7314 ; GCN1-NEXT: s_add_u32 s0, s2, 16
7315 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
7316 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7317 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7318 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
7319 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2
7320 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7321 ; GCN1-NEXT: buffer_wbinvl1_vol
7322 ; GCN1-NEXT: s_endpgm
7324 ; GCN2-LABEL: atomic_dec_i32_offset:
7325 ; GCN2: ; %bb.0: ; %entry
7326 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
7327 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
7328 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7329 ; GCN2-NEXT: s_add_u32 s0, s2, 16
7330 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
7331 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7332 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7333 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
7334 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2
7335 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7336 ; GCN2-NEXT: buffer_wbinvl1_vol
7337 ; GCN2-NEXT: s_endpgm
7339 ; GCN3-LABEL: atomic_dec_i32_offset:
7340 ; GCN3: ; %bb.0: ; %entry
7341 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
7342 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
7343 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7344 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
7345 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
7346 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
7347 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16
7348 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7349 ; GCN3-NEXT: buffer_wbinvl1_vol
7350 ; GCN3-NEXT: s_endpgm
7352 %gep = getelementptr i32, ptr %out, i32 4
7353 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7357 define amdgpu_kernel void @atomic_dec_i32_max_offset(ptr %out, i32 %in) {
7358 ; GCN1-LABEL: atomic_dec_i32_max_offset:
7359 ; GCN1: ; %bb.0: ; %entry
7360 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
7361 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
7362 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7363 ; GCN1-NEXT: s_add_u32 s0, s2, 0xffc
7364 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
7365 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7366 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7367 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
7368 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2
7369 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7370 ; GCN1-NEXT: buffer_wbinvl1_vol
7371 ; GCN1-NEXT: s_endpgm
7373 ; GCN2-LABEL: atomic_dec_i32_max_offset:
7374 ; GCN2: ; %bb.0: ; %entry
7375 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
7376 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
7377 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7378 ; GCN2-NEXT: s_add_u32 s0, s2, 0xffc
7379 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
7380 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7381 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7382 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
7383 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2
7384 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7385 ; GCN2-NEXT: buffer_wbinvl1_vol
7386 ; GCN2-NEXT: s_endpgm
7388 ; GCN3-LABEL: atomic_dec_i32_max_offset:
7389 ; GCN3: ; %bb.0: ; %entry
7390 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
7391 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
7392 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7393 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
7394 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
7395 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
7396 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:4092
7397 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7398 ; GCN3-NEXT: buffer_wbinvl1_vol
7399 ; GCN3-NEXT: s_endpgm
7401 %gep = getelementptr i32, ptr %out, i32 1023
7402 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7406 define amdgpu_kernel void @atomic_dec_i32_max_offset_p1(ptr %out, i32 %in) {
7407 ; GCN1-LABEL: atomic_dec_i32_max_offset_p1:
7408 ; GCN1: ; %bb.0: ; %entry
7409 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
7410 ; GCN1-NEXT: s_load_dword s4, s[0:1], 0xb
7411 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7412 ; GCN1-NEXT: s_add_u32 s0, s2, 0x1000
7413 ; GCN1-NEXT: s_addc_u32 s1, s3, 0
7414 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7415 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7416 ; GCN1-NEXT: v_mov_b32_e32 v2, s4
7417 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2
7418 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7419 ; GCN1-NEXT: buffer_wbinvl1_vol
7420 ; GCN1-NEXT: s_endpgm
7422 ; GCN2-LABEL: atomic_dec_i32_max_offset_p1:
7423 ; GCN2: ; %bb.0: ; %entry
7424 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
7425 ; GCN2-NEXT: s_load_dword s4, s[0:1], 0x2c
7426 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7427 ; GCN2-NEXT: s_add_u32 s0, s2, 0x1000
7428 ; GCN2-NEXT: s_addc_u32 s1, s3, 0
7429 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7430 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7431 ; GCN2-NEXT: v_mov_b32_e32 v2, s4
7432 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2
7433 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7434 ; GCN2-NEXT: buffer_wbinvl1_vol
7435 ; GCN2-NEXT: s_endpgm
7437 ; GCN3-LABEL: atomic_dec_i32_max_offset_p1:
7438 ; GCN3: ; %bb.0: ; %entry
7439 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
7440 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
7441 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7442 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
7443 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
7444 ; GCN3-NEXT: v_add_co_u32_e32 v0, vcc, 0x1000, v0
7445 ; GCN3-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
7446 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
7447 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2
7448 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7449 ; GCN3-NEXT: buffer_wbinvl1_vol
7450 ; GCN3-NEXT: s_endpgm
7452 %gep = getelementptr i32, ptr %out, i32 1024
7453 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7457 define amdgpu_kernel void @atomic_dec_i32_ret_offset(ptr %out, ptr %out2, i32 %in) {
7458 ; GCN1-LABEL: atomic_dec_i32_ret_offset:
7459 ; GCN1: ; %bb.0: ; %entry
7460 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
7461 ; GCN1-NEXT: s_load_dword s2, s[0:1], 0xd
7462 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7463 ; GCN1-NEXT: s_add_u32 s0, s4, 16
7464 ; GCN1-NEXT: s_addc_u32 s1, s5, 0
7465 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7466 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7467 ; GCN1-NEXT: v_mov_b32_e32 v2, s2
7468 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
7469 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7470 ; GCN1-NEXT: buffer_wbinvl1_vol
7471 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
7472 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
7473 ; GCN1-NEXT: flat_store_dword v[0:1], v2
7474 ; GCN1-NEXT: s_endpgm
7476 ; GCN2-LABEL: atomic_dec_i32_ret_offset:
7477 ; GCN2: ; %bb.0: ; %entry
7478 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7479 ; GCN2-NEXT: s_load_dword s2, s[0:1], 0x34
7480 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7481 ; GCN2-NEXT: s_add_u32 s0, s4, 16
7482 ; GCN2-NEXT: s_addc_u32 s1, s5, 0
7483 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7484 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7485 ; GCN2-NEXT: v_mov_b32_e32 v2, s2
7486 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
7487 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7488 ; GCN2-NEXT: buffer_wbinvl1_vol
7489 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
7490 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
7491 ; GCN2-NEXT: flat_store_dword v[0:1], v2
7492 ; GCN2-NEXT: s_endpgm
7494 ; GCN3-LABEL: atomic_dec_i32_ret_offset:
7495 ; GCN3: ; %bb.0: ; %entry
7496 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7497 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
7498 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7499 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
7500 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
7501 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
7502 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc
7503 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7504 ; GCN3-NEXT: buffer_wbinvl1_vol
7505 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
7506 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
7507 ; GCN3-NEXT: flat_store_dword v[0:1], v2
7508 ; GCN3-NEXT: s_endpgm
7510 %gep = getelementptr i32, ptr %out, i32 4
7511 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7512 store i32 %val, ptr %out2
7516 define amdgpu_kernel void @atomic_dec_i32_decr64_offset(ptr %out, i32 %in, i64 %index) {
7517 ; GCN1-LABEL: atomic_dec_i32_decr64_offset:
7518 ; GCN1: ; %bb.0: ; %entry
7519 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
7520 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
7521 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
7522 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7523 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7524 ; GCN1-NEXT: s_add_u32 s0, s4, s0
7525 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
7526 ; GCN1-NEXT: s_add_u32 s0, s0, 16
7527 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
7528 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7529 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7530 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
7531 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2
7532 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7533 ; GCN1-NEXT: buffer_wbinvl1_vol
7534 ; GCN1-NEXT: s_endpgm
7536 ; GCN2-LABEL: atomic_dec_i32_decr64_offset:
7537 ; GCN2: ; %bb.0: ; %entry
7538 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
7539 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
7540 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
7541 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7542 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7543 ; GCN2-NEXT: s_add_u32 s0, s4, s0
7544 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
7545 ; GCN2-NEXT: s_add_u32 s0, s0, 16
7546 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
7547 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7548 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7549 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
7550 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2
7551 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7552 ; GCN2-NEXT: buffer_wbinvl1_vol
7553 ; GCN2-NEXT: s_endpgm
7555 ; GCN3-LABEL: atomic_dec_i32_decr64_offset:
7556 ; GCN3: ; %bb.0: ; %entry
7557 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
7558 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
7559 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
7560 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7561 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7562 ; GCN3-NEXT: s_add_u32 s0, s4, s0
7563 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
7564 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
7565 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
7566 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
7567 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2 offset:16
7568 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7569 ; GCN3-NEXT: buffer_wbinvl1_vol
7570 ; GCN3-NEXT: s_endpgm
7572 %ptr = getelementptr i32, ptr %out, i64 %index
7573 %gep = getelementptr i32, ptr %ptr, i32 4
7574 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7578 define amdgpu_kernel void @atomic_dec_i32_ret_decr64_offset(ptr %out, ptr %out2, i32 %in, i64 %index) {
7579 ; GCN1-LABEL: atomic_dec_i32_ret_decr64_offset:
7580 ; GCN1: ; %bb.0: ; %entry
7581 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
7582 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
7583 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
7584 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7585 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7586 ; GCN1-NEXT: s_add_u32 s0, s4, s0
7587 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
7588 ; GCN1-NEXT: s_add_u32 s0, s0, 16
7589 ; GCN1-NEXT: s_addc_u32 s1, s1, 0
7590 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7591 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7592 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
7593 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
7594 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7595 ; GCN1-NEXT: buffer_wbinvl1_vol
7596 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
7597 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
7598 ; GCN1-NEXT: flat_store_dword v[0:1], v2
7599 ; GCN1-NEXT: s_endpgm
7601 ; GCN2-LABEL: atomic_dec_i32_ret_decr64_offset:
7602 ; GCN2: ; %bb.0: ; %entry
7603 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
7604 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7605 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
7606 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7607 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7608 ; GCN2-NEXT: s_add_u32 s0, s4, s0
7609 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
7610 ; GCN2-NEXT: s_add_u32 s0, s0, 16
7611 ; GCN2-NEXT: s_addc_u32 s1, s1, 0
7612 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7613 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7614 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
7615 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
7616 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7617 ; GCN2-NEXT: buffer_wbinvl1_vol
7618 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
7619 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
7620 ; GCN2-NEXT: flat_store_dword v[0:1], v2
7621 ; GCN2-NEXT: s_endpgm
7623 ; GCN3-LABEL: atomic_dec_i32_ret_decr64_offset:
7624 ; GCN3: ; %bb.0: ; %entry
7625 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
7626 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7627 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
7628 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7629 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7630 ; GCN3-NEXT: s_add_u32 s0, s4, s0
7631 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
7632 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
7633 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
7634 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
7635 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc
7636 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7637 ; GCN3-NEXT: buffer_wbinvl1_vol
7638 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
7639 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
7640 ; GCN3-NEXT: flat_store_dword v[0:1], v2
7641 ; GCN3-NEXT: s_endpgm
7643 %ptr = getelementptr i32, ptr %out, i64 %index
7644 %gep = getelementptr i32, ptr %ptr, i32 4
7645 %val = atomicrmw volatile udec_wrap ptr %gep, i32 %in syncscope("agent") seq_cst
7646 store i32 %val, ptr %out2
7650 define amdgpu_kernel void @atomic_dec_i32(ptr %out, i32 %in) {
7651 ; GCN1-LABEL: atomic_dec_i32:
7652 ; GCN1: ; %bb.0: ; %entry
7653 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
7654 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xb
7655 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7656 ; GCN1-NEXT: v_mov_b32_e32 v0, s2
7657 ; GCN1-NEXT: v_mov_b32_e32 v1, s3
7658 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
7659 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2
7660 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7661 ; GCN1-NEXT: buffer_wbinvl1_vol
7662 ; GCN1-NEXT: s_endpgm
7664 ; GCN2-LABEL: atomic_dec_i32:
7665 ; GCN2: ; %bb.0: ; %entry
7666 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
7667 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x2c
7668 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7669 ; GCN2-NEXT: v_mov_b32_e32 v0, s2
7670 ; GCN2-NEXT: v_mov_b32_e32 v1, s3
7671 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
7672 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2
7673 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7674 ; GCN2-NEXT: buffer_wbinvl1_vol
7675 ; GCN2-NEXT: s_endpgm
7677 ; GCN3-LABEL: atomic_dec_i32:
7678 ; GCN3: ; %bb.0: ; %entry
7679 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
7680 ; GCN3-NEXT: s_load_dword s4, s[0:1], 0x2c
7681 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7682 ; GCN3-NEXT: v_mov_b32_e32 v0, s2
7683 ; GCN3-NEXT: v_mov_b32_e32 v1, s3
7684 ; GCN3-NEXT: v_mov_b32_e32 v2, s4
7685 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2
7686 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7687 ; GCN3-NEXT: buffer_wbinvl1_vol
7688 ; GCN3-NEXT: s_endpgm
7690 %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst
7694 define amdgpu_kernel void @atomic_dec_i32_ret(ptr %out, ptr %out2, i32 %in) {
7695 ; GCN1-LABEL: atomic_dec_i32_ret:
7696 ; GCN1: ; %bb.0: ; %entry
7697 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
7698 ; GCN1-NEXT: s_load_dword s0, s[0:1], 0xd
7699 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7700 ; GCN1-NEXT: v_mov_b32_e32 v0, s4
7701 ; GCN1-NEXT: v_mov_b32_e32 v1, s5
7702 ; GCN1-NEXT: v_mov_b32_e32 v2, s0
7703 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
7704 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7705 ; GCN1-NEXT: buffer_wbinvl1_vol
7706 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
7707 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
7708 ; GCN1-NEXT: flat_store_dword v[0:1], v2
7709 ; GCN1-NEXT: s_endpgm
7711 ; GCN2-LABEL: atomic_dec_i32_ret:
7712 ; GCN2: ; %bb.0: ; %entry
7713 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7714 ; GCN2-NEXT: s_load_dword s0, s[0:1], 0x34
7715 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7716 ; GCN2-NEXT: v_mov_b32_e32 v0, s4
7717 ; GCN2-NEXT: v_mov_b32_e32 v1, s5
7718 ; GCN2-NEXT: v_mov_b32_e32 v2, s0
7719 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
7720 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7721 ; GCN2-NEXT: buffer_wbinvl1_vol
7722 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
7723 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
7724 ; GCN2-NEXT: flat_store_dword v[0:1], v2
7725 ; GCN2-NEXT: s_endpgm
7727 ; GCN3-LABEL: atomic_dec_i32_ret:
7728 ; GCN3: ; %bb.0: ; %entry
7729 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7730 ; GCN3-NEXT: s_load_dword s2, s[0:1], 0x34
7731 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7732 ; GCN3-NEXT: v_mov_b32_e32 v0, s4
7733 ; GCN3-NEXT: v_mov_b32_e32 v1, s5
7734 ; GCN3-NEXT: v_mov_b32_e32 v2, s2
7735 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
7736 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7737 ; GCN3-NEXT: buffer_wbinvl1_vol
7738 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
7739 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
7740 ; GCN3-NEXT: flat_store_dword v[0:1], v2
7741 ; GCN3-NEXT: s_endpgm
7743 %val = atomicrmw volatile udec_wrap ptr %out, i32 %in syncscope("agent") seq_cst
7744 store i32 %val, ptr %out2
7748 define amdgpu_kernel void @atomic_dec_i32_decr64(ptr %out, i32 %in, i64 %index) {
7749 ; GCN1-LABEL: atomic_dec_i32_decr64:
7750 ; GCN1: ; %bb.0: ; %entry
7751 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xd
7752 ; GCN1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
7753 ; GCN1-NEXT: s_load_dword s6, s[0:1], 0xb
7754 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7755 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7756 ; GCN1-NEXT: s_add_u32 s0, s4, s0
7757 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
7758 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7759 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7760 ; GCN1-NEXT: v_mov_b32_e32 v2, s6
7761 ; GCN1-NEXT: flat_atomic_dec v[0:1], v2
7762 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7763 ; GCN1-NEXT: buffer_wbinvl1_vol
7764 ; GCN1-NEXT: s_endpgm
7766 ; GCN2-LABEL: atomic_dec_i32_decr64:
7767 ; GCN2: ; %bb.0: ; %entry
7768 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
7769 ; GCN2-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
7770 ; GCN2-NEXT: s_load_dword s6, s[0:1], 0x2c
7771 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7772 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7773 ; GCN2-NEXT: s_add_u32 s0, s4, s0
7774 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
7775 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7776 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7777 ; GCN2-NEXT: v_mov_b32_e32 v2, s6
7778 ; GCN2-NEXT: flat_atomic_dec v[0:1], v2
7779 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7780 ; GCN2-NEXT: buffer_wbinvl1_vol
7781 ; GCN2-NEXT: s_endpgm
7783 ; GCN3-LABEL: atomic_dec_i32_decr64:
7784 ; GCN3: ; %bb.0: ; %entry
7785 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
7786 ; GCN3-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24
7787 ; GCN3-NEXT: s_load_dword s6, s[0:1], 0x2c
7788 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7789 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7790 ; GCN3-NEXT: s_add_u32 s0, s4, s0
7791 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
7792 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
7793 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
7794 ; GCN3-NEXT: v_mov_b32_e32 v2, s6
7795 ; GCN3-NEXT: flat_atomic_dec v[0:1], v2
7796 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7797 ; GCN3-NEXT: buffer_wbinvl1_vol
7798 ; GCN3-NEXT: s_endpgm
7800 %ptr = getelementptr i32, ptr %out, i64 %index
7801 %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
7805 define amdgpu_kernel void @atomic_dec_i32_ret_decr64(ptr %out, ptr %out2, i32 %in, i64 %index) {
7806 ; GCN1-LABEL: atomic_dec_i32_ret_decr64:
7807 ; GCN1: ; %bb.0: ; %entry
7808 ; GCN1-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0xf
7809 ; GCN1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
7810 ; GCN1-NEXT: s_load_dword s8, s[0:1], 0xd
7811 ; GCN1-NEXT: s_waitcnt lgkmcnt(0)
7812 ; GCN1-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7813 ; GCN1-NEXT: s_add_u32 s0, s4, s0
7814 ; GCN1-NEXT: s_addc_u32 s1, s5, s1
7815 ; GCN1-NEXT: v_mov_b32_e32 v0, s0
7816 ; GCN1-NEXT: v_mov_b32_e32 v1, s1
7817 ; GCN1-NEXT: v_mov_b32_e32 v2, s8
7818 ; GCN1-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
7819 ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7820 ; GCN1-NEXT: buffer_wbinvl1_vol
7821 ; GCN1-NEXT: v_mov_b32_e32 v0, s6
7822 ; GCN1-NEXT: v_mov_b32_e32 v1, s7
7823 ; GCN1-NEXT: flat_store_dword v[0:1], v2
7824 ; GCN1-NEXT: s_endpgm
7826 ; GCN2-LABEL: atomic_dec_i32_ret_decr64:
7827 ; GCN2: ; %bb.0: ; %entry
7828 ; GCN2-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
7829 ; GCN2-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7830 ; GCN2-NEXT: s_load_dword s8, s[0:1], 0x34
7831 ; GCN2-NEXT: s_waitcnt lgkmcnt(0)
7832 ; GCN2-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7833 ; GCN2-NEXT: s_add_u32 s0, s4, s0
7834 ; GCN2-NEXT: s_addc_u32 s1, s5, s1
7835 ; GCN2-NEXT: v_mov_b32_e32 v0, s0
7836 ; GCN2-NEXT: v_mov_b32_e32 v1, s1
7837 ; GCN2-NEXT: v_mov_b32_e32 v2, s8
7838 ; GCN2-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
7839 ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7840 ; GCN2-NEXT: buffer_wbinvl1_vol
7841 ; GCN2-NEXT: v_mov_b32_e32 v0, s6
7842 ; GCN2-NEXT: v_mov_b32_e32 v1, s7
7843 ; GCN2-NEXT: flat_store_dword v[0:1], v2
7844 ; GCN2-NEXT: s_endpgm
7846 ; GCN3-LABEL: atomic_dec_i32_ret_decr64:
7847 ; GCN3: ; %bb.0: ; %entry
7848 ; GCN3-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x3c
7849 ; GCN3-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
7850 ; GCN3-NEXT: s_load_dword s8, s[0:1], 0x34
7851 ; GCN3-NEXT: s_waitcnt lgkmcnt(0)
7852 ; GCN3-NEXT: s_lshl_b64 s[0:1], s[2:3], 2
7853 ; GCN3-NEXT: s_add_u32 s0, s4, s0
7854 ; GCN3-NEXT: s_addc_u32 s1, s5, s1
7855 ; GCN3-NEXT: v_mov_b32_e32 v0, s0
7856 ; GCN3-NEXT: v_mov_b32_e32 v1, s1
7857 ; GCN3-NEXT: v_mov_b32_e32 v2, s8
7858 ; GCN3-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
7859 ; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7860 ; GCN3-NEXT: buffer_wbinvl1_vol
7861 ; GCN3-NEXT: v_mov_b32_e32 v0, s6
7862 ; GCN3-NEXT: v_mov_b32_e32 v1, s7
7863 ; GCN3-NEXT: flat_store_dword v[0:1], v2
7864 ; GCN3-NEXT: s_endpgm
7866 %ptr = getelementptr i32, ptr %out, i64 %index
7867 %val = atomicrmw volatile udec_wrap ptr %ptr, i32 %in syncscope("agent") seq_cst
7868 store i32 %val, ptr %out2