1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -O0 -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
8 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
9 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
10 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
11 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
12 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
13 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
15 define amdgpu_kernel void @flat_workgroup_unordered_load(
16 ; GFX7-LABEL: flat_workgroup_unordered_load:
17 ; GFX7: ; %bb.0: ; %entry
18 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
19 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
20 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
21 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
22 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
23 ; GFX7-NEXT: flat_load_dword v2, v[0:1]
24 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
25 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
26 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
27 ; GFX7-NEXT: flat_store_dword v[0:1], v2
30 ; GFX10-WGP-LABEL: flat_workgroup_unordered_load:
31 ; GFX10-WGP: ; %bb.0: ; %entry
32 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
33 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
34 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
35 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
36 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
37 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
38 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
39 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
40 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
41 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
42 ; GFX10-WGP-NEXT: s_endpgm
44 ; GFX10-CU-LABEL: flat_workgroup_unordered_load:
45 ; GFX10-CU: ; %bb.0: ; %entry
46 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
47 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
48 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
49 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
50 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
51 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
52 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
53 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
54 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
55 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
56 ; GFX10-CU-NEXT: s_endpgm
58 ; SKIP-CACHE-INV-LABEL: flat_workgroup_unordered_load:
59 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
60 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
61 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
62 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
63 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
64 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
65 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
66 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
67 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
68 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
69 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
70 ; SKIP-CACHE-INV-NEXT: s_endpgm
72 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
73 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
74 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
75 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
76 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
77 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
78 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
79 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
80 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
81 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
82 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
84 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load:
85 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
86 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
87 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
88 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
89 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
90 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
91 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
92 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
93 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
94 ; GFX90A-TGSPLIT-NEXT: s_endpgm
96 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
97 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
98 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
99 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
100 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
101 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
102 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
103 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
104 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
105 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
106 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
108 ; GFX940-TGSPLIT-LABEL: flat_workgroup_unordered_load:
109 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
110 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
111 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
112 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
113 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
114 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
115 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
116 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
117 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
118 ; GFX940-TGSPLIT-NEXT: s_endpgm
120 ; GFX11-WGP-LABEL: flat_workgroup_unordered_load:
121 ; GFX11-WGP: ; %bb.0: ; %entry
122 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
123 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
124 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
125 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
126 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
127 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
128 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
129 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
130 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
131 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
132 ; GFX11-WGP-NEXT: s_endpgm
134 ; GFX11-CU-LABEL: flat_workgroup_unordered_load:
135 ; GFX11-CU: ; %bb.0: ; %entry
136 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
137 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
138 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
139 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
140 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
141 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
142 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
143 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
144 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
145 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
146 ; GFX11-CU-NEXT: s_endpgm
148 ; GFX12-WGP-LABEL: flat_workgroup_unordered_load:
149 ; GFX12-WGP: ; %bb.0: ; %entry
150 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
151 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
152 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
153 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
154 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
155 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1]
156 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
157 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
158 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
159 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
160 ; GFX12-WGP-NEXT: s_endpgm
162 ; GFX12-CU-LABEL: flat_workgroup_unordered_load:
163 ; GFX12-CU: ; %bb.0: ; %entry
164 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
165 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
166 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
167 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
168 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
169 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
170 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
171 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
172 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
173 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
174 ; GFX12-CU-NEXT: s_endpgm
177 %val = load atomic i32, ptr %in syncscope("workgroup") unordered, align 4
178 store i32 %val, ptr %out
182 define amdgpu_kernel void @flat_workgroup_monotonic_load(
183 ; GFX7-LABEL: flat_workgroup_monotonic_load:
184 ; GFX7: ; %bb.0: ; %entry
185 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
186 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
187 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
188 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
189 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
190 ; GFX7-NEXT: flat_load_dword v2, v[0:1]
191 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
192 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
193 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
194 ; GFX7-NEXT: flat_store_dword v[0:1], v2
195 ; GFX7-NEXT: s_endpgm
197 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_load:
198 ; GFX10-WGP: ; %bb.0: ; %entry
199 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
200 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
201 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
202 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
203 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
204 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc
205 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
206 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
207 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
208 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
209 ; GFX10-WGP-NEXT: s_endpgm
211 ; GFX10-CU-LABEL: flat_workgroup_monotonic_load:
212 ; GFX10-CU: ; %bb.0: ; %entry
213 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
214 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
215 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
216 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
217 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
218 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
219 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
220 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
221 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
222 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
223 ; GFX10-CU-NEXT: s_endpgm
225 ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_load:
226 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
227 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
228 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
229 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
230 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
231 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
232 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
233 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
234 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
235 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
236 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
237 ; SKIP-CACHE-INV-NEXT: s_endpgm
239 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
240 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
241 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
242 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
243 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
244 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
245 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
246 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
247 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
248 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
249 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
251 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
252 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
253 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
254 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
255 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
256 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
257 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
258 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
259 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
260 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
261 ; GFX90A-TGSPLIT-NEXT: s_endpgm
263 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
264 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
265 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
266 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
267 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
268 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
269 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
270 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
271 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
272 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
273 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
275 ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
276 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
277 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
278 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
279 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
280 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
281 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
282 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
283 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
284 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
285 ; GFX940-TGSPLIT-NEXT: s_endpgm
287 ; GFX11-WGP-LABEL: flat_workgroup_monotonic_load:
288 ; GFX11-WGP: ; %bb.0: ; %entry
289 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
290 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
291 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
292 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
293 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
294 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
295 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
296 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
297 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
298 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
299 ; GFX11-WGP-NEXT: s_endpgm
301 ; GFX11-CU-LABEL: flat_workgroup_monotonic_load:
302 ; GFX11-CU: ; %bb.0: ; %entry
303 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
304 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
305 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
306 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
307 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
308 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
309 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
310 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
311 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
312 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
313 ; GFX11-CU-NEXT: s_endpgm
315 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_load:
316 ; GFX12-WGP: ; %bb.0: ; %entry
317 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
318 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
319 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
320 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
321 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
322 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE
323 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
324 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
325 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
326 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
327 ; GFX12-WGP-NEXT: s_endpgm
329 ; GFX12-CU-LABEL: flat_workgroup_monotonic_load:
330 ; GFX12-CU: ; %bb.0: ; %entry
331 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
332 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
333 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
334 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
335 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
336 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
337 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
338 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
339 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
340 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
341 ; GFX12-CU-NEXT: s_endpgm
344 %val = load atomic i32, ptr %in syncscope("workgroup") monotonic, align 4
345 store i32 %val, ptr %out
349 define amdgpu_kernel void @flat_workgroup_acquire_load(
350 ; GFX7-LABEL: flat_workgroup_acquire_load:
351 ; GFX7: ; %bb.0: ; %entry
352 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
353 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
354 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
355 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
356 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
357 ; GFX7-NEXT: flat_load_dword v2, v[0:1]
358 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
359 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
360 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
361 ; GFX7-NEXT: s_waitcnt vmcnt(0)
362 ; GFX7-NEXT: flat_store_dword v[0:1], v2
363 ; GFX7-NEXT: s_endpgm
365 ; GFX10-WGP-LABEL: flat_workgroup_acquire_load:
366 ; GFX10-WGP: ; %bb.0: ; %entry
367 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
368 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
369 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
370 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
371 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
372 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc
373 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
374 ; GFX10-WGP-NEXT: buffer_gl0_inv
375 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
376 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
377 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
378 ; GFX10-WGP-NEXT: s_endpgm
380 ; GFX10-CU-LABEL: flat_workgroup_acquire_load:
381 ; GFX10-CU: ; %bb.0: ; %entry
382 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
383 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
384 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
385 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
386 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
387 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
388 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
389 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
390 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
391 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
392 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
393 ; GFX10-CU-NEXT: s_endpgm
395 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_load:
396 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
397 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
398 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
399 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
400 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
401 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
402 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
403 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
404 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
405 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
406 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
407 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
408 ; SKIP-CACHE-INV-NEXT: s_endpgm
410 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
411 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
412 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
413 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
414 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
415 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
416 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
417 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
418 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
419 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
420 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
421 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
423 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load:
424 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
425 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
426 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
427 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
428 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
429 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
430 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
431 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
432 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
433 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
434 ; GFX90A-TGSPLIT-NEXT: s_endpgm
436 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
437 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
438 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
439 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
440 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
441 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
442 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
443 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
444 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
445 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
446 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
447 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
449 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_load:
450 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
451 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
452 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
453 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
454 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
455 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
456 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
457 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
458 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
459 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
460 ; GFX940-TGSPLIT-NEXT: s_endpgm
462 ; GFX11-WGP-LABEL: flat_workgroup_acquire_load:
463 ; GFX11-WGP: ; %bb.0: ; %entry
464 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
465 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
466 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
467 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
468 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
469 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
470 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
471 ; GFX11-WGP-NEXT: buffer_gl0_inv
472 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
473 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
474 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
475 ; GFX11-WGP-NEXT: s_endpgm
477 ; GFX11-CU-LABEL: flat_workgroup_acquire_load:
478 ; GFX11-CU: ; %bb.0: ; %entry
479 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
480 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
481 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
482 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
483 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
484 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
485 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
486 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
487 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
488 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
489 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
490 ; GFX11-CU-NEXT: s_endpgm
492 ; GFX12-WGP-LABEL: flat_workgroup_acquire_load:
493 ; GFX12-WGP: ; %bb.0: ; %entry
494 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
495 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
496 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
497 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
498 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
499 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE
500 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
501 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
502 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
503 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
504 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
505 ; GFX12-WGP-NEXT: s_endpgm
507 ; GFX12-CU-LABEL: flat_workgroup_acquire_load:
508 ; GFX12-CU: ; %bb.0: ; %entry
509 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
510 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
511 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
512 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
513 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
514 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
515 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
516 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
517 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
518 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
519 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
520 ; GFX12-CU-NEXT: s_endpgm
523 %val = load atomic i32, ptr %in syncscope("workgroup") acquire, align 4
524 store i32 %val, ptr %out
528 define amdgpu_kernel void @flat_workgroup_seq_cst_load(
529 ; GFX7-LABEL: flat_workgroup_seq_cst_load:
530 ; GFX7: ; %bb.0: ; %entry
531 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
532 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
533 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
534 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
535 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
536 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
537 ; GFX7-NEXT: flat_load_dword v2, v[0:1]
538 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
539 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
540 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
541 ; GFX7-NEXT: s_waitcnt vmcnt(0)
542 ; GFX7-NEXT: flat_store_dword v[0:1], v2
543 ; GFX7-NEXT: s_endpgm
545 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load:
546 ; GFX10-WGP: ; %bb.0: ; %entry
547 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
548 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
549 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
550 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
551 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
552 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
553 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
554 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc
555 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
556 ; GFX10-WGP-NEXT: buffer_gl0_inv
557 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
558 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
559 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
560 ; GFX10-WGP-NEXT: s_endpgm
562 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_load:
563 ; GFX10-CU: ; %bb.0: ; %entry
564 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
565 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
566 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
567 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
568 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
569 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
570 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
571 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
572 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
573 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
574 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
575 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
576 ; GFX10-CU-NEXT: s_endpgm
578 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_load:
579 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
580 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
581 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
582 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
583 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
584 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
585 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
586 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
587 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
588 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
589 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
590 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
591 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
592 ; SKIP-CACHE-INV-NEXT: s_endpgm
594 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
595 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
596 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
597 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
598 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
599 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
600 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
601 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
602 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
603 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
604 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
605 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
606 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
608 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
609 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
610 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
611 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
612 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
613 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
614 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
615 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
616 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
617 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
618 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
619 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
620 ; GFX90A-TGSPLIT-NEXT: s_endpgm
622 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
623 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
624 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
625 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
626 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
627 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
628 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
629 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
630 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
631 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
632 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
633 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
634 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
636 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
637 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
638 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
639 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
640 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
641 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
642 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
643 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
644 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
645 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
646 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
647 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
648 ; GFX940-TGSPLIT-NEXT: s_endpgm
650 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_load:
651 ; GFX11-WGP: ; %bb.0: ; %entry
652 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
653 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
654 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
655 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
656 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
657 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
658 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
659 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
660 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
661 ; GFX11-WGP-NEXT: buffer_gl0_inv
662 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
663 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
664 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
665 ; GFX11-WGP-NEXT: s_endpgm
667 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_load:
668 ; GFX11-CU: ; %bb.0: ; %entry
669 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
670 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
671 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
672 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
673 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
674 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
675 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
676 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
677 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
678 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
679 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
680 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
681 ; GFX11-CU-NEXT: s_endpgm
683 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_load:
684 ; GFX12-WGP: ; %bb.0: ; %entry
685 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
686 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
687 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
688 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
689 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
690 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
691 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
692 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
693 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
694 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE
695 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
696 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
697 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
698 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
699 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
700 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
701 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
702 ; GFX12-WGP-NEXT: s_endpgm
704 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_load:
705 ; GFX12-CU: ; %bb.0: ; %entry
706 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
707 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
708 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
709 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
710 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
711 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
712 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
713 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
714 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
715 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
716 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
717 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
718 ; GFX12-CU-NEXT: s_endpgm
721 %val = load atomic i32, ptr %in syncscope("workgroup") seq_cst, align 4
722 store i32 %val, ptr %out
726 define amdgpu_kernel void @flat_workgroup_unordered_store(
727 ; GFX7-LABEL: flat_workgroup_unordered_store:
728 ; GFX7: ; %bb.0: ; %entry
729 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
730 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
731 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
732 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
733 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
734 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
735 ; GFX7-NEXT: flat_store_dword v[0:1], v2
736 ; GFX7-NEXT: s_endpgm
738 ; GFX10-WGP-LABEL: flat_workgroup_unordered_store:
739 ; GFX10-WGP: ; %bb.0: ; %entry
740 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
741 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
742 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
743 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
744 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
745 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
746 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
747 ; GFX10-WGP-NEXT: s_endpgm
749 ; GFX10-CU-LABEL: flat_workgroup_unordered_store:
750 ; GFX10-CU: ; %bb.0: ; %entry
751 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
752 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
753 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
754 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
755 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
756 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
757 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
758 ; GFX10-CU-NEXT: s_endpgm
760 ; SKIP-CACHE-INV-LABEL: flat_workgroup_unordered_store:
761 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
762 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
763 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
764 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
765 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
766 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
767 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
768 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
769 ; SKIP-CACHE-INV-NEXT: s_endpgm
771 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
772 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
773 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
774 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
775 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
776 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
777 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
778 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
779 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
781 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store:
782 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
783 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
784 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
785 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
786 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
787 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
788 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
789 ; GFX90A-TGSPLIT-NEXT: s_endpgm
791 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
792 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
793 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
794 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
795 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
796 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
797 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
798 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
799 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
801 ; GFX940-TGSPLIT-LABEL: flat_workgroup_unordered_store:
802 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
803 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
804 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
805 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
806 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
807 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
808 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
809 ; GFX940-TGSPLIT-NEXT: s_endpgm
811 ; GFX11-WGP-LABEL: flat_workgroup_unordered_store:
812 ; GFX11-WGP: ; %bb.0: ; %entry
813 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
814 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
815 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
816 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
817 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
818 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
819 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
820 ; GFX11-WGP-NEXT: s_endpgm
822 ; GFX11-CU-LABEL: flat_workgroup_unordered_store:
823 ; GFX11-CU: ; %bb.0: ; %entry
824 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
825 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
826 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
827 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
828 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
829 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
830 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
831 ; GFX11-CU-NEXT: s_endpgm
833 ; GFX12-WGP-LABEL: flat_workgroup_unordered_store:
834 ; GFX12-WGP: ; %bb.0: ; %entry
835 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
836 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
837 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
838 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
839 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
840 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
841 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
842 ; GFX12-WGP-NEXT: s_endpgm
844 ; GFX12-CU-LABEL: flat_workgroup_unordered_store:
845 ; GFX12-CU: ; %bb.0: ; %entry
846 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
847 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
848 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
849 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
850 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
851 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
852 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
853 ; GFX12-CU-NEXT: s_endpgm
856 store atomic i32 %in, ptr %out syncscope("workgroup") unordered, align 4
860 define amdgpu_kernel void @flat_workgroup_monotonic_store(
861 ; GFX7-LABEL: flat_workgroup_monotonic_store:
862 ; GFX7: ; %bb.0: ; %entry
863 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
864 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
865 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
866 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
867 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
868 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
869 ; GFX7-NEXT: flat_store_dword v[0:1], v2
870 ; GFX7-NEXT: s_endpgm
872 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_store:
873 ; GFX10-WGP: ; %bb.0: ; %entry
874 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
875 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
876 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
877 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
878 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
879 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
880 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
881 ; GFX10-WGP-NEXT: s_endpgm
883 ; GFX10-CU-LABEL: flat_workgroup_monotonic_store:
884 ; GFX10-CU: ; %bb.0: ; %entry
885 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
886 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
887 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
888 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
889 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
890 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
891 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
892 ; GFX10-CU-NEXT: s_endpgm
894 ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_store:
895 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
896 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
897 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
898 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
899 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
900 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
901 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
902 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
903 ; SKIP-CACHE-INV-NEXT: s_endpgm
905 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
906 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
907 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
908 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
909 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
910 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
911 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
912 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
913 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
915 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
916 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
917 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
918 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
919 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
920 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
921 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
922 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
923 ; GFX90A-TGSPLIT-NEXT: s_endpgm
925 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
926 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
927 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
928 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
929 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
930 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
931 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
932 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
933 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
935 ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
936 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
937 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
938 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
939 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
940 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
941 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
942 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
943 ; GFX940-TGSPLIT-NEXT: s_endpgm
945 ; GFX11-WGP-LABEL: flat_workgroup_monotonic_store:
946 ; GFX11-WGP: ; %bb.0: ; %entry
947 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
948 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
949 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
950 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
951 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
952 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
953 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
954 ; GFX11-WGP-NEXT: s_endpgm
956 ; GFX11-CU-LABEL: flat_workgroup_monotonic_store:
957 ; GFX11-CU: ; %bb.0: ; %entry
958 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
959 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
960 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
961 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
962 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
963 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
964 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
965 ; GFX11-CU-NEXT: s_endpgm
967 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_store:
968 ; GFX12-WGP: ; %bb.0: ; %entry
969 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
970 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
971 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
972 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
973 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
974 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
975 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
976 ; GFX12-WGP-NEXT: s_endpgm
978 ; GFX12-CU-LABEL: flat_workgroup_monotonic_store:
979 ; GFX12-CU: ; %bb.0: ; %entry
980 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
981 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
982 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
983 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
984 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
985 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
986 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
987 ; GFX12-CU-NEXT: s_endpgm
990 store atomic i32 %in, ptr %out syncscope("workgroup") monotonic, align 4
994 define amdgpu_kernel void @flat_workgroup_release_store(
995 ; GFX7-LABEL: flat_workgroup_release_store:
996 ; GFX7: ; %bb.0: ; %entry
997 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
998 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
999 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1000 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
1001 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
1002 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
1003 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1004 ; GFX7-NEXT: flat_store_dword v[0:1], v2
1005 ; GFX7-NEXT: s_endpgm
1007 ; GFX10-WGP-LABEL: flat_workgroup_release_store:
1008 ; GFX10-WGP: ; %bb.0: ; %entry
1009 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
1010 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
1011 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
1012 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
1013 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
1014 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
1015 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1016 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
1017 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
1018 ; GFX10-WGP-NEXT: s_endpgm
1020 ; GFX10-CU-LABEL: flat_workgroup_release_store:
1021 ; GFX10-CU: ; %bb.0: ; %entry
1022 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
1023 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
1024 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1025 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
1026 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
1027 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
1028 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1029 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
1030 ; GFX10-CU-NEXT: s_endpgm
1032 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_store:
1033 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
1034 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
1035 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
1036 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1037 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
1038 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
1039 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
1040 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1041 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
1042 ; SKIP-CACHE-INV-NEXT: s_endpgm
1044 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
1045 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
1046 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
1047 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
1048 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1049 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1050 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
1051 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1052 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
1053 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
1055 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store:
1056 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
1057 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
1058 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
1059 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1060 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1061 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
1062 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1063 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
1064 ; GFX90A-TGSPLIT-NEXT: s_endpgm
1066 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
1067 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
1068 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
1069 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
1070 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1071 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1072 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
1073 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1074 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
1075 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
1077 ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_store:
1078 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
1079 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
1080 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
1081 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1082 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1083 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
1084 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1085 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
1086 ; GFX940-TGSPLIT-NEXT: s_endpgm
1088 ; GFX11-WGP-LABEL: flat_workgroup_release_store:
1089 ; GFX11-WGP: ; %bb.0: ; %entry
1090 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
1091 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
1092 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
1093 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
1094 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
1095 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
1096 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1097 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
1098 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
1099 ; GFX11-WGP-NEXT: s_endpgm
1101 ; GFX11-CU-LABEL: flat_workgroup_release_store:
1102 ; GFX11-CU: ; %bb.0: ; %entry
1103 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
1104 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
1105 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1106 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
1107 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
1108 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
1109 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1110 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
1111 ; GFX11-CU-NEXT: s_endpgm
1113 ; GFX12-WGP-LABEL: flat_workgroup_release_store:
1114 ; GFX12-WGP: ; %bb.0: ; %entry
1115 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
1116 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
1117 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
1118 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
1119 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
1120 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
1121 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
1122 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
1123 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
1124 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
1125 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
1126 ; GFX12-WGP-NEXT: s_endpgm
1128 ; GFX12-CU-LABEL: flat_workgroup_release_store:
1129 ; GFX12-CU: ; %bb.0: ; %entry
1130 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
1131 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
1132 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
1133 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
1134 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
1135 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
1136 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
1137 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
1138 ; GFX12-CU-NEXT: s_endpgm
1139 i32 %in, ptr %out) {
1141 store atomic i32 %in, ptr %out syncscope("workgroup") release, align 4
1145 define amdgpu_kernel void @flat_workgroup_seq_cst_store(
1146 ; GFX7-LABEL: flat_workgroup_seq_cst_store:
1147 ; GFX7: ; %bb.0: ; %entry
1148 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
1149 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
1150 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1151 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
1152 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
1153 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
1154 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1155 ; GFX7-NEXT: flat_store_dword v[0:1], v2
1156 ; GFX7-NEXT: s_endpgm
1158 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store:
1159 ; GFX10-WGP: ; %bb.0: ; %entry
1160 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
1161 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
1162 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
1163 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
1164 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
1165 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
1166 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1167 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
1168 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
1169 ; GFX10-WGP-NEXT: s_endpgm
1171 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_store:
1172 ; GFX10-CU: ; %bb.0: ; %entry
1173 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
1174 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
1175 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1176 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
1177 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
1178 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
1179 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1180 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
1181 ; GFX10-CU-NEXT: s_endpgm
1183 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_store:
1184 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
1185 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
1186 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
1187 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1188 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
1189 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
1190 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
1191 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1192 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
1193 ; SKIP-CACHE-INV-NEXT: s_endpgm
1195 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
1196 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
1197 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
1198 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
1199 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1200 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1201 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
1202 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1203 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
1204 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
1206 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
1207 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
1208 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
1209 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
1210 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1211 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1212 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
1213 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1214 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
1215 ; GFX90A-TGSPLIT-NEXT: s_endpgm
1217 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
1218 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
1219 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
1220 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
1221 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1222 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1223 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
1224 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1225 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
1226 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
1228 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
1229 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
1230 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
1231 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
1232 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1233 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1234 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
1235 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1236 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
1237 ; GFX940-TGSPLIT-NEXT: s_endpgm
1239 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_store:
1240 ; GFX11-WGP: ; %bb.0: ; %entry
1241 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
1242 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
1243 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
1244 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
1245 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
1246 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
1247 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1248 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
1249 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
1250 ; GFX11-WGP-NEXT: s_endpgm
1252 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_store:
1253 ; GFX11-CU: ; %bb.0: ; %entry
1254 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
1255 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
1256 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1257 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
1258 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
1259 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
1260 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1261 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
1262 ; GFX11-CU-NEXT: s_endpgm
1264 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_store:
1265 ; GFX12-WGP: ; %bb.0: ; %entry
1266 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
1267 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
1268 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
1269 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
1270 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
1271 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
1272 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
1273 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
1274 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
1275 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
1276 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
1277 ; GFX12-WGP-NEXT: s_endpgm
1279 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_store:
1280 ; GFX12-CU: ; %bb.0: ; %entry
1281 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
1282 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
1283 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
1284 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
1285 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
1286 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
1287 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
1288 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
1289 ; GFX12-CU-NEXT: s_endpgm
1290 i32 %in, ptr %out) {
1292 store atomic i32 %in, ptr %out syncscope("workgroup") seq_cst, align 4
1296 define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
1297 ; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw:
1298 ; GFX7: ; %bb.0: ; %entry
1299 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1300 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
1301 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1302 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
1303 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
1304 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
1305 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2
1306 ; GFX7-NEXT: s_endpgm
1308 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
1309 ; GFX10-WGP: ; %bb.0: ; %entry
1310 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1311 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
1312 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
1313 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
1314 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
1315 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
1316 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
1317 ; GFX10-WGP-NEXT: s_endpgm
1319 ; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
1320 ; GFX10-CU: ; %bb.0: ; %entry
1321 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1322 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
1323 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1324 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
1325 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
1326 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
1327 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
1328 ; GFX10-CU-NEXT: s_endpgm
1330 ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_atomicrmw:
1331 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
1332 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1333 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
1334 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1335 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
1336 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
1337 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
1338 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
1339 ; SKIP-CACHE-INV-NEXT: s_endpgm
1341 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
1342 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
1343 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1344 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
1345 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1346 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1347 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
1348 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1349 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
1351 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
1352 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
1353 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1354 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
1355 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1356 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1357 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
1358 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1359 ; GFX90A-TGSPLIT-NEXT: s_endpgm
1361 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
1362 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
1363 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1364 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
1365 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1366 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1367 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
1368 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1369 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
1371 ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
1372 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
1373 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1374 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
1375 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1376 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1377 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
1378 ; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1379 ; GFX940-TGSPLIT-NEXT: s_endpgm
1381 ; GFX11-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
1382 ; GFX11-WGP: ; %bb.0: ; %entry
1383 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
1384 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
1385 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
1386 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
1387 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
1388 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
1389 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
1390 ; GFX11-WGP-NEXT: s_endpgm
1392 ; GFX11-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
1393 ; GFX11-CU: ; %bb.0: ; %entry
1394 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
1395 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
1396 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1397 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
1398 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
1399 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
1400 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
1401 ; GFX11-CU-NEXT: s_endpgm
1403 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
1404 ; GFX12-WGP: ; %bb.0: ; %entry
1405 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
1406 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
1407 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
1408 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
1409 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
1410 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
1411 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
1412 ; GFX12-WGP-NEXT: s_endpgm
1414 ; GFX12-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
1415 ; GFX12-CU: ; %bb.0: ; %entry
1416 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
1417 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
1418 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
1419 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
1420 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
1421 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
1422 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
1423 ; GFX12-CU-NEXT: s_endpgm
1424 ptr %out, i32 %in) {
1426 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") monotonic
1430 define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
1431 ; GFX7-LABEL: flat_workgroup_acquire_atomicrmw:
1432 ; GFX7: ; %bb.0: ; %entry
1433 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1434 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
1435 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1436 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
1437 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
1438 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
1439 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2
1440 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1441 ; GFX7-NEXT: s_endpgm
1443 ; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
1444 ; GFX10-WGP: ; %bb.0: ; %entry
1445 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1446 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
1447 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
1448 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
1449 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
1450 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
1451 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
1452 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
1453 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
1454 ; GFX10-WGP-NEXT: buffer_gl0_inv
1455 ; GFX10-WGP-NEXT: s_endpgm
1457 ; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw:
1458 ; GFX10-CU: ; %bb.0: ; %entry
1459 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1460 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
1461 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1462 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
1463 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
1464 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
1465 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
1466 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1467 ; GFX10-CU-NEXT: s_endpgm
1469 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_atomicrmw:
1470 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
1471 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1472 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
1473 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1474 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
1475 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
1476 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
1477 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
1478 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1479 ; SKIP-CACHE-INV-NEXT: s_endpgm
1481 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
1482 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
1483 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1484 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
1485 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1486 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1487 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
1488 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1489 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1490 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
1492 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
1493 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
1494 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1495 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
1496 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1497 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1498 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
1499 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1500 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
1501 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
1502 ; GFX90A-TGSPLIT-NEXT: s_endpgm
1504 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
1505 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
1506 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1507 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
1508 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1509 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1510 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
1511 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1512 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1513 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
1515 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
1516 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
1517 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1518 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
1519 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1520 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1521 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
1522 ; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1523 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
1524 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
1525 ; GFX940-TGSPLIT-NEXT: s_endpgm
1527 ; GFX11-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
1528 ; GFX11-WGP: ; %bb.0: ; %entry
1529 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
1530 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
1531 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
1532 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
1533 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
1534 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
1535 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
1536 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
1537 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
1538 ; GFX11-WGP-NEXT: buffer_gl0_inv
1539 ; GFX11-WGP-NEXT: s_endpgm
1541 ; GFX11-CU-LABEL: flat_workgroup_acquire_atomicrmw:
1542 ; GFX11-CU: ; %bb.0: ; %entry
1543 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
1544 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
1545 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1546 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
1547 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
1548 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
1549 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
1550 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1551 ; GFX11-CU-NEXT: s_endpgm
1553 ; GFX12-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
1554 ; GFX12-WGP: ; %bb.0: ; %entry
1555 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
1556 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
1557 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
1558 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
1559 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
1560 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
1561 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
1562 ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0
1563 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
1564 ; GFX12-WGP-NEXT: s_endpgm
1566 ; GFX12-CU-LABEL: flat_workgroup_acquire_atomicrmw:
1567 ; GFX12-CU: ; %bb.0: ; %entry
1568 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
1569 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
1570 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
1571 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
1572 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
1573 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
1574 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
1575 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
1576 ; GFX12-CU-NEXT: s_endpgm
1577 ptr %out, i32 %in) {
1579 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
1583 define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
1584 ; GFX7-LABEL: flat_workgroup_release_atomicrmw:
1585 ; GFX7: ; %bb.0: ; %entry
1586 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1587 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
1588 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1589 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
1590 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
1591 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
1592 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1593 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2
1594 ; GFX7-NEXT: s_endpgm
1596 ; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw:
1597 ; GFX10-WGP: ; %bb.0: ; %entry
1598 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1599 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
1600 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
1601 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
1602 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
1603 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
1604 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1605 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
1606 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
1607 ; GFX10-WGP-NEXT: s_endpgm
1609 ; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw:
1610 ; GFX10-CU: ; %bb.0: ; %entry
1611 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1612 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
1613 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1614 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
1615 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
1616 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
1617 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1618 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
1619 ; GFX10-CU-NEXT: s_endpgm
1621 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_atomicrmw:
1622 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
1623 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1624 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
1625 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1626 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
1627 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
1628 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
1629 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1630 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
1631 ; SKIP-CACHE-INV-NEXT: s_endpgm
1633 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
1634 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
1635 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1636 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
1637 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1638 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1639 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
1640 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1641 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1642 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
1644 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
1645 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
1646 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1647 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
1648 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1649 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1650 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
1651 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1652 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1653 ; GFX90A-TGSPLIT-NEXT: s_endpgm
1655 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
1656 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
1657 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1658 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
1659 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1660 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1661 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
1662 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1663 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1664 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
1666 ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
1667 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
1668 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1669 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
1670 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1671 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1672 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
1673 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1674 ; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1675 ; GFX940-TGSPLIT-NEXT: s_endpgm
1677 ; GFX11-WGP-LABEL: flat_workgroup_release_atomicrmw:
1678 ; GFX11-WGP: ; %bb.0: ; %entry
1679 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
1680 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
1681 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
1682 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
1683 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
1684 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
1685 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1686 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
1687 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
1688 ; GFX11-WGP-NEXT: s_endpgm
1690 ; GFX11-CU-LABEL: flat_workgroup_release_atomicrmw:
1691 ; GFX11-CU: ; %bb.0: ; %entry
1692 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
1693 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
1694 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1695 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
1696 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
1697 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
1698 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1699 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
1700 ; GFX11-CU-NEXT: s_endpgm
1702 ; GFX12-WGP-LABEL: flat_workgroup_release_atomicrmw:
1703 ; GFX12-WGP: ; %bb.0: ; %entry
1704 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
1705 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
1706 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
1707 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
1708 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
1709 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
1710 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
1711 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
1712 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
1713 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
1714 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
1715 ; GFX12-WGP-NEXT: s_endpgm
1717 ; GFX12-CU-LABEL: flat_workgroup_release_atomicrmw:
1718 ; GFX12-CU: ; %bb.0: ; %entry
1719 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
1720 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
1721 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
1722 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
1723 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
1724 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
1725 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
1726 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
1727 ; GFX12-CU-NEXT: s_endpgm
1728 ptr %out, i32 %in) {
1730 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") release
1734 define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
1735 ; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw:
1736 ; GFX7: ; %bb.0: ; %entry
1737 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1738 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
1739 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1740 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
1741 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
1742 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
1743 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1744 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2
1745 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1746 ; GFX7-NEXT: s_endpgm
1748 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
1749 ; GFX10-WGP: ; %bb.0: ; %entry
1750 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1751 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
1752 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
1753 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
1754 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
1755 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
1756 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1757 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
1758 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
1759 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
1760 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
1761 ; GFX10-WGP-NEXT: buffer_gl0_inv
1762 ; GFX10-WGP-NEXT: s_endpgm
1764 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
1765 ; GFX10-CU: ; %bb.0: ; %entry
1766 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1767 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
1768 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1769 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
1770 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
1771 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
1772 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1773 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
1774 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1775 ; GFX10-CU-NEXT: s_endpgm
1777 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_atomicrmw:
1778 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
1779 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1780 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
1781 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1782 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
1783 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
1784 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
1785 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1786 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
1787 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1788 ; SKIP-CACHE-INV-NEXT: s_endpgm
1790 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
1791 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
1792 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1793 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
1794 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1795 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1796 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
1797 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1798 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1799 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1800 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
1802 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
1803 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
1804 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1805 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
1806 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1807 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1808 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
1809 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1810 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1811 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
1812 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
1813 ; GFX90A-TGSPLIT-NEXT: s_endpgm
1815 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
1816 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
1817 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1818 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
1819 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1820 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1821 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
1822 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1823 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1824 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1825 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
1827 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
1828 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
1829 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1830 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
1831 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1832 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1833 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
1834 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1835 ; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1836 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
1837 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
1838 ; GFX940-TGSPLIT-NEXT: s_endpgm
1840 ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
1841 ; GFX11-WGP: ; %bb.0: ; %entry
1842 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
1843 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
1844 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
1845 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
1846 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
1847 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
1848 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1849 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
1850 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
1851 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
1852 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
1853 ; GFX11-WGP-NEXT: buffer_gl0_inv
1854 ; GFX11-WGP-NEXT: s_endpgm
1856 ; GFX11-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
1857 ; GFX11-CU: ; %bb.0: ; %entry
1858 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
1859 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
1860 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1861 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
1862 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
1863 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
1864 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1865 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
1866 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
1867 ; GFX11-CU-NEXT: s_endpgm
1869 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
1870 ; GFX12-WGP: ; %bb.0: ; %entry
1871 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
1872 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
1873 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
1874 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
1875 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
1876 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
1877 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
1878 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
1879 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
1880 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
1881 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
1882 ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0
1883 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
1884 ; GFX12-WGP-NEXT: s_endpgm
1886 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
1887 ; GFX12-CU: ; %bb.0: ; %entry
1888 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
1889 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
1890 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
1891 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
1892 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
1893 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
1894 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
1895 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
1896 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
1897 ; GFX12-CU-NEXT: s_endpgm
1898 ptr %out, i32 %in) {
1900 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
1904 define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
1905 ; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw:
1906 ; GFX7: ; %bb.0: ; %entry
1907 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1908 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
1909 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1910 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
1911 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
1912 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
1913 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1914 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2
1915 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
1916 ; GFX7-NEXT: s_endpgm
1918 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
1919 ; GFX10-WGP: ; %bb.0: ; %entry
1920 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1921 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
1922 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
1923 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
1924 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
1925 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
1926 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1927 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
1928 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
1929 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
1930 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
1931 ; GFX10-WGP-NEXT: buffer_gl0_inv
1932 ; GFX10-WGP-NEXT: s_endpgm
1934 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
1935 ; GFX10-CU: ; %bb.0: ; %entry
1936 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1937 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
1938 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1939 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
1940 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
1941 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
1942 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1943 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
1944 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
1945 ; GFX10-CU-NEXT: s_endpgm
1947 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_atomicrmw:
1948 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
1949 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1950 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
1951 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1952 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
1953 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
1954 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
1955 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1956 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
1957 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
1958 ; SKIP-CACHE-INV-NEXT: s_endpgm
1960 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
1961 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
1962 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1963 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
1964 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1965 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1966 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
1967 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1968 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1969 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1970 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
1972 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
1973 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
1974 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
1975 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
1976 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1977 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
1978 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
1979 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1980 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1981 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
1982 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
1983 ; GFX90A-TGSPLIT-NEXT: s_endpgm
1985 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
1986 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
1987 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
1988 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
1989 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1990 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1991 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
1992 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1993 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
1994 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
1995 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
1997 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
1998 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
1999 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
2000 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
2001 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2002 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
2003 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
2004 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2005 ; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
2006 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
2007 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
2008 ; GFX940-TGSPLIT-NEXT: s_endpgm
2010 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
2011 ; GFX11-WGP: ; %bb.0: ; %entry
2012 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
2013 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
2014 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
2015 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
2016 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
2017 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
2018 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2019 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
2020 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
2021 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
2022 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
2023 ; GFX11-WGP-NEXT: buffer_gl0_inv
2024 ; GFX11-WGP-NEXT: s_endpgm
2026 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
2027 ; GFX11-CU: ; %bb.0: ; %entry
2028 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
2029 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
2030 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
2031 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
2032 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
2033 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
2034 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
2035 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
2036 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
2037 ; GFX11-CU-NEXT: s_endpgm
2039 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
2040 ; GFX12-WGP: ; %bb.0: ; %entry
2041 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
2042 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
2043 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
2044 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
2045 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
2046 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
2047 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
2048 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
2049 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
2050 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
2051 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
2052 ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0
2053 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
2054 ; GFX12-WGP-NEXT: s_endpgm
2056 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
2057 ; GFX12-CU: ; %bb.0: ; %entry
2058 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
2059 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
2060 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
2061 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
2062 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
2063 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
2064 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
2065 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
2066 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
2067 ; GFX12-CU-NEXT: s_endpgm
2068 ptr %out, i32 %in) {
2070 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
2074 define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
2075 ; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2076 ; GFX7: ; %bb.0: ; %entry
2077 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2078 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
2079 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2080 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
2081 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
2082 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
2083 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2084 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2085 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
2086 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
2087 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2088 ; GFX7-NEXT: flat_store_dword v[0:1], v2
2089 ; GFX7-NEXT: s_endpgm
2091 ; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2092 ; GFX10-WGP: ; %bb.0: ; %entry
2093 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2094 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
2095 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
2096 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
2097 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
2098 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6
2099 ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2100 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2101 ; GFX10-WGP-NEXT: buffer_gl0_inv
2102 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
2103 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
2104 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
2105 ; GFX10-WGP-NEXT: s_endpgm
2107 ; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2108 ; GFX10-CU: ; %bb.0: ; %entry
2109 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2110 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
2111 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
2112 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
2113 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
2114 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
2115 ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2116 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
2117 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
2118 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
2119 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
2120 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
2121 ; GFX10-CU-NEXT: s_endpgm
2123 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2124 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
2125 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2126 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
2127 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
2128 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
2129 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
2130 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
2131 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2132 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
2133 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
2134 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
2135 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
2136 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
2137 ; SKIP-CACHE-INV-NEXT: s_endpgm
2139 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2140 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
2141 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2142 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
2143 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2144 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2145 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6
2146 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2147 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2148 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2149 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
2150 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
2151 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
2153 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2154 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
2155 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2156 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
2157 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2158 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2159 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6
2160 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2161 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
2162 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
2163 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2164 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
2165 ; GFX90A-TGSPLIT-NEXT: s_endpgm
2167 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2168 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
2169 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2170 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
2171 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2172 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2173 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
2174 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
2175 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2176 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2177 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
2178 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
2179 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
2181 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2182 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
2183 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2184 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
2185 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2186 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2187 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
2188 ; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
2189 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
2190 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
2191 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2192 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
2193 ; GFX940-TGSPLIT-NEXT: s_endpgm
2195 ; GFX11-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2196 ; GFX11-WGP: ; %bb.0: ; %entry
2197 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2198 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
2199 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
2200 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
2201 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
2202 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
2203 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
2204 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2205 ; GFX11-WGP-NEXT: buffer_gl0_inv
2206 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
2207 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
2208 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
2209 ; GFX11-WGP-NEXT: s_endpgm
2211 ; GFX11-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2212 ; GFX11-CU: ; %bb.0: ; %entry
2213 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2214 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
2215 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
2216 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
2217 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
2218 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
2219 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
2220 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
2221 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
2222 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
2223 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
2224 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
2225 ; GFX11-CU-NEXT: s_endpgm
2227 ; GFX12-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2228 ; GFX12-WGP: ; %bb.0: ; %entry
2229 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2230 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
2231 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
2232 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
2233 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
2234 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
2235 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
2236 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
2237 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
2238 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
2239 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
2240 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
2241 ; GFX12-WGP-NEXT: s_endpgm
2243 ; GFX12-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
2244 ; GFX12-CU: ; %bb.0: ; %entry
2245 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2246 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
2247 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
2248 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
2249 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
2250 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
2251 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
2252 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
2253 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
2254 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
2255 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
2256 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
2257 ; GFX12-CU-NEXT: s_endpgm
2258 ptr %out, i32 %in) {
2260 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acquire
2261 store i32 %val, ptr %out, align 4
2265 define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
2266 ; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2267 ; GFX7: ; %bb.0: ; %entry
2268 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2269 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
2270 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2271 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
2272 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
2273 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
2274 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2275 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2276 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2277 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
2278 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
2279 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2280 ; GFX7-NEXT: flat_store_dword v[0:1], v2
2281 ; GFX7-NEXT: s_endpgm
2283 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2284 ; GFX10-WGP: ; %bb.0: ; %entry
2285 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2286 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
2287 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
2288 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
2289 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
2290 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6
2291 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2292 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
2293 ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2294 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2295 ; GFX10-WGP-NEXT: buffer_gl0_inv
2296 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
2297 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
2298 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
2299 ; GFX10-WGP-NEXT: s_endpgm
2301 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2302 ; GFX10-CU: ; %bb.0: ; %entry
2303 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2304 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
2305 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
2306 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
2307 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
2308 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
2309 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
2310 ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2311 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
2312 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
2313 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
2314 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
2315 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
2316 ; GFX10-CU-NEXT: s_endpgm
2318 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2319 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
2320 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2321 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
2322 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
2323 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
2324 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
2325 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
2326 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
2327 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2328 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
2329 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
2330 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
2331 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
2332 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
2333 ; SKIP-CACHE-INV-NEXT: s_endpgm
2335 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2336 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
2337 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2338 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
2339 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2340 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2341 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6
2342 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2343 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2344 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2345 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2346 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
2347 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
2348 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
2350 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2351 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
2352 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2353 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
2354 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2355 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2356 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6
2357 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2358 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2359 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
2360 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
2361 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2362 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
2363 ; GFX90A-TGSPLIT-NEXT: s_endpgm
2365 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2366 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
2367 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2368 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
2369 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2370 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2371 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
2372 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2373 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
2374 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2375 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2376 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
2377 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
2378 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
2380 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2381 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
2382 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2383 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
2384 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2385 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2386 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
2387 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2388 ; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
2389 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
2390 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
2391 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2392 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
2393 ; GFX940-TGSPLIT-NEXT: s_endpgm
2395 ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2396 ; GFX11-WGP: ; %bb.0: ; %entry
2397 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2398 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
2399 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
2400 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
2401 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
2402 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
2403 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2404 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
2405 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
2406 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2407 ; GFX11-WGP-NEXT: buffer_gl0_inv
2408 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
2409 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
2410 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
2411 ; GFX11-WGP-NEXT: s_endpgm
2413 ; GFX11-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2414 ; GFX11-CU: ; %bb.0: ; %entry
2415 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2416 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
2417 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
2418 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
2419 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
2420 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
2421 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
2422 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
2423 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
2424 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
2425 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
2426 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
2427 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
2428 ; GFX11-CU-NEXT: s_endpgm
2430 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2431 ; GFX12-WGP: ; %bb.0: ; %entry
2432 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2433 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
2434 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
2435 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
2436 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
2437 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
2438 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
2439 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
2440 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
2441 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
2442 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
2443 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
2444 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
2445 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
2446 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
2447 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
2448 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
2449 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
2450 ; GFX12-WGP-NEXT: s_endpgm
2452 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
2453 ; GFX12-CU: ; %bb.0: ; %entry
2454 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2455 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
2456 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
2457 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
2458 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
2459 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
2460 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
2461 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
2462 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
2463 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
2464 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
2465 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
2466 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
2467 ; GFX12-CU-NEXT: s_endpgm
2468 ptr %out, i32 %in) {
2470 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") acq_rel
2471 store i32 %val, ptr %out, align 4
2475 define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
2476 ; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2477 ; GFX7: ; %bb.0: ; %entry
2478 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2479 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
2480 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2481 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
2482 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
2483 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
2484 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2485 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2486 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2487 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
2488 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
2489 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2490 ; GFX7-NEXT: flat_store_dword v[0:1], v2
2491 ; GFX7-NEXT: s_endpgm
2493 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2494 ; GFX10-WGP: ; %bb.0: ; %entry
2495 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2496 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
2497 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
2498 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
2499 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
2500 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6
2501 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2502 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
2503 ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2504 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2505 ; GFX10-WGP-NEXT: buffer_gl0_inv
2506 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
2507 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
2508 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
2509 ; GFX10-WGP-NEXT: s_endpgm
2511 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2512 ; GFX10-CU: ; %bb.0: ; %entry
2513 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2514 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
2515 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
2516 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
2517 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
2518 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
2519 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
2520 ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2521 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
2522 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
2523 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
2524 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
2525 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
2526 ; GFX10-CU-NEXT: s_endpgm
2528 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2529 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
2530 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2531 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
2532 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
2533 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
2534 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
2535 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
2536 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
2537 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2538 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
2539 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
2540 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
2541 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
2542 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
2543 ; SKIP-CACHE-INV-NEXT: s_endpgm
2545 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2546 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
2547 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2548 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
2549 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2550 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2551 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6
2552 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2553 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2554 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2555 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2556 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
2557 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
2558 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
2560 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2561 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
2562 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2563 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
2564 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2565 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2566 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6
2567 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2568 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
2569 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
2570 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
2571 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2572 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
2573 ; GFX90A-TGSPLIT-NEXT: s_endpgm
2575 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2576 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
2577 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2578 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
2579 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2580 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2581 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
2582 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2583 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
2584 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2585 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2586 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
2587 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
2588 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
2590 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2591 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
2592 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2593 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
2594 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2595 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2596 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
2597 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2598 ; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
2599 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
2600 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
2601 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2602 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
2603 ; GFX940-TGSPLIT-NEXT: s_endpgm
2605 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2606 ; GFX11-WGP: ; %bb.0: ; %entry
2607 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2608 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
2609 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
2610 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
2611 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
2612 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
2613 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2614 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
2615 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
2616 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2617 ; GFX11-WGP-NEXT: buffer_gl0_inv
2618 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
2619 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
2620 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
2621 ; GFX11-WGP-NEXT: s_endpgm
2623 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2624 ; GFX11-CU: ; %bb.0: ; %entry
2625 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2626 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
2627 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
2628 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
2629 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
2630 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
2631 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
2632 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
2633 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
2634 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
2635 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
2636 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
2637 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
2638 ; GFX11-CU-NEXT: s_endpgm
2640 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2641 ; GFX12-WGP: ; %bb.0: ; %entry
2642 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2643 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
2644 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
2645 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
2646 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
2647 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
2648 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
2649 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
2650 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
2651 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
2652 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
2653 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
2654 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
2655 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
2656 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
2657 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
2658 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
2659 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
2660 ; GFX12-WGP-NEXT: s_endpgm
2662 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
2663 ; GFX12-CU: ; %bb.0: ; %entry
2664 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2665 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
2666 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
2667 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
2668 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
2669 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
2670 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
2671 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
2672 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
2673 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
2674 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
2675 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
2676 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
2677 ; GFX12-CU-NEXT: s_endpgm
2678 ptr %out, i32 %in) {
2680 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup") seq_cst
2681 store i32 %val, ptr %out, align 4
2685 define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
2686 ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2687 ; GFX7: ; %bb.0: ; %entry
2688 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
2689 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
2690 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
2691 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
2692 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
2693 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2694 ; GFX7-NEXT: s_mov_b32 s4, s8
2695 ; GFX7-NEXT: s_mov_b32 s5, s9
2696 ; GFX7-NEXT: s_mov_b32 s9, s10
2697 ; GFX7-NEXT: s_mov_b32 s8, s11
2698 ; GFX7-NEXT: s_add_u32 s4, s4, s9
2699 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
2700 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2701 ; GFX7-NEXT: s_mov_b32 s5, s8
2702 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
2703 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
2704 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2705 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
2706 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
2707 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
2708 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
2709 ; GFX7-NEXT: s_endpgm
2711 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2712 ; GFX10-WGP: ; %bb.0: ; %entry
2713 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
2714 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
2715 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
2716 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
2717 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
2718 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
2719 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
2720 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
2721 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
2722 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
2723 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
2724 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
2725 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2726 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
2727 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
2728 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
2729 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2730 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
2731 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
2732 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
2733 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
2734 ; GFX10-WGP-NEXT: s_endpgm
2736 ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2737 ; GFX10-CU: ; %bb.0: ; %entry
2738 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
2739 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
2740 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
2741 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
2742 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
2743 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
2744 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
2745 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
2746 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
2747 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
2748 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
2749 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
2750 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2751 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
2752 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
2753 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
2754 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2755 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
2756 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
2757 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
2758 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
2759 ; GFX10-CU-NEXT: s_endpgm
2761 ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2762 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
2763 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
2764 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
2765 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
2766 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
2767 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
2768 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
2769 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
2770 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
2771 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
2772 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
2773 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
2774 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
2775 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
2776 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
2777 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
2778 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
2779 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2780 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
2781 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
2782 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
2783 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
2784 ; SKIP-CACHE-INV-NEXT: s_endpgm
2786 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2787 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
2788 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2789 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
2790 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
2791 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2792 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
2793 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
2794 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2795 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
2796 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2797 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2798 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
2800 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2801 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
2802 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
2803 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
2804 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
2805 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2806 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
2807 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
2808 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2809 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
2810 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
2811 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2812 ; GFX90A-TGSPLIT-NEXT: s_endpgm
2814 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2815 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
2816 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2817 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
2818 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
2819 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2820 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
2821 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
2822 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2823 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
2824 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2825 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2826 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
2828 ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2829 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
2830 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
2831 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
2832 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
2833 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
2834 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
2835 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
2836 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2837 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
2838 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2839 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
2840 ; GFX940-TGSPLIT-NEXT: s_endpgm
2842 ; GFX11-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2843 ; GFX11-WGP: ; %bb.0: ; %entry
2844 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2845 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
2846 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
2847 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
2848 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
2849 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
2850 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2851 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
2852 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
2853 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
2854 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2855 ; GFX11-WGP-NEXT: s_endpgm
2857 ; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2858 ; GFX11-CU: ; %bb.0: ; %entry
2859 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2860 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
2861 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
2862 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
2863 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
2864 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
2865 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2866 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
2867 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
2868 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
2869 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2870 ; GFX11-CU-NEXT: s_endpgm
2872 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2873 ; GFX12-WGP: ; %bb.0: ; %entry
2874 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2875 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
2876 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
2877 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
2878 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
2879 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
2880 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2881 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
2882 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
2883 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
2884 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
2885 ; GFX12-WGP-NEXT: s_endpgm
2887 ; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
2888 ; GFX12-CU: ; %bb.0: ; %entry
2889 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
2890 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
2891 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
2892 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
2893 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
2894 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
2895 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2896 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
2897 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
2898 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
2899 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
2900 ; GFX12-CU-NEXT: s_endpgm
2901 ptr %out, i32 %in, i32 %old) {
2903 %gep = getelementptr i32, ptr %out, i32 4
2904 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
2908 define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
2909 ; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
2910 ; GFX7: ; %bb.0: ; %entry
2911 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
2912 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
2913 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
2914 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
2915 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
2916 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2917 ; GFX7-NEXT: s_mov_b32 s4, s8
2918 ; GFX7-NEXT: s_mov_b32 s5, s9
2919 ; GFX7-NEXT: s_mov_b32 s9, s10
2920 ; GFX7-NEXT: s_mov_b32 s8, s11
2921 ; GFX7-NEXT: s_add_u32 s4, s4, s9
2922 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
2923 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2924 ; GFX7-NEXT: s_mov_b32 s5, s8
2925 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
2926 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
2927 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2928 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
2929 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
2930 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
2931 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
2932 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
2933 ; GFX7-NEXT: s_endpgm
2935 ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
2936 ; GFX10-WGP: ; %bb.0: ; %entry
2937 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
2938 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
2939 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
2940 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
2941 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
2942 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
2943 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
2944 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
2945 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
2946 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
2947 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
2948 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
2949 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2950 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
2951 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
2952 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
2953 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2954 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
2955 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
2956 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
2957 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
2958 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
2959 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
2960 ; GFX10-WGP-NEXT: buffer_gl0_inv
2961 ; GFX10-WGP-NEXT: s_endpgm
2963 ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
2964 ; GFX10-CU: ; %bb.0: ; %entry
2965 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
2966 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
2967 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
2968 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
2969 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
2970 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
2971 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
2972 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
2973 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
2974 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
2975 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
2976 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
2977 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
2978 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
2979 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
2980 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
2981 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
2982 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
2983 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
2984 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
2985 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
2986 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
2987 ; GFX10-CU-NEXT: s_endpgm
2989 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
2990 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
2991 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
2992 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
2993 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
2994 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
2995 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
2996 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
2997 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
2998 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
2999 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
3000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
3001 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
3002 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
3003 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
3004 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
3005 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
3006 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
3007 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3008 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
3009 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
3010 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
3011 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
3012 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
3013 ; SKIP-CACHE-INV-NEXT: s_endpgm
3015 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
3016 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
3017 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
3018 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
3019 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
3020 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3021 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
3022 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
3023 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3024 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
3025 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3026 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3027 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3028 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
3030 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
3031 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
3032 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
3033 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
3034 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
3035 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3036 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
3037 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
3038 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3039 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
3040 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3041 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3042 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
3043 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
3044 ; GFX90A-TGSPLIT-NEXT: s_endpgm
3046 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
3047 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
3048 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3049 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
3050 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
3051 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3052 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
3053 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
3054 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3055 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
3056 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
3057 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3058 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3059 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
3061 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
3062 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
3063 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3064 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
3065 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
3066 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3067 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
3068 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
3069 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3070 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
3071 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
3072 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3073 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
3074 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
3075 ; GFX940-TGSPLIT-NEXT: s_endpgm
3077 ; GFX11-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
3078 ; GFX11-WGP: ; %bb.0: ; %entry
3079 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3080 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
3081 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
3082 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
3083 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
3084 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
3085 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3086 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
3087 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
3088 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
3089 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3090 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
3091 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
3092 ; GFX11-WGP-NEXT: buffer_gl0_inv
3093 ; GFX11-WGP-NEXT: s_endpgm
3095 ; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
3096 ; GFX11-CU: ; %bb.0: ; %entry
3097 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3098 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
3099 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
3100 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
3101 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
3102 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
3103 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3104 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
3105 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
3106 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
3107 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3108 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
3109 ; GFX11-CU-NEXT: s_endpgm
3111 ; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
3112 ; GFX12-WGP: ; %bb.0: ; %entry
3113 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3114 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
3115 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
3116 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
3117 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
3118 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
3119 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3120 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
3121 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
3122 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
3123 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
3124 ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0
3125 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
3126 ; GFX12-WGP-NEXT: s_endpgm
3128 ; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
3129 ; GFX12-CU: ; %bb.0: ; %entry
3130 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3131 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
3132 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
3133 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
3134 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
3135 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
3136 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3137 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
3138 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
3139 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
3140 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3141 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
3142 ; GFX12-CU-NEXT: s_endpgm
3143 ptr %out, i32 %in, i32 %old) {
3145 %gep = getelementptr i32, ptr %out, i32 4
3146 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
3150 define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
3151 ; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3152 ; GFX7: ; %bb.0: ; %entry
3153 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
3154 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
3155 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
3156 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
3157 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
3158 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3159 ; GFX7-NEXT: s_mov_b32 s4, s8
3160 ; GFX7-NEXT: s_mov_b32 s5, s9
3161 ; GFX7-NEXT: s_mov_b32 s9, s10
3162 ; GFX7-NEXT: s_mov_b32 s8, s11
3163 ; GFX7-NEXT: s_add_u32 s4, s4, s9
3164 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
3165 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3166 ; GFX7-NEXT: s_mov_b32 s5, s8
3167 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
3168 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
3169 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3170 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
3171 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
3172 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
3173 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3174 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
3175 ; GFX7-NEXT: s_endpgm
3177 ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3178 ; GFX10-WGP: ; %bb.0: ; %entry
3179 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
3180 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
3181 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
3182 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
3183 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
3184 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
3185 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
3186 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
3187 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
3188 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
3189 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
3190 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
3191 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3192 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
3193 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
3194 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
3195 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3196 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
3197 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
3198 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
3199 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3200 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
3201 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
3202 ; GFX10-WGP-NEXT: s_endpgm
3204 ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3205 ; GFX10-CU: ; %bb.0: ; %entry
3206 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
3207 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
3208 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
3209 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
3210 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
3211 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
3212 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
3213 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
3214 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
3215 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
3216 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
3217 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
3218 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3219 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
3220 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
3221 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
3222 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3223 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
3224 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
3225 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
3226 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
3227 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
3228 ; GFX10-CU-NEXT: s_endpgm
3230 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3231 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
3232 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
3233 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
3234 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
3235 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
3236 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
3237 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
3238 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
3239 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
3240 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
3241 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
3242 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
3243 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
3244 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
3245 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
3246 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
3247 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
3248 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3249 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
3250 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
3251 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
3252 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
3253 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
3254 ; SKIP-CACHE-INV-NEXT: s_endpgm
3256 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3257 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
3258 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
3259 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
3260 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
3261 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3262 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
3263 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
3264 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3265 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
3266 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3267 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3268 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3269 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
3271 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3272 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
3273 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
3274 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
3275 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
3276 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3277 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
3278 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
3279 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3280 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
3281 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3282 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3283 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3284 ; GFX90A-TGSPLIT-NEXT: s_endpgm
3286 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3287 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
3288 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3289 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
3290 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
3291 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3292 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
3293 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
3294 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3295 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
3296 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
3297 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3298 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3299 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
3301 ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3302 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
3303 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3304 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
3305 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
3306 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3307 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
3308 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
3309 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3310 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
3311 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
3312 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3313 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3314 ; GFX940-TGSPLIT-NEXT: s_endpgm
3316 ; GFX11-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3317 ; GFX11-WGP: ; %bb.0: ; %entry
3318 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3319 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
3320 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
3321 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
3322 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
3323 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
3324 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3325 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
3326 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
3327 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
3328 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3329 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
3330 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3331 ; GFX11-WGP-NEXT: s_endpgm
3333 ; GFX11-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3334 ; GFX11-CU: ; %bb.0: ; %entry
3335 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3336 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
3337 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
3338 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
3339 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
3340 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
3341 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3342 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
3343 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
3344 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
3345 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
3346 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3347 ; GFX11-CU-NEXT: s_endpgm
3349 ; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3350 ; GFX12-WGP: ; %bb.0: ; %entry
3351 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3352 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
3353 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
3354 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
3355 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
3356 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
3357 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3358 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
3359 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
3360 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
3361 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
3362 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
3363 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
3364 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
3365 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
3366 ; GFX12-WGP-NEXT: s_endpgm
3368 ; GFX12-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
3369 ; GFX12-CU: ; %bb.0: ; %entry
3370 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3371 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
3372 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
3373 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
3374 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
3375 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
3376 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3377 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
3378 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
3379 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
3380 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
3381 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3382 ; GFX12-CU-NEXT: s_endpgm
3383 ptr %out, i32 %in, i32 %old) {
3385 %gep = getelementptr i32, ptr %out, i32 4
3386 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
3390 define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
3391 ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3392 ; GFX7: ; %bb.0: ; %entry
3393 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
3394 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
3395 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
3396 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
3397 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
3398 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3399 ; GFX7-NEXT: s_mov_b32 s4, s8
3400 ; GFX7-NEXT: s_mov_b32 s5, s9
3401 ; GFX7-NEXT: s_mov_b32 s9, s10
3402 ; GFX7-NEXT: s_mov_b32 s8, s11
3403 ; GFX7-NEXT: s_add_u32 s4, s4, s9
3404 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
3405 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3406 ; GFX7-NEXT: s_mov_b32 s5, s8
3407 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
3408 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
3409 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3410 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
3411 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
3412 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
3413 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3414 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
3415 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3416 ; GFX7-NEXT: s_endpgm
3418 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3419 ; GFX10-WGP: ; %bb.0: ; %entry
3420 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
3421 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
3422 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
3423 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
3424 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
3425 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
3426 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
3427 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
3428 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
3429 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
3430 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
3431 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
3432 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3433 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
3434 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
3435 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
3436 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3437 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
3438 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
3439 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
3440 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3441 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
3442 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
3443 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
3444 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
3445 ; GFX10-WGP-NEXT: buffer_gl0_inv
3446 ; GFX10-WGP-NEXT: s_endpgm
3448 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3449 ; GFX10-CU: ; %bb.0: ; %entry
3450 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
3451 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
3452 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
3453 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
3454 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
3455 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
3456 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
3457 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
3458 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
3459 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
3460 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
3461 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
3462 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3463 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
3464 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
3465 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
3466 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3467 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
3468 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
3469 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
3470 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
3471 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
3472 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
3473 ; GFX10-CU-NEXT: s_endpgm
3475 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3476 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
3477 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
3478 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
3479 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
3480 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
3481 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
3482 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
3483 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
3484 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
3485 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
3486 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
3487 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
3488 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
3489 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
3490 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
3491 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
3492 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
3493 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3494 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
3495 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
3496 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
3497 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
3498 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
3499 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
3500 ; SKIP-CACHE-INV-NEXT: s_endpgm
3502 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3503 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
3504 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
3505 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
3506 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
3507 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3508 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
3509 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
3510 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3511 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
3512 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3513 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3514 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3515 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3516 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
3518 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3519 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
3520 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
3521 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
3522 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
3523 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3524 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
3525 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
3526 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3527 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
3528 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3529 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3530 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3531 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
3532 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
3533 ; GFX90A-TGSPLIT-NEXT: s_endpgm
3535 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3536 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
3537 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3538 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
3539 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
3540 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3541 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
3542 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
3543 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3544 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
3545 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
3546 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3547 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3548 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3549 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
3551 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3552 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
3553 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3554 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
3555 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
3556 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3557 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
3558 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
3559 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3560 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
3561 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
3562 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3563 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3564 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
3565 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
3566 ; GFX940-TGSPLIT-NEXT: s_endpgm
3568 ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3569 ; GFX11-WGP: ; %bb.0: ; %entry
3570 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3571 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
3572 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
3573 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
3574 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
3575 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
3576 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3577 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
3578 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
3579 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
3580 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3581 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
3582 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3583 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
3584 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
3585 ; GFX11-WGP-NEXT: buffer_gl0_inv
3586 ; GFX11-WGP-NEXT: s_endpgm
3588 ; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3589 ; GFX11-CU: ; %bb.0: ; %entry
3590 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3591 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
3592 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
3593 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
3594 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
3595 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
3596 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3597 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
3598 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
3599 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
3600 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
3601 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3602 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
3603 ; GFX11-CU-NEXT: s_endpgm
3605 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3606 ; GFX12-WGP: ; %bb.0: ; %entry
3607 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3608 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
3609 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
3610 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
3611 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
3612 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
3613 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3614 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
3615 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
3616 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
3617 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
3618 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
3619 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
3620 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
3621 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
3622 ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0
3623 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
3624 ; GFX12-WGP-NEXT: s_endpgm
3626 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
3627 ; GFX12-CU: ; %bb.0: ; %entry
3628 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3629 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
3630 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
3631 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
3632 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
3633 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
3634 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3635 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
3636 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
3637 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
3638 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
3639 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3640 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
3641 ; GFX12-CU-NEXT: s_endpgm
3642 ptr %out, i32 %in, i32 %old) {
3644 %gep = getelementptr i32, ptr %out, i32 4
3645 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
3649 define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
3650 ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3651 ; GFX7: ; %bb.0: ; %entry
3652 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
3653 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
3654 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
3655 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
3656 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
3657 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3658 ; GFX7-NEXT: s_mov_b32 s4, s8
3659 ; GFX7-NEXT: s_mov_b32 s5, s9
3660 ; GFX7-NEXT: s_mov_b32 s9, s10
3661 ; GFX7-NEXT: s_mov_b32 s8, s11
3662 ; GFX7-NEXT: s_add_u32 s4, s4, s9
3663 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
3664 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3665 ; GFX7-NEXT: s_mov_b32 s5, s8
3666 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
3667 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
3668 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3669 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
3670 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
3671 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
3672 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3673 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
3674 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3675 ; GFX7-NEXT: s_endpgm
3677 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3678 ; GFX10-WGP: ; %bb.0: ; %entry
3679 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
3680 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
3681 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
3682 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
3683 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
3684 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
3685 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
3686 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
3687 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
3688 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
3689 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
3690 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
3691 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3692 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
3693 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
3694 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
3695 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3696 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
3697 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
3698 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
3699 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3700 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
3701 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
3702 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
3703 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
3704 ; GFX10-WGP-NEXT: buffer_gl0_inv
3705 ; GFX10-WGP-NEXT: s_endpgm
3707 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3708 ; GFX10-CU: ; %bb.0: ; %entry
3709 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
3710 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
3711 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
3712 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
3713 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
3714 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
3715 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
3716 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
3717 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
3718 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
3719 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
3720 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
3721 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3722 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
3723 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
3724 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
3725 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3726 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
3727 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
3728 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
3729 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
3730 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
3731 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
3732 ; GFX10-CU-NEXT: s_endpgm
3734 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3735 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
3736 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
3737 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
3738 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
3739 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
3740 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
3741 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
3742 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
3743 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
3744 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
3745 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
3746 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
3747 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
3748 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
3749 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
3750 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
3751 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
3752 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3753 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
3754 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
3755 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
3756 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
3757 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
3758 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
3759 ; SKIP-CACHE-INV-NEXT: s_endpgm
3761 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3762 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
3763 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
3764 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
3765 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
3766 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3767 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
3768 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
3769 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3770 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
3771 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3772 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3773 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3774 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3775 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
3777 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3778 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
3779 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
3780 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
3781 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
3782 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3783 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
3784 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
3785 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3786 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
3787 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
3788 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3789 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3790 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
3791 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
3792 ; GFX90A-TGSPLIT-NEXT: s_endpgm
3794 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3795 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
3796 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3797 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
3798 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
3799 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3800 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
3801 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
3802 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3803 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
3804 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
3805 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3806 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3807 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3808 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
3810 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3811 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
3812 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
3813 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
3814 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
3815 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
3816 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
3817 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
3818 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3819 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
3820 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
3821 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3822 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
3823 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
3824 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
3825 ; GFX940-TGSPLIT-NEXT: s_endpgm
3827 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3828 ; GFX11-WGP: ; %bb.0: ; %entry
3829 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3830 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
3831 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
3832 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
3833 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
3834 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
3835 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3836 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
3837 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
3838 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
3839 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
3840 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
3841 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3842 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
3843 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
3844 ; GFX11-WGP-NEXT: buffer_gl0_inv
3845 ; GFX11-WGP-NEXT: s_endpgm
3847 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3848 ; GFX11-CU: ; %bb.0: ; %entry
3849 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3850 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
3851 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
3852 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
3853 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
3854 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
3855 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3856 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
3857 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
3858 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
3859 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
3860 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3861 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
3862 ; GFX11-CU-NEXT: s_endpgm
3864 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3865 ; GFX12-WGP: ; %bb.0: ; %entry
3866 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3867 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
3868 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
3869 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
3870 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
3871 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
3872 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3873 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
3874 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
3875 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
3876 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
3877 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
3878 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
3879 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
3880 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
3881 ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0
3882 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
3883 ; GFX12-WGP-NEXT: s_endpgm
3885 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
3886 ; GFX12-CU: ; %bb.0: ; %entry
3887 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
3888 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
3889 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
3890 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
3891 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
3892 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
3893 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3894 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
3895 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
3896 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
3897 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
3898 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
3899 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
3900 ; GFX12-CU-NEXT: s_endpgm
3901 ptr %out, i32 %in, i32 %old) {
3903 %gep = getelementptr i32, ptr %out, i32 4
3904 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
3908 define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
3909 ; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
3910 ; GFX7: ; %bb.0: ; %entry
3911 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
3912 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
3913 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
3914 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
3915 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
3916 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3917 ; GFX7-NEXT: s_mov_b32 s4, s8
3918 ; GFX7-NEXT: s_mov_b32 s5, s9
3919 ; GFX7-NEXT: s_mov_b32 s9, s10
3920 ; GFX7-NEXT: s_mov_b32 s8, s11
3921 ; GFX7-NEXT: s_add_u32 s4, s4, s9
3922 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
3923 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3924 ; GFX7-NEXT: s_mov_b32 s5, s8
3925 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
3926 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
3927 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3928 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
3929 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
3930 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
3931 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
3932 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
3933 ; GFX7-NEXT: s_endpgm
3935 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
3936 ; GFX10-WGP: ; %bb.0: ; %entry
3937 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
3938 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
3939 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
3940 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
3941 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
3942 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
3943 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
3944 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
3945 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
3946 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
3947 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
3948 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
3949 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3950 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
3951 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
3952 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
3953 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3954 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
3955 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
3956 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
3957 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
3958 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
3959 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
3960 ; GFX10-WGP-NEXT: buffer_gl0_inv
3961 ; GFX10-WGP-NEXT: s_endpgm
3963 ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
3964 ; GFX10-CU: ; %bb.0: ; %entry
3965 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
3966 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
3967 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
3968 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
3969 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
3970 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
3971 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
3972 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
3973 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
3974 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
3975 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
3976 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
3977 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
3978 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
3979 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
3980 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
3981 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
3982 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
3983 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
3984 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
3985 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
3986 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
3987 ; GFX10-CU-NEXT: s_endpgm
3989 ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
3990 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
3991 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
3992 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
3993 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
3994 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
3995 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
3996 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
3997 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
3998 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
3999 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
4000 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
4001 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
4002 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
4003 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
4004 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
4005 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
4006 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
4007 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4008 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
4009 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
4010 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
4011 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4012 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
4013 ; SKIP-CACHE-INV-NEXT: s_endpgm
4015 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
4016 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
4017 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
4018 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
4019 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
4020 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4021 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
4022 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
4023 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4024 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
4025 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4026 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4027 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4028 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
4030 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
4031 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
4032 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
4033 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
4034 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
4035 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4036 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
4037 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
4038 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4039 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
4040 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4041 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4042 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
4043 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
4044 ; GFX90A-TGSPLIT-NEXT: s_endpgm
4046 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
4047 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
4048 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
4049 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
4050 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
4051 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4052 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
4053 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
4054 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4055 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
4056 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
4057 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4058 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4059 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
4061 ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
4062 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
4063 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
4064 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
4065 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
4066 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4067 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
4068 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
4069 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4070 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
4071 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
4072 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4073 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
4074 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
4075 ; GFX940-TGSPLIT-NEXT: s_endpgm
4077 ; GFX11-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
4078 ; GFX11-WGP: ; %bb.0: ; %entry
4079 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
4080 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
4081 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
4082 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
4083 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
4084 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
4085 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4086 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
4087 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
4088 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
4089 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4090 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
4091 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
4092 ; GFX11-WGP-NEXT: buffer_gl0_inv
4093 ; GFX11-WGP-NEXT: s_endpgm
4095 ; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
4096 ; GFX11-CU: ; %bb.0: ; %entry
4097 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
4098 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
4099 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
4100 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
4101 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
4102 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
4103 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4104 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
4105 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
4106 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
4107 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4108 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
4109 ; GFX11-CU-NEXT: s_endpgm
4111 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
4112 ; GFX12-WGP: ; %bb.0: ; %entry
4113 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
4114 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
4115 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
4116 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
4117 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
4118 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
4119 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4120 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
4121 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
4122 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
4123 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
4124 ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0
4125 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
4126 ; GFX12-WGP-NEXT: s_endpgm
4128 ; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
4129 ; GFX12-CU: ; %bb.0: ; %entry
4130 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
4131 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
4132 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
4133 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
4134 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
4135 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
4136 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4137 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
4138 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
4139 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
4140 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4141 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
4142 ; GFX12-CU-NEXT: s_endpgm
4143 ptr %out, i32 %in, i32 %old) {
4145 %gep = getelementptr i32, ptr %out, i32 4
4146 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire
4150 define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
4151 ; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4152 ; GFX7: ; %bb.0: ; %entry
4153 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
4154 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
4155 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
4156 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
4157 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
4158 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4159 ; GFX7-NEXT: s_mov_b32 s4, s8
4160 ; GFX7-NEXT: s_mov_b32 s5, s9
4161 ; GFX7-NEXT: s_mov_b32 s9, s10
4162 ; GFX7-NEXT: s_mov_b32 s8, s11
4163 ; GFX7-NEXT: s_add_u32 s4, s4, s9
4164 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
4165 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4166 ; GFX7-NEXT: s_mov_b32 s5, s8
4167 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
4168 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
4169 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4170 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
4171 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
4172 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
4173 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4174 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4175 ; GFX7-NEXT: s_endpgm
4177 ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4178 ; GFX10-WGP: ; %bb.0: ; %entry
4179 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
4180 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
4181 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
4182 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
4183 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
4184 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
4185 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
4186 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
4187 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
4188 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
4189 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
4190 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
4191 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4192 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
4193 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
4194 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
4195 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4196 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
4197 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
4198 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
4199 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4200 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
4201 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
4202 ; GFX10-WGP-NEXT: buffer_gl0_inv
4203 ; GFX10-WGP-NEXT: s_endpgm
4205 ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4206 ; GFX10-CU: ; %bb.0: ; %entry
4207 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
4208 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
4209 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
4210 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
4211 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
4212 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
4213 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
4214 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
4215 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
4216 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
4217 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
4218 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
4219 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4220 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
4221 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
4222 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
4223 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4224 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
4225 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
4226 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
4227 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4228 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
4229 ; GFX10-CU-NEXT: s_endpgm
4231 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4232 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
4233 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
4234 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
4235 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
4236 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
4237 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
4238 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
4239 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
4240 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
4241 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
4242 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
4243 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
4244 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
4245 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
4246 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
4247 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
4248 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
4249 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4250 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
4251 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
4252 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
4253 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4254 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
4255 ; SKIP-CACHE-INV-NEXT: s_endpgm
4257 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4258 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
4259 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
4260 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
4261 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
4262 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4263 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
4264 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
4265 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4266 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
4267 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4268 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4269 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4270 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
4272 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4273 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
4274 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
4275 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
4276 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
4277 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4278 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
4279 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
4280 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4281 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
4282 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4283 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4284 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
4285 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
4286 ; GFX90A-TGSPLIT-NEXT: s_endpgm
4288 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4289 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
4290 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
4291 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
4292 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
4293 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4294 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
4295 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
4296 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4297 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
4298 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
4299 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4300 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4301 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
4303 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4304 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
4305 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
4306 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
4307 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
4308 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4309 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
4310 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
4311 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4312 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
4313 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
4314 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4315 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
4316 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
4317 ; GFX940-TGSPLIT-NEXT: s_endpgm
4319 ; GFX11-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4320 ; GFX11-WGP: ; %bb.0: ; %entry
4321 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
4322 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
4323 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
4324 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
4325 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
4326 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
4327 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4328 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
4329 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
4330 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
4331 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4332 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
4333 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
4334 ; GFX11-WGP-NEXT: buffer_gl0_inv
4335 ; GFX11-WGP-NEXT: s_endpgm
4337 ; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4338 ; GFX11-CU: ; %bb.0: ; %entry
4339 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
4340 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
4341 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
4342 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
4343 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
4344 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
4345 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4346 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
4347 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
4348 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
4349 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4350 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
4351 ; GFX11-CU-NEXT: s_endpgm
4353 ; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4354 ; GFX12-WGP: ; %bb.0: ; %entry
4355 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
4356 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
4357 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
4358 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
4359 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
4360 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
4361 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4362 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
4363 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
4364 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
4365 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
4366 ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0
4367 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
4368 ; GFX12-WGP-NEXT: s_endpgm
4370 ; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
4371 ; GFX12-CU: ; %bb.0: ; %entry
4372 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
4373 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
4374 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
4375 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
4376 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
4377 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
4378 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4379 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
4380 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
4381 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
4382 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4383 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
4384 ; GFX12-CU-NEXT: s_endpgm
4385 ptr %out, i32 %in, i32 %old) {
4387 %gep = getelementptr i32, ptr %out, i32 4
4388 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
4392 define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
4393 ; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg:
4394 ; GFX7: ; %bb.0: ; %entry
4395 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
4396 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
4397 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
4398 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
4399 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
4400 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4401 ; GFX7-NEXT: s_mov_b32 s4, s8
4402 ; GFX7-NEXT: s_mov_b32 s5, s9
4403 ; GFX7-NEXT: s_mov_b32 s9, s10
4404 ; GFX7-NEXT: s_mov_b32 s8, s11
4405 ; GFX7-NEXT: s_add_u32 s4, s4, s9
4406 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
4407 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4408 ; GFX7-NEXT: s_mov_b32 s5, s8
4409 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
4410 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
4411 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4412 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
4413 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
4414 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
4415 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4416 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4417 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4418 ; GFX7-NEXT: s_endpgm
4420 ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
4421 ; GFX10-WGP: ; %bb.0: ; %entry
4422 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
4423 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
4424 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
4425 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
4426 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
4427 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
4428 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
4429 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
4430 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
4431 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
4432 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
4433 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
4434 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4435 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
4436 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
4437 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
4438 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4439 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
4440 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
4441 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
4442 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4443 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
4444 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4445 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
4446 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
4447 ; GFX10-WGP-NEXT: buffer_gl0_inv
4448 ; GFX10-WGP-NEXT: s_endpgm
4450 ; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
4451 ; GFX10-CU: ; %bb.0: ; %entry
4452 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
4453 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
4454 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
4455 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
4456 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
4457 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
4458 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
4459 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
4460 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
4461 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
4462 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
4463 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
4464 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4465 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
4466 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
4467 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
4468 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4469 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
4470 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
4471 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
4472 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
4473 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4474 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
4475 ; GFX10-CU-NEXT: s_endpgm
4477 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_cmpxchg:
4478 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
4479 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
4480 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
4481 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
4482 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
4483 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
4484 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
4485 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
4486 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
4487 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
4488 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
4489 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
4490 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
4491 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
4492 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
4493 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
4494 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
4495 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4496 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
4497 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
4498 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
4499 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
4500 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4501 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
4502 ; SKIP-CACHE-INV-NEXT: s_endpgm
4504 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
4505 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
4506 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
4507 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
4508 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
4509 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4510 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
4511 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
4512 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4513 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
4514 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4515 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4516 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4517 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4518 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
4520 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
4521 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
4522 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
4523 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
4524 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
4525 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4526 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
4527 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
4528 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4529 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
4530 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4531 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4532 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4533 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
4534 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
4535 ; GFX90A-TGSPLIT-NEXT: s_endpgm
4537 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
4538 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
4539 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
4540 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
4541 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
4542 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4543 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
4544 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
4545 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4546 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
4547 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
4548 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4549 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4550 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4551 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
4553 ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
4554 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
4555 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
4556 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
4557 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
4558 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4559 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
4560 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
4561 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4562 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
4563 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
4564 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4565 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4566 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
4567 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
4568 ; GFX940-TGSPLIT-NEXT: s_endpgm
4570 ; GFX11-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
4571 ; GFX11-WGP: ; %bb.0: ; %entry
4572 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
4573 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
4574 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
4575 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
4576 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
4577 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
4578 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4579 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
4580 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
4581 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
4582 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4583 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
4584 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4585 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
4586 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
4587 ; GFX11-WGP-NEXT: buffer_gl0_inv
4588 ; GFX11-WGP-NEXT: s_endpgm
4590 ; GFX11-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
4591 ; GFX11-CU: ; %bb.0: ; %entry
4592 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
4593 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
4594 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
4595 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
4596 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
4597 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
4598 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4599 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
4600 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
4601 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
4602 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
4603 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4604 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
4605 ; GFX11-CU-NEXT: s_endpgm
4607 ; GFX12-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
4608 ; GFX12-WGP: ; %bb.0: ; %entry
4609 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
4610 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
4611 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
4612 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
4613 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
4614 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
4615 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4616 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
4617 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
4618 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
4619 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
4620 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
4621 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
4622 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
4623 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
4624 ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0
4625 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
4626 ; GFX12-WGP-NEXT: s_endpgm
4628 ; GFX12-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
4629 ; GFX12-CU: ; %bb.0: ; %entry
4630 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
4631 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
4632 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
4633 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
4634 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
4635 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
4636 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4637 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
4638 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
4639 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
4640 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
4641 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4642 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
4643 ; GFX12-CU-NEXT: s_endpgm
4644 ptr %out, i32 %in, i32 %old) {
4646 %gep = getelementptr i32, ptr %out, i32 4
4647 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
4651 define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
4652 ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4653 ; GFX7: ; %bb.0: ; %entry
4654 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
4655 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
4656 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
4657 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
4658 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
4659 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4660 ; GFX7-NEXT: s_mov_b32 s4, s8
4661 ; GFX7-NEXT: s_mov_b32 s5, s9
4662 ; GFX7-NEXT: s_mov_b32 s9, s10
4663 ; GFX7-NEXT: s_mov_b32 s8, s11
4664 ; GFX7-NEXT: s_add_u32 s4, s4, s9
4665 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
4666 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4667 ; GFX7-NEXT: s_mov_b32 s5, s8
4668 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
4669 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
4670 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4671 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
4672 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
4673 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
4674 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4675 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4676 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4677 ; GFX7-NEXT: s_endpgm
4679 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4680 ; GFX10-WGP: ; %bb.0: ; %entry
4681 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
4682 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
4683 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
4684 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
4685 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
4686 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
4687 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
4688 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
4689 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
4690 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
4691 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
4692 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
4693 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4694 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
4695 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
4696 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
4697 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4698 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
4699 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
4700 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
4701 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4702 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
4703 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4704 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
4705 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
4706 ; GFX10-WGP-NEXT: buffer_gl0_inv
4707 ; GFX10-WGP-NEXT: s_endpgm
4709 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4710 ; GFX10-CU: ; %bb.0: ; %entry
4711 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
4712 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
4713 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
4714 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
4715 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
4716 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
4717 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
4718 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
4719 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
4720 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
4721 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
4722 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
4723 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4724 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
4725 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
4726 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
4727 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4728 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
4729 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
4730 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
4731 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
4732 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4733 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
4734 ; GFX10-CU-NEXT: s_endpgm
4736 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4737 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
4738 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
4739 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
4740 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
4741 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
4742 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
4743 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
4744 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
4745 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
4746 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
4747 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
4748 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
4749 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
4750 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
4751 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
4752 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
4753 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
4754 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4755 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
4756 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
4757 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
4758 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
4759 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4760 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
4761 ; SKIP-CACHE-INV-NEXT: s_endpgm
4763 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4764 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
4765 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
4766 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
4767 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
4768 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4769 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
4770 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
4771 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4772 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
4773 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4774 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4775 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4776 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4777 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
4779 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4780 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
4781 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
4782 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
4783 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
4784 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4785 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
4786 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
4787 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4788 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
4789 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
4790 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4791 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4792 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
4793 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
4794 ; GFX90A-TGSPLIT-NEXT: s_endpgm
4796 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4797 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
4798 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
4799 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
4800 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
4801 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4802 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
4803 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
4804 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4805 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
4806 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
4807 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4808 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4809 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4810 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
4812 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4813 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
4814 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
4815 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
4816 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
4817 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
4818 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
4819 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
4820 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4821 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
4822 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
4823 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4824 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
4825 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
4826 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
4827 ; GFX940-TGSPLIT-NEXT: s_endpgm
4829 ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4830 ; GFX11-WGP: ; %bb.0: ; %entry
4831 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
4832 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
4833 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
4834 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
4835 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
4836 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
4837 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4838 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
4839 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
4840 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
4841 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4842 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
4843 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4844 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
4845 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
4846 ; GFX11-WGP-NEXT: buffer_gl0_inv
4847 ; GFX11-WGP-NEXT: s_endpgm
4849 ; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4850 ; GFX11-CU: ; %bb.0: ; %entry
4851 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
4852 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
4853 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
4854 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
4855 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
4856 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
4857 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4858 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
4859 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
4860 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
4861 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
4862 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4863 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
4864 ; GFX11-CU-NEXT: s_endpgm
4866 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4867 ; GFX12-WGP: ; %bb.0: ; %entry
4868 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
4869 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
4870 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
4871 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
4872 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
4873 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
4874 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4875 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
4876 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
4877 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
4878 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
4879 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
4880 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
4881 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
4882 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
4883 ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0
4884 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
4885 ; GFX12-WGP-NEXT: s_endpgm
4887 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
4888 ; GFX12-CU: ; %bb.0: ; %entry
4889 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
4890 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
4891 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
4892 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
4893 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
4894 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
4895 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4896 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
4897 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
4898 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
4899 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
4900 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
4901 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
4902 ; GFX12-CU-NEXT: s_endpgm
4903 ptr %out, i32 %in, i32 %old) {
4905 %gep = getelementptr i32, ptr %out, i32 4
4906 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
4910 define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
4911 ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
4912 ; GFX7: ; %bb.0: ; %entry
4913 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
4914 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
4915 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
4916 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
4917 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
4918 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4919 ; GFX7-NEXT: s_mov_b32 s4, s8
4920 ; GFX7-NEXT: s_mov_b32 s5, s9
4921 ; GFX7-NEXT: s_mov_b32 s9, s10
4922 ; GFX7-NEXT: s_mov_b32 s8, s11
4923 ; GFX7-NEXT: s_add_u32 s4, s4, s9
4924 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
4925 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4926 ; GFX7-NEXT: s_mov_b32 s5, s8
4927 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
4928 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
4929 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4930 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
4931 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
4932 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
4933 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4934 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4935 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
4936 ; GFX7-NEXT: s_endpgm
4938 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
4939 ; GFX10-WGP: ; %bb.0: ; %entry
4940 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
4941 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
4942 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
4943 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
4944 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
4945 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
4946 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
4947 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
4948 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
4949 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
4950 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
4951 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
4952 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4953 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
4954 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
4955 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
4956 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4957 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
4958 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
4959 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
4960 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
4961 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
4962 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4963 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
4964 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
4965 ; GFX10-WGP-NEXT: buffer_gl0_inv
4966 ; GFX10-WGP-NEXT: s_endpgm
4968 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
4969 ; GFX10-CU: ; %bb.0: ; %entry
4970 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
4971 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
4972 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
4973 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
4974 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
4975 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
4976 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
4977 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
4978 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
4979 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
4980 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
4981 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
4982 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
4983 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
4984 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
4985 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
4986 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
4987 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
4988 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
4989 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
4990 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
4991 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
4992 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
4993 ; GFX10-CU-NEXT: s_endpgm
4995 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
4996 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
4997 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
4998 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
4999 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
5000 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
5001 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
5002 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
5003 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
5004 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
5005 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
5006 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
5007 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
5008 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
5009 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
5010 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
5011 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
5012 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
5013 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5014 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
5015 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
5016 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
5017 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
5018 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
5019 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
5020 ; SKIP-CACHE-INV-NEXT: s_endpgm
5022 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
5023 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
5024 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
5025 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
5026 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
5027 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5028 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
5029 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
5030 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5031 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
5032 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5033 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5034 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5035 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5036 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
5038 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
5039 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
5040 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
5041 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
5042 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
5043 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5044 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
5045 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
5046 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5047 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
5048 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5049 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5050 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5051 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
5052 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
5053 ; GFX90A-TGSPLIT-NEXT: s_endpgm
5055 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
5056 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
5057 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
5058 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
5059 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
5060 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5061 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
5062 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
5063 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5064 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
5065 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
5066 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5067 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5068 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5069 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
5071 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
5072 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
5073 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
5074 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
5075 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
5076 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5077 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
5078 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
5079 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5080 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
5081 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
5082 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5083 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5084 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
5085 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
5086 ; GFX940-TGSPLIT-NEXT: s_endpgm
5088 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
5089 ; GFX11-WGP: ; %bb.0: ; %entry
5090 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5091 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
5092 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
5093 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
5094 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
5095 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
5096 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5097 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
5098 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
5099 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
5100 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5101 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
5102 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5103 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
5104 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
5105 ; GFX11-WGP-NEXT: buffer_gl0_inv
5106 ; GFX11-WGP-NEXT: s_endpgm
5108 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
5109 ; GFX11-CU: ; %bb.0: ; %entry
5110 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5111 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
5112 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
5113 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
5114 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
5115 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
5116 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5117 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
5118 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
5119 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
5120 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
5121 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5122 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
5123 ; GFX11-CU-NEXT: s_endpgm
5125 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
5126 ; GFX12-WGP: ; %bb.0: ; %entry
5127 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5128 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
5129 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
5130 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
5131 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
5132 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
5133 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5134 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
5135 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
5136 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
5137 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
5138 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
5139 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
5140 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
5141 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
5142 ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0
5143 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
5144 ; GFX12-WGP-NEXT: s_endpgm
5146 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
5147 ; GFX12-CU: ; %bb.0: ; %entry
5148 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5149 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
5150 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
5151 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
5152 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
5153 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
5154 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5155 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
5156 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
5157 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
5158 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
5159 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5160 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
5161 ; GFX12-CU-NEXT: s_endpgm
5162 ptr %out, i32 %in, i32 %old) {
5164 %gep = getelementptr i32, ptr %out, i32 4
5165 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
5169 define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
5170 ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5171 ; GFX7: ; %bb.0: ; %entry
5172 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
5173 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
5174 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
5175 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
5176 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
5177 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
5178 ; GFX7-NEXT: s_mov_b32 s4, s8
5179 ; GFX7-NEXT: s_mov_b32 s5, s9
5180 ; GFX7-NEXT: s_mov_b32 s9, s10
5181 ; GFX7-NEXT: s_mov_b32 s8, s11
5182 ; GFX7-NEXT: s_add_u32 s4, s4, s9
5183 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
5184 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5185 ; GFX7-NEXT: s_mov_b32 s5, s8
5186 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
5187 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
5188 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5189 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
5190 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
5191 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
5192 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
5193 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
5194 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
5195 ; GFX7-NEXT: s_endpgm
5197 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5198 ; GFX10-WGP: ; %bb.0: ; %entry
5199 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
5200 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
5201 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
5202 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
5203 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
5204 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
5205 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
5206 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
5207 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
5208 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
5209 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
5210 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
5211 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5212 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
5213 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
5214 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
5215 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5216 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
5217 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
5218 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
5219 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5220 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
5221 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
5222 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
5223 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
5224 ; GFX10-WGP-NEXT: buffer_gl0_inv
5225 ; GFX10-WGP-NEXT: s_endpgm
5227 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5228 ; GFX10-CU: ; %bb.0: ; %entry
5229 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
5230 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
5231 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
5232 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
5233 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
5234 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
5235 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
5236 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
5237 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
5238 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
5239 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
5240 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
5241 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
5242 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
5243 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
5244 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
5245 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5246 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
5247 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
5248 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
5249 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
5250 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
5251 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
5252 ; GFX10-CU-NEXT: s_endpgm
5254 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5255 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
5256 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
5257 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
5258 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
5259 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
5260 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
5261 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
5262 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
5263 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
5264 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
5265 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
5266 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
5267 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
5268 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
5269 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
5270 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
5271 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
5272 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5273 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
5274 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
5275 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
5276 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
5277 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
5278 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
5279 ; SKIP-CACHE-INV-NEXT: s_endpgm
5281 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5282 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
5283 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
5284 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
5285 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
5286 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5287 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
5288 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
5289 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5290 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
5291 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5292 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5293 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5294 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5295 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
5297 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5298 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
5299 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
5300 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
5301 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
5302 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5303 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
5304 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
5305 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5306 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
5307 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5308 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5309 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5310 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
5311 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
5312 ; GFX90A-TGSPLIT-NEXT: s_endpgm
5314 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5315 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
5316 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
5317 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
5318 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
5319 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5320 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
5321 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
5322 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5323 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
5324 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
5325 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5326 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5327 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5328 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
5330 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5331 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
5332 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
5333 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
5334 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
5335 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5336 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
5337 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
5338 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5339 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
5340 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
5341 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5342 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
5343 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
5344 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
5345 ; GFX940-TGSPLIT-NEXT: s_endpgm
5347 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5348 ; GFX11-WGP: ; %bb.0: ; %entry
5349 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5350 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
5351 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
5352 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
5353 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
5354 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
5355 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5356 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
5357 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
5358 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
5359 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5360 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
5361 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5362 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
5363 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
5364 ; GFX11-WGP-NEXT: buffer_gl0_inv
5365 ; GFX11-WGP-NEXT: s_endpgm
5367 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5368 ; GFX11-CU: ; %bb.0: ; %entry
5369 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5370 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
5371 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
5372 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
5373 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
5374 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
5375 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5376 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
5377 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
5378 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
5379 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
5380 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5381 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
5382 ; GFX11-CU-NEXT: s_endpgm
5384 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5385 ; GFX12-WGP: ; %bb.0: ; %entry
5386 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5387 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
5388 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
5389 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
5390 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
5391 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
5392 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5393 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
5394 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
5395 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
5396 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
5397 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
5398 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
5399 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
5400 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
5401 ; GFX12-WGP-NEXT: s_wait_storecnt_dscnt 0x0
5402 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
5403 ; GFX12-WGP-NEXT: s_endpgm
5405 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
5406 ; GFX12-CU: ; %bb.0: ; %entry
5407 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5408 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
5409 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
5410 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
5411 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
5412 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
5413 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5414 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
5415 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
5416 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
5417 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
5418 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
5419 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
5420 ; GFX12-CU-NEXT: s_endpgm
5421 ptr %out, i32 %in, i32 %old) {
5423 %gep = getelementptr i32, ptr %out, i32 4
5424 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
5428 define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
5429 ; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5430 ; GFX7: ; %bb.0: ; %entry
5431 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
5432 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
5433 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
5434 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
5435 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
5436 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
5437 ; GFX7-NEXT: s_mov_b32 s6, s4
5438 ; GFX7-NEXT: s_mov_b32 s7, s5
5439 ; GFX7-NEXT: s_mov_b32 s11, s12
5440 ; GFX7-NEXT: s_mov_b32 s10, s13
5441 ; GFX7-NEXT: s_add_u32 s6, s6, s11
5442 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
5443 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5444 ; GFX7-NEXT: s_mov_b32 s7, s10
5445 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
5446 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
5447 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5448 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
5449 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
5450 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
5451 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5452 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
5453 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
5454 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5455 ; GFX7-NEXT: flat_store_dword v[0:1], v2
5456 ; GFX7-NEXT: s_endpgm
5458 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5459 ; GFX10-WGP: ; %bb.0: ; %entry
5460 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
5461 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
5462 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
5463 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
5464 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
5465 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
5466 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
5467 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
5468 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
5469 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
5470 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
5471 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
5472 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5473 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
5474 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
5475 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
5476 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5477 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
5478 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
5479 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
5480 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5481 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
5482 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
5483 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5484 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
5485 ; GFX10-WGP-NEXT: s_endpgm
5487 ; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5488 ; GFX10-CU: ; %bb.0: ; %entry
5489 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
5490 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
5491 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
5492 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
5493 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
5494 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
5495 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
5496 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
5497 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
5498 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
5499 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
5500 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
5501 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5502 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
5503 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
5504 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
5505 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5506 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
5507 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
5508 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
5509 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5510 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
5511 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
5512 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5513 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
5514 ; GFX10-CU-NEXT: s_endpgm
5516 ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5517 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
5518 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
5519 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
5520 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
5521 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
5522 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
5523 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
5524 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
5525 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
5526 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
5527 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
5528 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
5529 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
5530 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
5531 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
5532 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
5533 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
5534 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5535 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
5536 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
5537 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
5538 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5539 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
5540 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
5541 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5542 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
5543 ; SKIP-CACHE-INV-NEXT: s_endpgm
5545 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5546 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
5547 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
5548 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
5549 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
5550 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5551 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
5552 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
5553 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5554 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
5555 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5556 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5557 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5558 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5559 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
5560 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
5562 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5563 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
5564 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
5565 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
5566 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
5567 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5568 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
5569 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
5570 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5571 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
5572 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5573 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5574 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5575 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
5576 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
5577 ; GFX90A-TGSPLIT-NEXT: s_endpgm
5579 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5580 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
5581 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
5582 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
5583 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
5584 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5585 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
5586 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
5587 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5588 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
5589 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
5590 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5591 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
5592 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5593 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
5594 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
5596 ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5597 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
5598 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
5599 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
5600 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
5601 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5602 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
5603 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
5604 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5605 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
5606 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
5607 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5608 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
5609 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
5610 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
5611 ; GFX940-TGSPLIT-NEXT: s_endpgm
5613 ; GFX11-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5614 ; GFX11-WGP: ; %bb.0: ; %entry
5615 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5616 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
5617 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
5618 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
5619 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
5620 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
5621 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5622 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
5623 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
5624 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
5625 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5626 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
5627 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
5628 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5629 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
5630 ; GFX11-WGP-NEXT: s_endpgm
5632 ; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5633 ; GFX11-CU: ; %bb.0: ; %entry
5634 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5635 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
5636 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
5637 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
5638 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
5639 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
5640 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5641 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
5642 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
5643 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
5644 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5645 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
5646 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
5647 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5648 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
5649 ; GFX11-CU-NEXT: s_endpgm
5651 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5652 ; GFX12-WGP: ; %bb.0: ; %entry
5653 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5654 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
5655 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
5656 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
5657 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
5658 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
5659 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5660 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
5661 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
5662 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
5663 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
5664 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
5665 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
5666 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
5667 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
5668 ; GFX12-WGP-NEXT: s_endpgm
5670 ; GFX12-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
5671 ; GFX12-CU: ; %bb.0: ; %entry
5672 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5673 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
5674 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
5675 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
5676 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
5677 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
5678 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5679 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
5680 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
5681 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
5682 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
5683 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
5684 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
5685 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
5686 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
5687 ; GFX12-CU-NEXT: s_endpgm
5688 ptr %out, i32 %in, i32 %old) {
5690 %gep = getelementptr i32, ptr %out, i32 4
5691 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic monotonic
5692 %val0 = extractvalue { i32, i1 } %val, 0
5693 store i32 %val0, ptr %out, align 4
5697 define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
5698 ; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5699 ; GFX7: ; %bb.0: ; %entry
5700 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
5701 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
5702 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
5703 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
5704 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
5705 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
5706 ; GFX7-NEXT: s_mov_b32 s6, s4
5707 ; GFX7-NEXT: s_mov_b32 s7, s5
5708 ; GFX7-NEXT: s_mov_b32 s11, s12
5709 ; GFX7-NEXT: s_mov_b32 s10, s13
5710 ; GFX7-NEXT: s_add_u32 s6, s6, s11
5711 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
5712 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5713 ; GFX7-NEXT: s_mov_b32 s7, s10
5714 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
5715 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
5716 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5717 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
5718 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
5719 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
5720 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5721 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
5722 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
5723 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
5724 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5725 ; GFX7-NEXT: flat_store_dword v[0:1], v2
5726 ; GFX7-NEXT: s_endpgm
5728 ; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5729 ; GFX10-WGP: ; %bb.0: ; %entry
5730 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
5731 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
5732 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
5733 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
5734 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
5735 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
5736 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
5737 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
5738 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
5739 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
5740 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
5741 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
5742 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5743 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
5744 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
5745 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
5746 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5747 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
5748 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
5749 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
5750 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5751 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5752 ; GFX10-WGP-NEXT: buffer_gl0_inv
5753 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
5754 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
5755 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
5756 ; GFX10-WGP-NEXT: s_endpgm
5758 ; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5759 ; GFX10-CU: ; %bb.0: ; %entry
5760 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
5761 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
5762 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
5763 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
5764 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
5765 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
5766 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
5767 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
5768 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
5769 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
5770 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
5771 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
5772 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5773 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
5774 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
5775 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
5776 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5777 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
5778 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
5779 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
5780 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5781 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
5782 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
5783 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
5784 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
5785 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
5786 ; GFX10-CU-NEXT: s_endpgm
5788 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5789 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
5790 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
5791 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
5792 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
5793 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
5794 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
5795 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
5796 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
5797 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
5798 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
5799 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
5800 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
5801 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
5802 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
5803 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
5804 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
5805 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
5806 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5807 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
5808 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
5809 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
5810 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
5811 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
5812 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
5813 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
5814 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
5815 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
5816 ; SKIP-CACHE-INV-NEXT: s_endpgm
5818 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5819 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
5820 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
5821 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
5822 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
5823 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5824 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
5825 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
5826 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5827 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
5828 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5829 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5830 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5831 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5832 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
5833 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
5834 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
5836 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5837 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
5838 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
5839 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
5840 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
5841 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5842 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
5843 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
5844 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5845 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
5846 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5847 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
5848 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
5849 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
5850 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
5851 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
5852 ; GFX90A-TGSPLIT-NEXT: s_endpgm
5854 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5855 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
5856 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
5857 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
5858 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
5859 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5860 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
5861 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
5862 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5863 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
5864 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
5865 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5866 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5867 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
5868 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
5869 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
5870 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
5872 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5873 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
5874 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
5875 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
5876 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
5877 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
5878 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
5879 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
5880 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5881 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
5882 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
5883 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
5884 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
5885 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
5886 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
5887 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
5888 ; GFX940-TGSPLIT-NEXT: s_endpgm
5890 ; GFX11-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5891 ; GFX11-WGP: ; %bb.0: ; %entry
5892 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5893 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
5894 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
5895 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
5896 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
5897 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
5898 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5899 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
5900 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
5901 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
5902 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5903 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
5904 ; GFX11-WGP-NEXT: buffer_gl0_inv
5905 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
5906 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
5907 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
5908 ; GFX11-WGP-NEXT: s_endpgm
5910 ; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5911 ; GFX11-CU: ; %bb.0: ; %entry
5912 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5913 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
5914 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
5915 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
5916 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
5917 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
5918 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5919 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
5920 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
5921 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
5922 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
5923 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
5924 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
5925 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
5926 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
5927 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
5928 ; GFX11-CU-NEXT: s_endpgm
5930 ; GFX12-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5931 ; GFX12-WGP: ; %bb.0: ; %entry
5932 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5933 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
5934 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
5935 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
5936 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
5937 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
5938 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5939 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
5940 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
5941 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
5942 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
5943 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
5944 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
5945 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
5946 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
5947 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
5948 ; GFX12-WGP-NEXT: s_endpgm
5950 ; GFX12-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
5951 ; GFX12-CU: ; %bb.0: ; %entry
5952 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5953 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
5954 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
5955 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
5956 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
5957 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
5958 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5959 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
5960 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
5961 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
5962 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
5963 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
5964 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
5965 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
5966 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
5967 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
5968 ; GFX12-CU-NEXT: s_endpgm
5969 ptr %out, i32 %in, i32 %old) {
5971 %gep = getelementptr i32, ptr %out, i32 4
5972 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire monotonic
5973 %val0 = extractvalue { i32, i1 } %val, 0
5974 store i32 %val0, ptr %out, align 4
5978 define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
5979 ; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
5980 ; GFX7: ; %bb.0: ; %entry
5981 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
5982 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
5983 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
5984 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
5985 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
5986 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
5987 ; GFX7-NEXT: s_mov_b32 s6, s4
5988 ; GFX7-NEXT: s_mov_b32 s7, s5
5989 ; GFX7-NEXT: s_mov_b32 s11, s12
5990 ; GFX7-NEXT: s_mov_b32 s10, s13
5991 ; GFX7-NEXT: s_add_u32 s6, s6, s11
5992 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
5993 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
5994 ; GFX7-NEXT: s_mov_b32 s7, s10
5995 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
5996 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
5997 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
5998 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
5999 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
6000 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
6001 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
6002 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6003 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
6004 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
6005 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6006 ; GFX7-NEXT: flat_store_dword v[0:1], v2
6007 ; GFX7-NEXT: s_endpgm
6009 ; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6010 ; GFX10-WGP: ; %bb.0: ; %entry
6011 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
6012 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
6013 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
6014 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
6015 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
6016 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
6017 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
6018 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
6019 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
6020 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
6021 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
6022 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
6023 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6024 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
6025 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
6026 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
6027 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6028 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
6029 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
6030 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
6031 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6032 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
6033 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6034 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
6035 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
6036 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6037 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
6038 ; GFX10-WGP-NEXT: s_endpgm
6040 ; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6041 ; GFX10-CU: ; %bb.0: ; %entry
6042 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
6043 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
6044 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
6045 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
6046 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
6047 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
6048 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
6049 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
6050 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
6051 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
6052 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
6053 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
6054 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6055 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
6056 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
6057 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
6058 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6059 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
6060 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
6061 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
6062 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
6063 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6064 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
6065 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
6066 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6067 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
6068 ; GFX10-CU-NEXT: s_endpgm
6070 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6071 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
6072 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
6073 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
6074 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
6075 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
6076 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
6077 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
6078 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
6079 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
6080 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
6081 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
6082 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
6083 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
6084 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
6085 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
6086 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
6087 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
6088 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6089 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
6090 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
6091 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
6092 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
6093 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6094 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
6095 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
6096 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6097 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
6098 ; SKIP-CACHE-INV-NEXT: s_endpgm
6100 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6101 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
6102 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
6103 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
6104 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
6105 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6106 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
6107 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
6108 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6109 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
6110 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6111 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6112 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6113 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6114 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6115 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
6116 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
6118 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6119 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
6120 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
6121 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
6122 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
6123 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6124 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
6125 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
6126 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6127 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
6128 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6129 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6130 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6131 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6132 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
6133 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
6134 ; GFX90A-TGSPLIT-NEXT: s_endpgm
6136 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6137 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
6138 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
6139 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
6140 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
6141 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6142 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
6143 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
6144 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6145 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
6146 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
6147 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6148 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6149 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
6150 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6151 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
6152 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
6154 ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6155 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
6156 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
6157 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
6158 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
6159 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6160 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
6161 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
6162 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6163 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
6164 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
6165 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6166 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6167 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
6168 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
6169 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
6170 ; GFX940-TGSPLIT-NEXT: s_endpgm
6172 ; GFX11-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6173 ; GFX11-WGP: ; %bb.0: ; %entry
6174 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
6175 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
6176 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
6177 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
6178 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
6179 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
6180 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6181 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
6182 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
6183 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
6184 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6185 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
6186 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6187 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
6188 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
6189 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6190 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
6191 ; GFX11-WGP-NEXT: s_endpgm
6193 ; GFX11-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6194 ; GFX11-CU: ; %bb.0: ; %entry
6195 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
6196 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
6197 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
6198 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
6199 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
6200 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
6201 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6202 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
6203 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
6204 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
6205 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
6206 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6207 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
6208 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
6209 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6210 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
6211 ; GFX11-CU-NEXT: s_endpgm
6213 ; GFX12-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6214 ; GFX12-WGP: ; %bb.0: ; %entry
6215 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
6216 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
6217 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
6218 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
6219 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
6220 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
6221 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6222 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
6223 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
6224 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
6225 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
6226 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
6227 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
6228 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
6229 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
6230 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
6231 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
6232 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
6233 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
6234 ; GFX12-WGP-NEXT: s_endpgm
6236 ; GFX12-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
6237 ; GFX12-CU: ; %bb.0: ; %entry
6238 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
6239 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
6240 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
6241 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
6242 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
6243 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
6244 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6245 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
6246 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
6247 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
6248 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
6249 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
6250 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
6251 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
6252 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
6253 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
6254 ; GFX12-CU-NEXT: s_endpgm
6255 ptr %out, i32 %in, i32 %old) {
6257 %gep = getelementptr i32, ptr %out, i32 4
6258 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release monotonic
6259 %val0 = extractvalue { i32, i1 } %val, 0
6260 store i32 %val0, ptr %out, align 4
6264 define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
6265 ; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6266 ; GFX7: ; %bb.0: ; %entry
6267 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
6268 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
6269 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
6270 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
6271 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
6272 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
6273 ; GFX7-NEXT: s_mov_b32 s6, s4
6274 ; GFX7-NEXT: s_mov_b32 s7, s5
6275 ; GFX7-NEXT: s_mov_b32 s11, s12
6276 ; GFX7-NEXT: s_mov_b32 s10, s13
6277 ; GFX7-NEXT: s_add_u32 s6, s6, s11
6278 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
6279 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6280 ; GFX7-NEXT: s_mov_b32 s7, s10
6281 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
6282 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
6283 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6284 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
6285 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
6286 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
6287 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
6288 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6289 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
6290 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
6291 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
6292 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6293 ; GFX7-NEXT: flat_store_dword v[0:1], v2
6294 ; GFX7-NEXT: s_endpgm
6296 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6297 ; GFX10-WGP: ; %bb.0: ; %entry
6298 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
6299 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
6300 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
6301 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
6302 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
6303 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
6304 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
6305 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
6306 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
6307 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
6308 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
6309 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
6310 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6311 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
6312 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
6313 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
6314 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6315 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
6316 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
6317 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
6318 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6319 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
6320 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6321 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6322 ; GFX10-WGP-NEXT: buffer_gl0_inv
6323 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
6324 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
6325 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
6326 ; GFX10-WGP-NEXT: s_endpgm
6328 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6329 ; GFX10-CU: ; %bb.0: ; %entry
6330 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
6331 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
6332 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
6333 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
6334 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
6335 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
6336 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
6337 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
6338 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
6339 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
6340 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
6341 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
6342 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6343 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
6344 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
6345 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
6346 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6347 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
6348 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
6349 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
6350 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
6351 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6352 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
6353 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
6354 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
6355 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
6356 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
6357 ; GFX10-CU-NEXT: s_endpgm
6359 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6360 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
6361 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
6362 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
6363 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
6364 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
6365 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
6366 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
6367 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
6368 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
6369 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
6370 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
6371 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
6372 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
6373 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
6374 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
6375 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
6376 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
6377 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6378 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
6379 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
6380 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
6381 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
6382 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6383 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
6384 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
6385 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
6386 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
6387 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
6388 ; SKIP-CACHE-INV-NEXT: s_endpgm
6390 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6391 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
6392 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
6393 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
6394 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
6395 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6396 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
6397 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
6398 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6399 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
6400 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6401 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6402 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6403 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6404 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6405 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
6406 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
6407 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
6409 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6410 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
6411 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
6412 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
6413 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
6414 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6415 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
6416 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
6417 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6418 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
6419 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6420 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6421 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6422 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
6423 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
6424 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6425 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
6426 ; GFX90A-TGSPLIT-NEXT: s_endpgm
6428 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6429 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
6430 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
6431 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
6432 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
6433 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6434 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
6435 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
6436 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6437 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
6438 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
6439 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6440 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6441 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6442 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
6443 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
6444 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
6445 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
6447 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6448 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
6449 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
6450 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
6451 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
6452 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6453 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
6454 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
6455 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6456 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
6457 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
6458 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6459 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6460 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
6461 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
6462 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
6463 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
6464 ; GFX940-TGSPLIT-NEXT: s_endpgm
6466 ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6467 ; GFX11-WGP: ; %bb.0: ; %entry
6468 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
6469 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
6470 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
6471 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
6472 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
6473 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
6474 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6475 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
6476 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
6477 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
6478 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6479 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
6480 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6481 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6482 ; GFX11-WGP-NEXT: buffer_gl0_inv
6483 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
6484 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
6485 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
6486 ; GFX11-WGP-NEXT: s_endpgm
6488 ; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6489 ; GFX11-CU: ; %bb.0: ; %entry
6490 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
6491 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
6492 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
6493 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
6494 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
6495 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
6496 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6497 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
6498 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
6499 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
6500 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
6501 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6502 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
6503 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
6504 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
6505 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
6506 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
6507 ; GFX11-CU-NEXT: s_endpgm
6509 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6510 ; GFX12-WGP: ; %bb.0: ; %entry
6511 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
6512 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
6513 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
6514 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
6515 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
6516 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
6517 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6518 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
6519 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
6520 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
6521 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
6522 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
6523 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
6524 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
6525 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
6526 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
6527 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
6528 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
6529 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
6530 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
6531 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
6532 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
6533 ; GFX12-WGP-NEXT: s_endpgm
6535 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
6536 ; GFX12-CU: ; %bb.0: ; %entry
6537 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
6538 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
6539 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
6540 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
6541 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
6542 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
6543 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6544 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
6545 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
6546 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
6547 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
6548 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
6549 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
6550 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
6551 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
6552 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
6553 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
6554 ; GFX12-CU-NEXT: s_endpgm
6555 ptr %out, i32 %in, i32 %old) {
6557 %gep = getelementptr i32, ptr %out, i32 4
6558 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel monotonic
6559 %val0 = extractvalue { i32, i1 } %val, 0
6560 store i32 %val0, ptr %out, align 4
6564 define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
6565 ; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6566 ; GFX7: ; %bb.0: ; %entry
6567 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
6568 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
6569 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
6570 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
6571 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
6572 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
6573 ; GFX7-NEXT: s_mov_b32 s6, s4
6574 ; GFX7-NEXT: s_mov_b32 s7, s5
6575 ; GFX7-NEXT: s_mov_b32 s11, s12
6576 ; GFX7-NEXT: s_mov_b32 s10, s13
6577 ; GFX7-NEXT: s_add_u32 s6, s6, s11
6578 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
6579 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6580 ; GFX7-NEXT: s_mov_b32 s7, s10
6581 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
6582 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
6583 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6584 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
6585 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
6586 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
6587 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
6588 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6589 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
6590 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
6591 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
6592 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6593 ; GFX7-NEXT: flat_store_dword v[0:1], v2
6594 ; GFX7-NEXT: s_endpgm
6596 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6597 ; GFX10-WGP: ; %bb.0: ; %entry
6598 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
6599 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
6600 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
6601 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
6602 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
6603 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
6604 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
6605 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
6606 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
6607 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
6608 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
6609 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
6610 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6611 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
6612 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
6613 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
6614 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6615 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
6616 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
6617 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
6618 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6619 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
6620 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6621 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6622 ; GFX10-WGP-NEXT: buffer_gl0_inv
6623 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
6624 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
6625 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
6626 ; GFX10-WGP-NEXT: s_endpgm
6628 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6629 ; GFX10-CU: ; %bb.0: ; %entry
6630 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
6631 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
6632 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
6633 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
6634 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
6635 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
6636 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
6637 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
6638 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
6639 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
6640 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
6641 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
6642 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6643 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
6644 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
6645 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
6646 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6647 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
6648 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
6649 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
6650 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
6651 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6652 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
6653 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
6654 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
6655 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
6656 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
6657 ; GFX10-CU-NEXT: s_endpgm
6659 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6660 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
6661 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
6662 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
6663 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
6664 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
6665 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
6666 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
6667 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
6668 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
6669 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
6670 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
6671 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
6672 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
6673 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
6674 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
6675 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
6676 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
6677 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6678 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
6679 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
6680 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
6681 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
6682 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6683 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
6684 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
6685 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
6686 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
6687 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
6688 ; SKIP-CACHE-INV-NEXT: s_endpgm
6690 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6691 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
6692 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
6693 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
6694 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
6695 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6696 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
6697 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
6698 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6699 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
6700 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6701 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6702 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6703 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6704 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6705 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
6706 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
6707 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
6709 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6710 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
6711 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
6712 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
6713 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
6714 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6715 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
6716 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
6717 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6718 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
6719 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6720 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6721 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6722 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
6723 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
6724 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6725 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
6726 ; GFX90A-TGSPLIT-NEXT: s_endpgm
6728 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6729 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
6730 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
6731 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
6732 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
6733 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6734 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
6735 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
6736 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6737 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
6738 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
6739 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6740 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6741 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6742 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
6743 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
6744 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
6745 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
6747 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6748 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
6749 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
6750 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
6751 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
6752 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6753 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
6754 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
6755 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6756 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
6757 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
6758 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6759 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
6760 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
6761 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
6762 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
6763 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
6764 ; GFX940-TGSPLIT-NEXT: s_endpgm
6766 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6767 ; GFX11-WGP: ; %bb.0: ; %entry
6768 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
6769 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
6770 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
6771 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
6772 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
6773 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
6774 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6775 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
6776 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
6777 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
6778 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6779 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
6780 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6781 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6782 ; GFX11-WGP-NEXT: buffer_gl0_inv
6783 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
6784 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
6785 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
6786 ; GFX11-WGP-NEXT: s_endpgm
6788 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6789 ; GFX11-CU: ; %bb.0: ; %entry
6790 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
6791 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
6792 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
6793 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
6794 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
6795 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
6796 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6797 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
6798 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
6799 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
6800 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
6801 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
6802 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
6803 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
6804 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
6805 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
6806 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
6807 ; GFX11-CU-NEXT: s_endpgm
6809 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6810 ; GFX12-WGP: ; %bb.0: ; %entry
6811 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
6812 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
6813 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
6814 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
6815 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
6816 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
6817 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6818 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
6819 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
6820 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
6821 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
6822 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
6823 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
6824 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
6825 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
6826 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
6827 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
6828 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
6829 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
6830 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
6831 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
6832 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
6833 ; GFX12-WGP-NEXT: s_endpgm
6835 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
6836 ; GFX12-CU: ; %bb.0: ; %entry
6837 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
6838 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
6839 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
6840 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
6841 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
6842 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
6843 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6844 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
6845 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
6846 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
6847 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
6848 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
6849 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
6850 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
6851 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
6852 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
6853 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
6854 ; GFX12-CU-NEXT: s_endpgm
6855 ptr %out, i32 %in, i32 %old) {
6857 %gep = getelementptr i32, ptr %out, i32 4
6858 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst monotonic
6859 %val0 = extractvalue { i32, i1 } %val, 0
6860 store i32 %val0, ptr %out, align 4
6864 define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
6865 ; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
6866 ; GFX7: ; %bb.0: ; %entry
6867 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
6868 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
6869 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
6870 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
6871 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
6872 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
6873 ; GFX7-NEXT: s_mov_b32 s6, s4
6874 ; GFX7-NEXT: s_mov_b32 s7, s5
6875 ; GFX7-NEXT: s_mov_b32 s11, s12
6876 ; GFX7-NEXT: s_mov_b32 s10, s13
6877 ; GFX7-NEXT: s_add_u32 s6, s6, s11
6878 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
6879 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6880 ; GFX7-NEXT: s_mov_b32 s7, s10
6881 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
6882 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
6883 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6884 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
6885 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
6886 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
6887 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6888 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
6889 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
6890 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
6891 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6892 ; GFX7-NEXT: flat_store_dword v[0:1], v2
6893 ; GFX7-NEXT: s_endpgm
6895 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
6896 ; GFX10-WGP: ; %bb.0: ; %entry
6897 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
6898 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
6899 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
6900 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
6901 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
6902 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
6903 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
6904 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
6905 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
6906 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
6907 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
6908 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
6909 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6910 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
6911 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
6912 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
6913 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6914 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
6915 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
6916 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
6917 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6918 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
6919 ; GFX10-WGP-NEXT: buffer_gl0_inv
6920 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
6921 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
6922 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
6923 ; GFX10-WGP-NEXT: s_endpgm
6925 ; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
6926 ; GFX10-CU: ; %bb.0: ; %entry
6927 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
6928 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
6929 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
6930 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
6931 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
6932 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
6933 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
6934 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
6935 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
6936 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
6937 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
6938 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
6939 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
6940 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
6941 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
6942 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
6943 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6944 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
6945 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
6946 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
6947 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6948 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
6949 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
6950 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
6951 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
6952 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
6953 ; GFX10-CU-NEXT: s_endpgm
6955 ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
6956 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
6957 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
6958 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
6959 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
6960 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
6961 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
6962 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
6963 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
6964 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
6965 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
6966 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
6967 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
6968 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
6969 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
6970 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
6971 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
6972 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
6973 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6974 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
6975 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
6976 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
6977 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
6978 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
6979 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
6980 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
6981 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
6982 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
6983 ; SKIP-CACHE-INV-NEXT: s_endpgm
6985 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
6986 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
6987 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
6988 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
6989 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
6990 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6991 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
6992 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
6993 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
6994 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
6995 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6996 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
6997 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
6998 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
6999 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
7000 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
7001 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
7003 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
7004 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
7005 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
7006 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
7007 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
7008 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7009 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
7010 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
7011 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7012 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
7013 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7014 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7015 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
7016 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
7017 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7018 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
7019 ; GFX90A-TGSPLIT-NEXT: s_endpgm
7021 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
7022 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
7023 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
7024 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
7025 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
7026 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7027 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
7028 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
7029 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7030 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
7031 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
7032 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7033 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7034 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
7035 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
7036 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
7037 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
7039 ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
7040 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
7041 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
7042 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
7043 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
7044 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7045 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
7046 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
7047 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7048 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
7049 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
7050 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7051 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
7052 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
7053 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
7054 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
7055 ; GFX940-TGSPLIT-NEXT: s_endpgm
7057 ; GFX11-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
7058 ; GFX11-WGP: ; %bb.0: ; %entry
7059 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
7060 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
7061 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
7062 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
7063 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
7064 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
7065 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7066 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
7067 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
7068 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
7069 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7070 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7071 ; GFX11-WGP-NEXT: buffer_gl0_inv
7072 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
7073 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
7074 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
7075 ; GFX11-WGP-NEXT: s_endpgm
7077 ; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
7078 ; GFX11-CU: ; %bb.0: ; %entry
7079 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
7080 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
7081 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
7082 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
7083 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
7084 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
7085 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7086 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
7087 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
7088 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
7089 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7090 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
7091 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
7092 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
7093 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
7094 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
7095 ; GFX11-CU-NEXT: s_endpgm
7097 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
7098 ; GFX12-WGP: ; %bb.0: ; %entry
7099 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
7100 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
7101 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
7102 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
7103 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
7104 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
7105 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7106 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
7107 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
7108 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
7109 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
7110 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
7111 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
7112 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
7113 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
7114 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
7115 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
7116 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
7117 ; GFX12-WGP-NEXT: s_endpgm
7119 ; GFX12-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
7120 ; GFX12-CU: ; %bb.0: ; %entry
7121 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
7122 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
7123 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
7124 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
7125 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
7126 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
7127 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7128 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
7129 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
7130 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
7131 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
7132 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
7133 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
7134 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
7135 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
7136 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
7137 ; GFX12-CU-NEXT: s_endpgm
7138 ptr %out, i32 %in, i32 %old) {
7140 %gep = getelementptr i32, ptr %out, i32 4
7141 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic acquire
7142 %val0 = extractvalue { i32, i1 } %val, 0
7143 store i32 %val0, ptr %out, align 4
7147 define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
7148 ; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7149 ; GFX7: ; %bb.0: ; %entry
7150 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
7151 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
7152 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
7153 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
7154 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
7155 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
7156 ; GFX7-NEXT: s_mov_b32 s6, s4
7157 ; GFX7-NEXT: s_mov_b32 s7, s5
7158 ; GFX7-NEXT: s_mov_b32 s11, s12
7159 ; GFX7-NEXT: s_mov_b32 s10, s13
7160 ; GFX7-NEXT: s_add_u32 s6, s6, s11
7161 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
7162 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7163 ; GFX7-NEXT: s_mov_b32 s7, s10
7164 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
7165 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
7166 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7167 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
7168 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
7169 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
7170 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7171 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
7172 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
7173 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
7174 ; GFX7-NEXT: s_waitcnt vmcnt(0)
7175 ; GFX7-NEXT: flat_store_dword v[0:1], v2
7176 ; GFX7-NEXT: s_endpgm
7178 ; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7179 ; GFX10-WGP: ; %bb.0: ; %entry
7180 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
7181 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
7182 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
7183 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
7184 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
7185 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
7186 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
7187 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
7188 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
7189 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
7190 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
7191 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
7192 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7193 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
7194 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
7195 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
7196 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7197 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
7198 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
7199 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
7200 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7201 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7202 ; GFX10-WGP-NEXT: buffer_gl0_inv
7203 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
7204 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
7205 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
7206 ; GFX10-WGP-NEXT: s_endpgm
7208 ; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7209 ; GFX10-CU: ; %bb.0: ; %entry
7210 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
7211 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
7212 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
7213 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
7214 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
7215 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
7216 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
7217 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
7218 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
7219 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
7220 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
7221 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
7222 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7223 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
7224 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
7225 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
7226 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7227 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
7228 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
7229 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
7230 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7231 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
7232 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
7233 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
7234 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
7235 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
7236 ; GFX10-CU-NEXT: s_endpgm
7238 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7239 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
7240 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
7241 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
7242 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
7243 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
7244 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
7245 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
7246 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
7247 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
7248 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
7249 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
7250 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
7251 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
7252 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
7253 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
7254 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
7255 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
7256 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7257 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
7258 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
7259 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
7260 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7261 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
7262 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
7263 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
7264 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
7265 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
7266 ; SKIP-CACHE-INV-NEXT: s_endpgm
7268 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7269 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
7270 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
7271 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
7272 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
7273 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7274 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
7275 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
7276 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7277 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
7278 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7279 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7280 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7281 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7282 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
7283 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
7284 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
7286 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7287 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
7288 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
7289 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
7290 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
7291 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7292 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
7293 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
7294 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7295 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
7296 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7297 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7298 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
7299 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
7300 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7301 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
7302 ; GFX90A-TGSPLIT-NEXT: s_endpgm
7304 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7305 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
7306 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
7307 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
7308 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
7309 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7310 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
7311 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
7312 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7313 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
7314 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
7315 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7316 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7317 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
7318 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
7319 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
7320 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
7322 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7323 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
7324 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
7325 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
7326 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
7327 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7328 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
7329 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
7330 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7331 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
7332 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
7333 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7334 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
7335 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
7336 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
7337 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
7338 ; GFX940-TGSPLIT-NEXT: s_endpgm
7340 ; GFX11-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7341 ; GFX11-WGP: ; %bb.0: ; %entry
7342 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
7343 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
7344 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
7345 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
7346 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
7347 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
7348 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7349 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
7350 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
7351 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
7352 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7353 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7354 ; GFX11-WGP-NEXT: buffer_gl0_inv
7355 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
7356 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
7357 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
7358 ; GFX11-WGP-NEXT: s_endpgm
7360 ; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7361 ; GFX11-CU: ; %bb.0: ; %entry
7362 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
7363 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
7364 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
7365 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
7366 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
7367 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
7368 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7369 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
7370 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
7371 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
7372 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7373 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
7374 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
7375 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
7376 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
7377 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
7378 ; GFX11-CU-NEXT: s_endpgm
7380 ; GFX12-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7381 ; GFX12-WGP: ; %bb.0: ; %entry
7382 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
7383 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
7384 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
7385 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
7386 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
7387 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
7388 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7389 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
7390 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
7391 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
7392 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
7393 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
7394 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
7395 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
7396 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
7397 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
7398 ; GFX12-WGP-NEXT: s_endpgm
7400 ; GFX12-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
7401 ; GFX12-CU: ; %bb.0: ; %entry
7402 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
7403 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
7404 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
7405 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
7406 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
7407 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
7408 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7409 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
7410 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
7411 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
7412 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
7413 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
7414 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
7415 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
7416 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
7417 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
7418 ; GFX12-CU-NEXT: s_endpgm
7419 ptr %out, i32 %in, i32 %old) {
7421 %gep = getelementptr i32, ptr %out, i32 4
7422 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire acquire
7423 %val0 = extractvalue { i32, i1 } %val, 0
7424 store i32 %val0, ptr %out, align 4
7428 define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
7429 ; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7430 ; GFX7: ; %bb.0: ; %entry
7431 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
7432 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
7433 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
7434 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
7435 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
7436 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
7437 ; GFX7-NEXT: s_mov_b32 s6, s4
7438 ; GFX7-NEXT: s_mov_b32 s7, s5
7439 ; GFX7-NEXT: s_mov_b32 s11, s12
7440 ; GFX7-NEXT: s_mov_b32 s10, s13
7441 ; GFX7-NEXT: s_add_u32 s6, s6, s11
7442 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
7443 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7444 ; GFX7-NEXT: s_mov_b32 s7, s10
7445 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
7446 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
7447 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7448 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
7449 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
7450 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
7451 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
7452 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7453 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
7454 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
7455 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
7456 ; GFX7-NEXT: s_waitcnt vmcnt(0)
7457 ; GFX7-NEXT: flat_store_dword v[0:1], v2
7458 ; GFX7-NEXT: s_endpgm
7460 ; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7461 ; GFX10-WGP: ; %bb.0: ; %entry
7462 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
7463 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
7464 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
7465 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
7466 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
7467 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
7468 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
7469 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
7470 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
7471 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
7472 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
7473 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
7474 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7475 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
7476 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
7477 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
7478 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7479 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
7480 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
7481 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
7482 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7483 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
7484 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7485 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7486 ; GFX10-WGP-NEXT: buffer_gl0_inv
7487 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
7488 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
7489 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
7490 ; GFX10-WGP-NEXT: s_endpgm
7492 ; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7493 ; GFX10-CU: ; %bb.0: ; %entry
7494 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
7495 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
7496 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
7497 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
7498 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
7499 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
7500 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
7501 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
7502 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
7503 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
7504 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
7505 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
7506 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7507 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
7508 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
7509 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
7510 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7511 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
7512 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
7513 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
7514 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
7515 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7516 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
7517 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
7518 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
7519 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
7520 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
7521 ; GFX10-CU-NEXT: s_endpgm
7523 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7524 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
7525 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
7526 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
7527 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
7528 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
7529 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
7530 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
7531 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
7532 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
7533 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
7534 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
7535 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
7536 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
7537 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
7538 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
7539 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
7540 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
7541 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7542 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
7543 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
7544 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
7545 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
7546 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7547 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
7548 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
7549 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
7550 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
7551 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
7552 ; SKIP-CACHE-INV-NEXT: s_endpgm
7554 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7555 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
7556 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
7557 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
7558 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
7559 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7560 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
7561 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
7562 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7563 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
7564 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7565 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7566 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7567 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7568 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7569 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
7570 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
7571 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
7573 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7574 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
7575 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
7576 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
7577 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
7578 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7579 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
7580 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
7581 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7582 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
7583 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7584 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7585 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7586 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
7587 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
7588 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7589 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
7590 ; GFX90A-TGSPLIT-NEXT: s_endpgm
7592 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7593 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
7594 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
7595 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
7596 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
7597 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7598 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
7599 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
7600 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7601 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
7602 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
7603 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7604 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7605 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7606 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
7607 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
7608 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
7609 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
7611 ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7612 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
7613 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
7614 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
7615 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
7616 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7617 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
7618 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
7619 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7620 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
7621 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
7622 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7623 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7624 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
7625 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
7626 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
7627 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
7628 ; GFX940-TGSPLIT-NEXT: s_endpgm
7630 ; GFX11-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7631 ; GFX11-WGP: ; %bb.0: ; %entry
7632 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
7633 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
7634 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
7635 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
7636 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
7637 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
7638 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7639 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
7640 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
7641 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
7642 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7643 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
7644 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7645 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7646 ; GFX11-WGP-NEXT: buffer_gl0_inv
7647 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
7648 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
7649 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
7650 ; GFX11-WGP-NEXT: s_endpgm
7652 ; GFX11-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7653 ; GFX11-CU: ; %bb.0: ; %entry
7654 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
7655 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
7656 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
7657 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
7658 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
7659 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
7660 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7661 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
7662 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
7663 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
7664 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
7665 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7666 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
7667 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
7668 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
7669 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
7670 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
7671 ; GFX11-CU-NEXT: s_endpgm
7673 ; GFX12-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7674 ; GFX12-WGP: ; %bb.0: ; %entry
7675 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
7676 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
7677 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
7678 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
7679 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
7680 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
7681 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7682 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
7683 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
7684 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
7685 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
7686 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
7687 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
7688 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
7689 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
7690 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
7691 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
7692 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
7693 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
7694 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
7695 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
7696 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
7697 ; GFX12-WGP-NEXT: s_endpgm
7699 ; GFX12-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
7700 ; GFX12-CU: ; %bb.0: ; %entry
7701 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
7702 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
7703 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
7704 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
7705 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
7706 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
7707 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7708 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
7709 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
7710 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
7711 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
7712 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
7713 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
7714 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
7715 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
7716 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
7717 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
7718 ; GFX12-CU-NEXT: s_endpgm
7719 ptr %out, i32 %in, i32 %old) {
7721 %gep = getelementptr i32, ptr %out, i32 4
7722 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release acquire
7723 %val0 = extractvalue { i32, i1 } %val, 0
7724 store i32 %val0, ptr %out, align 4
7728 define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
7729 ; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7730 ; GFX7: ; %bb.0: ; %entry
7731 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
7732 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
7733 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
7734 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
7735 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
7736 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
7737 ; GFX7-NEXT: s_mov_b32 s6, s4
7738 ; GFX7-NEXT: s_mov_b32 s7, s5
7739 ; GFX7-NEXT: s_mov_b32 s11, s12
7740 ; GFX7-NEXT: s_mov_b32 s10, s13
7741 ; GFX7-NEXT: s_add_u32 s6, s6, s11
7742 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
7743 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7744 ; GFX7-NEXT: s_mov_b32 s7, s10
7745 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
7746 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
7747 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7748 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
7749 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
7750 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
7751 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
7752 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7753 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
7754 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
7755 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
7756 ; GFX7-NEXT: s_waitcnt vmcnt(0)
7757 ; GFX7-NEXT: flat_store_dword v[0:1], v2
7758 ; GFX7-NEXT: s_endpgm
7760 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7761 ; GFX10-WGP: ; %bb.0: ; %entry
7762 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
7763 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
7764 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
7765 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
7766 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
7767 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
7768 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
7769 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
7770 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
7771 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
7772 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
7773 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
7774 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7775 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
7776 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
7777 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
7778 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7779 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
7780 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
7781 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
7782 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7783 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
7784 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7785 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7786 ; GFX10-WGP-NEXT: buffer_gl0_inv
7787 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
7788 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
7789 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
7790 ; GFX10-WGP-NEXT: s_endpgm
7792 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7793 ; GFX10-CU: ; %bb.0: ; %entry
7794 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
7795 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
7796 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
7797 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
7798 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
7799 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
7800 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
7801 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
7802 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
7803 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
7804 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
7805 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
7806 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
7807 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
7808 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
7809 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
7810 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7811 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
7812 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
7813 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
7814 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
7815 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7816 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
7817 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
7818 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
7819 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
7820 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
7821 ; GFX10-CU-NEXT: s_endpgm
7823 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7824 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
7825 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
7826 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
7827 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
7828 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
7829 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
7830 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
7831 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
7832 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
7833 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
7834 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
7835 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
7836 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
7837 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
7838 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
7839 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
7840 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
7841 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7842 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
7843 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
7844 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
7845 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
7846 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
7847 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
7848 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
7849 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
7850 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
7851 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
7852 ; SKIP-CACHE-INV-NEXT: s_endpgm
7854 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7855 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
7856 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
7857 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
7858 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
7859 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7860 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
7861 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
7862 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7863 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
7864 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7865 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7866 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7867 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7868 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7869 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
7870 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
7871 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
7873 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7874 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
7875 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
7876 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
7877 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
7878 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7879 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
7880 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
7881 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7882 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
7883 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7884 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7885 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
7886 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
7887 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
7888 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
7889 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
7890 ; GFX90A-TGSPLIT-NEXT: s_endpgm
7892 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7893 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
7894 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
7895 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
7896 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
7897 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7898 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
7899 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
7900 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7901 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
7902 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
7903 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7904 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7905 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7906 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
7907 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
7908 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
7909 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
7911 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7912 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
7913 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
7914 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
7915 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
7916 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
7917 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
7918 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
7919 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7920 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
7921 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
7922 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7923 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
7924 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
7925 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
7926 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
7927 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
7928 ; GFX940-TGSPLIT-NEXT: s_endpgm
7930 ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7931 ; GFX11-WGP: ; %bb.0: ; %entry
7932 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
7933 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
7934 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
7935 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
7936 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
7937 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
7938 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7939 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
7940 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
7941 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
7942 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7943 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
7944 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7945 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
7946 ; GFX11-WGP-NEXT: buffer_gl0_inv
7947 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
7948 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
7949 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
7950 ; GFX11-WGP-NEXT: s_endpgm
7952 ; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7953 ; GFX11-CU: ; %bb.0: ; %entry
7954 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
7955 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
7956 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
7957 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
7958 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
7959 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
7960 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7961 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
7962 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
7963 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
7964 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
7965 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
7966 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
7967 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
7968 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
7969 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
7970 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
7971 ; GFX11-CU-NEXT: s_endpgm
7973 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
7974 ; GFX12-WGP: ; %bb.0: ; %entry
7975 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
7976 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
7977 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
7978 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
7979 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
7980 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
7981 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
7982 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
7983 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
7984 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
7985 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
7986 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
7987 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
7988 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
7989 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
7990 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
7991 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
7992 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
7993 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
7994 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
7995 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
7996 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
7997 ; GFX12-WGP-NEXT: s_endpgm
7999 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
8000 ; GFX12-CU: ; %bb.0: ; %entry
8001 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
8002 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
8003 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
8004 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
8005 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
8006 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
8007 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8008 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
8009 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
8010 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
8011 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
8012 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
8013 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
8014 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
8015 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
8016 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
8017 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
8018 ; GFX12-CU-NEXT: s_endpgm
8019 ptr %out, i32 %in, i32 %old) {
8021 %gep = getelementptr i32, ptr %out, i32 4
8022 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel acquire
8023 %val0 = extractvalue { i32, i1 } %val, 0
8024 store i32 %val0, ptr %out, align 4
8028 define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
8029 ; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8030 ; GFX7: ; %bb.0: ; %entry
8031 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
8032 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
8033 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
8034 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
8035 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
8036 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
8037 ; GFX7-NEXT: s_mov_b32 s6, s4
8038 ; GFX7-NEXT: s_mov_b32 s7, s5
8039 ; GFX7-NEXT: s_mov_b32 s11, s12
8040 ; GFX7-NEXT: s_mov_b32 s10, s13
8041 ; GFX7-NEXT: s_add_u32 s6, s6, s11
8042 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
8043 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8044 ; GFX7-NEXT: s_mov_b32 s7, s10
8045 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
8046 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
8047 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8048 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
8049 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
8050 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
8051 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
8052 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8053 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
8054 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
8055 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
8056 ; GFX7-NEXT: s_waitcnt vmcnt(0)
8057 ; GFX7-NEXT: flat_store_dword v[0:1], v2
8058 ; GFX7-NEXT: s_endpgm
8060 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8061 ; GFX10-WGP: ; %bb.0: ; %entry
8062 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
8063 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
8064 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
8065 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
8066 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
8067 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
8068 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
8069 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
8070 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
8071 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
8072 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
8073 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
8074 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8075 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
8076 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
8077 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
8078 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8079 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
8080 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
8081 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
8082 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8083 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
8084 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8085 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8086 ; GFX10-WGP-NEXT: buffer_gl0_inv
8087 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
8088 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
8089 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
8090 ; GFX10-WGP-NEXT: s_endpgm
8092 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8093 ; GFX10-CU: ; %bb.0: ; %entry
8094 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
8095 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
8096 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
8097 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
8098 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
8099 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
8100 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
8101 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
8102 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
8103 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
8104 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
8105 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
8106 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8107 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
8108 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
8109 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
8110 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8111 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
8112 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
8113 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
8114 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
8115 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8116 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
8117 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
8118 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
8119 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
8120 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
8121 ; GFX10-CU-NEXT: s_endpgm
8123 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8124 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
8125 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
8126 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
8127 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
8128 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
8129 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
8130 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
8131 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
8132 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
8133 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
8134 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
8135 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
8136 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
8137 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
8138 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
8139 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
8140 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
8141 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8142 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
8143 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
8144 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
8145 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
8146 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8147 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
8148 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
8149 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
8150 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
8151 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
8152 ; SKIP-CACHE-INV-NEXT: s_endpgm
8154 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8155 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
8156 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
8157 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
8158 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
8159 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8160 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
8161 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
8162 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8163 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
8164 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8165 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8166 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8167 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8168 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8169 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
8170 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
8171 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
8173 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8174 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
8175 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
8176 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
8177 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
8178 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8179 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
8180 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
8181 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8182 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
8183 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8184 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8185 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8186 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
8187 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
8188 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8189 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
8190 ; GFX90A-TGSPLIT-NEXT: s_endpgm
8192 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8193 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
8194 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
8195 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
8196 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
8197 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8198 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
8199 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
8200 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8201 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
8202 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
8203 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8204 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8205 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8206 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
8207 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
8208 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
8209 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
8211 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8212 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
8213 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
8214 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
8215 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
8216 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8217 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
8218 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
8219 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8220 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
8221 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
8222 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8223 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8224 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
8225 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
8226 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
8227 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
8228 ; GFX940-TGSPLIT-NEXT: s_endpgm
8230 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8231 ; GFX11-WGP: ; %bb.0: ; %entry
8232 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
8233 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
8234 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
8235 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
8236 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
8237 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
8238 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8239 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
8240 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
8241 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
8242 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8243 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
8244 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8245 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8246 ; GFX11-WGP-NEXT: buffer_gl0_inv
8247 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
8248 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
8249 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
8250 ; GFX11-WGP-NEXT: s_endpgm
8252 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8253 ; GFX11-CU: ; %bb.0: ; %entry
8254 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
8255 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
8256 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
8257 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
8258 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
8259 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
8260 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8261 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
8262 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
8263 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
8264 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
8265 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8266 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
8267 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
8268 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
8269 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
8270 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
8271 ; GFX11-CU-NEXT: s_endpgm
8273 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8274 ; GFX12-WGP: ; %bb.0: ; %entry
8275 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
8276 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
8277 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
8278 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
8279 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
8280 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
8281 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8282 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
8283 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
8284 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
8285 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
8286 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
8287 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
8288 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
8289 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
8290 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
8291 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
8292 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
8293 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
8294 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
8295 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
8296 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
8297 ; GFX12-WGP-NEXT: s_endpgm
8299 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
8300 ; GFX12-CU: ; %bb.0: ; %entry
8301 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
8302 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
8303 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
8304 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
8305 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
8306 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
8307 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8308 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
8309 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
8310 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
8311 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
8312 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
8313 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
8314 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
8315 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
8316 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
8317 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
8318 ; GFX12-CU-NEXT: s_endpgm
8319 ptr %out, i32 %in, i32 %old) {
8321 %gep = getelementptr i32, ptr %out, i32 4
8322 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst acquire
8323 %val0 = extractvalue { i32, i1 } %val, 0
8324 store i32 %val0, ptr %out, align 4
8328 define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
8329 ; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8330 ; GFX7: ; %bb.0: ; %entry
8331 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
8332 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
8333 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
8334 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
8335 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
8336 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
8337 ; GFX7-NEXT: s_mov_b32 s6, s4
8338 ; GFX7-NEXT: s_mov_b32 s7, s5
8339 ; GFX7-NEXT: s_mov_b32 s11, s12
8340 ; GFX7-NEXT: s_mov_b32 s10, s13
8341 ; GFX7-NEXT: s_add_u32 s6, s6, s11
8342 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
8343 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8344 ; GFX7-NEXT: s_mov_b32 s7, s10
8345 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
8346 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
8347 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8348 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
8349 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
8350 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
8351 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
8352 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8353 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
8354 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
8355 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
8356 ; GFX7-NEXT: s_waitcnt vmcnt(0)
8357 ; GFX7-NEXT: flat_store_dword v[0:1], v2
8358 ; GFX7-NEXT: s_endpgm
8360 ; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8361 ; GFX10-WGP: ; %bb.0: ; %entry
8362 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
8363 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
8364 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
8365 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
8366 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
8367 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
8368 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
8369 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
8370 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
8371 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
8372 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
8373 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
8374 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8375 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
8376 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
8377 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
8378 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8379 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
8380 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
8381 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
8382 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8383 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
8384 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8385 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8386 ; GFX10-WGP-NEXT: buffer_gl0_inv
8387 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
8388 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
8389 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
8390 ; GFX10-WGP-NEXT: s_endpgm
8392 ; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8393 ; GFX10-CU: ; %bb.0: ; %entry
8394 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
8395 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
8396 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
8397 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
8398 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
8399 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
8400 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
8401 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
8402 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
8403 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
8404 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
8405 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
8406 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8407 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
8408 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
8409 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
8410 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8411 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
8412 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
8413 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
8414 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
8415 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8416 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
8417 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
8418 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
8419 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
8420 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
8421 ; GFX10-CU-NEXT: s_endpgm
8423 ; SKIP-CACHE-INV-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8424 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
8425 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
8426 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
8427 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
8428 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
8429 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
8430 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
8431 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
8432 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
8433 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
8434 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
8435 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
8436 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
8437 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
8438 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
8439 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
8440 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
8441 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8442 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
8443 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
8444 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
8445 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
8446 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8447 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
8448 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
8449 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
8450 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
8451 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
8452 ; SKIP-CACHE-INV-NEXT: s_endpgm
8454 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8455 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
8456 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
8457 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
8458 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
8459 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8460 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
8461 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
8462 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8463 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
8464 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8465 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8466 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8467 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8468 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8469 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
8470 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
8471 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
8473 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8474 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
8475 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
8476 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
8477 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
8478 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8479 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
8480 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
8481 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8482 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
8483 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8484 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8485 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8486 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
8487 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
8488 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8489 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
8490 ; GFX90A-TGSPLIT-NEXT: s_endpgm
8492 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8493 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
8494 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
8495 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
8496 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
8497 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8498 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
8499 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
8500 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8501 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
8502 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
8503 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8504 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8505 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8506 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
8507 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
8508 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
8509 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
8511 ; GFX940-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8512 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
8513 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
8514 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
8515 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
8516 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8517 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
8518 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
8519 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8520 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
8521 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
8522 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8523 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8524 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
8525 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
8526 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
8527 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
8528 ; GFX940-TGSPLIT-NEXT: s_endpgm
8530 ; GFX11-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8531 ; GFX11-WGP: ; %bb.0: ; %entry
8532 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
8533 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
8534 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
8535 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
8536 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
8537 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
8538 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8539 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
8540 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
8541 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
8542 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8543 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
8544 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8545 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8546 ; GFX11-WGP-NEXT: buffer_gl0_inv
8547 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
8548 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
8549 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
8550 ; GFX11-WGP-NEXT: s_endpgm
8552 ; GFX11-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8553 ; GFX11-CU: ; %bb.0: ; %entry
8554 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
8555 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
8556 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
8557 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
8558 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
8559 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
8560 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8561 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
8562 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
8563 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
8564 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
8565 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8566 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
8567 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
8568 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
8569 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
8570 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
8571 ; GFX11-CU-NEXT: s_endpgm
8573 ; GFX12-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8574 ; GFX12-WGP: ; %bb.0: ; %entry
8575 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
8576 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
8577 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
8578 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
8579 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
8580 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
8581 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8582 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
8583 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
8584 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
8585 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
8586 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
8587 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
8588 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
8589 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
8590 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
8591 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
8592 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
8593 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
8594 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
8595 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
8596 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
8597 ; GFX12-WGP-NEXT: s_endpgm
8599 ; GFX12-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
8600 ; GFX12-CU: ; %bb.0: ; %entry
8601 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
8602 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
8603 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
8604 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
8605 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
8606 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
8607 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8608 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
8609 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
8610 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
8611 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
8612 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
8613 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
8614 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
8615 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
8616 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
8617 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
8618 ; GFX12-CU-NEXT: s_endpgm
8619 ptr %out, i32 %in, i32 %old) {
8621 %gep = getelementptr i32, ptr %out, i32 4
8622 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") monotonic seq_cst
8623 %val0 = extractvalue { i32, i1 } %val, 0
8624 store i32 %val0, ptr %out, align 4
8628 define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
8629 ; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8630 ; GFX7: ; %bb.0: ; %entry
8631 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
8632 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
8633 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
8634 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
8635 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
8636 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
8637 ; GFX7-NEXT: s_mov_b32 s6, s4
8638 ; GFX7-NEXT: s_mov_b32 s7, s5
8639 ; GFX7-NEXT: s_mov_b32 s11, s12
8640 ; GFX7-NEXT: s_mov_b32 s10, s13
8641 ; GFX7-NEXT: s_add_u32 s6, s6, s11
8642 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
8643 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8644 ; GFX7-NEXT: s_mov_b32 s7, s10
8645 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
8646 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
8647 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8648 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
8649 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
8650 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
8651 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
8652 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8653 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
8654 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
8655 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
8656 ; GFX7-NEXT: s_waitcnt vmcnt(0)
8657 ; GFX7-NEXT: flat_store_dword v[0:1], v2
8658 ; GFX7-NEXT: s_endpgm
8660 ; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8661 ; GFX10-WGP: ; %bb.0: ; %entry
8662 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
8663 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
8664 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
8665 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
8666 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
8667 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
8668 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
8669 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
8670 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
8671 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
8672 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
8673 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
8674 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8675 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
8676 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
8677 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
8678 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8679 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
8680 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
8681 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
8682 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8683 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
8684 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8685 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8686 ; GFX10-WGP-NEXT: buffer_gl0_inv
8687 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
8688 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
8689 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
8690 ; GFX10-WGP-NEXT: s_endpgm
8692 ; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8693 ; GFX10-CU: ; %bb.0: ; %entry
8694 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
8695 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
8696 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
8697 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
8698 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
8699 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
8700 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
8701 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
8702 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
8703 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
8704 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
8705 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
8706 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8707 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
8708 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
8709 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
8710 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8711 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
8712 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
8713 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
8714 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
8715 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8716 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
8717 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
8718 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
8719 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
8720 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
8721 ; GFX10-CU-NEXT: s_endpgm
8723 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8724 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
8725 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
8726 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
8727 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
8728 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
8729 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
8730 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
8731 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
8732 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
8733 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
8734 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
8735 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
8736 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
8737 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
8738 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
8739 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
8740 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
8741 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8742 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
8743 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
8744 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
8745 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
8746 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8747 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
8748 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
8749 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
8750 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
8751 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
8752 ; SKIP-CACHE-INV-NEXT: s_endpgm
8754 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8755 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
8756 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
8757 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
8758 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
8759 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8760 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
8761 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
8762 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8763 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
8764 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8765 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8766 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8767 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8768 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8769 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
8770 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
8771 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
8773 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8774 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
8775 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
8776 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
8777 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
8778 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8779 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
8780 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
8781 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8782 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
8783 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8784 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8785 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
8786 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
8787 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
8788 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
8789 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
8790 ; GFX90A-TGSPLIT-NEXT: s_endpgm
8792 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8793 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
8794 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
8795 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
8796 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
8797 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8798 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
8799 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
8800 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8801 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
8802 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
8803 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8804 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8805 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8806 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
8807 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
8808 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
8809 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
8811 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8812 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
8813 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
8814 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
8815 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
8816 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
8817 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
8818 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
8819 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8820 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
8821 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
8822 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8823 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
8824 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
8825 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
8826 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
8827 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
8828 ; GFX940-TGSPLIT-NEXT: s_endpgm
8830 ; GFX11-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8831 ; GFX11-WGP: ; %bb.0: ; %entry
8832 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
8833 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
8834 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
8835 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
8836 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
8837 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
8838 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8839 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
8840 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
8841 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
8842 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8843 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
8844 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8845 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8846 ; GFX11-WGP-NEXT: buffer_gl0_inv
8847 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
8848 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
8849 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
8850 ; GFX11-WGP-NEXT: s_endpgm
8852 ; GFX11-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8853 ; GFX11-CU: ; %bb.0: ; %entry
8854 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
8855 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
8856 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
8857 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
8858 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
8859 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
8860 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8861 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
8862 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
8863 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
8864 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
8865 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
8866 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
8867 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
8868 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
8869 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
8870 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
8871 ; GFX11-CU-NEXT: s_endpgm
8873 ; GFX12-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8874 ; GFX12-WGP: ; %bb.0: ; %entry
8875 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
8876 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
8877 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
8878 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
8879 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
8880 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
8881 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8882 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
8883 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
8884 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
8885 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
8886 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
8887 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
8888 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
8889 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
8890 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
8891 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
8892 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
8893 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
8894 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
8895 ; GFX12-WGP-NEXT: s_endpgm
8897 ; GFX12-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
8898 ; GFX12-CU: ; %bb.0: ; %entry
8899 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
8900 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
8901 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
8902 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
8903 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
8904 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
8905 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8906 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
8907 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
8908 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
8909 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
8910 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
8911 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
8912 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
8913 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
8914 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
8915 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
8916 ; GFX12-CU-NEXT: s_endpgm
8917 ptr %out, i32 %in, i32 %old) {
8919 %gep = getelementptr i32, ptr %out, i32 4
8920 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acquire seq_cst
8921 %val0 = extractvalue { i32, i1 } %val, 0
8922 store i32 %val0, ptr %out, align 4
8926 define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
8927 ; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
8928 ; GFX7: ; %bb.0: ; %entry
8929 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
8930 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
8931 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
8932 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
8933 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
8934 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
8935 ; GFX7-NEXT: s_mov_b32 s6, s4
8936 ; GFX7-NEXT: s_mov_b32 s7, s5
8937 ; GFX7-NEXT: s_mov_b32 s11, s12
8938 ; GFX7-NEXT: s_mov_b32 s10, s13
8939 ; GFX7-NEXT: s_add_u32 s6, s6, s11
8940 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
8941 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8942 ; GFX7-NEXT: s_mov_b32 s7, s10
8943 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
8944 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
8945 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8946 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
8947 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
8948 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
8949 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
8950 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8951 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
8952 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
8953 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
8954 ; GFX7-NEXT: s_waitcnt vmcnt(0)
8955 ; GFX7-NEXT: flat_store_dword v[0:1], v2
8956 ; GFX7-NEXT: s_endpgm
8958 ; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
8959 ; GFX10-WGP: ; %bb.0: ; %entry
8960 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
8961 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
8962 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
8963 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
8964 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
8965 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
8966 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
8967 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
8968 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
8969 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
8970 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
8971 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
8972 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
8973 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
8974 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
8975 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
8976 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
8977 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
8978 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
8979 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
8980 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8981 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
8982 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
8983 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
8984 ; GFX10-WGP-NEXT: buffer_gl0_inv
8985 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
8986 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
8987 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
8988 ; GFX10-WGP-NEXT: s_endpgm
8990 ; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
8991 ; GFX10-CU: ; %bb.0: ; %entry
8992 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
8993 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
8994 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
8995 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
8996 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
8997 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
8998 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
8999 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
9000 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
9001 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
9002 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
9003 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
9004 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9005 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
9006 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
9007 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
9008 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9009 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
9010 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
9011 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
9012 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
9013 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9014 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
9015 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
9016 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
9017 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
9018 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
9019 ; GFX10-CU-NEXT: s_endpgm
9021 ; SKIP-CACHE-INV-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9022 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
9023 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
9024 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
9025 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
9026 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
9027 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
9028 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
9029 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
9030 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
9031 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
9032 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
9033 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
9034 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
9035 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
9036 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
9037 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
9038 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
9039 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9040 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
9041 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
9042 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
9043 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
9044 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9045 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
9046 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
9047 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
9048 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
9049 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
9050 ; SKIP-CACHE-INV-NEXT: s_endpgm
9052 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9053 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
9054 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
9055 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
9056 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
9057 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9058 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
9059 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
9060 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9061 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
9062 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9063 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9064 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9065 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9066 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9067 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
9068 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
9069 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
9071 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9072 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
9073 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
9074 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
9075 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
9076 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9077 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
9078 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
9079 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9080 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
9081 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9082 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9083 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9084 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
9085 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
9086 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9087 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
9088 ; GFX90A-TGSPLIT-NEXT: s_endpgm
9090 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9091 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
9092 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
9093 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
9094 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
9095 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9096 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
9097 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
9098 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9099 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
9100 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
9101 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9102 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9103 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9104 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
9105 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
9106 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
9107 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
9109 ; GFX940-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9110 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
9111 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
9112 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
9113 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
9114 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9115 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
9116 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
9117 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9118 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
9119 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
9120 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9121 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9122 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
9123 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
9124 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
9125 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
9126 ; GFX940-TGSPLIT-NEXT: s_endpgm
9128 ; GFX11-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9129 ; GFX11-WGP: ; %bb.0: ; %entry
9130 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
9131 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
9132 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
9133 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
9134 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
9135 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
9136 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9137 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
9138 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
9139 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
9140 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9141 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
9142 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9143 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9144 ; GFX11-WGP-NEXT: buffer_gl0_inv
9145 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
9146 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
9147 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
9148 ; GFX11-WGP-NEXT: s_endpgm
9150 ; GFX11-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9151 ; GFX11-CU: ; %bb.0: ; %entry
9152 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
9153 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
9154 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
9155 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
9156 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
9157 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
9158 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9159 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
9160 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
9161 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
9162 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
9163 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9164 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
9165 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
9166 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
9167 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
9168 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
9169 ; GFX11-CU-NEXT: s_endpgm
9171 ; GFX12-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9172 ; GFX12-WGP: ; %bb.0: ; %entry
9173 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
9174 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
9175 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
9176 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
9177 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
9178 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
9179 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9180 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
9181 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
9182 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
9183 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
9184 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
9185 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
9186 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
9187 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
9188 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
9189 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
9190 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
9191 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
9192 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
9193 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
9194 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
9195 ; GFX12-WGP-NEXT: s_endpgm
9197 ; GFX12-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
9198 ; GFX12-CU: ; %bb.0: ; %entry
9199 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
9200 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
9201 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
9202 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
9203 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
9204 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
9205 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9206 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
9207 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
9208 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
9209 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
9210 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
9211 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
9212 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
9213 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
9214 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
9215 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
9216 ; GFX12-CU-NEXT: s_endpgm
9217 ptr %out, i32 %in, i32 %old) {
9219 %gep = getelementptr i32, ptr %out, i32 4
9220 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") release seq_cst
9221 %val0 = extractvalue { i32, i1 } %val, 0
9222 store i32 %val0, ptr %out, align 4
9226 define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
9227 ; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9228 ; GFX7: ; %bb.0: ; %entry
9229 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
9230 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
9231 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
9232 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
9233 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
9234 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
9235 ; GFX7-NEXT: s_mov_b32 s6, s4
9236 ; GFX7-NEXT: s_mov_b32 s7, s5
9237 ; GFX7-NEXT: s_mov_b32 s11, s12
9238 ; GFX7-NEXT: s_mov_b32 s10, s13
9239 ; GFX7-NEXT: s_add_u32 s6, s6, s11
9240 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
9241 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9242 ; GFX7-NEXT: s_mov_b32 s7, s10
9243 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
9244 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
9245 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9246 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
9247 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
9248 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
9249 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
9250 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9251 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
9252 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
9253 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
9254 ; GFX7-NEXT: s_waitcnt vmcnt(0)
9255 ; GFX7-NEXT: flat_store_dword v[0:1], v2
9256 ; GFX7-NEXT: s_endpgm
9258 ; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9259 ; GFX10-WGP: ; %bb.0: ; %entry
9260 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
9261 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
9262 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
9263 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
9264 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
9265 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
9266 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
9267 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
9268 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
9269 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
9270 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
9271 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
9272 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9273 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
9274 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
9275 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
9276 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9277 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
9278 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
9279 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
9280 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9281 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
9282 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9283 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9284 ; GFX10-WGP-NEXT: buffer_gl0_inv
9285 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
9286 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
9287 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
9288 ; GFX10-WGP-NEXT: s_endpgm
9290 ; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9291 ; GFX10-CU: ; %bb.0: ; %entry
9292 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
9293 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
9294 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
9295 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
9296 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
9297 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
9298 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
9299 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
9300 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
9301 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
9302 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
9303 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
9304 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9305 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
9306 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
9307 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
9308 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9309 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
9310 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
9311 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
9312 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
9313 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9314 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
9315 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
9316 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
9317 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
9318 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
9319 ; GFX10-CU-NEXT: s_endpgm
9321 ; SKIP-CACHE-INV-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9322 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
9323 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
9324 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
9325 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
9326 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
9327 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
9328 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
9329 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
9330 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
9331 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
9332 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
9333 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
9334 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
9335 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
9336 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
9337 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
9338 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
9339 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9340 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
9341 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
9342 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
9343 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
9344 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9345 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
9346 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
9347 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
9348 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
9349 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
9350 ; SKIP-CACHE-INV-NEXT: s_endpgm
9352 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9353 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
9354 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
9355 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
9356 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
9357 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9358 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
9359 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
9360 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9361 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
9362 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9363 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9364 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9365 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9366 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9367 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
9368 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
9369 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
9371 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9372 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
9373 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
9374 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
9375 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
9376 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9377 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
9378 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
9379 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9380 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
9381 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9382 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9383 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9384 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
9385 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
9386 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9387 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
9388 ; GFX90A-TGSPLIT-NEXT: s_endpgm
9390 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9391 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
9392 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
9393 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
9394 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
9395 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9396 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
9397 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
9398 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9399 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
9400 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
9401 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9402 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9403 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9404 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
9405 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
9406 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
9407 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
9409 ; GFX940-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9410 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
9411 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
9412 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
9413 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
9414 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9415 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
9416 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
9417 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9418 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
9419 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
9420 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9421 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9422 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
9423 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
9424 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
9425 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
9426 ; GFX940-TGSPLIT-NEXT: s_endpgm
9428 ; GFX11-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9429 ; GFX11-WGP: ; %bb.0: ; %entry
9430 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
9431 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
9432 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
9433 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
9434 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
9435 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
9436 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9437 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
9438 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
9439 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
9440 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9441 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
9442 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9443 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9444 ; GFX11-WGP-NEXT: buffer_gl0_inv
9445 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
9446 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
9447 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
9448 ; GFX11-WGP-NEXT: s_endpgm
9450 ; GFX11-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9451 ; GFX11-CU: ; %bb.0: ; %entry
9452 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
9453 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
9454 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
9455 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
9456 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
9457 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
9458 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9459 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
9460 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
9461 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
9462 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
9463 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9464 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
9465 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
9466 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
9467 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
9468 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
9469 ; GFX11-CU-NEXT: s_endpgm
9471 ; GFX12-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9472 ; GFX12-WGP: ; %bb.0: ; %entry
9473 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
9474 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
9475 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
9476 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
9477 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
9478 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
9479 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9480 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
9481 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
9482 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
9483 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
9484 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
9485 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
9486 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
9487 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
9488 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
9489 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
9490 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
9491 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
9492 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
9493 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
9494 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
9495 ; GFX12-WGP-NEXT: s_endpgm
9497 ; GFX12-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
9498 ; GFX12-CU: ; %bb.0: ; %entry
9499 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
9500 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
9501 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
9502 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
9503 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
9504 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
9505 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9506 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
9507 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
9508 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
9509 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
9510 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
9511 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
9512 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
9513 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
9514 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
9515 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
9516 ; GFX12-CU-NEXT: s_endpgm
9517 ptr %out, i32 %in, i32 %old) {
9519 %gep = getelementptr i32, ptr %out, i32 4
9520 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") acq_rel seq_cst
9521 %val0 = extractvalue { i32, i1 } %val, 0
9522 store i32 %val0, ptr %out, align 4
9526 define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
9527 ; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9528 ; GFX7: ; %bb.0: ; %entry
9529 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
9530 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
9531 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
9532 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
9533 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
9534 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
9535 ; GFX7-NEXT: s_mov_b32 s6, s4
9536 ; GFX7-NEXT: s_mov_b32 s7, s5
9537 ; GFX7-NEXT: s_mov_b32 s11, s12
9538 ; GFX7-NEXT: s_mov_b32 s10, s13
9539 ; GFX7-NEXT: s_add_u32 s6, s6, s11
9540 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
9541 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9542 ; GFX7-NEXT: s_mov_b32 s7, s10
9543 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
9544 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
9545 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9546 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
9547 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
9548 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
9549 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
9550 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9551 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
9552 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
9553 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
9554 ; GFX7-NEXT: s_waitcnt vmcnt(0)
9555 ; GFX7-NEXT: flat_store_dword v[0:1], v2
9556 ; GFX7-NEXT: s_endpgm
9558 ; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9559 ; GFX10-WGP: ; %bb.0: ; %entry
9560 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
9561 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
9562 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
9563 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
9564 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
9565 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
9566 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
9567 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
9568 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
9569 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
9570 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
9571 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
9572 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9573 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
9574 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
9575 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
9576 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9577 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
9578 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
9579 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
9580 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9581 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
9582 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9583 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9584 ; GFX10-WGP-NEXT: buffer_gl0_inv
9585 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
9586 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
9587 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
9588 ; GFX10-WGP-NEXT: s_endpgm
9590 ; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9591 ; GFX10-CU: ; %bb.0: ; %entry
9592 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
9593 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
9594 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
9595 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
9596 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
9597 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
9598 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
9599 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
9600 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
9601 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
9602 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
9603 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
9604 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
9605 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
9606 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
9607 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
9608 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9609 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
9610 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
9611 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
9612 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
9613 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9614 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
9615 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
9616 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
9617 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
9618 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
9619 ; GFX10-CU-NEXT: s_endpgm
9621 ; SKIP-CACHE-INV-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9622 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
9623 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
9624 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
9625 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
9626 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
9627 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
9628 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
9629 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
9630 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
9631 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
9632 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
9633 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
9634 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
9635 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
9636 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
9637 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
9638 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
9639 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9640 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
9641 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
9642 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
9643 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
9644 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
9645 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
9646 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
9647 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
9648 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
9649 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
9650 ; SKIP-CACHE-INV-NEXT: s_endpgm
9652 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9653 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
9654 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
9655 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
9656 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
9657 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9658 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
9659 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
9660 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9661 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
9662 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9663 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9664 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9665 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9666 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9667 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
9668 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
9669 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
9671 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9672 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
9673 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
9674 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
9675 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
9676 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9677 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
9678 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
9679 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9680 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
9681 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9682 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9683 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
9684 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
9685 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
9686 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9687 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
9688 ; GFX90A-TGSPLIT-NEXT: s_endpgm
9690 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9691 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
9692 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
9693 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
9694 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
9695 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9696 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
9697 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
9698 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9699 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
9700 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
9701 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9702 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9703 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9704 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
9705 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0)
9706 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
9707 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
9709 ; GFX940-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9710 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
9711 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
9712 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
9713 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
9714 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9715 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
9716 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
9717 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9718 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
9719 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
9720 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9721 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
9722 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
9723 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
9724 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
9725 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
9726 ; GFX940-TGSPLIT-NEXT: s_endpgm
9728 ; GFX11-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9729 ; GFX11-WGP: ; %bb.0: ; %entry
9730 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
9731 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
9732 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
9733 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
9734 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
9735 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
9736 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9737 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
9738 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
9739 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
9740 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9741 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
9742 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9743 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9744 ; GFX11-WGP-NEXT: buffer_gl0_inv
9745 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
9746 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
9747 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
9748 ; GFX11-WGP-NEXT: s_endpgm
9750 ; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9751 ; GFX11-CU: ; %bb.0: ; %entry
9752 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
9753 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
9754 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
9755 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
9756 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
9757 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
9758 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9759 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
9760 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
9761 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
9762 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
9763 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
9764 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
9765 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
9766 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
9767 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0)
9768 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
9769 ; GFX11-CU-NEXT: s_endpgm
9771 ; GFX12-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9772 ; GFX12-WGP: ; %bb.0: ; %entry
9773 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
9774 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
9775 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
9776 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
9777 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
9778 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
9779 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9780 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
9781 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
9782 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
9783 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
9784 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
9785 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
9786 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
9787 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
9788 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
9789 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
9790 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
9791 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
9792 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
9793 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
9794 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
9795 ; GFX12-WGP-NEXT: s_endpgm
9797 ; GFX12-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
9798 ; GFX12-CU: ; %bb.0: ; %entry
9799 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
9800 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
9801 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
9802 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
9803 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
9804 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
9805 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
9806 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
9807 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
9808 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
9809 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
9810 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
9811 ; GFX12-CU-NEXT: s_wait_dscnt 0x0
9812 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
9813 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
9814 ; GFX12-CU-NEXT: s_wait_loadcnt 0x0
9815 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
9816 ; GFX12-CU-NEXT: s_endpgm
9817 ptr %out, i32 %in, i32 %old) {
9819 %gep = getelementptr i32, ptr %out, i32 4
9820 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup") seq_cst seq_cst
9821 %val0 = extractvalue { i32, i1 } %val, 0
9822 store i32 %val0, ptr %out, align 4
9826 define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
9827 ; GFX7-LABEL: flat_workgroup_one_as_unordered_load:
9828 ; GFX7: ; %bb.0: ; %entry
9829 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
9830 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
9831 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
9832 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
9833 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
9834 ; GFX7-NEXT: flat_load_dword v2, v[0:1]
9835 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
9836 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
9837 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9838 ; GFX7-NEXT: flat_store_dword v[0:1], v2
9839 ; GFX7-NEXT: s_endpgm
9841 ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load:
9842 ; GFX10-WGP: ; %bb.0: ; %entry
9843 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
9844 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
9845 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
9846 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
9847 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
9848 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1]
9849 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
9850 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
9851 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9852 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
9853 ; GFX10-WGP-NEXT: s_endpgm
9855 ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load:
9856 ; GFX10-CU: ; %bb.0: ; %entry
9857 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
9858 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
9859 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
9860 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
9861 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
9862 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
9863 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
9864 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
9865 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9866 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
9867 ; GFX10-CU-NEXT: s_endpgm
9869 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_load:
9870 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
9871 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
9872 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
9873 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
9874 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
9875 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
9876 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
9877 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
9878 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
9879 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9880 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
9881 ; SKIP-CACHE-INV-NEXT: s_endpgm
9883 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
9884 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
9885 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
9886 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
9887 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9888 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
9889 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
9890 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9891 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9892 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
9893 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
9895 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
9896 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
9897 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
9898 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
9899 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9900 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
9901 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
9902 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
9903 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
9904 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
9905 ; GFX90A-TGSPLIT-NEXT: s_endpgm
9907 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
9908 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
9909 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
9910 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
9911 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9912 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
9913 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
9914 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
9915 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9916 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
9917 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
9919 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
9920 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
9921 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
9922 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
9923 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
9924 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
9925 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1]
9926 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
9927 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
9928 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
9929 ; GFX940-TGSPLIT-NEXT: s_endpgm
9931 ; GFX11-WGP-LABEL: flat_workgroup_one_as_unordered_load:
9932 ; GFX11-WGP: ; %bb.0: ; %entry
9933 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
9934 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
9935 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
9936 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
9937 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
9938 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1]
9939 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
9940 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
9941 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9942 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
9943 ; GFX11-WGP-NEXT: s_endpgm
9945 ; GFX11-CU-LABEL: flat_workgroup_one_as_unordered_load:
9946 ; GFX11-CU: ; %bb.0: ; %entry
9947 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
9948 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
9949 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
9950 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
9951 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
9952 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
9953 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
9954 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
9955 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
9956 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
9957 ; GFX11-CU-NEXT: s_endpgm
9959 ; GFX12-WGP-LABEL: flat_workgroup_one_as_unordered_load:
9960 ; GFX12-WGP: ; %bb.0: ; %entry
9961 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
9962 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
9963 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
9964 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
9965 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
9966 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1]
9967 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
9968 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
9969 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
9970 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
9971 ; GFX12-WGP-NEXT: s_endpgm
9973 ; GFX12-CU-LABEL: flat_workgroup_one_as_unordered_load:
9974 ; GFX12-CU: ; %bb.0: ; %entry
9975 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
9976 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
9977 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
9978 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
9979 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
9980 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
9981 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
9982 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
9983 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
9984 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
9985 ; GFX12-CU-NEXT: s_endpgm
9986 ptr %in, ptr %out) {
9988 %val = load atomic i32, ptr %in syncscope("workgroup-one-as") unordered, align 4
9989 store i32 %val, ptr %out
9993 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
9994 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_load:
9995 ; GFX7: ; %bb.0: ; %entry
9996 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
9997 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
9998 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
9999 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
10000 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
10001 ; GFX7-NEXT: flat_load_dword v2, v[0:1]
10002 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
10003 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
10004 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10005 ; GFX7-NEXT: flat_store_dword v[0:1], v2
10006 ; GFX7-NEXT: s_endpgm
10008 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
10009 ; GFX10-WGP: ; %bb.0: ; %entry
10010 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
10011 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
10012 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
10013 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
10014 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
10015 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc
10016 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
10017 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
10018 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10019 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
10020 ; GFX10-WGP-NEXT: s_endpgm
10022 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load:
10023 ; GFX10-CU: ; %bb.0: ; %entry
10024 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
10025 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
10026 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
10027 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
10028 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
10029 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
10030 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
10031 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
10032 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10033 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
10034 ; GFX10-CU-NEXT: s_endpgm
10036 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_load:
10037 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
10038 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
10039 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
10040 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
10041 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
10042 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
10043 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
10044 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
10045 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
10046 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10047 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
10048 ; SKIP-CACHE-INV-NEXT: s_endpgm
10050 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
10051 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
10052 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
10053 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
10054 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10055 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10056 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
10057 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10058 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10059 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
10060 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
10062 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
10063 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
10064 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
10065 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
10066 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10067 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10068 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
10069 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10070 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
10071 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
10072 ; GFX90A-TGSPLIT-NEXT: s_endpgm
10074 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
10075 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
10076 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
10077 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
10078 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10079 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
10080 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
10081 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
10082 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10083 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
10084 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
10086 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
10087 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
10088 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
10089 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
10090 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10091 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
10092 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
10093 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
10094 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
10095 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
10096 ; GFX940-TGSPLIT-NEXT: s_endpgm
10098 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
10099 ; GFX11-WGP: ; %bb.0: ; %entry
10100 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
10101 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
10102 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
10103 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
10104 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
10105 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
10106 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
10107 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
10108 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10109 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
10110 ; GFX11-WGP-NEXT: s_endpgm
10112 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_load:
10113 ; GFX11-CU: ; %bb.0: ; %entry
10114 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
10115 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
10116 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
10117 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
10118 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
10119 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
10120 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
10121 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
10122 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10123 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
10124 ; GFX11-CU-NEXT: s_endpgm
10126 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
10127 ; GFX12-WGP: ; %bb.0: ; %entry
10128 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
10129 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
10130 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
10131 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
10132 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
10133 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE
10134 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
10135 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
10136 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
10137 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
10138 ; GFX12-WGP-NEXT: s_endpgm
10140 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_load:
10141 ; GFX12-CU: ; %bb.0: ; %entry
10142 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
10143 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
10144 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
10145 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
10146 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
10147 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
10148 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
10149 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
10150 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
10151 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
10152 ; GFX12-CU-NEXT: s_endpgm
10153 ptr %in, ptr %out) {
10155 %val = load atomic i32, ptr %in syncscope("workgroup-one-as") monotonic, align 4
10156 store i32 %val, ptr %out
10160 define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
10161 ; GFX7-LABEL: flat_workgroup_one_as_acquire_load:
10162 ; GFX7: ; %bb.0: ; %entry
10163 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
10164 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
10165 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
10166 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
10167 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
10168 ; GFX7-NEXT: flat_load_dword v2, v[0:1]
10169 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
10170 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
10171 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10172 ; GFX7-NEXT: flat_store_dword v[0:1], v2
10173 ; GFX7-NEXT: s_endpgm
10175 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load:
10176 ; GFX10-WGP: ; %bb.0: ; %entry
10177 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
10178 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
10179 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
10180 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
10181 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
10182 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc
10183 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
10184 ; GFX10-WGP-NEXT: buffer_gl0_inv
10185 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
10186 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
10187 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
10188 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
10189 ; GFX10-WGP-NEXT: s_endpgm
10191 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load:
10192 ; GFX10-CU: ; %bb.0: ; %entry
10193 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
10194 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
10195 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
10196 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
10197 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
10198 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
10199 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
10200 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
10201 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10202 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
10203 ; GFX10-CU-NEXT: s_endpgm
10205 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_load:
10206 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
10207 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
10208 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
10209 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
10210 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
10211 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
10212 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
10213 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
10214 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
10215 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10216 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
10217 ; SKIP-CACHE-INV-NEXT: s_endpgm
10219 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
10220 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
10221 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
10222 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
10223 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10224 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10225 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
10226 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10227 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10228 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
10229 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
10231 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
10232 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
10233 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
10234 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
10235 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10236 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10237 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
10238 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
10239 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
10240 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10241 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
10242 ; GFX90A-TGSPLIT-NEXT: s_endpgm
10244 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
10245 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
10246 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
10247 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
10248 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10249 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
10250 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
10251 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
10252 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10253 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
10254 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
10256 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
10257 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
10258 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
10259 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
10260 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10261 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
10262 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
10263 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
10264 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
10265 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
10266 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
10267 ; GFX940-TGSPLIT-NEXT: s_endpgm
10269 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_load:
10270 ; GFX11-WGP: ; %bb.0: ; %entry
10271 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
10272 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
10273 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
10274 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
10275 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
10276 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
10277 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
10278 ; GFX11-WGP-NEXT: buffer_gl0_inv
10279 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
10280 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
10281 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
10282 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
10283 ; GFX11-WGP-NEXT: s_endpgm
10285 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_load:
10286 ; GFX11-CU: ; %bb.0: ; %entry
10287 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
10288 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
10289 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
10290 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
10291 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
10292 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
10293 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
10294 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
10295 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10296 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
10297 ; GFX11-CU-NEXT: s_endpgm
10299 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_load:
10300 ; GFX12-WGP: ; %bb.0: ; %entry
10301 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
10302 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
10303 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
10304 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
10305 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
10306 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE
10307 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
10308 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
10309 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
10310 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
10311 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
10312 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
10313 ; GFX12-WGP-NEXT: s_endpgm
10315 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_load:
10316 ; GFX12-CU: ; %bb.0: ; %entry
10317 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
10318 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
10319 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
10320 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
10321 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
10322 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
10323 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
10324 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
10325 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
10326 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
10327 ; GFX12-CU-NEXT: s_endpgm
10328 ptr %in, ptr %out) {
10330 %val = load atomic i32, ptr %in syncscope("workgroup-one-as") acquire, align 4
10331 store i32 %val, ptr %out
10335 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
10336 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load:
10337 ; GFX7: ; %bb.0: ; %entry
10338 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
10339 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
10340 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
10341 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
10342 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
10343 ; GFX7-NEXT: flat_load_dword v2, v[0:1]
10344 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
10345 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
10346 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10347 ; GFX7-NEXT: flat_store_dword v[0:1], v2
10348 ; GFX7-NEXT: s_endpgm
10350 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
10351 ; GFX10-WGP: ; %bb.0: ; %entry
10352 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
10353 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
10354 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
10355 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
10356 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
10357 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
10358 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
10359 ; GFX10-WGP-NEXT: flat_load_dword v2, v[0:1] glc
10360 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
10361 ; GFX10-WGP-NEXT: buffer_gl0_inv
10362 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
10363 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
10364 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
10365 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
10366 ; GFX10-WGP-NEXT: s_endpgm
10368 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
10369 ; GFX10-CU: ; %bb.0: ; %entry
10370 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
10371 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
10372 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
10373 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
10374 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
10375 ; GFX10-CU-NEXT: flat_load_dword v2, v[0:1]
10376 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
10377 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
10378 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10379 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
10380 ; GFX10-CU-NEXT: s_endpgm
10382 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_load:
10383 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
10384 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
10385 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
10386 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
10387 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
10388 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
10389 ; SKIP-CACHE-INV-NEXT: flat_load_dword v2, v[0:1]
10390 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
10391 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
10392 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10393 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
10394 ; SKIP-CACHE-INV-NEXT: s_endpgm
10396 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
10397 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
10398 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
10399 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
10400 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10401 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10402 ; GFX90A-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1]
10403 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10404 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10405 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
10406 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
10408 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
10409 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
10410 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
10411 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
10412 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10413 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10414 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
10415 ; GFX90A-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] glc
10416 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
10417 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
10418 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
10419 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
10420 ; GFX90A-TGSPLIT-NEXT: s_endpgm
10422 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
10423 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
10424 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
10425 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
10426 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10427 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
10428 ; GFX940-NOTTGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
10429 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
10430 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10431 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
10432 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
10434 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
10435 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
10436 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
10437 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
10438 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10439 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
10440 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
10441 ; GFX940-TGSPLIT-NEXT: flat_load_dword v2, v[0:1] sc0
10442 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
10443 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
10444 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
10445 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
10446 ; GFX940-TGSPLIT-NEXT: s_endpgm
10448 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
10449 ; GFX11-WGP: ; %bb.0: ; %entry
10450 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
10451 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
10452 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
10453 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
10454 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
10455 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
10456 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
10457 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc
10458 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
10459 ; GFX11-WGP-NEXT: buffer_gl0_inv
10460 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
10461 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
10462 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
10463 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
10464 ; GFX11-WGP-NEXT: s_endpgm
10466 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
10467 ; GFX11-CU: ; %bb.0: ; %entry
10468 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
10469 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
10470 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
10471 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
10472 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
10473 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1]
10474 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
10475 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
10476 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
10477 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
10478 ; GFX11-CU-NEXT: s_endpgm
10480 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
10481 ; GFX12-WGP: ; %bb.0: ; %entry
10482 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
10483 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
10484 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
10485 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
10486 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
10487 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
10488 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
10489 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
10490 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
10491 ; GFX12-WGP-NEXT: flat_load_b32 v2, v[0:1] scope:SCOPE_SE
10492 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
10493 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
10494 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
10495 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
10496 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
10497 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
10498 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
10499 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
10500 ; GFX12-WGP-NEXT: s_endpgm
10502 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
10503 ; GFX12-CU: ; %bb.0: ; %entry
10504 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
10505 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x8
10506 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
10507 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
10508 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
10509 ; GFX12-CU-NEXT: flat_load_b32 v2, v[0:1]
10510 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
10511 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
10512 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
10513 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
10514 ; GFX12-CU-NEXT: s_endpgm
10515 ptr %in, ptr %out) {
10517 %val = load atomic i32, ptr %in syncscope("workgroup-one-as") seq_cst, align 4
10518 store i32 %val, ptr %out
10522 define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
10523 ; GFX7-LABEL: flat_workgroup_one_as_unordered_store:
10524 ; GFX7: ; %bb.0: ; %entry
10525 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
10526 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
10527 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
10528 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
10529 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
10530 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
10531 ; GFX7-NEXT: flat_store_dword v[0:1], v2
10532 ; GFX7-NEXT: s_endpgm
10534 ; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store:
10535 ; GFX10-WGP: ; %bb.0: ; %entry
10536 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
10537 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
10538 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
10539 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
10540 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
10541 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
10542 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
10543 ; GFX10-WGP-NEXT: s_endpgm
10545 ; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store:
10546 ; GFX10-CU: ; %bb.0: ; %entry
10547 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
10548 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
10549 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
10550 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
10551 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
10552 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
10553 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
10554 ; GFX10-CU-NEXT: s_endpgm
10556 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_unordered_store:
10557 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
10558 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
10559 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
10560 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
10561 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
10562 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
10563 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
10564 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
10565 ; SKIP-CACHE-INV-NEXT: s_endpgm
10567 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
10568 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
10569 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
10570 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
10571 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10572 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10573 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
10574 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
10575 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
10577 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
10578 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
10579 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
10580 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
10581 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10582 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10583 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
10584 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
10585 ; GFX90A-TGSPLIT-NEXT: s_endpgm
10587 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
10588 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
10589 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
10590 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
10591 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10592 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
10593 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
10594 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
10595 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
10597 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
10598 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
10599 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
10600 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
10601 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10602 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
10603 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
10604 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
10605 ; GFX940-TGSPLIT-NEXT: s_endpgm
10607 ; GFX11-WGP-LABEL: flat_workgroup_one_as_unordered_store:
10608 ; GFX11-WGP: ; %bb.0: ; %entry
10609 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
10610 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
10611 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
10612 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
10613 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
10614 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
10615 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
10616 ; GFX11-WGP-NEXT: s_endpgm
10618 ; GFX11-CU-LABEL: flat_workgroup_one_as_unordered_store:
10619 ; GFX11-CU: ; %bb.0: ; %entry
10620 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
10621 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
10622 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
10623 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
10624 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
10625 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
10626 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
10627 ; GFX11-CU-NEXT: s_endpgm
10629 ; GFX12-WGP-LABEL: flat_workgroup_one_as_unordered_store:
10630 ; GFX12-WGP: ; %bb.0: ; %entry
10631 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
10632 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
10633 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
10634 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
10635 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
10636 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
10637 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
10638 ; GFX12-WGP-NEXT: s_endpgm
10640 ; GFX12-CU-LABEL: flat_workgroup_one_as_unordered_store:
10641 ; GFX12-CU: ; %bb.0: ; %entry
10642 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
10643 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
10644 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
10645 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
10646 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
10647 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
10648 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
10649 ; GFX12-CU-NEXT: s_endpgm
10650 i32 %in, ptr %out) {
10652 store atomic i32 %in, ptr %out syncscope("workgroup-one-as") unordered, align 4
10656 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
10657 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_store:
10658 ; GFX7: ; %bb.0: ; %entry
10659 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
10660 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
10661 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
10662 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
10663 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
10664 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
10665 ; GFX7-NEXT: flat_store_dword v[0:1], v2
10666 ; GFX7-NEXT: s_endpgm
10668 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
10669 ; GFX10-WGP: ; %bb.0: ; %entry
10670 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
10671 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
10672 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
10673 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
10674 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
10675 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
10676 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
10677 ; GFX10-WGP-NEXT: s_endpgm
10679 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store:
10680 ; GFX10-CU: ; %bb.0: ; %entry
10681 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
10682 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
10683 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
10684 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
10685 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
10686 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
10687 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
10688 ; GFX10-CU-NEXT: s_endpgm
10690 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_store:
10691 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
10692 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
10693 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
10694 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
10695 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
10696 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
10697 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
10698 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
10699 ; SKIP-CACHE-INV-NEXT: s_endpgm
10701 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
10702 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
10703 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
10704 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
10705 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10706 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10707 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
10708 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
10709 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
10711 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
10712 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
10713 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
10714 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
10715 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10716 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10717 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
10718 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
10719 ; GFX90A-TGSPLIT-NEXT: s_endpgm
10721 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
10722 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
10723 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
10724 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
10725 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10726 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
10727 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
10728 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
10729 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
10731 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
10732 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
10733 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
10734 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
10735 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10736 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
10737 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
10738 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
10739 ; GFX940-TGSPLIT-NEXT: s_endpgm
10741 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
10742 ; GFX11-WGP: ; %bb.0: ; %entry
10743 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
10744 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
10745 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
10746 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
10747 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
10748 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
10749 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
10750 ; GFX11-WGP-NEXT: s_endpgm
10752 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_store:
10753 ; GFX11-CU: ; %bb.0: ; %entry
10754 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
10755 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
10756 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
10757 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
10758 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
10759 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
10760 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
10761 ; GFX11-CU-NEXT: s_endpgm
10763 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
10764 ; GFX12-WGP: ; %bb.0: ; %entry
10765 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
10766 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
10767 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
10768 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
10769 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
10770 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
10771 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
10772 ; GFX12-WGP-NEXT: s_endpgm
10774 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_store:
10775 ; GFX12-CU: ; %bb.0: ; %entry
10776 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
10777 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
10778 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
10779 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
10780 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
10781 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
10782 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
10783 ; GFX12-CU-NEXT: s_endpgm
10784 i32 %in, ptr %out) {
10786 store atomic i32 %in, ptr %out syncscope("workgroup-one-as") monotonic, align 4
10790 define amdgpu_kernel void @flat_workgroup_one_as_release_store(
10791 ; GFX7-LABEL: flat_workgroup_one_as_release_store:
10792 ; GFX7: ; %bb.0: ; %entry
10793 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
10794 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
10795 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
10796 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
10797 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
10798 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
10799 ; GFX7-NEXT: flat_store_dword v[0:1], v2
10800 ; GFX7-NEXT: s_endpgm
10802 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store:
10803 ; GFX10-WGP: ; %bb.0: ; %entry
10804 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
10805 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
10806 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
10807 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
10808 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
10809 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
10810 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
10811 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
10812 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
10813 ; GFX10-WGP-NEXT: s_endpgm
10815 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_store:
10816 ; GFX10-CU: ; %bb.0: ; %entry
10817 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
10818 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
10819 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
10820 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
10821 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
10822 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
10823 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
10824 ; GFX10-CU-NEXT: s_endpgm
10826 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_store:
10827 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
10828 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
10829 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
10830 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
10831 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
10832 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
10833 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
10834 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
10835 ; SKIP-CACHE-INV-NEXT: s_endpgm
10837 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
10838 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
10839 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
10840 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
10841 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10842 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10843 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
10844 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
10845 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
10847 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
10848 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
10849 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
10850 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
10851 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10852 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10853 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
10854 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
10855 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
10856 ; GFX90A-TGSPLIT-NEXT: s_endpgm
10858 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
10859 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
10860 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
10861 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
10862 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10863 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
10864 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
10865 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
10866 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
10868 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
10869 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
10870 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
10871 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
10872 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10873 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
10874 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
10875 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
10876 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
10877 ; GFX940-TGSPLIT-NEXT: s_endpgm
10879 ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_store:
10880 ; GFX11-WGP: ; %bb.0: ; %entry
10881 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
10882 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
10883 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
10884 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
10885 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
10886 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
10887 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
10888 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
10889 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
10890 ; GFX11-WGP-NEXT: s_endpgm
10892 ; GFX11-CU-LABEL: flat_workgroup_one_as_release_store:
10893 ; GFX11-CU: ; %bb.0: ; %entry
10894 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
10895 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
10896 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
10897 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
10898 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
10899 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
10900 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
10901 ; GFX11-CU-NEXT: s_endpgm
10903 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_store:
10904 ; GFX12-WGP: ; %bb.0: ; %entry
10905 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
10906 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
10907 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
10908 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
10909 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
10910 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
10911 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
10912 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
10913 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
10914 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
10915 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
10916 ; GFX12-WGP-NEXT: s_endpgm
10918 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_store:
10919 ; GFX12-CU: ; %bb.0: ; %entry
10920 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
10921 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
10922 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
10923 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
10924 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
10925 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
10926 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
10927 ; GFX12-CU-NEXT: s_endpgm
10928 i32 %in, ptr %out) {
10930 store atomic i32 %in, ptr %out syncscope("workgroup-one-as") release, align 4
10934 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
10935 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store:
10936 ; GFX7: ; %bb.0: ; %entry
10937 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
10938 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
10939 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
10940 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
10941 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
10942 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
10943 ; GFX7-NEXT: flat_store_dword v[0:1], v2
10944 ; GFX7-NEXT: s_endpgm
10946 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
10947 ; GFX10-WGP: ; %bb.0: ; %entry
10948 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
10949 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
10950 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
10951 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
10952 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
10953 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
10954 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
10955 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
10956 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
10957 ; GFX10-WGP-NEXT: s_endpgm
10959 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
10960 ; GFX10-CU: ; %bb.0: ; %entry
10961 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
10962 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
10963 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
10964 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
10965 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
10966 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
10967 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
10968 ; GFX10-CU-NEXT: s_endpgm
10970 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_store:
10971 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
10972 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x0
10973 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2
10974 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
10975 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
10976 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
10977 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
10978 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
10979 ; SKIP-CACHE-INV-NEXT: s_endpgm
10981 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
10982 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
10983 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
10984 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
10985 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10986 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10987 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
10988 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
10989 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
10991 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
10992 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
10993 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
10994 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
10995 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
10996 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
10997 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
10998 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
10999 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
11000 ; GFX90A-TGSPLIT-NEXT: s_endpgm
11002 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
11003 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
11004 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
11005 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
11006 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11007 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
11008 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
11009 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
11010 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
11012 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
11013 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
11014 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x0
11015 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8
11016 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11017 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
11018 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
11019 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
11020 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
11021 ; GFX940-TGSPLIT-NEXT: s_endpgm
11023 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
11024 ; GFX11-WGP: ; %bb.0: ; %entry
11025 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
11026 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
11027 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
11028 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
11029 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
11030 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
11031 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
11032 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
11033 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
11034 ; GFX11-WGP-NEXT: s_endpgm
11036 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
11037 ; GFX11-CU: ; %bb.0: ; %entry
11038 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
11039 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
11040 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
11041 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
11042 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
11043 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
11044 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
11045 ; GFX11-CU-NEXT: s_endpgm
11047 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
11048 ; GFX12-WGP: ; %bb.0: ; %entry
11049 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x0
11050 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
11051 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
11052 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
11053 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
11054 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
11055 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
11056 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
11057 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
11058 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
11059 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SE
11060 ; GFX12-WGP-NEXT: s_endpgm
11062 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
11063 ; GFX12-CU: ; %bb.0: ; %entry
11064 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x0
11065 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x8
11066 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
11067 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
11068 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
11069 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
11070 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
11071 ; GFX12-CU-NEXT: s_endpgm
11072 i32 %in, ptr %out) {
11074 store atomic i32 %in, ptr %out syncscope("workgroup-one-as") seq_cst, align 4
11078 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
11079 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11080 ; GFX7: ; %bb.0: ; %entry
11081 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11082 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
11083 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
11084 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
11085 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
11086 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
11087 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2
11088 ; GFX7-NEXT: s_endpgm
11090 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11091 ; GFX10-WGP: ; %bb.0: ; %entry
11092 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11093 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
11094 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
11095 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
11096 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
11097 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
11098 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
11099 ; GFX10-WGP-NEXT: s_endpgm
11101 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11102 ; GFX10-CU: ; %bb.0: ; %entry
11103 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11104 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
11105 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
11106 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
11107 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
11108 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
11109 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
11110 ; GFX10-CU-NEXT: s_endpgm
11112 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11113 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
11114 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
11115 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
11116 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
11117 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
11118 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
11119 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
11120 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
11121 ; SKIP-CACHE-INV-NEXT: s_endpgm
11123 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11124 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
11125 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11126 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
11127 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11128 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11129 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
11130 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11131 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
11133 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11134 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
11135 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11136 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
11137 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11138 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11139 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
11140 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11141 ; GFX90A-TGSPLIT-NEXT: s_endpgm
11143 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11144 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
11145 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
11146 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
11147 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11148 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
11149 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
11150 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11151 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
11153 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11154 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
11155 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
11156 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
11157 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11158 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
11159 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
11160 ; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11161 ; GFX940-TGSPLIT-NEXT: s_endpgm
11163 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11164 ; GFX11-WGP: ; %bb.0: ; %entry
11165 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11166 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
11167 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
11168 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
11169 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
11170 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
11171 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
11172 ; GFX11-WGP-NEXT: s_endpgm
11174 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11175 ; GFX11-CU: ; %bb.0: ; %entry
11176 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11177 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
11178 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
11179 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
11180 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
11181 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
11182 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
11183 ; GFX11-CU-NEXT: s_endpgm
11185 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11186 ; GFX12-WGP: ; %bb.0: ; %entry
11187 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11188 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
11189 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
11190 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
11191 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
11192 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
11193 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
11194 ; GFX12-WGP-NEXT: s_endpgm
11196 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
11197 ; GFX12-CU: ; %bb.0: ; %entry
11198 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11199 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
11200 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
11201 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
11202 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
11203 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
11204 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
11205 ; GFX12-CU-NEXT: s_endpgm
11206 ptr %out, i32 %in) {
11208 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") monotonic
11212 define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
11213 ; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11214 ; GFX7: ; %bb.0: ; %entry
11215 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11216 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
11217 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
11218 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
11219 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
11220 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
11221 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2
11222 ; GFX7-NEXT: s_endpgm
11224 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11225 ; GFX10-WGP: ; %bb.0: ; %entry
11226 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11227 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
11228 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
11229 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
11230 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
11231 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
11232 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
11233 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
11234 ; GFX10-WGP-NEXT: buffer_gl0_inv
11235 ; GFX10-WGP-NEXT: s_endpgm
11237 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11238 ; GFX10-CU: ; %bb.0: ; %entry
11239 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11240 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
11241 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
11242 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
11243 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
11244 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
11245 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
11246 ; GFX10-CU-NEXT: s_endpgm
11248 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11249 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
11250 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
11251 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
11252 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
11253 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
11254 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
11255 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
11256 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
11257 ; SKIP-CACHE-INV-NEXT: s_endpgm
11259 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11260 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
11261 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11262 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
11263 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11264 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11265 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
11266 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11267 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
11269 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11270 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
11271 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11272 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
11273 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11274 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11275 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
11276 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11277 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
11278 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
11279 ; GFX90A-TGSPLIT-NEXT: s_endpgm
11281 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11282 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
11283 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
11284 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
11285 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11286 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
11287 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
11288 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11289 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
11291 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11292 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
11293 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
11294 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
11295 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11296 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
11297 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
11298 ; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11299 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
11300 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
11301 ; GFX940-TGSPLIT-NEXT: s_endpgm
11303 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11304 ; GFX11-WGP: ; %bb.0: ; %entry
11305 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11306 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
11307 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
11308 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
11309 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
11310 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
11311 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
11312 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
11313 ; GFX11-WGP-NEXT: buffer_gl0_inv
11314 ; GFX11-WGP-NEXT: s_endpgm
11316 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11317 ; GFX11-CU: ; %bb.0: ; %entry
11318 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11319 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
11320 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
11321 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
11322 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
11323 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
11324 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
11325 ; GFX11-CU-NEXT: s_endpgm
11327 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11328 ; GFX12-WGP: ; %bb.0: ; %entry
11329 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11330 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
11331 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
11332 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
11333 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
11334 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
11335 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
11336 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
11337 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
11338 ; GFX12-WGP-NEXT: s_endpgm
11340 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
11341 ; GFX12-CU: ; %bb.0: ; %entry
11342 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11343 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
11344 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
11345 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
11346 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
11347 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
11348 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
11349 ; GFX12-CU-NEXT: s_endpgm
11350 ptr %out, i32 %in) {
11352 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
11356 define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
11357 ; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw:
11358 ; GFX7: ; %bb.0: ; %entry
11359 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11360 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
11361 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
11362 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
11363 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
11364 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
11365 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2
11366 ; GFX7-NEXT: s_endpgm
11368 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
11369 ; GFX10-WGP: ; %bb.0: ; %entry
11370 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11371 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
11372 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
11373 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
11374 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
11375 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
11376 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
11377 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
11378 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
11379 ; GFX10-WGP-NEXT: s_endpgm
11381 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
11382 ; GFX10-CU: ; %bb.0: ; %entry
11383 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11384 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
11385 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
11386 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
11387 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
11388 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
11389 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
11390 ; GFX10-CU-NEXT: s_endpgm
11392 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_atomicrmw:
11393 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
11394 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
11395 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
11396 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
11397 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
11398 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
11399 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
11400 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
11401 ; SKIP-CACHE-INV-NEXT: s_endpgm
11403 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
11404 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
11405 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11406 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
11407 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11408 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11409 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
11410 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11411 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
11413 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
11414 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
11415 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11416 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
11417 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11418 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11419 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
11420 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
11421 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11422 ; GFX90A-TGSPLIT-NEXT: s_endpgm
11424 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
11425 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
11426 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
11427 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
11428 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11429 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
11430 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
11431 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11432 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
11434 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
11435 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
11436 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
11437 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
11438 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11439 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
11440 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
11441 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
11442 ; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11443 ; GFX940-TGSPLIT-NEXT: s_endpgm
11445 ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
11446 ; GFX11-WGP: ; %bb.0: ; %entry
11447 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11448 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
11449 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
11450 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
11451 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
11452 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
11453 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
11454 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
11455 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
11456 ; GFX11-WGP-NEXT: s_endpgm
11458 ; GFX11-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
11459 ; GFX11-CU: ; %bb.0: ; %entry
11460 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11461 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
11462 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
11463 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
11464 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
11465 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
11466 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
11467 ; GFX11-CU-NEXT: s_endpgm
11469 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
11470 ; GFX12-WGP: ; %bb.0: ; %entry
11471 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11472 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
11473 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
11474 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
11475 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
11476 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
11477 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
11478 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
11479 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
11480 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
11481 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
11482 ; GFX12-WGP-NEXT: s_endpgm
11484 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
11485 ; GFX12-CU: ; %bb.0: ; %entry
11486 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11487 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
11488 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
11489 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
11490 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
11491 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
11492 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
11493 ; GFX12-CU-NEXT: s_endpgm
11494 ptr %out, i32 %in) {
11496 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") release
11500 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
11501 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11502 ; GFX7: ; %bb.0: ; %entry
11503 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11504 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
11505 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
11506 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
11507 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
11508 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
11509 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2
11510 ; GFX7-NEXT: s_endpgm
11512 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11513 ; GFX10-WGP: ; %bb.0: ; %entry
11514 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11515 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
11516 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
11517 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
11518 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
11519 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
11520 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
11521 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
11522 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
11523 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
11524 ; GFX10-WGP-NEXT: buffer_gl0_inv
11525 ; GFX10-WGP-NEXT: s_endpgm
11527 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11528 ; GFX10-CU: ; %bb.0: ; %entry
11529 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11530 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
11531 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
11532 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
11533 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
11534 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
11535 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
11536 ; GFX10-CU-NEXT: s_endpgm
11538 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11539 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
11540 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
11541 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
11542 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
11543 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
11544 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
11545 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
11546 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
11547 ; SKIP-CACHE-INV-NEXT: s_endpgm
11549 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11550 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
11551 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11552 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
11553 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11554 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11555 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
11556 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11557 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
11559 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11560 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
11561 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11562 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
11563 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11564 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11565 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
11566 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
11567 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11568 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
11569 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
11570 ; GFX90A-TGSPLIT-NEXT: s_endpgm
11572 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11573 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
11574 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
11575 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
11576 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11577 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
11578 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
11579 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11580 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
11582 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11583 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
11584 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
11585 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
11586 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11587 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
11588 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
11589 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
11590 ; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11591 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
11592 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
11593 ; GFX940-TGSPLIT-NEXT: s_endpgm
11595 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11596 ; GFX11-WGP: ; %bb.0: ; %entry
11597 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11598 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
11599 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
11600 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
11601 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
11602 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
11603 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
11604 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
11605 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
11606 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
11607 ; GFX11-WGP-NEXT: buffer_gl0_inv
11608 ; GFX11-WGP-NEXT: s_endpgm
11610 ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11611 ; GFX11-CU: ; %bb.0: ; %entry
11612 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11613 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
11614 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
11615 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
11616 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
11617 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
11618 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
11619 ; GFX11-CU-NEXT: s_endpgm
11621 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11622 ; GFX12-WGP: ; %bb.0: ; %entry
11623 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11624 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
11625 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
11626 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
11627 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
11628 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
11629 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
11630 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
11631 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
11632 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
11633 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
11634 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
11635 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
11636 ; GFX12-WGP-NEXT: s_endpgm
11638 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
11639 ; GFX12-CU: ; %bb.0: ; %entry
11640 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11641 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
11642 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
11643 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
11644 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
11645 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
11646 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
11647 ; GFX12-CU-NEXT: s_endpgm
11648 ptr %out, i32 %in) {
11650 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
11654 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
11655 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11656 ; GFX7: ; %bb.0: ; %entry
11657 ; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11658 ; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
11659 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
11660 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
11661 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
11662 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
11663 ; GFX7-NEXT: flat_atomic_swap v[0:1], v2
11664 ; GFX7-NEXT: s_endpgm
11666 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11667 ; GFX10-WGP: ; %bb.0: ; %entry
11668 ; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11669 ; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
11670 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
11671 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
11672 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
11673 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s4
11674 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
11675 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
11676 ; GFX10-WGP-NEXT: flat_atomic_swap v[0:1], v2
11677 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
11678 ; GFX10-WGP-NEXT: buffer_gl0_inv
11679 ; GFX10-WGP-NEXT: s_endpgm
11681 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11682 ; GFX10-CU: ; %bb.0: ; %entry
11683 ; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11684 ; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
11685 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
11686 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
11687 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
11688 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s4
11689 ; GFX10-CU-NEXT: flat_atomic_swap v[0:1], v2
11690 ; GFX10-CU-NEXT: s_endpgm
11692 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11693 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
11694 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
11695 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[4:5], 0x2
11696 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
11697 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
11698 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
11699 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
11700 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v[0:1], v2
11701 ; SKIP-CACHE-INV-NEXT: s_endpgm
11703 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11704 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
11705 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11706 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
11707 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11708 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11709 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s4
11710 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11711 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
11713 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11714 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
11715 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
11716 ; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
11717 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11718 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[6:7], s[6:7] op_sel:[0,1]
11719 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s4
11720 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
11721 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11722 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
11723 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
11724 ; GFX90A-TGSPLIT-NEXT: s_endpgm
11726 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11727 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
11728 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
11729 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
11730 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11731 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
11732 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s0
11733 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11734 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
11736 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11737 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
11738 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0
11739 ; GFX940-TGSPLIT-NEXT: s_load_dword s0, s[4:5], 0x8
11740 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11741 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
11742 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s0
11743 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
11744 ; GFX940-TGSPLIT-NEXT: flat_atomic_swap v[0:1], v2
11745 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
11746 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
11747 ; GFX940-TGSPLIT-NEXT: s_endpgm
11749 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11750 ; GFX11-WGP: ; %bb.0: ; %entry
11751 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11752 ; GFX11-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
11753 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
11754 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
11755 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3
11756 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0
11757 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
11758 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
11759 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2
11760 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
11761 ; GFX11-WGP-NEXT: buffer_gl0_inv
11762 ; GFX11-WGP-NEXT: s_endpgm
11764 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11765 ; GFX11-CU: ; %bb.0: ; %entry
11766 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11767 ; GFX11-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
11768 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
11769 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
11770 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3
11771 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0
11772 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
11773 ; GFX11-CU-NEXT: s_endpgm
11775 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11776 ; GFX12-WGP: ; %bb.0: ; %entry
11777 ; GFX12-WGP-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11778 ; GFX12-WGP-NEXT: s_load_b32 s0, s[4:5], 0x8
11779 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
11780 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
11781 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s3
11782 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s0
11783 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
11784 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
11785 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
11786 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
11787 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SE
11788 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
11789 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
11790 ; GFX12-WGP-NEXT: s_endpgm
11792 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
11793 ; GFX12-CU: ; %bb.0: ; %entry
11794 ; GFX12-CU-NEXT: s_load_b64 s[2:3], s[4:5], 0x0
11795 ; GFX12-CU-NEXT: s_load_b32 s0, s[4:5], 0x8
11796 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
11797 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
11798 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s3
11799 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s0
11800 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2
11801 ; GFX12-CU-NEXT: s_endpgm
11802 ptr %out, i32 %in) {
11804 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
11808 define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
11809 ; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11810 ; GFX7: ; %bb.0: ; %entry
11811 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
11812 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
11813 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
11814 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
11815 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
11816 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
11817 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
11818 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
11819 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
11820 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11821 ; GFX7-NEXT: flat_store_dword v[0:1], v2
11822 ; GFX7-NEXT: s_endpgm
11824 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11825 ; GFX10-WGP: ; %bb.0: ; %entry
11826 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
11827 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
11828 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
11829 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
11830 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
11831 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6
11832 ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
11833 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
11834 ; GFX10-WGP-NEXT: buffer_gl0_inv
11835 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
11836 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
11837 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
11838 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
11839 ; GFX10-WGP-NEXT: s_endpgm
11841 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11842 ; GFX10-CU: ; %bb.0: ; %entry
11843 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
11844 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
11845 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
11846 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
11847 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
11848 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
11849 ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
11850 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
11851 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
11852 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11853 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
11854 ; GFX10-CU-NEXT: s_endpgm
11856 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11857 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
11858 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
11859 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
11860 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
11861 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
11862 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
11863 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
11864 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
11865 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
11866 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
11867 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11868 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
11869 ; SKIP-CACHE-INV-NEXT: s_endpgm
11871 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11872 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
11873 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
11874 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
11875 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11876 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11877 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6
11878 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
11879 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11880 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11881 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
11882 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
11884 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11885 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
11886 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
11887 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
11888 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11889 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11890 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6
11891 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
11892 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
11893 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
11894 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
11895 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
11896 ; GFX90A-TGSPLIT-NEXT: s_endpgm
11898 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11899 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
11900 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
11901 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
11902 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11903 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
11904 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
11905 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
11906 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
11907 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11908 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
11909 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
11911 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11912 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
11913 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
11914 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
11915 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
11916 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
11917 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
11918 ; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
11919 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
11920 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
11921 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
11922 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
11923 ; GFX940-TGSPLIT-NEXT: s_endpgm
11925 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11926 ; GFX11-WGP: ; %bb.0: ; %entry
11927 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
11928 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
11929 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
11930 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
11931 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
11932 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
11933 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
11934 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
11935 ; GFX11-WGP-NEXT: buffer_gl0_inv
11936 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
11937 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
11938 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
11939 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
11940 ; GFX11-WGP-NEXT: s_endpgm
11942 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11943 ; GFX11-CU: ; %bb.0: ; %entry
11944 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
11945 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
11946 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
11947 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
11948 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
11949 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
11950 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
11951 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
11952 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
11953 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11954 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
11955 ; GFX11-CU-NEXT: s_endpgm
11957 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11958 ; GFX12-WGP: ; %bb.0: ; %entry
11959 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
11960 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
11961 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
11962 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
11963 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
11964 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
11965 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
11966 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
11967 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
11968 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
11969 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
11970 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
11971 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
11972 ; GFX12-WGP-NEXT: s_endpgm
11974 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
11975 ; GFX12-CU: ; %bb.0: ; %entry
11976 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
11977 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
11978 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
11979 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
11980 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
11981 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
11982 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
11983 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
11984 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
11985 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
11986 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
11987 ; GFX12-CU-NEXT: s_endpgm
11988 ptr %out, i32 %in) {
11990 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acquire
11991 store i32 %val, ptr %out, align 4
11995 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
11996 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
11997 ; GFX7: ; %bb.0: ; %entry
11998 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
11999 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
12000 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
12001 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
12002 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
12003 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
12004 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
12005 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
12006 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
12007 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12008 ; GFX7-NEXT: flat_store_dword v[0:1], v2
12009 ; GFX7-NEXT: s_endpgm
12011 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12012 ; GFX10-WGP: ; %bb.0: ; %entry
12013 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
12014 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
12015 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
12016 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
12017 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
12018 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6
12019 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
12020 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
12021 ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
12022 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
12023 ; GFX10-WGP-NEXT: buffer_gl0_inv
12024 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
12025 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
12026 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
12027 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
12028 ; GFX10-WGP-NEXT: s_endpgm
12030 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12031 ; GFX10-CU: ; %bb.0: ; %entry
12032 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
12033 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
12034 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
12035 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
12036 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
12037 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
12038 ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
12039 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
12040 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
12041 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12042 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
12043 ; GFX10-CU-NEXT: s_endpgm
12045 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12046 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
12047 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
12048 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
12049 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
12050 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
12051 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
12052 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
12053 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
12054 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
12055 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
12056 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12057 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
12058 ; SKIP-CACHE-INV-NEXT: s_endpgm
12060 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12061 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
12062 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
12063 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
12064 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12065 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12066 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6
12067 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
12068 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12069 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12070 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
12071 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
12073 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12074 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
12075 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
12076 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
12077 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12078 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12079 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6
12080 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
12081 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
12082 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
12083 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
12084 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12085 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
12086 ; GFX90A-TGSPLIT-NEXT: s_endpgm
12088 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12089 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
12090 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
12091 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
12092 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12093 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
12094 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
12095 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
12096 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
12097 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12098 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
12099 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
12101 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12102 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
12103 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
12104 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
12105 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12106 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
12107 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
12108 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
12109 ; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
12110 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
12111 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
12112 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
12113 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
12114 ; GFX940-TGSPLIT-NEXT: s_endpgm
12116 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12117 ; GFX11-WGP: ; %bb.0: ; %entry
12118 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
12119 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
12120 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
12121 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
12122 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
12123 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
12124 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
12125 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
12126 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
12127 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
12128 ; GFX11-WGP-NEXT: buffer_gl0_inv
12129 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
12130 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
12131 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
12132 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
12133 ; GFX11-WGP-NEXT: s_endpgm
12135 ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12136 ; GFX11-CU: ; %bb.0: ; %entry
12137 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
12138 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
12139 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
12140 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
12141 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
12142 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
12143 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
12144 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
12145 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
12146 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12147 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
12148 ; GFX11-CU-NEXT: s_endpgm
12150 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12151 ; GFX12-WGP: ; %bb.0: ; %entry
12152 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
12153 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
12154 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
12155 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
12156 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
12157 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
12158 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
12159 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
12160 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
12161 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
12162 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
12163 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
12164 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
12165 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
12166 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
12167 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
12168 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
12169 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
12170 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
12171 ; GFX12-WGP-NEXT: s_endpgm
12173 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
12174 ; GFX12-CU: ; %bb.0: ; %entry
12175 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
12176 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
12177 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
12178 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
12179 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
12180 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
12181 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
12182 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
12183 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
12184 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
12185 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
12186 ; GFX12-CU-NEXT: s_endpgm
12187 ptr %out, i32 %in) {
12189 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") acq_rel
12190 store i32 %val, ptr %out, align 4
12194 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
12195 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12196 ; GFX7: ; %bb.0: ; %entry
12197 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
12198 ; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
12199 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
12200 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
12201 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
12202 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
12203 ; GFX7-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
12204 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
12205 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
12206 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12207 ; GFX7-NEXT: flat_store_dword v[0:1], v2
12208 ; GFX7-NEXT: s_endpgm
12210 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12211 ; GFX10-WGP: ; %bb.0: ; %entry
12212 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
12213 ; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
12214 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
12215 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
12216 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
12217 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s6
12218 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
12219 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
12220 ; GFX10-WGP-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
12221 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
12222 ; GFX10-WGP-NEXT: buffer_gl0_inv
12223 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
12224 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
12225 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
12226 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
12227 ; GFX10-WGP-NEXT: s_endpgm
12229 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12230 ; GFX10-CU: ; %bb.0: ; %entry
12231 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
12232 ; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
12233 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
12234 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
12235 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
12236 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s6
12237 ; GFX10-CU-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
12238 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
12239 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
12240 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12241 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
12242 ; GFX10-CU-NEXT: s_endpgm
12244 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12245 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
12246 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
12247 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[4:5], 0x2
12248 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
12249 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
12250 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
12251 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s2
12252 ; SKIP-CACHE-INV-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
12253 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
12254 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
12255 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12256 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
12257 ; SKIP-CACHE-INV-NEXT: s_endpgm
12259 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12260 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
12261 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
12262 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
12263 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12264 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12265 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s6
12266 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
12267 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12268 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12269 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
12270 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
12272 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12273 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
12274 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
12275 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
12276 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12277 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12278 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s6
12279 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
12280 ; GFX90A-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 glc
12281 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
12282 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
12283 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12284 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
12285 ; GFX90A-TGSPLIT-NEXT: s_endpgm
12287 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12288 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
12289 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
12290 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
12291 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12292 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
12293 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s2
12294 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
12295 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
12296 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12297 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
12298 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
12300 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12301 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
12302 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
12303 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8
12304 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12305 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
12306 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s2
12307 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
12308 ; GFX940-TGSPLIT-NEXT: flat_atomic_swap v2, v[0:1], v2 sc0
12309 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
12310 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
12311 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
12312 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
12313 ; GFX940-TGSPLIT-NEXT: s_endpgm
12315 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12316 ; GFX11-WGP: ; %bb.0: ; %entry
12317 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
12318 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
12319 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
12320 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
12321 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
12322 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2
12323 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
12324 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
12325 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
12326 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
12327 ; GFX11-WGP-NEXT: buffer_gl0_inv
12328 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
12329 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
12330 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
12331 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
12332 ; GFX11-WGP-NEXT: s_endpgm
12334 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12335 ; GFX11-CU: ; %bb.0: ; %entry
12336 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
12337 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
12338 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
12339 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
12340 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
12341 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2
12342 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc
12343 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
12344 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
12345 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
12346 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
12347 ; GFX11-CU-NEXT: s_endpgm
12349 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12350 ; GFX12-WGP: ; %bb.0: ; %entry
12351 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
12352 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0x8
12353 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
12354 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
12355 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
12356 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s2
12357 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
12358 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
12359 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
12360 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
12361 ; GFX12-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SE
12362 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
12363 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
12364 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
12365 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
12366 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
12367 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
12368 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
12369 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
12370 ; GFX12-WGP-NEXT: s_endpgm
12372 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
12373 ; GFX12-CU: ; %bb.0: ; %entry
12374 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
12375 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0x8
12376 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
12377 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
12378 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
12379 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s2
12380 ; GFX12-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN
12381 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
12382 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
12383 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
12384 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
12385 ; GFX12-CU-NEXT: s_endpgm
12386 ptr %out, i32 %in) {
12388 %val = atomicrmw volatile xchg ptr %out, i32 %in syncscope("workgroup-one-as") seq_cst
12389 store i32 %val, ptr %out, align 4
12393 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
12394 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12395 ; GFX7: ; %bb.0: ; %entry
12396 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
12397 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
12398 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
12399 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
12400 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
12401 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
12402 ; GFX7-NEXT: s_mov_b32 s4, s8
12403 ; GFX7-NEXT: s_mov_b32 s5, s9
12404 ; GFX7-NEXT: s_mov_b32 s9, s10
12405 ; GFX7-NEXT: s_mov_b32 s8, s11
12406 ; GFX7-NEXT: s_add_u32 s4, s4, s9
12407 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
12408 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12409 ; GFX7-NEXT: s_mov_b32 s5, s8
12410 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
12411 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
12412 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12413 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
12414 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
12415 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
12416 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
12417 ; GFX7-NEXT: s_endpgm
12419 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12420 ; GFX10-WGP: ; %bb.0: ; %entry
12421 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
12422 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
12423 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
12424 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
12425 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
12426 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
12427 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
12428 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
12429 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
12430 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
12431 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
12432 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
12433 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12434 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
12435 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
12436 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
12437 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12438 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
12439 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
12440 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
12441 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
12442 ; GFX10-WGP-NEXT: s_endpgm
12444 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12445 ; GFX10-CU: ; %bb.0: ; %entry
12446 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
12447 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
12448 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
12449 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
12450 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
12451 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
12452 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
12453 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
12454 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
12455 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
12456 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
12457 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
12458 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12459 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
12460 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
12461 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
12462 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12463 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
12464 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
12465 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
12466 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
12467 ; GFX10-CU-NEXT: s_endpgm
12469 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12470 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
12471 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
12472 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
12473 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
12474 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
12475 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
12476 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
12477 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
12478 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
12479 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
12480 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
12481 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
12482 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
12483 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
12484 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
12485 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
12486 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
12487 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12488 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
12489 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
12490 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
12491 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
12492 ; SKIP-CACHE-INV-NEXT: s_endpgm
12494 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12495 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
12496 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
12497 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
12498 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
12499 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12500 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
12501 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
12502 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12503 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
12504 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12505 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12506 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
12508 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12509 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
12510 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
12511 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
12512 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
12513 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12514 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
12515 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
12516 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12517 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
12518 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12519 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12520 ; GFX90A-TGSPLIT-NEXT: s_endpgm
12522 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12523 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
12524 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
12525 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
12526 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
12527 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12528 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
12529 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
12530 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12531 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
12532 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
12533 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12534 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
12536 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12537 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
12538 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
12539 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
12540 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
12541 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12542 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
12543 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
12544 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12545 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
12546 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
12547 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12548 ; GFX940-TGSPLIT-NEXT: s_endpgm
12550 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12551 ; GFX11-WGP: ; %bb.0: ; %entry
12552 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
12553 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
12554 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
12555 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
12556 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
12557 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
12558 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12559 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
12560 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
12561 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
12562 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12563 ; GFX11-WGP-NEXT: s_endpgm
12565 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12566 ; GFX11-CU: ; %bb.0: ; %entry
12567 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
12568 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
12569 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
12570 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
12571 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
12572 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
12573 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12574 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
12575 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
12576 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
12577 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12578 ; GFX11-CU-NEXT: s_endpgm
12580 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12581 ; GFX12-WGP: ; %bb.0: ; %entry
12582 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
12583 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
12584 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
12585 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
12586 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
12587 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
12588 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12589 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
12590 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
12591 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
12592 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
12593 ; GFX12-WGP-NEXT: s_endpgm
12595 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
12596 ; GFX12-CU: ; %bb.0: ; %entry
12597 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
12598 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
12599 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
12600 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
12601 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
12602 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
12603 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12604 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
12605 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
12606 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
12607 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12608 ; GFX12-CU-NEXT: s_endpgm
12609 ptr %out, i32 %in, i32 %old) {
12611 %gep = getelementptr i32, ptr %out, i32 4
12612 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
12616 define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
12617 ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12618 ; GFX7: ; %bb.0: ; %entry
12619 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
12620 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
12621 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
12622 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
12623 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
12624 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
12625 ; GFX7-NEXT: s_mov_b32 s4, s8
12626 ; GFX7-NEXT: s_mov_b32 s5, s9
12627 ; GFX7-NEXT: s_mov_b32 s9, s10
12628 ; GFX7-NEXT: s_mov_b32 s8, s11
12629 ; GFX7-NEXT: s_add_u32 s4, s4, s9
12630 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
12631 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12632 ; GFX7-NEXT: s_mov_b32 s5, s8
12633 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
12634 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
12635 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12636 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
12637 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
12638 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
12639 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
12640 ; GFX7-NEXT: s_endpgm
12642 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12643 ; GFX10-WGP: ; %bb.0: ; %entry
12644 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
12645 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
12646 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
12647 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
12648 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
12649 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
12650 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
12651 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
12652 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
12653 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
12654 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
12655 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
12656 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12657 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
12658 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
12659 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
12660 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12661 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
12662 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
12663 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
12664 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
12665 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
12666 ; GFX10-WGP-NEXT: buffer_gl0_inv
12667 ; GFX10-WGP-NEXT: s_endpgm
12669 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12670 ; GFX10-CU: ; %bb.0: ; %entry
12671 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
12672 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
12673 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
12674 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
12675 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
12676 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
12677 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
12678 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
12679 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
12680 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
12681 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
12682 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
12683 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12684 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
12685 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
12686 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
12687 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12688 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
12689 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
12690 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
12691 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
12692 ; GFX10-CU-NEXT: s_endpgm
12694 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12695 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
12696 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
12697 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
12698 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
12699 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
12700 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
12701 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
12702 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
12703 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
12704 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
12705 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
12706 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
12707 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
12708 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
12709 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
12710 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
12711 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
12712 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12713 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
12714 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
12715 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
12716 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
12717 ; SKIP-CACHE-INV-NEXT: s_endpgm
12719 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12720 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
12721 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
12722 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
12723 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
12724 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12725 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
12726 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
12727 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12728 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
12729 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12730 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12731 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
12733 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12734 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
12735 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
12736 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
12737 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
12738 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12739 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
12740 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
12741 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12742 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
12743 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12744 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12745 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
12746 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
12747 ; GFX90A-TGSPLIT-NEXT: s_endpgm
12749 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12750 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
12751 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
12752 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
12753 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
12754 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12755 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
12756 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
12757 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12758 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
12759 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
12760 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12761 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
12763 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12764 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
12765 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
12766 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
12767 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
12768 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12769 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
12770 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
12771 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12772 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
12773 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
12774 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12775 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
12776 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
12777 ; GFX940-TGSPLIT-NEXT: s_endpgm
12779 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12780 ; GFX11-WGP: ; %bb.0: ; %entry
12781 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
12782 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
12783 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
12784 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
12785 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
12786 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
12787 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12788 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
12789 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
12790 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
12791 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12792 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
12793 ; GFX11-WGP-NEXT: buffer_gl0_inv
12794 ; GFX11-WGP-NEXT: s_endpgm
12796 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12797 ; GFX11-CU: ; %bb.0: ; %entry
12798 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
12799 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
12800 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
12801 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
12802 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
12803 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
12804 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12805 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
12806 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
12807 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
12808 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12809 ; GFX11-CU-NEXT: s_endpgm
12811 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12812 ; GFX12-WGP: ; %bb.0: ; %entry
12813 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
12814 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
12815 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
12816 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
12817 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
12818 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
12819 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12820 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
12821 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
12822 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
12823 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
12824 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
12825 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
12826 ; GFX12-WGP-NEXT: s_endpgm
12828 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
12829 ; GFX12-CU: ; %bb.0: ; %entry
12830 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
12831 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
12832 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
12833 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
12834 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
12835 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
12836 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12837 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
12838 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
12839 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
12840 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
12841 ; GFX12-CU-NEXT: s_endpgm
12842 ptr %out, i32 %in, i32 %old) {
12844 %gep = getelementptr i32, ptr %out, i32 4
12845 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
12849 define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
12850 ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
12851 ; GFX7: ; %bb.0: ; %entry
12852 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
12853 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
12854 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
12855 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
12856 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
12857 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
12858 ; GFX7-NEXT: s_mov_b32 s4, s8
12859 ; GFX7-NEXT: s_mov_b32 s5, s9
12860 ; GFX7-NEXT: s_mov_b32 s9, s10
12861 ; GFX7-NEXT: s_mov_b32 s8, s11
12862 ; GFX7-NEXT: s_add_u32 s4, s4, s9
12863 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
12864 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12865 ; GFX7-NEXT: s_mov_b32 s5, s8
12866 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
12867 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
12868 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12869 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
12870 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
12871 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
12872 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
12873 ; GFX7-NEXT: s_endpgm
12875 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
12876 ; GFX10-WGP: ; %bb.0: ; %entry
12877 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
12878 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
12879 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
12880 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
12881 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
12882 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
12883 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
12884 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
12885 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
12886 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
12887 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
12888 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
12889 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12890 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
12891 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
12892 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
12893 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12894 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
12895 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
12896 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
12897 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
12898 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
12899 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
12900 ; GFX10-WGP-NEXT: s_endpgm
12902 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
12903 ; GFX10-CU: ; %bb.0: ; %entry
12904 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
12905 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
12906 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
12907 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
12908 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
12909 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
12910 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
12911 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
12912 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
12913 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
12914 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
12915 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
12916 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
12917 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
12918 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
12919 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
12920 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12921 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
12922 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
12923 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
12924 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
12925 ; GFX10-CU-NEXT: s_endpgm
12927 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
12928 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
12929 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
12930 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
12931 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
12932 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
12933 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
12934 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
12935 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
12936 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
12937 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
12938 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
12939 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
12940 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
12941 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
12942 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
12943 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
12944 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
12945 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12946 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
12947 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
12948 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
12949 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
12950 ; SKIP-CACHE-INV-NEXT: s_endpgm
12952 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
12953 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
12954 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
12955 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
12956 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
12957 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12958 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
12959 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
12960 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12961 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
12962 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12963 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12964 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
12966 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
12967 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
12968 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
12969 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
12970 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
12971 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12972 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
12973 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
12974 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12975 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
12976 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
12977 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
12978 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12979 ; GFX90A-TGSPLIT-NEXT: s_endpgm
12981 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
12982 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
12983 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
12984 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
12985 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
12986 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
12987 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
12988 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
12989 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
12990 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
12991 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
12992 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
12993 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
12995 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
12996 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
12997 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
12998 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
12999 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
13000 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13001 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
13002 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
13003 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13004 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13005 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
13006 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
13007 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13008 ; GFX940-TGSPLIT-NEXT: s_endpgm
13010 ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
13011 ; GFX11-WGP: ; %bb.0: ; %entry
13012 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13013 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
13014 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
13015 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
13016 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
13017 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
13018 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13019 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
13020 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
13021 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
13022 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
13023 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
13024 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13025 ; GFX11-WGP-NEXT: s_endpgm
13027 ; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
13028 ; GFX11-CU: ; %bb.0: ; %entry
13029 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13030 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
13031 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
13032 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
13033 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
13034 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
13035 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13036 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
13037 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
13038 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
13039 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13040 ; GFX11-CU-NEXT: s_endpgm
13042 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
13043 ; GFX12-WGP: ; %bb.0: ; %entry
13044 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13045 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
13046 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
13047 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
13048 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
13049 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
13050 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13051 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
13052 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
13053 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
13054 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
13055 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
13056 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
13057 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
13058 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
13059 ; GFX12-WGP-NEXT: s_endpgm
13061 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
13062 ; GFX12-CU: ; %bb.0: ; %entry
13063 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13064 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
13065 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
13066 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
13067 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
13068 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
13069 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13070 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
13071 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
13072 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
13073 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13074 ; GFX12-CU-NEXT: s_endpgm
13075 ptr %out, i32 %in, i32 %old) {
13077 %gep = getelementptr i32, ptr %out, i32 4
13078 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
13082 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
13083 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13084 ; GFX7: ; %bb.0: ; %entry
13085 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
13086 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
13087 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
13088 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
13089 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
13090 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
13091 ; GFX7-NEXT: s_mov_b32 s4, s8
13092 ; GFX7-NEXT: s_mov_b32 s5, s9
13093 ; GFX7-NEXT: s_mov_b32 s9, s10
13094 ; GFX7-NEXT: s_mov_b32 s8, s11
13095 ; GFX7-NEXT: s_add_u32 s4, s4, s9
13096 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
13097 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13098 ; GFX7-NEXT: s_mov_b32 s5, s8
13099 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
13100 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
13101 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13102 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
13103 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
13104 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
13105 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
13106 ; GFX7-NEXT: s_endpgm
13108 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13109 ; GFX10-WGP: ; %bb.0: ; %entry
13110 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
13111 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
13112 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
13113 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
13114 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
13115 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
13116 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
13117 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
13118 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
13119 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
13120 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
13121 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
13122 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13123 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
13124 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
13125 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
13126 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13127 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
13128 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
13129 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
13130 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
13131 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
13132 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
13133 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
13134 ; GFX10-WGP-NEXT: buffer_gl0_inv
13135 ; GFX10-WGP-NEXT: s_endpgm
13137 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13138 ; GFX10-CU: ; %bb.0: ; %entry
13139 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
13140 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
13141 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
13142 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
13143 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
13144 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
13145 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
13146 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
13147 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
13148 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
13149 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
13150 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
13151 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13152 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
13153 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
13154 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
13155 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13156 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
13157 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
13158 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
13159 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
13160 ; GFX10-CU-NEXT: s_endpgm
13162 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13163 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
13164 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
13165 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
13166 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
13167 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
13168 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
13169 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
13170 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
13171 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
13172 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
13173 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
13174 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
13175 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
13176 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
13177 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
13178 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
13179 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
13180 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13181 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
13182 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
13183 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
13184 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
13185 ; SKIP-CACHE-INV-NEXT: s_endpgm
13187 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13188 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
13189 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
13190 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
13191 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
13192 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13193 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
13194 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
13195 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13196 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13197 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13198 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13199 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
13201 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13202 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
13203 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
13204 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
13205 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
13206 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13207 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
13208 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
13209 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13210 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13211 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13212 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
13213 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13214 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
13215 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
13216 ; GFX90A-TGSPLIT-NEXT: s_endpgm
13218 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13219 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
13220 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
13221 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
13222 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
13223 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13224 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
13225 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
13226 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13227 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13228 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
13229 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13230 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
13232 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13233 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
13234 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
13235 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
13236 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
13237 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13238 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
13239 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
13240 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13241 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13242 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
13243 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
13244 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13245 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
13246 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
13247 ; GFX940-TGSPLIT-NEXT: s_endpgm
13249 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13250 ; GFX11-WGP: ; %bb.0: ; %entry
13251 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13252 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
13253 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
13254 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
13255 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
13256 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
13257 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13258 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
13259 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
13260 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
13261 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
13262 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
13263 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13264 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
13265 ; GFX11-WGP-NEXT: buffer_gl0_inv
13266 ; GFX11-WGP-NEXT: s_endpgm
13268 ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13269 ; GFX11-CU: ; %bb.0: ; %entry
13270 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13271 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
13272 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
13273 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
13274 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
13275 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
13276 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13277 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
13278 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
13279 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
13280 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13281 ; GFX11-CU-NEXT: s_endpgm
13283 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13284 ; GFX12-WGP: ; %bb.0: ; %entry
13285 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13286 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
13287 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
13288 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
13289 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
13290 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
13291 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13292 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
13293 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
13294 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
13295 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
13296 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
13297 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
13298 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
13299 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
13300 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
13301 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
13302 ; GFX12-WGP-NEXT: s_endpgm
13304 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
13305 ; GFX12-CU: ; %bb.0: ; %entry
13306 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13307 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
13308 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
13309 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
13310 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
13311 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
13312 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13313 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
13314 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
13315 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
13316 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13317 ; GFX12-CU-NEXT: s_endpgm
13318 ptr %out, i32 %in, i32 %old) {
13320 %gep = getelementptr i32, ptr %out, i32 4
13321 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
13325 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
13326 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13327 ; GFX7: ; %bb.0: ; %entry
13328 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
13329 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
13330 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
13331 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
13332 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
13333 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
13334 ; GFX7-NEXT: s_mov_b32 s4, s8
13335 ; GFX7-NEXT: s_mov_b32 s5, s9
13336 ; GFX7-NEXT: s_mov_b32 s9, s10
13337 ; GFX7-NEXT: s_mov_b32 s8, s11
13338 ; GFX7-NEXT: s_add_u32 s4, s4, s9
13339 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
13340 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13341 ; GFX7-NEXT: s_mov_b32 s5, s8
13342 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
13343 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
13344 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13345 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
13346 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
13347 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
13348 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
13349 ; GFX7-NEXT: s_endpgm
13351 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13352 ; GFX10-WGP: ; %bb.0: ; %entry
13353 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
13354 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
13355 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
13356 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
13357 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
13358 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
13359 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
13360 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
13361 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
13362 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
13363 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
13364 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
13365 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13366 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
13367 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
13368 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
13369 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13370 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
13371 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
13372 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
13373 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
13374 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
13375 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
13376 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
13377 ; GFX10-WGP-NEXT: buffer_gl0_inv
13378 ; GFX10-WGP-NEXT: s_endpgm
13380 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13381 ; GFX10-CU: ; %bb.0: ; %entry
13382 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
13383 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
13384 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
13385 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
13386 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
13387 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
13388 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
13389 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
13390 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
13391 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
13392 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
13393 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
13394 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13395 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
13396 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
13397 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
13398 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13399 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
13400 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
13401 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
13402 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
13403 ; GFX10-CU-NEXT: s_endpgm
13405 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13406 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
13407 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
13408 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
13409 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
13410 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
13411 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
13412 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
13413 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
13414 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
13415 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
13416 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
13417 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
13418 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
13419 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
13420 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
13421 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
13422 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
13423 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13424 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
13425 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
13426 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
13427 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
13428 ; SKIP-CACHE-INV-NEXT: s_endpgm
13430 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13431 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
13432 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
13433 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
13434 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
13435 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13436 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
13437 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
13438 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13439 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13440 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13441 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13442 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
13444 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13445 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
13446 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
13447 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
13448 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
13449 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13450 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
13451 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
13452 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13453 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13454 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13455 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
13456 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13457 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
13458 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
13459 ; GFX90A-TGSPLIT-NEXT: s_endpgm
13461 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13462 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
13463 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
13464 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
13465 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
13466 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13467 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
13468 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
13469 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13470 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13471 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
13472 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13473 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
13475 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13476 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
13477 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
13478 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
13479 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
13480 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13481 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
13482 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
13483 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13484 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13485 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
13486 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
13487 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13488 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
13489 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
13490 ; GFX940-TGSPLIT-NEXT: s_endpgm
13492 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13493 ; GFX11-WGP: ; %bb.0: ; %entry
13494 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13495 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
13496 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
13497 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
13498 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
13499 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
13500 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13501 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
13502 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
13503 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
13504 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
13505 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
13506 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13507 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
13508 ; GFX11-WGP-NEXT: buffer_gl0_inv
13509 ; GFX11-WGP-NEXT: s_endpgm
13511 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13512 ; GFX11-CU: ; %bb.0: ; %entry
13513 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13514 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
13515 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
13516 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
13517 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
13518 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
13519 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13520 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
13521 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
13522 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
13523 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13524 ; GFX11-CU-NEXT: s_endpgm
13526 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13527 ; GFX12-WGP: ; %bb.0: ; %entry
13528 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13529 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
13530 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
13531 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
13532 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
13533 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
13534 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13535 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
13536 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
13537 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
13538 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
13539 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
13540 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
13541 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
13542 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
13543 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
13544 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
13545 ; GFX12-WGP-NEXT: s_endpgm
13547 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
13548 ; GFX12-CU: ; %bb.0: ; %entry
13549 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13550 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
13551 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
13552 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
13553 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
13554 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
13555 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13556 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
13557 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
13558 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
13559 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13560 ; GFX12-CU-NEXT: s_endpgm
13561 ptr %out, i32 %in, i32 %old) {
13563 %gep = getelementptr i32, ptr %out, i32 4
13564 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
13568 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
13569 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13570 ; GFX7: ; %bb.0: ; %entry
13571 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
13572 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
13573 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
13574 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
13575 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
13576 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
13577 ; GFX7-NEXT: s_mov_b32 s4, s8
13578 ; GFX7-NEXT: s_mov_b32 s5, s9
13579 ; GFX7-NEXT: s_mov_b32 s9, s10
13580 ; GFX7-NEXT: s_mov_b32 s8, s11
13581 ; GFX7-NEXT: s_add_u32 s4, s4, s9
13582 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
13583 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13584 ; GFX7-NEXT: s_mov_b32 s5, s8
13585 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
13586 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
13587 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13588 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
13589 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
13590 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
13591 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
13592 ; GFX7-NEXT: s_endpgm
13594 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13595 ; GFX10-WGP: ; %bb.0: ; %entry
13596 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
13597 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
13598 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
13599 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
13600 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
13601 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
13602 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
13603 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
13604 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
13605 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
13606 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
13607 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
13608 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13609 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
13610 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
13611 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
13612 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13613 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
13614 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
13615 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
13616 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
13617 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
13618 ; GFX10-WGP-NEXT: buffer_gl0_inv
13619 ; GFX10-WGP-NEXT: s_endpgm
13621 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13622 ; GFX10-CU: ; %bb.0: ; %entry
13623 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
13624 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
13625 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
13626 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
13627 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
13628 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
13629 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
13630 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
13631 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
13632 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
13633 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
13634 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
13635 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13636 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
13637 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
13638 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
13639 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13640 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
13641 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
13642 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
13643 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
13644 ; GFX10-CU-NEXT: s_endpgm
13646 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13647 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
13648 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
13649 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
13650 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
13651 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
13652 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
13653 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
13654 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
13655 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
13656 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
13657 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
13658 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
13659 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
13660 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
13661 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
13662 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
13663 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
13664 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13665 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
13666 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
13667 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
13668 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
13669 ; SKIP-CACHE-INV-NEXT: s_endpgm
13671 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13672 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
13673 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
13674 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
13675 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
13676 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13677 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
13678 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
13679 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13680 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13681 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13682 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13683 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
13685 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13686 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
13687 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
13688 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
13689 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
13690 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13691 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
13692 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
13693 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13694 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13695 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13696 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13697 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
13698 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
13699 ; GFX90A-TGSPLIT-NEXT: s_endpgm
13701 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13702 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
13703 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
13704 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
13705 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
13706 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13707 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
13708 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
13709 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13710 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13711 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
13712 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13713 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
13715 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13716 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
13717 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
13718 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
13719 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
13720 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13721 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
13722 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
13723 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13724 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13725 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
13726 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13727 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
13728 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
13729 ; GFX940-TGSPLIT-NEXT: s_endpgm
13731 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13732 ; GFX11-WGP: ; %bb.0: ; %entry
13733 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13734 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
13735 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
13736 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
13737 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
13738 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
13739 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13740 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
13741 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
13742 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
13743 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13744 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
13745 ; GFX11-WGP-NEXT: buffer_gl0_inv
13746 ; GFX11-WGP-NEXT: s_endpgm
13748 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13749 ; GFX11-CU: ; %bb.0: ; %entry
13750 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13751 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
13752 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
13753 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
13754 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
13755 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
13756 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13757 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
13758 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
13759 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
13760 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13761 ; GFX11-CU-NEXT: s_endpgm
13763 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13764 ; GFX12-WGP: ; %bb.0: ; %entry
13765 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13766 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
13767 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
13768 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
13769 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
13770 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
13771 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13772 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
13773 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
13774 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
13775 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
13776 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
13777 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
13778 ; GFX12-WGP-NEXT: s_endpgm
13780 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
13781 ; GFX12-CU: ; %bb.0: ; %entry
13782 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13783 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
13784 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
13785 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
13786 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
13787 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
13788 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13789 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
13790 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
13791 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
13792 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13793 ; GFX12-CU-NEXT: s_endpgm
13794 ptr %out, i32 %in, i32 %old) {
13796 %gep = getelementptr i32, ptr %out, i32 4
13797 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire
13801 define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
13802 ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13803 ; GFX7: ; %bb.0: ; %entry
13804 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
13805 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
13806 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
13807 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
13808 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
13809 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
13810 ; GFX7-NEXT: s_mov_b32 s4, s8
13811 ; GFX7-NEXT: s_mov_b32 s5, s9
13812 ; GFX7-NEXT: s_mov_b32 s9, s10
13813 ; GFX7-NEXT: s_mov_b32 s8, s11
13814 ; GFX7-NEXT: s_add_u32 s4, s4, s9
13815 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
13816 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13817 ; GFX7-NEXT: s_mov_b32 s5, s8
13818 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
13819 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
13820 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13821 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
13822 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
13823 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
13824 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
13825 ; GFX7-NEXT: s_endpgm
13827 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13828 ; GFX10-WGP: ; %bb.0: ; %entry
13829 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
13830 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
13831 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
13832 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
13833 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
13834 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
13835 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
13836 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
13837 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
13838 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
13839 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
13840 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
13841 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13842 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
13843 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
13844 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
13845 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13846 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
13847 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
13848 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
13849 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
13850 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
13851 ; GFX10-WGP-NEXT: buffer_gl0_inv
13852 ; GFX10-WGP-NEXT: s_endpgm
13854 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13855 ; GFX10-CU: ; %bb.0: ; %entry
13856 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
13857 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
13858 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
13859 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
13860 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
13861 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
13862 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
13863 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
13864 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
13865 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
13866 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
13867 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
13868 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
13869 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
13870 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
13871 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
13872 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13873 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
13874 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
13875 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
13876 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
13877 ; GFX10-CU-NEXT: s_endpgm
13879 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13880 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
13881 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
13882 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
13883 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
13884 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
13885 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
13886 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
13887 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
13888 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
13889 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
13890 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
13891 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
13892 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
13893 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
13894 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
13895 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
13896 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
13897 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13898 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
13899 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
13900 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
13901 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
13902 ; SKIP-CACHE-INV-NEXT: s_endpgm
13904 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13905 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
13906 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
13907 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
13908 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
13909 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13910 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
13911 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
13912 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13913 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13914 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13915 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13916 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
13918 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13919 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
13920 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
13921 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
13922 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
13923 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13924 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
13925 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
13926 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13927 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13928 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
13929 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13930 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
13931 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
13932 ; GFX90A-TGSPLIT-NEXT: s_endpgm
13934 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13935 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
13936 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
13937 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
13938 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
13939 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13940 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
13941 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
13942 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13943 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13944 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
13945 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13946 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
13948 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13949 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
13950 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
13951 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
13952 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
13953 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
13954 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
13955 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
13956 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13957 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
13958 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
13959 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
13960 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
13961 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
13962 ; GFX940-TGSPLIT-NEXT: s_endpgm
13964 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13965 ; GFX11-WGP: ; %bb.0: ; %entry
13966 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13967 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
13968 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
13969 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
13970 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
13971 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
13972 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13973 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
13974 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
13975 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
13976 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13977 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
13978 ; GFX11-WGP-NEXT: buffer_gl0_inv
13979 ; GFX11-WGP-NEXT: s_endpgm
13981 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13982 ; GFX11-CU: ; %bb.0: ; %entry
13983 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13984 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
13985 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
13986 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
13987 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
13988 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
13989 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
13990 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
13991 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
13992 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
13993 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
13994 ; GFX11-CU-NEXT: s_endpgm
13996 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
13997 ; GFX12-WGP: ; %bb.0: ; %entry
13998 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
13999 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
14000 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
14001 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
14002 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
14003 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
14004 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14005 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
14006 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
14007 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
14008 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
14009 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
14010 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
14011 ; GFX12-WGP-NEXT: s_endpgm
14013 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
14014 ; GFX12-CU: ; %bb.0: ; %entry
14015 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14016 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
14017 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
14018 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
14019 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
14020 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
14021 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14022 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
14023 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
14024 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
14025 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14026 ; GFX12-CU-NEXT: s_endpgm
14027 ptr %out, i32 %in, i32 %old) {
14029 %gep = getelementptr i32, ptr %out, i32 4
14030 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
14034 define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
14035 ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14036 ; GFX7: ; %bb.0: ; %entry
14037 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
14038 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
14039 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
14040 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
14041 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
14042 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
14043 ; GFX7-NEXT: s_mov_b32 s4, s8
14044 ; GFX7-NEXT: s_mov_b32 s5, s9
14045 ; GFX7-NEXT: s_mov_b32 s9, s10
14046 ; GFX7-NEXT: s_mov_b32 s8, s11
14047 ; GFX7-NEXT: s_add_u32 s4, s4, s9
14048 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
14049 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14050 ; GFX7-NEXT: s_mov_b32 s5, s8
14051 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
14052 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
14053 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14054 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
14055 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
14056 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
14057 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
14058 ; GFX7-NEXT: s_endpgm
14060 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14061 ; GFX10-WGP: ; %bb.0: ; %entry
14062 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
14063 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
14064 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
14065 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
14066 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
14067 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
14068 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
14069 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
14070 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
14071 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
14072 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
14073 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
14074 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14075 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
14076 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
14077 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
14078 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14079 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
14080 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
14081 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
14082 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
14083 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
14084 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
14085 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
14086 ; GFX10-WGP-NEXT: buffer_gl0_inv
14087 ; GFX10-WGP-NEXT: s_endpgm
14089 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14090 ; GFX10-CU: ; %bb.0: ; %entry
14091 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
14092 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
14093 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
14094 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
14095 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
14096 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
14097 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
14098 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
14099 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
14100 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
14101 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
14102 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
14103 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14104 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
14105 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
14106 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
14107 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14108 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
14109 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
14110 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
14111 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
14112 ; GFX10-CU-NEXT: s_endpgm
14114 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14115 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
14116 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
14117 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
14118 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
14119 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
14120 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
14121 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
14122 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
14123 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
14124 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
14125 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
14126 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
14127 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
14128 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
14129 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
14130 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
14131 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
14132 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14133 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
14134 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
14135 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
14136 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
14137 ; SKIP-CACHE-INV-NEXT: s_endpgm
14139 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14140 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
14141 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
14142 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
14143 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
14144 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
14145 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
14146 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
14147 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14148 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
14149 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14150 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14151 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
14153 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14154 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
14155 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
14156 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
14157 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
14158 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
14159 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
14160 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
14161 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14162 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
14163 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14164 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
14165 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14166 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
14167 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
14168 ; GFX90A-TGSPLIT-NEXT: s_endpgm
14170 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14171 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
14172 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
14173 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
14174 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
14175 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
14176 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
14177 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
14178 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14179 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
14180 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
14181 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14182 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
14184 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14185 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
14186 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
14187 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
14188 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
14189 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
14190 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
14191 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
14192 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14193 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
14194 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
14195 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
14196 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14197 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
14198 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
14199 ; GFX940-TGSPLIT-NEXT: s_endpgm
14201 ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14202 ; GFX11-WGP: ; %bb.0: ; %entry
14203 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14204 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
14205 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
14206 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
14207 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
14208 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
14209 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14210 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
14211 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
14212 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
14213 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
14214 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
14215 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14216 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
14217 ; GFX11-WGP-NEXT: buffer_gl0_inv
14218 ; GFX11-WGP-NEXT: s_endpgm
14220 ; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14221 ; GFX11-CU: ; %bb.0: ; %entry
14222 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14223 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
14224 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
14225 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
14226 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
14227 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
14228 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14229 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
14230 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
14231 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
14232 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14233 ; GFX11-CU-NEXT: s_endpgm
14235 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14236 ; GFX12-WGP: ; %bb.0: ; %entry
14237 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14238 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
14239 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
14240 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
14241 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
14242 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
14243 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14244 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
14245 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
14246 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
14247 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
14248 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
14249 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
14250 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
14251 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
14252 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
14253 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
14254 ; GFX12-WGP-NEXT: s_endpgm
14256 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
14257 ; GFX12-CU: ; %bb.0: ; %entry
14258 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14259 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
14260 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
14261 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
14262 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
14263 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
14264 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14265 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
14266 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
14267 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
14268 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14269 ; GFX12-CU-NEXT: s_endpgm
14270 ptr %out, i32 %in, i32 %old) {
14272 %gep = getelementptr i32, ptr %out, i32 4
14273 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
14277 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
14278 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14279 ; GFX7: ; %bb.0: ; %entry
14280 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
14281 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
14282 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
14283 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
14284 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
14285 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
14286 ; GFX7-NEXT: s_mov_b32 s4, s8
14287 ; GFX7-NEXT: s_mov_b32 s5, s9
14288 ; GFX7-NEXT: s_mov_b32 s9, s10
14289 ; GFX7-NEXT: s_mov_b32 s8, s11
14290 ; GFX7-NEXT: s_add_u32 s4, s4, s9
14291 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
14292 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14293 ; GFX7-NEXT: s_mov_b32 s5, s8
14294 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
14295 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
14296 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14297 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
14298 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
14299 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
14300 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
14301 ; GFX7-NEXT: s_endpgm
14303 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14304 ; GFX10-WGP: ; %bb.0: ; %entry
14305 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
14306 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
14307 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
14308 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
14309 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
14310 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
14311 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
14312 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
14313 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
14314 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
14315 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
14316 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
14317 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14318 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
14319 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
14320 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
14321 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14322 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
14323 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
14324 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
14325 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
14326 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
14327 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
14328 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
14329 ; GFX10-WGP-NEXT: buffer_gl0_inv
14330 ; GFX10-WGP-NEXT: s_endpgm
14332 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14333 ; GFX10-CU: ; %bb.0: ; %entry
14334 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
14335 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
14336 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
14337 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
14338 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
14339 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
14340 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
14341 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
14342 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
14343 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
14344 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
14345 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
14346 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14347 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
14348 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
14349 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
14350 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14351 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
14352 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
14353 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
14354 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
14355 ; GFX10-CU-NEXT: s_endpgm
14357 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14358 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
14359 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
14360 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
14361 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
14362 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
14363 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
14364 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
14365 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
14366 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
14367 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
14368 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
14369 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
14370 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
14371 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
14372 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
14373 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
14374 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
14375 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14376 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
14377 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
14378 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
14379 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
14380 ; SKIP-CACHE-INV-NEXT: s_endpgm
14382 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14383 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
14384 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
14385 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
14386 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
14387 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
14388 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
14389 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
14390 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14391 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
14392 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14393 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14394 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
14396 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14397 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
14398 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
14399 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
14400 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
14401 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
14402 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
14403 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
14404 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14405 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
14406 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14407 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
14408 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14409 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
14410 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
14411 ; GFX90A-TGSPLIT-NEXT: s_endpgm
14413 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14414 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
14415 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
14416 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
14417 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
14418 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
14419 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
14420 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
14421 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14422 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
14423 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
14424 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14425 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
14427 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14428 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
14429 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
14430 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
14431 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
14432 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
14433 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
14434 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
14435 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14436 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
14437 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
14438 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
14439 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14440 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
14441 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
14442 ; GFX940-TGSPLIT-NEXT: s_endpgm
14444 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14445 ; GFX11-WGP: ; %bb.0: ; %entry
14446 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14447 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
14448 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
14449 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
14450 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
14451 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
14452 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14453 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
14454 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
14455 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
14456 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
14457 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
14458 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14459 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
14460 ; GFX11-WGP-NEXT: buffer_gl0_inv
14461 ; GFX11-WGP-NEXT: s_endpgm
14463 ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14464 ; GFX11-CU: ; %bb.0: ; %entry
14465 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14466 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
14467 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
14468 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
14469 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
14470 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
14471 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14472 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
14473 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
14474 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
14475 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14476 ; GFX11-CU-NEXT: s_endpgm
14478 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14479 ; GFX12-WGP: ; %bb.0: ; %entry
14480 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14481 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
14482 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
14483 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
14484 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
14485 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
14486 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14487 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
14488 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
14489 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
14490 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
14491 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
14492 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
14493 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
14494 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
14495 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
14496 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
14497 ; GFX12-WGP-NEXT: s_endpgm
14499 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
14500 ; GFX12-CU: ; %bb.0: ; %entry
14501 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14502 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
14503 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
14504 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
14505 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
14506 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
14507 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14508 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
14509 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
14510 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
14511 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14512 ; GFX12-CU-NEXT: s_endpgm
14513 ptr %out, i32 %in, i32 %old) {
14515 %gep = getelementptr i32, ptr %out, i32 4
14516 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
14520 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
14521 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14522 ; GFX7: ; %bb.0: ; %entry
14523 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
14524 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
14525 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
14526 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
14527 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
14528 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
14529 ; GFX7-NEXT: s_mov_b32 s4, s8
14530 ; GFX7-NEXT: s_mov_b32 s5, s9
14531 ; GFX7-NEXT: s_mov_b32 s9, s10
14532 ; GFX7-NEXT: s_mov_b32 s8, s11
14533 ; GFX7-NEXT: s_add_u32 s4, s4, s9
14534 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
14535 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14536 ; GFX7-NEXT: s_mov_b32 s5, s8
14537 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
14538 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
14539 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14540 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
14541 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
14542 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
14543 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
14544 ; GFX7-NEXT: s_endpgm
14546 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14547 ; GFX10-WGP: ; %bb.0: ; %entry
14548 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
14549 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
14550 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
14551 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
14552 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
14553 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
14554 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
14555 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
14556 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
14557 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
14558 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
14559 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
14560 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14561 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
14562 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
14563 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
14564 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14565 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
14566 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
14567 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
14568 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
14569 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
14570 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
14571 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
14572 ; GFX10-WGP-NEXT: buffer_gl0_inv
14573 ; GFX10-WGP-NEXT: s_endpgm
14575 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14576 ; GFX10-CU: ; %bb.0: ; %entry
14577 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
14578 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
14579 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
14580 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
14581 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
14582 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
14583 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
14584 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
14585 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
14586 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
14587 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
14588 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
14589 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14590 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
14591 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
14592 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
14593 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14594 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
14595 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
14596 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
14597 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
14598 ; GFX10-CU-NEXT: s_endpgm
14600 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14601 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
14602 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
14603 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
14604 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
14605 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
14606 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
14607 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
14608 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
14609 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
14610 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
14611 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
14612 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
14613 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
14614 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
14615 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
14616 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
14617 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
14618 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14619 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
14620 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
14621 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
14622 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
14623 ; SKIP-CACHE-INV-NEXT: s_endpgm
14625 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14626 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
14627 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
14628 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
14629 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
14630 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
14631 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
14632 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
14633 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14634 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
14635 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14636 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14637 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
14639 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14640 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
14641 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
14642 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
14643 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
14644 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
14645 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
14646 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
14647 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14648 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
14649 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14650 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
14651 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14652 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
14653 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
14654 ; GFX90A-TGSPLIT-NEXT: s_endpgm
14656 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14657 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
14658 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
14659 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
14660 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
14661 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
14662 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
14663 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
14664 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14665 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
14666 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
14667 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14668 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
14670 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14671 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
14672 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
14673 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
14674 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
14675 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
14676 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
14677 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
14678 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14679 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
14680 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
14681 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
14682 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14683 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
14684 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
14685 ; GFX940-TGSPLIT-NEXT: s_endpgm
14687 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14688 ; GFX11-WGP: ; %bb.0: ; %entry
14689 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14690 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
14691 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
14692 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
14693 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
14694 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
14695 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14696 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
14697 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
14698 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
14699 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
14700 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
14701 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14702 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
14703 ; GFX11-WGP-NEXT: buffer_gl0_inv
14704 ; GFX11-WGP-NEXT: s_endpgm
14706 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14707 ; GFX11-CU: ; %bb.0: ; %entry
14708 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14709 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
14710 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
14711 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
14712 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
14713 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
14714 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14715 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
14716 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
14717 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
14718 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14719 ; GFX11-CU-NEXT: s_endpgm
14721 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14722 ; GFX12-WGP: ; %bb.0: ; %entry
14723 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14724 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
14725 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
14726 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
14727 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
14728 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
14729 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14730 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
14731 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
14732 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
14733 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
14734 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
14735 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
14736 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
14737 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
14738 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
14739 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
14740 ; GFX12-WGP-NEXT: s_endpgm
14742 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
14743 ; GFX12-CU: ; %bb.0: ; %entry
14744 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14745 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
14746 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
14747 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
14748 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
14749 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
14750 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14751 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
14752 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
14753 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
14754 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14755 ; GFX12-CU-NEXT: s_endpgm
14756 ptr %out, i32 %in, i32 %old) {
14758 %gep = getelementptr i32, ptr %out, i32 4
14759 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
14763 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
14764 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14765 ; GFX7: ; %bb.0: ; %entry
14766 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
14767 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
14768 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
14769 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
14770 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
14771 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
14772 ; GFX7-NEXT: s_mov_b32 s4, s8
14773 ; GFX7-NEXT: s_mov_b32 s5, s9
14774 ; GFX7-NEXT: s_mov_b32 s9, s10
14775 ; GFX7-NEXT: s_mov_b32 s8, s11
14776 ; GFX7-NEXT: s_add_u32 s4, s4, s9
14777 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
14778 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14779 ; GFX7-NEXT: s_mov_b32 s5, s8
14780 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
14781 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
14782 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14783 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
14784 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
14785 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
14786 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
14787 ; GFX7-NEXT: s_endpgm
14789 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14790 ; GFX10-WGP: ; %bb.0: ; %entry
14791 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
14792 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
14793 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
14794 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
14795 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
14796 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
14797 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
14798 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
14799 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
14800 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
14801 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
14802 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
14803 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14804 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
14805 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
14806 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
14807 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14808 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
14809 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
14810 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
14811 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
14812 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
14813 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
14814 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
14815 ; GFX10-WGP-NEXT: buffer_gl0_inv
14816 ; GFX10-WGP-NEXT: s_endpgm
14818 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14819 ; GFX10-CU: ; %bb.0: ; %entry
14820 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
14821 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
14822 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
14823 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
14824 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
14825 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
14826 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
14827 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
14828 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
14829 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
14830 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
14831 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
14832 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
14833 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
14834 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
14835 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
14836 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14837 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
14838 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
14839 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
14840 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
14841 ; GFX10-CU-NEXT: s_endpgm
14843 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14844 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
14845 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
14846 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
14847 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
14848 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
14849 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
14850 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
14851 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
14852 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
14853 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
14854 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
14855 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
14856 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
14857 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
14858 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
14859 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
14860 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
14861 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14862 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
14863 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
14864 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
14865 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
14866 ; SKIP-CACHE-INV-NEXT: s_endpgm
14868 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14869 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
14870 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
14871 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
14872 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
14873 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
14874 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
14875 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
14876 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14877 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
14878 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14879 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14880 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
14882 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14883 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
14884 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
14885 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
14886 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
14887 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
14888 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
14889 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
14890 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14891 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
14892 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
14893 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
14894 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14895 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
14896 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
14897 ; GFX90A-TGSPLIT-NEXT: s_endpgm
14899 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14900 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
14901 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
14902 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
14903 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
14904 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
14905 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
14906 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
14907 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14908 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
14909 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
14910 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14911 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
14913 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14914 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
14915 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
14916 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
14917 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
14918 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
14919 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
14920 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
14921 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14922 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
14923 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
14924 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
14925 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
14926 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
14927 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
14928 ; GFX940-TGSPLIT-NEXT: s_endpgm
14930 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14931 ; GFX11-WGP: ; %bb.0: ; %entry
14932 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14933 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
14934 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
14935 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
14936 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
14937 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
14938 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14939 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
14940 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
14941 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
14942 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
14943 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
14944 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14945 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
14946 ; GFX11-WGP-NEXT: buffer_gl0_inv
14947 ; GFX11-WGP-NEXT: s_endpgm
14949 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14950 ; GFX11-CU: ; %bb.0: ; %entry
14951 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14952 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
14953 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
14954 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
14955 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
14956 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
14957 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14958 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
14959 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
14960 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
14961 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14962 ; GFX11-CU-NEXT: s_endpgm
14964 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14965 ; GFX12-WGP: ; %bb.0: ; %entry
14966 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14967 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
14968 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
14969 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
14970 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
14971 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
14972 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14973 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
14974 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
14975 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
14976 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
14977 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
14978 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
14979 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
14980 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
14981 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
14982 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
14983 ; GFX12-WGP-NEXT: s_endpgm
14985 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
14986 ; GFX12-CU: ; %bb.0: ; %entry
14987 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
14988 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
14989 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
14990 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
14991 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
14992 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
14993 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
14994 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
14995 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
14996 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
14997 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
14998 ; GFX12-CU-NEXT: s_endpgm
14999 ptr %out, i32 %in, i32 %old) {
15001 %gep = getelementptr i32, ptr %out, i32 4
15002 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst
15006 define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
15007 ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15008 ; GFX7: ; %bb.0: ; %entry
15009 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
15010 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
15011 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
15012 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
15013 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
15014 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
15015 ; GFX7-NEXT: s_mov_b32 s4, s8
15016 ; GFX7-NEXT: s_mov_b32 s5, s9
15017 ; GFX7-NEXT: s_mov_b32 s9, s10
15018 ; GFX7-NEXT: s_mov_b32 s8, s11
15019 ; GFX7-NEXT: s_add_u32 s4, s4, s9
15020 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
15021 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15022 ; GFX7-NEXT: s_mov_b32 s5, s8
15023 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
15024 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
15025 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15026 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
15027 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
15028 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
15029 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
15030 ; GFX7-NEXT: s_endpgm
15032 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15033 ; GFX10-WGP: ; %bb.0: ; %entry
15034 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
15035 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
15036 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
15037 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
15038 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
15039 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
15040 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
15041 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
15042 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
15043 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
15044 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
15045 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
15046 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15047 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
15048 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
15049 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
15050 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15051 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
15052 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
15053 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
15054 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
15055 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
15056 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
15057 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
15058 ; GFX10-WGP-NEXT: buffer_gl0_inv
15059 ; GFX10-WGP-NEXT: s_endpgm
15061 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15062 ; GFX10-CU: ; %bb.0: ; %entry
15063 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
15064 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
15065 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
15066 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
15067 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
15068 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
15069 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
15070 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
15071 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
15072 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
15073 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
15074 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
15075 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15076 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
15077 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
15078 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
15079 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15080 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
15081 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
15082 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
15083 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
15084 ; GFX10-CU-NEXT: s_endpgm
15086 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15087 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
15088 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
15089 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
15090 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
15091 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
15092 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
15093 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
15094 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
15095 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
15096 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
15097 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
15098 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
15099 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
15100 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
15101 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
15102 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
15103 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
15104 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15105 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
15106 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
15107 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
15108 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
15109 ; SKIP-CACHE-INV-NEXT: s_endpgm
15111 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15112 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
15113 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
15114 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
15115 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
15116 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
15117 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
15118 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
15119 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15120 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
15121 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15122 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15123 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
15125 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15126 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
15127 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
15128 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
15129 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
15130 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
15131 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
15132 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
15133 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15134 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
15135 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15136 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
15137 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15138 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
15139 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
15140 ; GFX90A-TGSPLIT-NEXT: s_endpgm
15142 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15143 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
15144 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
15145 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
15146 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
15147 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
15148 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
15149 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
15150 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15151 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
15152 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
15153 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15154 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
15156 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15157 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
15158 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
15159 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
15160 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
15161 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
15162 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
15163 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
15164 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15165 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
15166 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
15167 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
15168 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15169 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
15170 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
15171 ; GFX940-TGSPLIT-NEXT: s_endpgm
15173 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15174 ; GFX11-WGP: ; %bb.0: ; %entry
15175 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
15176 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
15177 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
15178 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
15179 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
15180 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
15181 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15182 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
15183 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
15184 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
15185 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
15186 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
15187 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15188 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
15189 ; GFX11-WGP-NEXT: buffer_gl0_inv
15190 ; GFX11-WGP-NEXT: s_endpgm
15192 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15193 ; GFX11-CU: ; %bb.0: ; %entry
15194 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
15195 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
15196 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
15197 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
15198 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
15199 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
15200 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15201 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
15202 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
15203 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
15204 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15205 ; GFX11-CU-NEXT: s_endpgm
15207 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15208 ; GFX12-WGP: ; %bb.0: ; %entry
15209 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
15210 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
15211 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
15212 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
15213 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
15214 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
15215 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15216 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
15217 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
15218 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
15219 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
15220 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
15221 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
15222 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
15223 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
15224 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
15225 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
15226 ; GFX12-WGP-NEXT: s_endpgm
15228 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
15229 ; GFX12-CU: ; %bb.0: ; %entry
15230 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
15231 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
15232 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
15233 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
15234 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
15235 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
15236 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15237 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
15238 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
15239 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
15240 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15241 ; GFX12-CU-NEXT: s_endpgm
15242 ptr %out, i32 %in, i32 %old) {
15244 %gep = getelementptr i32, ptr %out, i32 4
15245 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst
15249 define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
15250 ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15251 ; GFX7: ; %bb.0: ; %entry
15252 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
15253 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
15254 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
15255 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
15256 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
15257 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
15258 ; GFX7-NEXT: s_mov_b32 s4, s8
15259 ; GFX7-NEXT: s_mov_b32 s5, s9
15260 ; GFX7-NEXT: s_mov_b32 s9, s10
15261 ; GFX7-NEXT: s_mov_b32 s8, s11
15262 ; GFX7-NEXT: s_add_u32 s4, s4, s9
15263 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
15264 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15265 ; GFX7-NEXT: s_mov_b32 s5, s8
15266 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
15267 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
15268 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15269 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
15270 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
15271 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
15272 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
15273 ; GFX7-NEXT: s_endpgm
15275 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15276 ; GFX10-WGP: ; %bb.0: ; %entry
15277 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
15278 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
15279 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
15280 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
15281 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
15282 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
15283 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
15284 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
15285 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
15286 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
15287 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
15288 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
15289 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15290 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
15291 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
15292 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
15293 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15294 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
15295 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
15296 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
15297 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
15298 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
15299 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
15300 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
15301 ; GFX10-WGP-NEXT: buffer_gl0_inv
15302 ; GFX10-WGP-NEXT: s_endpgm
15304 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15305 ; GFX10-CU: ; %bb.0: ; %entry
15306 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
15307 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
15308 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
15309 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
15310 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
15311 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
15312 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
15313 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
15314 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
15315 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
15316 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
15317 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
15318 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15319 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
15320 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
15321 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
15322 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15323 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
15324 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
15325 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
15326 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
15327 ; GFX10-CU-NEXT: s_endpgm
15329 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15330 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
15331 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
15332 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
15333 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
15334 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
15335 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
15336 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
15337 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
15338 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
15339 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
15340 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
15341 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
15342 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
15343 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
15344 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
15345 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
15346 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
15347 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15348 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
15349 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
15350 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
15351 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
15352 ; SKIP-CACHE-INV-NEXT: s_endpgm
15354 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15355 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
15356 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
15357 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
15358 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
15359 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
15360 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
15361 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
15362 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15363 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
15364 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15365 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15366 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
15368 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15369 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
15370 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
15371 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
15372 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
15373 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
15374 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
15375 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
15376 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15377 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
15378 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15379 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
15380 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15381 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
15382 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
15383 ; GFX90A-TGSPLIT-NEXT: s_endpgm
15385 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15386 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
15387 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
15388 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
15389 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
15390 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
15391 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
15392 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
15393 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15394 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
15395 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
15396 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15397 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
15399 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15400 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
15401 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
15402 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
15403 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
15404 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
15405 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
15406 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
15407 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15408 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
15409 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
15410 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
15411 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15412 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
15413 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
15414 ; GFX940-TGSPLIT-NEXT: s_endpgm
15416 ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15417 ; GFX11-WGP: ; %bb.0: ; %entry
15418 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
15419 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
15420 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
15421 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
15422 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
15423 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
15424 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15425 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
15426 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
15427 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
15428 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
15429 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
15430 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15431 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
15432 ; GFX11-WGP-NEXT: buffer_gl0_inv
15433 ; GFX11-WGP-NEXT: s_endpgm
15435 ; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15436 ; GFX11-CU: ; %bb.0: ; %entry
15437 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
15438 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
15439 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
15440 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
15441 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
15442 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
15443 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15444 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
15445 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
15446 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
15447 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15448 ; GFX11-CU-NEXT: s_endpgm
15450 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15451 ; GFX12-WGP: ; %bb.0: ; %entry
15452 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
15453 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
15454 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
15455 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
15456 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
15457 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
15458 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15459 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
15460 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
15461 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
15462 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
15463 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
15464 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
15465 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
15466 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
15467 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
15468 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
15469 ; GFX12-WGP-NEXT: s_endpgm
15471 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
15472 ; GFX12-CU: ; %bb.0: ; %entry
15473 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
15474 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
15475 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
15476 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
15477 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
15478 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
15479 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15480 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
15481 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
15482 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
15483 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15484 ; GFX12-CU-NEXT: s_endpgm
15485 ptr %out, i32 %in, i32 %old) {
15487 %gep = getelementptr i32, ptr %out, i32 4
15488 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst
15492 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
15493 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15494 ; GFX7: ; %bb.0: ; %entry
15495 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
15496 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
15497 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
15498 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
15499 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
15500 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
15501 ; GFX7-NEXT: s_mov_b32 s4, s8
15502 ; GFX7-NEXT: s_mov_b32 s5, s9
15503 ; GFX7-NEXT: s_mov_b32 s9, s10
15504 ; GFX7-NEXT: s_mov_b32 s8, s11
15505 ; GFX7-NEXT: s_add_u32 s4, s4, s9
15506 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
15507 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15508 ; GFX7-NEXT: s_mov_b32 s5, s8
15509 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
15510 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
15511 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15512 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
15513 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
15514 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
15515 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
15516 ; GFX7-NEXT: s_endpgm
15518 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15519 ; GFX10-WGP: ; %bb.0: ; %entry
15520 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
15521 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
15522 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
15523 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
15524 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
15525 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
15526 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
15527 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
15528 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
15529 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
15530 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
15531 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
15532 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15533 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
15534 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
15535 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
15536 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15537 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
15538 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
15539 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
15540 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
15541 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
15542 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
15543 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
15544 ; GFX10-WGP-NEXT: buffer_gl0_inv
15545 ; GFX10-WGP-NEXT: s_endpgm
15547 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15548 ; GFX10-CU: ; %bb.0: ; %entry
15549 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
15550 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
15551 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
15552 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
15553 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
15554 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
15555 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
15556 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
15557 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
15558 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
15559 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
15560 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
15561 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15562 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
15563 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
15564 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
15565 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15566 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
15567 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
15568 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
15569 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
15570 ; GFX10-CU-NEXT: s_endpgm
15572 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15573 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
15574 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
15575 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
15576 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
15577 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
15578 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
15579 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
15580 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
15581 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
15582 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
15583 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
15584 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
15585 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
15586 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
15587 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
15588 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
15589 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
15590 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15591 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
15592 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
15593 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
15594 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
15595 ; SKIP-CACHE-INV-NEXT: s_endpgm
15597 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15598 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
15599 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
15600 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
15601 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
15602 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
15603 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
15604 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
15605 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15606 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
15607 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15608 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15609 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
15611 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15612 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
15613 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
15614 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
15615 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
15616 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
15617 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
15618 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
15619 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15620 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
15621 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15622 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
15623 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15624 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
15625 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
15626 ; GFX90A-TGSPLIT-NEXT: s_endpgm
15628 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15629 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
15630 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
15631 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
15632 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
15633 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
15634 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
15635 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
15636 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15637 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
15638 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
15639 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15640 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
15642 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15643 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
15644 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
15645 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
15646 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
15647 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
15648 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
15649 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
15650 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15651 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
15652 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
15653 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
15654 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15655 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
15656 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
15657 ; GFX940-TGSPLIT-NEXT: s_endpgm
15659 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15660 ; GFX11-WGP: ; %bb.0: ; %entry
15661 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
15662 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
15663 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
15664 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
15665 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
15666 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
15667 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15668 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
15669 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
15670 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
15671 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
15672 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
15673 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15674 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
15675 ; GFX11-WGP-NEXT: buffer_gl0_inv
15676 ; GFX11-WGP-NEXT: s_endpgm
15678 ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15679 ; GFX11-CU: ; %bb.0: ; %entry
15680 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
15681 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
15682 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
15683 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
15684 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
15685 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
15686 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15687 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
15688 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
15689 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
15690 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15691 ; GFX11-CU-NEXT: s_endpgm
15693 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15694 ; GFX12-WGP: ; %bb.0: ; %entry
15695 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
15696 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
15697 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
15698 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
15699 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
15700 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
15701 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15702 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
15703 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
15704 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
15705 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
15706 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
15707 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
15708 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
15709 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
15710 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
15711 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
15712 ; GFX12-WGP-NEXT: s_endpgm
15714 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
15715 ; GFX12-CU: ; %bb.0: ; %entry
15716 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
15717 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
15718 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
15719 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
15720 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
15721 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
15722 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15723 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
15724 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
15725 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
15726 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15727 ; GFX12-CU-NEXT: s_endpgm
15728 ptr %out, i32 %in, i32 %old) {
15730 %gep = getelementptr i32, ptr %out, i32 4
15731 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst
15735 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
15736 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15737 ; GFX7: ; %bb.0: ; %entry
15738 ; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
15739 ; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
15740 ; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
15741 ; GFX7-NEXT: s_load_dword s6, s[4:5], 0x3
15742 ; GFX7-NEXT: s_mov_b64 s[10:11], 16
15743 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
15744 ; GFX7-NEXT: s_mov_b32 s4, s8
15745 ; GFX7-NEXT: s_mov_b32 s5, s9
15746 ; GFX7-NEXT: s_mov_b32 s9, s10
15747 ; GFX7-NEXT: s_mov_b32 s8, s11
15748 ; GFX7-NEXT: s_add_u32 s4, s4, s9
15749 ; GFX7-NEXT: s_addc_u32 s8, s5, s8
15750 ; GFX7-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15751 ; GFX7-NEXT: s_mov_b32 s5, s8
15752 ; GFX7-NEXT: v_mov_b32_e32 v2, s7
15753 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
15754 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15755 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
15756 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
15757 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
15758 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
15759 ; GFX7-NEXT: s_endpgm
15761 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15762 ; GFX10-WGP: ; %bb.0: ; %entry
15763 ; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
15764 ; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
15765 ; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
15766 ; GFX10-WGP-NEXT: s_load_dword s6, s[4:5], 0xc
15767 ; GFX10-WGP-NEXT: s_mov_b64 s[10:11], 16
15768 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
15769 ; GFX10-WGP-NEXT: s_mov_b32 s4, s8
15770 ; GFX10-WGP-NEXT: s_mov_b32 s5, s9
15771 ; GFX10-WGP-NEXT: s_mov_b32 s9, s10
15772 ; GFX10-WGP-NEXT: s_mov_b32 s8, s11
15773 ; GFX10-WGP-NEXT: s_add_u32 s4, s4, s9
15774 ; GFX10-WGP-NEXT: s_addc_u32 s8, s5, s8
15775 ; GFX10-WGP-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15776 ; GFX10-WGP-NEXT: s_mov_b32 s5, s8
15777 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s7
15778 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
15779 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15780 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
15781 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
15782 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
15783 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
15784 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
15785 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
15786 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
15787 ; GFX10-WGP-NEXT: buffer_gl0_inv
15788 ; GFX10-WGP-NEXT: s_endpgm
15790 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15791 ; GFX10-CU: ; %bb.0: ; %entry
15792 ; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
15793 ; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
15794 ; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
15795 ; GFX10-CU-NEXT: s_load_dword s6, s[4:5], 0xc
15796 ; GFX10-CU-NEXT: s_mov_b64 s[10:11], 16
15797 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
15798 ; GFX10-CU-NEXT: s_mov_b32 s4, s8
15799 ; GFX10-CU-NEXT: s_mov_b32 s5, s9
15800 ; GFX10-CU-NEXT: s_mov_b32 s9, s10
15801 ; GFX10-CU-NEXT: s_mov_b32 s8, s11
15802 ; GFX10-CU-NEXT: s_add_u32 s4, s4, s9
15803 ; GFX10-CU-NEXT: s_addc_u32 s8, s5, s8
15804 ; GFX10-CU-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5
15805 ; GFX10-CU-NEXT: s_mov_b32 s5, s8
15806 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s7
15807 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
15808 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15809 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
15810 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
15811 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
15812 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
15813 ; GFX10-CU-NEXT: s_endpgm
15815 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15816 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
15817 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[0:1], s[4:5]
15818 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
15819 ; SKIP-CACHE-INV-NEXT: s_load_dword s3, s[0:1], 0x2
15820 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x3
15821 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[6:7], 16
15822 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
15823 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
15824 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
15825 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s6
15826 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s7
15827 ; SKIP-CACHE-INV-NEXT: s_add_u32 s0, s0, s5
15828 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s4, s1, s4
15829 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
15830 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s4
15831 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3
15832 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
15833 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15834 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
15835 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
15836 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
15837 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3]
15838 ; SKIP-CACHE-INV-NEXT: s_endpgm
15840 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15841 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
15842 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
15843 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
15844 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
15845 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
15846 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
15847 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
15848 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15849 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
15850 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15851 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15852 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
15854 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15855 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
15856 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
15857 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
15858 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
15859 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
15860 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
15861 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
15862 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15863 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
15864 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
15865 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
15866 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15867 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
15868 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
15869 ; GFX90A-TGSPLIT-NEXT: s_endpgm
15871 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15872 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
15873 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
15874 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
15875 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
15876 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
15877 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
15878 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
15879 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15880 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
15881 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
15882 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15883 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
15885 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15886 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
15887 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
15888 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
15889 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
15890 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
15891 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
15892 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
15893 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15894 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
15895 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
15896 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
15897 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] offset:16
15898 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
15899 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
15900 ; GFX940-TGSPLIT-NEXT: s_endpgm
15902 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15903 ; GFX11-WGP: ; %bb.0: ; %entry
15904 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
15905 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
15906 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
15907 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
15908 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
15909 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
15910 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15911 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
15912 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
15913 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
15914 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
15915 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
15916 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15917 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
15918 ; GFX11-WGP-NEXT: buffer_gl0_inv
15919 ; GFX11-WGP-NEXT: s_endpgm
15921 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15922 ; GFX11-CU: ; %bb.0: ; %entry
15923 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
15924 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
15925 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
15926 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
15927 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
15928 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
15929 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15930 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
15931 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
15932 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
15933 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15934 ; GFX11-CU-NEXT: s_endpgm
15936 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15937 ; GFX12-WGP: ; %bb.0: ; %entry
15938 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
15939 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
15940 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
15941 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
15942 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
15943 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
15944 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15945 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
15946 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
15947 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
15948 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
15949 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
15950 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
15951 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
15952 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 scope:SCOPE_SE
15953 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
15954 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
15955 ; GFX12-WGP-NEXT: s_endpgm
15957 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
15958 ; GFX12-CU: ; %bb.0: ; %entry
15959 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
15960 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
15961 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
15962 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
15963 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
15964 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
15965 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15966 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
15967 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
15968 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
15969 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16
15970 ; GFX12-CU-NEXT: s_endpgm
15971 ptr %out, i32 %in, i32 %old) {
15973 %gep = getelementptr i32, ptr %out, i32 4
15974 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
15978 define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
15979 ; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
15980 ; GFX7: ; %bb.0: ; %entry
15981 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
15982 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
15983 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
15984 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
15985 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
15986 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
15987 ; GFX7-NEXT: s_mov_b32 s6, s4
15988 ; GFX7-NEXT: s_mov_b32 s7, s5
15989 ; GFX7-NEXT: s_mov_b32 s11, s12
15990 ; GFX7-NEXT: s_mov_b32 s10, s13
15991 ; GFX7-NEXT: s_add_u32 s6, s6, s11
15992 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
15993 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
15994 ; GFX7-NEXT: s_mov_b32 s7, s10
15995 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
15996 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
15997 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
15998 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
15999 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
16000 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
16001 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16002 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
16003 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
16004 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16005 ; GFX7-NEXT: flat_store_dword v[0:1], v2
16006 ; GFX7-NEXT: s_endpgm
16008 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16009 ; GFX10-WGP: ; %bb.0: ; %entry
16010 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
16011 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
16012 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
16013 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
16014 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
16015 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
16016 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
16017 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
16018 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
16019 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
16020 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
16021 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
16022 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16023 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
16024 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
16025 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
16026 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16027 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
16028 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
16029 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
16030 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16031 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
16032 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
16033 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16034 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
16035 ; GFX10-WGP-NEXT: s_endpgm
16037 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16038 ; GFX10-CU: ; %bb.0: ; %entry
16039 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
16040 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
16041 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
16042 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
16043 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
16044 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
16045 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
16046 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
16047 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
16048 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
16049 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
16050 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
16051 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16052 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
16053 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
16054 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
16055 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16056 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
16057 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
16058 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
16059 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16060 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
16061 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
16062 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16063 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
16064 ; GFX10-CU-NEXT: s_endpgm
16066 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16067 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
16068 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
16069 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
16070 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
16071 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
16072 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
16073 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
16074 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
16075 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
16076 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
16077 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
16078 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
16079 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
16080 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
16081 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
16082 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
16083 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
16084 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16085 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
16086 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
16087 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
16088 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16089 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
16090 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
16091 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16092 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
16093 ; SKIP-CACHE-INV-NEXT: s_endpgm
16095 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16096 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
16097 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
16098 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
16099 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
16100 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
16101 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
16102 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
16103 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16104 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
16105 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16106 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16107 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16108 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16109 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
16110 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
16112 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16113 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
16114 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
16115 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
16116 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
16117 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
16118 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
16119 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
16120 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16121 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
16122 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16123 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16124 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16125 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
16126 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
16127 ; GFX90A-TGSPLIT-NEXT: s_endpgm
16129 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16130 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
16131 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
16132 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
16133 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
16134 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
16135 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
16136 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
16137 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16138 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
16139 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
16140 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16141 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
16142 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16143 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
16144 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
16146 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16147 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
16148 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
16149 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
16150 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
16151 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
16152 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
16153 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
16154 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16155 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
16156 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
16157 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16158 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
16159 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
16160 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
16161 ; GFX940-TGSPLIT-NEXT: s_endpgm
16163 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16164 ; GFX11-WGP: ; %bb.0: ; %entry
16165 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
16166 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
16167 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
16168 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
16169 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
16170 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
16171 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16172 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
16173 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
16174 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
16175 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16176 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
16177 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
16178 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16179 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
16180 ; GFX11-WGP-NEXT: s_endpgm
16182 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16183 ; GFX11-CU: ; %bb.0: ; %entry
16184 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
16185 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
16186 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
16187 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
16188 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
16189 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
16190 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16191 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
16192 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
16193 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
16194 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16195 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
16196 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
16197 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16198 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
16199 ; GFX11-CU-NEXT: s_endpgm
16201 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16202 ; GFX12-WGP: ; %bb.0: ; %entry
16203 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
16204 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
16205 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
16206 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
16207 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
16208 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
16209 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16210 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
16211 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
16212 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
16213 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
16214 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
16215 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
16216 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
16217 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
16218 ; GFX12-WGP-NEXT: s_endpgm
16220 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
16221 ; GFX12-CU: ; %bb.0: ; %entry
16222 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
16223 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
16224 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
16225 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
16226 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
16227 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
16228 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16229 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
16230 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
16231 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
16232 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
16233 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
16234 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
16235 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
16236 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
16237 ; GFX12-CU-NEXT: s_endpgm
16238 ptr %out, i32 %in, i32 %old) {
16240 %gep = getelementptr i32, ptr %out, i32 4
16241 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic monotonic
16242 %val0 = extractvalue { i32, i1 } %val, 0
16243 store i32 %val0, ptr %out, align 4
16247 define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
16248 ; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16249 ; GFX7: ; %bb.0: ; %entry
16250 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
16251 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
16252 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
16253 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
16254 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
16255 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
16256 ; GFX7-NEXT: s_mov_b32 s6, s4
16257 ; GFX7-NEXT: s_mov_b32 s7, s5
16258 ; GFX7-NEXT: s_mov_b32 s11, s12
16259 ; GFX7-NEXT: s_mov_b32 s10, s13
16260 ; GFX7-NEXT: s_add_u32 s6, s6, s11
16261 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
16262 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16263 ; GFX7-NEXT: s_mov_b32 s7, s10
16264 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
16265 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
16266 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16267 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
16268 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
16269 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
16270 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16271 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
16272 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
16273 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16274 ; GFX7-NEXT: flat_store_dword v[0:1], v2
16275 ; GFX7-NEXT: s_endpgm
16277 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16278 ; GFX10-WGP: ; %bb.0: ; %entry
16279 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
16280 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
16281 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
16282 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
16283 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
16284 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
16285 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
16286 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
16287 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
16288 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
16289 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
16290 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
16291 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16292 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
16293 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
16294 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
16295 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16296 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
16297 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
16298 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
16299 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16300 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
16301 ; GFX10-WGP-NEXT: buffer_gl0_inv
16302 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
16303 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
16304 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
16305 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
16306 ; GFX10-WGP-NEXT: s_endpgm
16308 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16309 ; GFX10-CU: ; %bb.0: ; %entry
16310 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
16311 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
16312 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
16313 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
16314 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
16315 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
16316 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
16317 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
16318 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
16319 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
16320 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
16321 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
16322 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16323 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
16324 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
16325 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
16326 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16327 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
16328 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
16329 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
16330 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16331 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
16332 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
16333 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16334 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
16335 ; GFX10-CU-NEXT: s_endpgm
16337 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16338 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
16339 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
16340 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
16341 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
16342 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
16343 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
16344 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
16345 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
16346 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
16347 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
16348 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
16349 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
16350 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
16351 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
16352 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
16353 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
16354 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
16355 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16356 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
16357 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
16358 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
16359 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16360 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
16361 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
16362 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16363 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
16364 ; SKIP-CACHE-INV-NEXT: s_endpgm
16366 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16367 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
16368 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
16369 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
16370 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
16371 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
16372 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
16373 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
16374 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16375 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
16376 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16377 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16378 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16379 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16380 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
16381 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
16383 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16384 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
16385 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
16386 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
16387 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
16388 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
16389 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
16390 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
16391 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16392 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
16393 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16394 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16395 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
16396 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
16397 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16398 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
16399 ; GFX90A-TGSPLIT-NEXT: s_endpgm
16401 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16402 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
16403 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
16404 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
16405 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
16406 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
16407 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
16408 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
16409 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16410 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
16411 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
16412 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16413 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
16414 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16415 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
16416 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
16418 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16419 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
16420 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
16421 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
16422 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
16423 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
16424 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
16425 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
16426 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16427 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
16428 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
16429 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16430 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
16431 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
16432 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
16433 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
16434 ; GFX940-TGSPLIT-NEXT: s_endpgm
16436 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16437 ; GFX11-WGP: ; %bb.0: ; %entry
16438 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
16439 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
16440 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
16441 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
16442 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
16443 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
16444 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16445 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
16446 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
16447 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
16448 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16449 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
16450 ; GFX11-WGP-NEXT: buffer_gl0_inv
16451 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
16452 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
16453 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
16454 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
16455 ; GFX11-WGP-NEXT: s_endpgm
16457 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16458 ; GFX11-CU: ; %bb.0: ; %entry
16459 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
16460 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
16461 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
16462 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
16463 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
16464 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
16465 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16466 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
16467 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
16468 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
16469 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16470 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
16471 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
16472 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16473 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
16474 ; GFX11-CU-NEXT: s_endpgm
16476 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16477 ; GFX12-WGP: ; %bb.0: ; %entry
16478 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
16479 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
16480 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
16481 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
16482 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
16483 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
16484 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16485 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
16486 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
16487 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
16488 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
16489 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
16490 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
16491 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
16492 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
16493 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
16494 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
16495 ; GFX12-WGP-NEXT: s_endpgm
16497 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
16498 ; GFX12-CU: ; %bb.0: ; %entry
16499 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
16500 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
16501 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
16502 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
16503 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
16504 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
16505 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16506 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
16507 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
16508 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
16509 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
16510 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
16511 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
16512 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
16513 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
16514 ; GFX12-CU-NEXT: s_endpgm
16515 ptr %out, i32 %in, i32 %old) {
16517 %gep = getelementptr i32, ptr %out, i32 4
16518 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire monotonic
16519 %val0 = extractvalue { i32, i1 } %val, 0
16520 store i32 %val0, ptr %out, align 4
16524 define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
16525 ; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16526 ; GFX7: ; %bb.0: ; %entry
16527 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
16528 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
16529 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
16530 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
16531 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
16532 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
16533 ; GFX7-NEXT: s_mov_b32 s6, s4
16534 ; GFX7-NEXT: s_mov_b32 s7, s5
16535 ; GFX7-NEXT: s_mov_b32 s11, s12
16536 ; GFX7-NEXT: s_mov_b32 s10, s13
16537 ; GFX7-NEXT: s_add_u32 s6, s6, s11
16538 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
16539 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16540 ; GFX7-NEXT: s_mov_b32 s7, s10
16541 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
16542 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
16543 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16544 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
16545 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
16546 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
16547 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16548 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
16549 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
16550 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16551 ; GFX7-NEXT: flat_store_dword v[0:1], v2
16552 ; GFX7-NEXT: s_endpgm
16554 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16555 ; GFX10-WGP: ; %bb.0: ; %entry
16556 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
16557 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
16558 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
16559 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
16560 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
16561 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
16562 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
16563 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
16564 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
16565 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
16566 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
16567 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
16568 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16569 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
16570 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
16571 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
16572 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16573 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
16574 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
16575 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
16576 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
16577 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
16578 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16579 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
16580 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
16581 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16582 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
16583 ; GFX10-WGP-NEXT: s_endpgm
16585 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16586 ; GFX10-CU: ; %bb.0: ; %entry
16587 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
16588 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
16589 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
16590 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
16591 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
16592 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
16593 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
16594 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
16595 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
16596 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
16597 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
16598 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
16599 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16600 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
16601 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
16602 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
16603 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16604 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
16605 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
16606 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
16607 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16608 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
16609 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
16610 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16611 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
16612 ; GFX10-CU-NEXT: s_endpgm
16614 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16615 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
16616 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
16617 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
16618 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
16619 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
16620 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
16621 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
16622 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
16623 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
16624 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
16625 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
16626 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
16627 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
16628 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
16629 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
16630 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
16631 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
16632 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16633 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
16634 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
16635 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
16636 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16637 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
16638 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
16639 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16640 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
16641 ; SKIP-CACHE-INV-NEXT: s_endpgm
16643 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16644 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
16645 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
16646 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
16647 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
16648 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
16649 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
16650 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
16651 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16652 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
16653 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16654 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16655 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16656 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16657 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
16658 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
16660 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16661 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
16662 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
16663 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
16664 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
16665 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
16666 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
16667 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
16668 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16669 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
16670 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16671 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
16672 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16673 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16674 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
16675 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
16676 ; GFX90A-TGSPLIT-NEXT: s_endpgm
16678 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16679 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
16680 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
16681 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
16682 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
16683 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
16684 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
16685 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
16686 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16687 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
16688 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
16689 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16690 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
16691 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16692 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
16693 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
16695 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16696 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
16697 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
16698 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
16699 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
16700 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
16701 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
16702 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
16703 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16704 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
16705 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
16706 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
16707 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16708 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
16709 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
16710 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
16711 ; GFX940-TGSPLIT-NEXT: s_endpgm
16713 ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16714 ; GFX11-WGP: ; %bb.0: ; %entry
16715 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
16716 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
16717 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
16718 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
16719 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
16720 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
16721 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16722 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
16723 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
16724 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
16725 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
16726 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
16727 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16728 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
16729 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
16730 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16731 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
16732 ; GFX11-WGP-NEXT: s_endpgm
16734 ; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16735 ; GFX11-CU: ; %bb.0: ; %entry
16736 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
16737 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
16738 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
16739 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
16740 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
16741 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
16742 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16743 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
16744 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
16745 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
16746 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
16747 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
16748 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
16749 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16750 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
16751 ; GFX11-CU-NEXT: s_endpgm
16753 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16754 ; GFX12-WGP: ; %bb.0: ; %entry
16755 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
16756 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
16757 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
16758 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
16759 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
16760 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
16761 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16762 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
16763 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
16764 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
16765 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
16766 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
16767 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
16768 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
16769 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
16770 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
16771 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
16772 ; GFX12-WGP-NEXT: s_wait_loadcnt_dscnt 0x0
16773 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
16774 ; GFX12-WGP-NEXT: s_endpgm
16776 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
16777 ; GFX12-CU: ; %bb.0: ; %entry
16778 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
16779 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
16780 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
16781 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
16782 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
16783 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
16784 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16785 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
16786 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
16787 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
16788 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
16789 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
16790 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
16791 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
16792 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
16793 ; GFX12-CU-NEXT: s_endpgm
16794 ptr %out, i32 %in, i32 %old) {
16796 %gep = getelementptr i32, ptr %out, i32 4
16797 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release monotonic
16798 %val0 = extractvalue { i32, i1 } %val, 0
16799 store i32 %val0, ptr %out, align 4
16803 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
16804 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16805 ; GFX7: ; %bb.0: ; %entry
16806 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
16807 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
16808 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
16809 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
16810 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
16811 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
16812 ; GFX7-NEXT: s_mov_b32 s6, s4
16813 ; GFX7-NEXT: s_mov_b32 s7, s5
16814 ; GFX7-NEXT: s_mov_b32 s11, s12
16815 ; GFX7-NEXT: s_mov_b32 s10, s13
16816 ; GFX7-NEXT: s_add_u32 s6, s6, s11
16817 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
16818 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16819 ; GFX7-NEXT: s_mov_b32 s7, s10
16820 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
16821 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
16822 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16823 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
16824 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
16825 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
16826 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16827 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
16828 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
16829 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16830 ; GFX7-NEXT: flat_store_dword v[0:1], v2
16831 ; GFX7-NEXT: s_endpgm
16833 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16834 ; GFX10-WGP: ; %bb.0: ; %entry
16835 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
16836 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
16837 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
16838 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
16839 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
16840 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
16841 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
16842 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
16843 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
16844 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
16845 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
16846 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
16847 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16848 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
16849 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
16850 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
16851 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16852 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
16853 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
16854 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
16855 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
16856 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
16857 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16858 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
16859 ; GFX10-WGP-NEXT: buffer_gl0_inv
16860 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
16861 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
16862 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
16863 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
16864 ; GFX10-WGP-NEXT: s_endpgm
16866 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16867 ; GFX10-CU: ; %bb.0: ; %entry
16868 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
16869 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
16870 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
16871 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
16872 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
16873 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
16874 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
16875 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
16876 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
16877 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
16878 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
16879 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
16880 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
16881 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
16882 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
16883 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
16884 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16885 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
16886 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
16887 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
16888 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16889 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
16890 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
16891 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16892 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
16893 ; GFX10-CU-NEXT: s_endpgm
16895 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16896 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
16897 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
16898 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
16899 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
16900 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
16901 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
16902 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
16903 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
16904 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
16905 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
16906 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
16907 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
16908 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
16909 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
16910 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
16911 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
16912 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
16913 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16914 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
16915 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
16916 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
16917 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
16918 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
16919 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
16920 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16921 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
16922 ; SKIP-CACHE-INV-NEXT: s_endpgm
16924 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16925 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
16926 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
16927 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
16928 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
16929 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
16930 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
16931 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
16932 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16933 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
16934 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16935 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16936 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16937 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16938 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
16939 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
16941 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16942 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
16943 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
16944 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
16945 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
16946 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
16947 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
16948 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
16949 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16950 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
16951 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16952 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
16953 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
16954 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
16955 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
16956 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
16957 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
16958 ; GFX90A-TGSPLIT-NEXT: s_endpgm
16960 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16961 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
16962 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
16963 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
16964 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
16965 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
16966 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
16967 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
16968 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16969 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
16970 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
16971 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16972 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
16973 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
16974 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
16975 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
16977 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16978 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
16979 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
16980 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
16981 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
16982 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
16983 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
16984 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
16985 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
16986 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
16987 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
16988 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
16989 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
16990 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
16991 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
16992 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
16993 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
16994 ; GFX940-TGSPLIT-NEXT: s_endpgm
16996 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
16997 ; GFX11-WGP: ; %bb.0: ; %entry
16998 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
16999 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
17000 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
17001 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
17002 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
17003 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
17004 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17005 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
17006 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
17007 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
17008 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
17009 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
17010 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17011 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
17012 ; GFX11-WGP-NEXT: buffer_gl0_inv
17013 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
17014 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
17015 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
17016 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
17017 ; GFX11-WGP-NEXT: s_endpgm
17019 ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
17020 ; GFX11-CU: ; %bb.0: ; %entry
17021 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
17022 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
17023 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
17024 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
17025 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
17026 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
17027 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17028 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
17029 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
17030 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
17031 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17032 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
17033 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
17034 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17035 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
17036 ; GFX11-CU-NEXT: s_endpgm
17038 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
17039 ; GFX12-WGP: ; %bb.0: ; %entry
17040 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
17041 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
17042 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
17043 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
17044 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
17045 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
17046 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17047 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
17048 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
17049 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
17050 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
17051 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
17052 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
17053 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
17054 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
17055 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
17056 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
17057 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
17058 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
17059 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
17060 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
17061 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
17062 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
17063 ; GFX12-WGP-NEXT: s_endpgm
17065 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
17066 ; GFX12-CU: ; %bb.0: ; %entry
17067 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
17068 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
17069 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
17070 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
17071 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
17072 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
17073 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17074 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
17075 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
17076 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
17077 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
17078 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
17079 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
17080 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
17081 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
17082 ; GFX12-CU-NEXT: s_endpgm
17083 ptr %out, i32 %in, i32 %old) {
17085 %gep = getelementptr i32, ptr %out, i32 4
17086 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel monotonic
17087 %val0 = extractvalue { i32, i1 } %val, 0
17088 store i32 %val0, ptr %out, align 4
17092 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
17093 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17094 ; GFX7: ; %bb.0: ; %entry
17095 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
17096 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
17097 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
17098 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
17099 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
17100 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
17101 ; GFX7-NEXT: s_mov_b32 s6, s4
17102 ; GFX7-NEXT: s_mov_b32 s7, s5
17103 ; GFX7-NEXT: s_mov_b32 s11, s12
17104 ; GFX7-NEXT: s_mov_b32 s10, s13
17105 ; GFX7-NEXT: s_add_u32 s6, s6, s11
17106 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
17107 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17108 ; GFX7-NEXT: s_mov_b32 s7, s10
17109 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
17110 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
17111 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17112 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
17113 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
17114 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
17115 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17116 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
17117 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
17118 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17119 ; GFX7-NEXT: flat_store_dword v[0:1], v2
17120 ; GFX7-NEXT: s_endpgm
17122 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17123 ; GFX10-WGP: ; %bb.0: ; %entry
17124 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
17125 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
17126 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
17127 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
17128 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
17129 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
17130 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
17131 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
17132 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
17133 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
17134 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
17135 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
17136 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17137 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
17138 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
17139 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
17140 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17141 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
17142 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
17143 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
17144 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
17145 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
17146 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17147 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
17148 ; GFX10-WGP-NEXT: buffer_gl0_inv
17149 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
17150 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
17151 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
17152 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
17153 ; GFX10-WGP-NEXT: s_endpgm
17155 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17156 ; GFX10-CU: ; %bb.0: ; %entry
17157 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
17158 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
17159 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
17160 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
17161 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
17162 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
17163 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
17164 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
17165 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
17166 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
17167 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
17168 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
17169 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17170 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
17171 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
17172 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
17173 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17174 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
17175 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
17176 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
17177 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17178 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
17179 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
17180 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17181 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
17182 ; GFX10-CU-NEXT: s_endpgm
17184 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17185 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
17186 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
17187 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
17188 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
17189 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
17190 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
17191 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
17192 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
17193 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
17194 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
17195 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
17196 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
17197 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
17198 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
17199 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
17200 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
17201 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
17202 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17203 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
17204 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
17205 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
17206 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17207 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
17208 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
17209 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17210 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
17211 ; SKIP-CACHE-INV-NEXT: s_endpgm
17213 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17214 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
17215 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
17216 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
17217 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
17218 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
17219 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
17220 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
17221 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17222 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
17223 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17224 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17225 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17226 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17227 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
17228 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
17230 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17231 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
17232 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
17233 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
17234 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
17235 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
17236 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
17237 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
17238 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17239 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
17240 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17241 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
17242 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17243 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
17244 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
17245 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17246 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
17247 ; GFX90A-TGSPLIT-NEXT: s_endpgm
17249 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17250 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
17251 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
17252 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
17253 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
17254 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
17255 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
17256 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
17257 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17258 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
17259 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
17260 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17261 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
17262 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17263 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
17264 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
17266 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17267 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
17268 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
17269 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
17270 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
17271 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
17272 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
17273 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
17274 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17275 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
17276 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
17277 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
17278 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17279 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
17280 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
17281 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
17282 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
17283 ; GFX940-TGSPLIT-NEXT: s_endpgm
17285 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17286 ; GFX11-WGP: ; %bb.0: ; %entry
17287 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
17288 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
17289 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
17290 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
17291 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
17292 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
17293 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17294 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
17295 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
17296 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
17297 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
17298 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
17299 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17300 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
17301 ; GFX11-WGP-NEXT: buffer_gl0_inv
17302 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
17303 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
17304 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
17305 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
17306 ; GFX11-WGP-NEXT: s_endpgm
17308 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17309 ; GFX11-CU: ; %bb.0: ; %entry
17310 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
17311 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
17312 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
17313 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
17314 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
17315 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
17316 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17317 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
17318 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
17319 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
17320 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17321 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
17322 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
17323 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17324 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
17325 ; GFX11-CU-NEXT: s_endpgm
17327 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17328 ; GFX12-WGP: ; %bb.0: ; %entry
17329 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
17330 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
17331 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
17332 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
17333 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
17334 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
17335 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17336 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
17337 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
17338 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
17339 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
17340 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
17341 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
17342 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
17343 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
17344 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
17345 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
17346 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
17347 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
17348 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
17349 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
17350 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
17351 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
17352 ; GFX12-WGP-NEXT: s_endpgm
17354 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
17355 ; GFX12-CU: ; %bb.0: ; %entry
17356 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
17357 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
17358 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
17359 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
17360 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
17361 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
17362 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17363 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
17364 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
17365 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
17366 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
17367 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
17368 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
17369 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
17370 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
17371 ; GFX12-CU-NEXT: s_endpgm
17372 ptr %out, i32 %in, i32 %old) {
17374 %gep = getelementptr i32, ptr %out, i32 4
17375 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst monotonic
17376 %val0 = extractvalue { i32, i1 } %val, 0
17377 store i32 %val0, ptr %out, align 4
17381 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
17382 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17383 ; GFX7: ; %bb.0: ; %entry
17384 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
17385 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
17386 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
17387 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
17388 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
17389 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
17390 ; GFX7-NEXT: s_mov_b32 s6, s4
17391 ; GFX7-NEXT: s_mov_b32 s7, s5
17392 ; GFX7-NEXT: s_mov_b32 s11, s12
17393 ; GFX7-NEXT: s_mov_b32 s10, s13
17394 ; GFX7-NEXT: s_add_u32 s6, s6, s11
17395 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
17396 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17397 ; GFX7-NEXT: s_mov_b32 s7, s10
17398 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
17399 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
17400 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17401 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
17402 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
17403 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
17404 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17405 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
17406 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
17407 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17408 ; GFX7-NEXT: flat_store_dword v[0:1], v2
17409 ; GFX7-NEXT: s_endpgm
17411 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17412 ; GFX10-WGP: ; %bb.0: ; %entry
17413 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
17414 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
17415 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
17416 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
17417 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
17418 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
17419 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
17420 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
17421 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
17422 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
17423 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
17424 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
17425 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17426 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
17427 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
17428 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
17429 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17430 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
17431 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
17432 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
17433 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17434 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
17435 ; GFX10-WGP-NEXT: buffer_gl0_inv
17436 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
17437 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
17438 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
17439 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
17440 ; GFX10-WGP-NEXT: s_endpgm
17442 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17443 ; GFX10-CU: ; %bb.0: ; %entry
17444 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
17445 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
17446 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
17447 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
17448 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
17449 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
17450 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
17451 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
17452 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
17453 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
17454 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
17455 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
17456 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17457 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
17458 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
17459 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
17460 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17461 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
17462 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
17463 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
17464 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17465 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
17466 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
17467 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17468 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
17469 ; GFX10-CU-NEXT: s_endpgm
17471 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17472 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
17473 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
17474 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
17475 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
17476 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
17477 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
17478 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
17479 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
17480 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
17481 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
17482 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
17483 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
17484 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
17485 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
17486 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
17487 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
17488 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
17489 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17490 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
17491 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
17492 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
17493 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17494 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
17495 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
17496 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17497 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
17498 ; SKIP-CACHE-INV-NEXT: s_endpgm
17500 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17501 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
17502 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
17503 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
17504 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
17505 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
17506 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
17507 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
17508 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17509 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
17510 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17511 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17512 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17513 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17514 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
17515 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
17517 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17518 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
17519 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
17520 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
17521 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
17522 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
17523 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
17524 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
17525 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17526 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
17527 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17528 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17529 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
17530 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
17531 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17532 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
17533 ; GFX90A-TGSPLIT-NEXT: s_endpgm
17535 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17536 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
17537 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
17538 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
17539 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
17540 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
17541 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
17542 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
17543 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17544 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
17545 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
17546 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17547 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
17548 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17549 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
17550 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
17552 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17553 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
17554 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
17555 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
17556 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
17557 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
17558 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
17559 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
17560 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17561 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
17562 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
17563 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17564 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
17565 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
17566 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
17567 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
17568 ; GFX940-TGSPLIT-NEXT: s_endpgm
17570 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17571 ; GFX11-WGP: ; %bb.0: ; %entry
17572 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
17573 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
17574 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
17575 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
17576 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
17577 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
17578 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17579 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
17580 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
17581 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
17582 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17583 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
17584 ; GFX11-WGP-NEXT: buffer_gl0_inv
17585 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
17586 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
17587 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
17588 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
17589 ; GFX11-WGP-NEXT: s_endpgm
17591 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17592 ; GFX11-CU: ; %bb.0: ; %entry
17593 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
17594 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
17595 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
17596 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
17597 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
17598 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
17599 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17600 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
17601 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
17602 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
17603 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17604 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
17605 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
17606 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17607 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
17608 ; GFX11-CU-NEXT: s_endpgm
17610 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17611 ; GFX12-WGP: ; %bb.0: ; %entry
17612 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
17613 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
17614 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
17615 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
17616 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
17617 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
17618 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17619 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
17620 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
17621 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
17622 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
17623 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
17624 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
17625 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
17626 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
17627 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
17628 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
17629 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
17630 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
17631 ; GFX12-WGP-NEXT: s_endpgm
17633 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
17634 ; GFX12-CU: ; %bb.0: ; %entry
17635 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
17636 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
17637 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
17638 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
17639 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
17640 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
17641 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17642 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
17643 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
17644 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
17645 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
17646 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
17647 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
17648 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
17649 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
17650 ; GFX12-CU-NEXT: s_endpgm
17651 ptr %out, i32 %in, i32 %old) {
17653 %gep = getelementptr i32, ptr %out, i32 4
17654 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic acquire
17655 %val0 = extractvalue { i32, i1 } %val, 0
17656 store i32 %val0, ptr %out, align 4
17660 define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
17661 ; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17662 ; GFX7: ; %bb.0: ; %entry
17663 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
17664 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
17665 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
17666 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
17667 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
17668 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
17669 ; GFX7-NEXT: s_mov_b32 s6, s4
17670 ; GFX7-NEXT: s_mov_b32 s7, s5
17671 ; GFX7-NEXT: s_mov_b32 s11, s12
17672 ; GFX7-NEXT: s_mov_b32 s10, s13
17673 ; GFX7-NEXT: s_add_u32 s6, s6, s11
17674 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
17675 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17676 ; GFX7-NEXT: s_mov_b32 s7, s10
17677 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
17678 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
17679 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17680 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
17681 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
17682 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
17683 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17684 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
17685 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
17686 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17687 ; GFX7-NEXT: flat_store_dword v[0:1], v2
17688 ; GFX7-NEXT: s_endpgm
17690 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17691 ; GFX10-WGP: ; %bb.0: ; %entry
17692 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
17693 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
17694 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
17695 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
17696 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
17697 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
17698 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
17699 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
17700 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
17701 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
17702 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
17703 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
17704 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17705 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
17706 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
17707 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
17708 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17709 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
17710 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
17711 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
17712 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17713 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
17714 ; GFX10-WGP-NEXT: buffer_gl0_inv
17715 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
17716 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
17717 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
17718 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
17719 ; GFX10-WGP-NEXT: s_endpgm
17721 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17722 ; GFX10-CU: ; %bb.0: ; %entry
17723 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
17724 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
17725 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
17726 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
17727 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
17728 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
17729 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
17730 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
17731 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
17732 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
17733 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
17734 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
17735 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17736 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
17737 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
17738 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
17739 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17740 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
17741 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
17742 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
17743 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17744 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
17745 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
17746 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17747 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
17748 ; GFX10-CU-NEXT: s_endpgm
17750 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17751 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
17752 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
17753 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
17754 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
17755 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
17756 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
17757 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
17758 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
17759 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
17760 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
17761 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
17762 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
17763 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
17764 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
17765 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
17766 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
17767 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
17768 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17769 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
17770 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
17771 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
17772 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17773 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
17774 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
17775 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17776 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
17777 ; SKIP-CACHE-INV-NEXT: s_endpgm
17779 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17780 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
17781 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
17782 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
17783 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
17784 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
17785 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
17786 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
17787 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17788 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
17789 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17790 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17791 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17792 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17793 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
17794 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
17796 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17797 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
17798 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
17799 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
17800 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
17801 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
17802 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
17803 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
17804 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17805 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
17806 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17807 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
17808 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
17809 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
17810 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
17811 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
17812 ; GFX90A-TGSPLIT-NEXT: s_endpgm
17814 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17815 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
17816 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
17817 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
17818 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
17819 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
17820 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
17821 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
17822 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17823 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
17824 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
17825 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17826 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
17827 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17828 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
17829 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
17831 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17832 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
17833 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
17834 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
17835 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
17836 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
17837 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
17838 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
17839 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17840 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
17841 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
17842 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
17843 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
17844 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
17845 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
17846 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
17847 ; GFX940-TGSPLIT-NEXT: s_endpgm
17849 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17850 ; GFX11-WGP: ; %bb.0: ; %entry
17851 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
17852 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
17853 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
17854 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
17855 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
17856 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
17857 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17858 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
17859 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
17860 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
17861 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17862 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
17863 ; GFX11-WGP-NEXT: buffer_gl0_inv
17864 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
17865 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
17866 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
17867 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
17868 ; GFX11-WGP-NEXT: s_endpgm
17870 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17871 ; GFX11-CU: ; %bb.0: ; %entry
17872 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
17873 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
17874 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
17875 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
17876 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
17877 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
17878 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17879 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
17880 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
17881 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
17882 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
17883 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
17884 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
17885 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17886 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
17887 ; GFX11-CU-NEXT: s_endpgm
17889 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17890 ; GFX12-WGP: ; %bb.0: ; %entry
17891 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
17892 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
17893 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
17894 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
17895 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
17896 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
17897 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17898 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
17899 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
17900 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
17901 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
17902 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
17903 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
17904 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
17905 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
17906 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
17907 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
17908 ; GFX12-WGP-NEXT: s_endpgm
17910 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
17911 ; GFX12-CU: ; %bb.0: ; %entry
17912 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
17913 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
17914 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
17915 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
17916 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
17917 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
17918 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17919 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
17920 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
17921 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
17922 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
17923 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
17924 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
17925 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
17926 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
17927 ; GFX12-CU-NEXT: s_endpgm
17928 ptr %out, i32 %in, i32 %old) {
17930 %gep = getelementptr i32, ptr %out, i32 4
17931 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire acquire
17932 %val0 = extractvalue { i32, i1 } %val, 0
17933 store i32 %val0, ptr %out, align 4
17937 define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
17938 ; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
17939 ; GFX7: ; %bb.0: ; %entry
17940 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
17941 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
17942 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
17943 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
17944 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
17945 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
17946 ; GFX7-NEXT: s_mov_b32 s6, s4
17947 ; GFX7-NEXT: s_mov_b32 s7, s5
17948 ; GFX7-NEXT: s_mov_b32 s11, s12
17949 ; GFX7-NEXT: s_mov_b32 s10, s13
17950 ; GFX7-NEXT: s_add_u32 s6, s6, s11
17951 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
17952 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17953 ; GFX7-NEXT: s_mov_b32 s7, s10
17954 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
17955 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
17956 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17957 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
17958 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
17959 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
17960 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17961 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
17962 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
17963 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
17964 ; GFX7-NEXT: flat_store_dword v[0:1], v2
17965 ; GFX7-NEXT: s_endpgm
17967 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
17968 ; GFX10-WGP: ; %bb.0: ; %entry
17969 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
17970 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
17971 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
17972 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
17973 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
17974 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
17975 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
17976 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
17977 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
17978 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
17979 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
17980 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
17981 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
17982 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
17983 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
17984 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
17985 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
17986 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
17987 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
17988 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
17989 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
17990 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
17991 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
17992 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
17993 ; GFX10-WGP-NEXT: buffer_gl0_inv
17994 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
17995 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
17996 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
17997 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
17998 ; GFX10-WGP-NEXT: s_endpgm
18000 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18001 ; GFX10-CU: ; %bb.0: ; %entry
18002 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
18003 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
18004 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
18005 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
18006 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
18007 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
18008 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
18009 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
18010 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
18011 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
18012 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
18013 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
18014 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18015 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
18016 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
18017 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
18018 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18019 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
18020 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
18021 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
18022 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18023 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
18024 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
18025 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18026 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
18027 ; GFX10-CU-NEXT: s_endpgm
18029 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18030 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
18031 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
18032 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
18033 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
18034 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
18035 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
18036 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
18037 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
18038 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
18039 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
18040 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
18041 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
18042 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
18043 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
18044 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
18045 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
18046 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
18047 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18048 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
18049 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
18050 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
18051 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18052 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
18053 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
18054 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18055 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
18056 ; SKIP-CACHE-INV-NEXT: s_endpgm
18058 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18059 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
18060 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
18061 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
18062 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
18063 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
18064 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
18065 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
18066 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18067 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
18068 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18069 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18070 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18071 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18072 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
18073 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
18075 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18076 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
18077 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
18078 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
18079 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
18080 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
18081 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
18082 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
18083 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18084 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
18085 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18086 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
18087 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18088 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
18089 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
18090 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18091 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
18092 ; GFX90A-TGSPLIT-NEXT: s_endpgm
18094 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18095 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
18096 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
18097 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
18098 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
18099 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
18100 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
18101 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
18102 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18103 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
18104 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
18105 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18106 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
18107 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18108 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
18109 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
18111 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18112 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
18113 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
18114 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
18115 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
18116 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
18117 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
18118 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
18119 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18120 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
18121 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
18122 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
18123 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18124 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
18125 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
18126 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
18127 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
18128 ; GFX940-TGSPLIT-NEXT: s_endpgm
18130 ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18131 ; GFX11-WGP: ; %bb.0: ; %entry
18132 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
18133 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
18134 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
18135 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
18136 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
18137 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
18138 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18139 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
18140 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
18141 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
18142 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
18143 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
18144 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18145 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
18146 ; GFX11-WGP-NEXT: buffer_gl0_inv
18147 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
18148 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
18149 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
18150 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
18151 ; GFX11-WGP-NEXT: s_endpgm
18153 ; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18154 ; GFX11-CU: ; %bb.0: ; %entry
18155 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
18156 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
18157 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
18158 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
18159 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
18160 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
18161 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18162 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
18163 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
18164 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
18165 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18166 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
18167 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
18168 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18169 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
18170 ; GFX11-CU-NEXT: s_endpgm
18172 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18173 ; GFX12-WGP: ; %bb.0: ; %entry
18174 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
18175 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
18176 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
18177 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
18178 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
18179 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
18180 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18181 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
18182 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
18183 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
18184 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
18185 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
18186 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
18187 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
18188 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
18189 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
18190 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
18191 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
18192 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
18193 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
18194 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
18195 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
18196 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
18197 ; GFX12-WGP-NEXT: s_endpgm
18199 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
18200 ; GFX12-CU: ; %bb.0: ; %entry
18201 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
18202 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
18203 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
18204 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
18205 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
18206 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
18207 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18208 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
18209 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
18210 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
18211 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
18212 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
18213 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
18214 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
18215 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
18216 ; GFX12-CU-NEXT: s_endpgm
18217 ptr %out, i32 %in, i32 %old) {
18219 %gep = getelementptr i32, ptr %out, i32 4
18220 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release acquire
18221 %val0 = extractvalue { i32, i1 } %val, 0
18222 store i32 %val0, ptr %out, align 4
18226 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
18227 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18228 ; GFX7: ; %bb.0: ; %entry
18229 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
18230 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
18231 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
18232 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
18233 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
18234 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
18235 ; GFX7-NEXT: s_mov_b32 s6, s4
18236 ; GFX7-NEXT: s_mov_b32 s7, s5
18237 ; GFX7-NEXT: s_mov_b32 s11, s12
18238 ; GFX7-NEXT: s_mov_b32 s10, s13
18239 ; GFX7-NEXT: s_add_u32 s6, s6, s11
18240 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
18241 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18242 ; GFX7-NEXT: s_mov_b32 s7, s10
18243 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
18244 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
18245 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18246 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
18247 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
18248 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
18249 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18250 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
18251 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
18252 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18253 ; GFX7-NEXT: flat_store_dword v[0:1], v2
18254 ; GFX7-NEXT: s_endpgm
18256 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18257 ; GFX10-WGP: ; %bb.0: ; %entry
18258 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
18259 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
18260 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
18261 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
18262 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
18263 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
18264 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
18265 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
18266 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
18267 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
18268 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
18269 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
18270 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18271 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
18272 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
18273 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
18274 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18275 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
18276 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
18277 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
18278 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
18279 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
18280 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18281 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
18282 ; GFX10-WGP-NEXT: buffer_gl0_inv
18283 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
18284 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
18285 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
18286 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
18287 ; GFX10-WGP-NEXT: s_endpgm
18289 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18290 ; GFX10-CU: ; %bb.0: ; %entry
18291 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
18292 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
18293 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
18294 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
18295 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
18296 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
18297 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
18298 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
18299 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
18300 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
18301 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
18302 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
18303 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18304 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
18305 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
18306 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
18307 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18308 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
18309 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
18310 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
18311 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18312 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
18313 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
18314 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18315 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
18316 ; GFX10-CU-NEXT: s_endpgm
18318 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18319 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
18320 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
18321 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
18322 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
18323 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
18324 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
18325 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
18326 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
18327 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
18328 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
18329 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
18330 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
18331 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
18332 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
18333 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
18334 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
18335 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
18336 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18337 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
18338 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
18339 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
18340 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18341 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
18342 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
18343 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18344 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
18345 ; SKIP-CACHE-INV-NEXT: s_endpgm
18347 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18348 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
18349 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
18350 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
18351 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
18352 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
18353 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
18354 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
18355 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18356 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
18357 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18358 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18359 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18360 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18361 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
18362 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
18364 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18365 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
18366 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
18367 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
18368 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
18369 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
18370 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
18371 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
18372 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18373 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
18374 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18375 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
18376 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18377 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
18378 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
18379 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18380 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
18381 ; GFX90A-TGSPLIT-NEXT: s_endpgm
18383 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18384 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
18385 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
18386 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
18387 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
18388 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
18389 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
18390 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
18391 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18392 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
18393 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
18394 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18395 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
18396 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18397 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
18398 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
18400 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18401 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
18402 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
18403 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
18404 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
18405 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
18406 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
18407 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
18408 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18409 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
18410 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
18411 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
18412 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18413 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
18414 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
18415 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
18416 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
18417 ; GFX940-TGSPLIT-NEXT: s_endpgm
18419 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18420 ; GFX11-WGP: ; %bb.0: ; %entry
18421 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
18422 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
18423 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
18424 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
18425 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
18426 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
18427 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18428 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
18429 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
18430 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
18431 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
18432 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
18433 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18434 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
18435 ; GFX11-WGP-NEXT: buffer_gl0_inv
18436 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
18437 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
18438 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
18439 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
18440 ; GFX11-WGP-NEXT: s_endpgm
18442 ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18443 ; GFX11-CU: ; %bb.0: ; %entry
18444 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
18445 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
18446 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
18447 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
18448 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
18449 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
18450 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18451 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
18452 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
18453 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
18454 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18455 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
18456 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
18457 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18458 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
18459 ; GFX11-CU-NEXT: s_endpgm
18461 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18462 ; GFX12-WGP: ; %bb.0: ; %entry
18463 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
18464 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
18465 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
18466 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
18467 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
18468 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
18469 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18470 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
18471 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
18472 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
18473 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
18474 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
18475 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
18476 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
18477 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
18478 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
18479 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
18480 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
18481 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
18482 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
18483 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
18484 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
18485 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
18486 ; GFX12-WGP-NEXT: s_endpgm
18488 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
18489 ; GFX12-CU: ; %bb.0: ; %entry
18490 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
18491 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
18492 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
18493 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
18494 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
18495 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
18496 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18497 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
18498 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
18499 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
18500 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
18501 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
18502 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
18503 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
18504 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
18505 ; GFX12-CU-NEXT: s_endpgm
18506 ptr %out, i32 %in, i32 %old) {
18508 %gep = getelementptr i32, ptr %out, i32 4
18509 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel acquire
18510 %val0 = extractvalue { i32, i1 } %val, 0
18511 store i32 %val0, ptr %out, align 4
18515 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
18516 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18517 ; GFX7: ; %bb.0: ; %entry
18518 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
18519 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
18520 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
18521 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
18522 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
18523 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
18524 ; GFX7-NEXT: s_mov_b32 s6, s4
18525 ; GFX7-NEXT: s_mov_b32 s7, s5
18526 ; GFX7-NEXT: s_mov_b32 s11, s12
18527 ; GFX7-NEXT: s_mov_b32 s10, s13
18528 ; GFX7-NEXT: s_add_u32 s6, s6, s11
18529 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
18530 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18531 ; GFX7-NEXT: s_mov_b32 s7, s10
18532 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
18533 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
18534 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18535 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
18536 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
18537 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
18538 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18539 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
18540 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
18541 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18542 ; GFX7-NEXT: flat_store_dword v[0:1], v2
18543 ; GFX7-NEXT: s_endpgm
18545 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18546 ; GFX10-WGP: ; %bb.0: ; %entry
18547 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
18548 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
18549 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
18550 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
18551 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
18552 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
18553 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
18554 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
18555 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
18556 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
18557 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
18558 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
18559 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18560 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
18561 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
18562 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
18563 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18564 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
18565 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
18566 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
18567 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
18568 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
18569 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18570 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
18571 ; GFX10-WGP-NEXT: buffer_gl0_inv
18572 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
18573 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
18574 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
18575 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
18576 ; GFX10-WGP-NEXT: s_endpgm
18578 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18579 ; GFX10-CU: ; %bb.0: ; %entry
18580 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
18581 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
18582 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
18583 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
18584 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
18585 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
18586 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
18587 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
18588 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
18589 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
18590 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
18591 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
18592 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18593 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
18594 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
18595 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
18596 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18597 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
18598 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
18599 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
18600 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18601 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
18602 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
18603 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18604 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
18605 ; GFX10-CU-NEXT: s_endpgm
18607 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18608 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
18609 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
18610 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
18611 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
18612 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
18613 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
18614 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
18615 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
18616 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
18617 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
18618 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
18619 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
18620 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
18621 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
18622 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
18623 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
18624 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
18625 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18626 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
18627 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
18628 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
18629 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18630 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
18631 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
18632 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18633 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
18634 ; SKIP-CACHE-INV-NEXT: s_endpgm
18636 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18637 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
18638 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
18639 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
18640 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
18641 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
18642 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
18643 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
18644 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18645 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
18646 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18647 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18648 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18649 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18650 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
18651 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
18653 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18654 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
18655 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
18656 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
18657 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
18658 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
18659 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
18660 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
18661 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18662 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
18663 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18664 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
18665 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18666 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
18667 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
18668 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18669 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
18670 ; GFX90A-TGSPLIT-NEXT: s_endpgm
18672 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18673 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
18674 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
18675 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
18676 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
18677 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
18678 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
18679 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
18680 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18681 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
18682 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
18683 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18684 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
18685 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18686 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
18687 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
18689 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18690 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
18691 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
18692 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
18693 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
18694 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
18695 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
18696 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
18697 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18698 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
18699 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
18700 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
18701 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18702 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
18703 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
18704 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
18705 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
18706 ; GFX940-TGSPLIT-NEXT: s_endpgm
18708 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18709 ; GFX11-WGP: ; %bb.0: ; %entry
18710 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
18711 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
18712 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
18713 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
18714 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
18715 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
18716 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18717 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
18718 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
18719 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
18720 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
18721 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
18722 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18723 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
18724 ; GFX11-WGP-NEXT: buffer_gl0_inv
18725 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
18726 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
18727 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
18728 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
18729 ; GFX11-WGP-NEXT: s_endpgm
18731 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18732 ; GFX11-CU: ; %bb.0: ; %entry
18733 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
18734 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
18735 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
18736 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
18737 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
18738 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
18739 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18740 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
18741 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
18742 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
18743 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
18744 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
18745 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
18746 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18747 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
18748 ; GFX11-CU-NEXT: s_endpgm
18750 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18751 ; GFX12-WGP: ; %bb.0: ; %entry
18752 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
18753 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
18754 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
18755 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
18756 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
18757 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
18758 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18759 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
18760 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
18761 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
18762 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
18763 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
18764 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
18765 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
18766 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
18767 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
18768 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
18769 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
18770 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
18771 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
18772 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
18773 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
18774 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
18775 ; GFX12-WGP-NEXT: s_endpgm
18777 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
18778 ; GFX12-CU: ; %bb.0: ; %entry
18779 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
18780 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
18781 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
18782 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
18783 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
18784 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
18785 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18786 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
18787 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
18788 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
18789 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
18790 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
18791 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
18792 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
18793 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
18794 ; GFX12-CU-NEXT: s_endpgm
18795 ptr %out, i32 %in, i32 %old) {
18797 %gep = getelementptr i32, ptr %out, i32 4
18798 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst acquire
18799 %val0 = extractvalue { i32, i1 } %val, 0
18800 store i32 %val0, ptr %out, align 4
18804 define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
18805 ; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18806 ; GFX7: ; %bb.0: ; %entry
18807 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
18808 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
18809 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
18810 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
18811 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
18812 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
18813 ; GFX7-NEXT: s_mov_b32 s6, s4
18814 ; GFX7-NEXT: s_mov_b32 s7, s5
18815 ; GFX7-NEXT: s_mov_b32 s11, s12
18816 ; GFX7-NEXT: s_mov_b32 s10, s13
18817 ; GFX7-NEXT: s_add_u32 s6, s6, s11
18818 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
18819 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18820 ; GFX7-NEXT: s_mov_b32 s7, s10
18821 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
18822 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
18823 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18824 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
18825 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
18826 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
18827 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18828 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
18829 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
18830 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18831 ; GFX7-NEXT: flat_store_dword v[0:1], v2
18832 ; GFX7-NEXT: s_endpgm
18834 ; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18835 ; GFX10-WGP: ; %bb.0: ; %entry
18836 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
18837 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
18838 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
18839 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
18840 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
18841 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
18842 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
18843 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
18844 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
18845 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
18846 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
18847 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
18848 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18849 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
18850 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
18851 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
18852 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18853 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
18854 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
18855 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
18856 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
18857 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
18858 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18859 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
18860 ; GFX10-WGP-NEXT: buffer_gl0_inv
18861 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
18862 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
18863 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
18864 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
18865 ; GFX10-WGP-NEXT: s_endpgm
18867 ; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18868 ; GFX10-CU: ; %bb.0: ; %entry
18869 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
18870 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
18871 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
18872 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
18873 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
18874 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
18875 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
18876 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
18877 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
18878 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
18879 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
18880 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
18881 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
18882 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
18883 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
18884 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
18885 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18886 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
18887 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
18888 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
18889 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18890 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
18891 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
18892 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18893 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
18894 ; GFX10-CU-NEXT: s_endpgm
18896 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18897 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
18898 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
18899 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
18900 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
18901 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
18902 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
18903 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
18904 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
18905 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
18906 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
18907 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
18908 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
18909 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
18910 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
18911 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
18912 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
18913 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
18914 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18915 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
18916 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
18917 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
18918 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
18919 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
18920 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
18921 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18922 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
18923 ; SKIP-CACHE-INV-NEXT: s_endpgm
18925 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18926 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
18927 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
18928 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
18929 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
18930 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
18931 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
18932 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
18933 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18934 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
18935 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18936 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18937 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18938 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18939 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
18940 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
18942 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18943 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
18944 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
18945 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
18946 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
18947 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
18948 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
18949 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
18950 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18951 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
18952 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18953 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
18954 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
18955 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
18956 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
18957 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
18958 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
18959 ; GFX90A-TGSPLIT-NEXT: s_endpgm
18961 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18962 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
18963 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
18964 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
18965 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
18966 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
18967 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
18968 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
18969 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18970 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
18971 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
18972 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18973 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
18974 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18975 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
18976 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
18978 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18979 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
18980 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
18981 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
18982 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
18983 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
18984 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
18985 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
18986 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
18987 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
18988 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
18989 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
18990 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
18991 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
18992 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
18993 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
18994 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
18995 ; GFX940-TGSPLIT-NEXT: s_endpgm
18997 ; GFX11-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
18998 ; GFX11-WGP: ; %bb.0: ; %entry
18999 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
19000 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
19001 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
19002 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
19003 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
19004 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
19005 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19006 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
19007 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
19008 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
19009 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
19010 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
19011 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19012 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
19013 ; GFX11-WGP-NEXT: buffer_gl0_inv
19014 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
19015 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
19016 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
19017 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
19018 ; GFX11-WGP-NEXT: s_endpgm
19020 ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
19021 ; GFX11-CU: ; %bb.0: ; %entry
19022 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
19023 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
19024 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
19025 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
19026 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
19027 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
19028 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19029 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
19030 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
19031 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
19032 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19033 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
19034 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
19035 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19036 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
19037 ; GFX11-CU-NEXT: s_endpgm
19039 ; GFX12-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
19040 ; GFX12-WGP: ; %bb.0: ; %entry
19041 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
19042 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
19043 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
19044 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
19045 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
19046 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
19047 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19048 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
19049 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
19050 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
19051 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
19052 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
19053 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
19054 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
19055 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
19056 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
19057 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
19058 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
19059 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
19060 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
19061 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
19062 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
19063 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
19064 ; GFX12-WGP-NEXT: s_endpgm
19066 ; GFX12-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
19067 ; GFX12-CU: ; %bb.0: ; %entry
19068 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
19069 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
19070 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
19071 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
19072 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
19073 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
19074 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19075 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
19076 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
19077 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
19078 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
19079 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
19080 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
19081 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
19082 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
19083 ; GFX12-CU-NEXT: s_endpgm
19084 ptr %out, i32 %in, i32 %old) {
19086 %gep = getelementptr i32, ptr %out, i32 4
19087 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") monotonic seq_cst
19088 %val0 = extractvalue { i32, i1 } %val, 0
19089 store i32 %val0, ptr %out, align 4
19093 define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
19094 ; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19095 ; GFX7: ; %bb.0: ; %entry
19096 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
19097 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
19098 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
19099 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
19100 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
19101 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
19102 ; GFX7-NEXT: s_mov_b32 s6, s4
19103 ; GFX7-NEXT: s_mov_b32 s7, s5
19104 ; GFX7-NEXT: s_mov_b32 s11, s12
19105 ; GFX7-NEXT: s_mov_b32 s10, s13
19106 ; GFX7-NEXT: s_add_u32 s6, s6, s11
19107 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
19108 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19109 ; GFX7-NEXT: s_mov_b32 s7, s10
19110 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
19111 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
19112 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19113 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
19114 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
19115 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
19116 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19117 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
19118 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
19119 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19120 ; GFX7-NEXT: flat_store_dword v[0:1], v2
19121 ; GFX7-NEXT: s_endpgm
19123 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19124 ; GFX10-WGP: ; %bb.0: ; %entry
19125 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
19126 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
19127 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
19128 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
19129 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
19130 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
19131 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
19132 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
19133 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
19134 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
19135 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
19136 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
19137 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19138 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
19139 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
19140 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
19141 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19142 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
19143 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
19144 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
19145 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
19146 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
19147 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19148 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
19149 ; GFX10-WGP-NEXT: buffer_gl0_inv
19150 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
19151 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
19152 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
19153 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
19154 ; GFX10-WGP-NEXT: s_endpgm
19156 ; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19157 ; GFX10-CU: ; %bb.0: ; %entry
19158 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
19159 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
19160 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
19161 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
19162 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
19163 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
19164 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
19165 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
19166 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
19167 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
19168 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
19169 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
19170 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19171 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
19172 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
19173 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
19174 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19175 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
19176 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
19177 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
19178 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19179 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
19180 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
19181 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19182 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
19183 ; GFX10-CU-NEXT: s_endpgm
19185 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19186 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
19187 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
19188 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
19189 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
19190 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
19191 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
19192 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
19193 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
19194 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
19195 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
19196 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
19197 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
19198 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
19199 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
19200 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
19201 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
19202 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
19203 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19204 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
19205 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
19206 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
19207 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19208 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
19209 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
19210 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19211 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
19212 ; SKIP-CACHE-INV-NEXT: s_endpgm
19214 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19215 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
19216 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
19217 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
19218 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
19219 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
19220 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
19221 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
19222 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19223 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
19224 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19225 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19226 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19227 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19228 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
19229 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
19231 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19232 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
19233 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
19234 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
19235 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
19236 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
19237 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
19238 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
19239 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19240 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
19241 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19242 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
19243 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19244 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
19245 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
19246 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19247 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
19248 ; GFX90A-TGSPLIT-NEXT: s_endpgm
19250 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19251 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
19252 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
19253 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
19254 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
19255 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
19256 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
19257 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
19258 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19259 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
19260 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
19261 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
19262 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
19263 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19264 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
19265 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
19267 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19268 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
19269 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
19270 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
19271 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
19272 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
19273 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
19274 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
19275 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19276 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
19277 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
19278 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
19279 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
19280 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
19281 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
19282 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
19283 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
19284 ; GFX940-TGSPLIT-NEXT: s_endpgm
19286 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19287 ; GFX11-WGP: ; %bb.0: ; %entry
19288 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
19289 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
19290 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
19291 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
19292 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
19293 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
19294 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19295 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
19296 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
19297 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
19298 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
19299 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
19300 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19301 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
19302 ; GFX11-WGP-NEXT: buffer_gl0_inv
19303 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
19304 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
19305 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
19306 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
19307 ; GFX11-WGP-NEXT: s_endpgm
19309 ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19310 ; GFX11-CU: ; %bb.0: ; %entry
19311 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
19312 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
19313 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
19314 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
19315 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
19316 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
19317 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19318 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
19319 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
19320 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
19321 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19322 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
19323 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
19324 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19325 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
19326 ; GFX11-CU-NEXT: s_endpgm
19328 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19329 ; GFX12-WGP: ; %bb.0: ; %entry
19330 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
19331 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
19332 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
19333 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
19334 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
19335 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
19336 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19337 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
19338 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
19339 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
19340 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
19341 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
19342 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
19343 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
19344 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
19345 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
19346 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
19347 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
19348 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
19349 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
19350 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
19351 ; GFX12-WGP-NEXT: s_endpgm
19353 ; GFX12-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
19354 ; GFX12-CU: ; %bb.0: ; %entry
19355 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
19356 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
19357 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
19358 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
19359 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
19360 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
19361 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19362 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
19363 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
19364 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
19365 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
19366 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
19367 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
19368 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
19369 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
19370 ; GFX12-CU-NEXT: s_endpgm
19371 ptr %out, i32 %in, i32 %old) {
19373 %gep = getelementptr i32, ptr %out, i32 4
19374 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acquire seq_cst
19375 %val0 = extractvalue { i32, i1 } %val, 0
19376 store i32 %val0, ptr %out, align 4
19380 define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
19381 ; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19382 ; GFX7: ; %bb.0: ; %entry
19383 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
19384 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
19385 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
19386 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
19387 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
19388 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
19389 ; GFX7-NEXT: s_mov_b32 s6, s4
19390 ; GFX7-NEXT: s_mov_b32 s7, s5
19391 ; GFX7-NEXT: s_mov_b32 s11, s12
19392 ; GFX7-NEXT: s_mov_b32 s10, s13
19393 ; GFX7-NEXT: s_add_u32 s6, s6, s11
19394 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
19395 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19396 ; GFX7-NEXT: s_mov_b32 s7, s10
19397 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
19398 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
19399 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19400 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
19401 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
19402 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
19403 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19404 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
19405 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
19406 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19407 ; GFX7-NEXT: flat_store_dword v[0:1], v2
19408 ; GFX7-NEXT: s_endpgm
19410 ; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19411 ; GFX10-WGP: ; %bb.0: ; %entry
19412 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
19413 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
19414 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
19415 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
19416 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
19417 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
19418 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
19419 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
19420 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
19421 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
19422 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
19423 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
19424 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19425 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
19426 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
19427 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
19428 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19429 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
19430 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
19431 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
19432 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
19433 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
19434 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19435 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
19436 ; GFX10-WGP-NEXT: buffer_gl0_inv
19437 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
19438 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
19439 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
19440 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
19441 ; GFX10-WGP-NEXT: s_endpgm
19443 ; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19444 ; GFX10-CU: ; %bb.0: ; %entry
19445 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
19446 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
19447 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
19448 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
19449 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
19450 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
19451 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
19452 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
19453 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
19454 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
19455 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
19456 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
19457 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19458 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
19459 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
19460 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
19461 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19462 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
19463 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
19464 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
19465 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19466 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
19467 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
19468 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19469 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
19470 ; GFX10-CU-NEXT: s_endpgm
19472 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19473 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
19474 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
19475 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
19476 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
19477 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
19478 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
19479 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
19480 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
19481 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
19482 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
19483 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
19484 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
19485 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
19486 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
19487 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
19488 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
19489 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
19490 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19491 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
19492 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
19493 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
19494 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19495 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
19496 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
19497 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19498 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
19499 ; SKIP-CACHE-INV-NEXT: s_endpgm
19501 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19502 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
19503 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
19504 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
19505 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
19506 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
19507 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
19508 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
19509 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19510 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
19511 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19512 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19513 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19514 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19515 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
19516 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
19518 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19519 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
19520 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
19521 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
19522 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
19523 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
19524 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
19525 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
19526 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19527 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
19528 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19529 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
19530 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19531 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
19532 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
19533 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19534 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
19535 ; GFX90A-TGSPLIT-NEXT: s_endpgm
19537 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19538 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
19539 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
19540 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
19541 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
19542 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
19543 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
19544 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
19545 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19546 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
19547 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
19548 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
19549 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
19550 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19551 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
19552 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
19554 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19555 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
19556 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
19557 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
19558 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
19559 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
19560 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
19561 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
19562 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19563 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
19564 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
19565 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
19566 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
19567 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
19568 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
19569 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
19570 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
19571 ; GFX940-TGSPLIT-NEXT: s_endpgm
19573 ; GFX11-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19574 ; GFX11-WGP: ; %bb.0: ; %entry
19575 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
19576 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
19577 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
19578 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
19579 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
19580 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
19581 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19582 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
19583 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
19584 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
19585 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
19586 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
19587 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19588 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
19589 ; GFX11-WGP-NEXT: buffer_gl0_inv
19590 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
19591 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
19592 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
19593 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
19594 ; GFX11-WGP-NEXT: s_endpgm
19596 ; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19597 ; GFX11-CU: ; %bb.0: ; %entry
19598 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
19599 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
19600 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
19601 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
19602 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
19603 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
19604 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19605 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
19606 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
19607 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
19608 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19609 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
19610 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
19611 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19612 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
19613 ; GFX11-CU-NEXT: s_endpgm
19615 ; GFX12-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19616 ; GFX12-WGP: ; %bb.0: ; %entry
19617 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
19618 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
19619 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
19620 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
19621 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
19622 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
19623 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19624 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
19625 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
19626 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
19627 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
19628 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
19629 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
19630 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
19631 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
19632 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
19633 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
19634 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
19635 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
19636 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
19637 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
19638 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
19639 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
19640 ; GFX12-WGP-NEXT: s_endpgm
19642 ; GFX12-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
19643 ; GFX12-CU: ; %bb.0: ; %entry
19644 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
19645 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
19646 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
19647 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
19648 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
19649 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
19650 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19651 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
19652 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
19653 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
19654 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
19655 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
19656 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
19657 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
19658 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
19659 ; GFX12-CU-NEXT: s_endpgm
19660 ptr %out, i32 %in, i32 %old) {
19662 %gep = getelementptr i32, ptr %out, i32 4
19663 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") release seq_cst
19664 %val0 = extractvalue { i32, i1 } %val, 0
19665 store i32 %val0, ptr %out, align 4
19669 define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
19670 ; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19671 ; GFX7: ; %bb.0: ; %entry
19672 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
19673 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
19674 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
19675 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
19676 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
19677 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
19678 ; GFX7-NEXT: s_mov_b32 s6, s4
19679 ; GFX7-NEXT: s_mov_b32 s7, s5
19680 ; GFX7-NEXT: s_mov_b32 s11, s12
19681 ; GFX7-NEXT: s_mov_b32 s10, s13
19682 ; GFX7-NEXT: s_add_u32 s6, s6, s11
19683 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
19684 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19685 ; GFX7-NEXT: s_mov_b32 s7, s10
19686 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
19687 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
19688 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19689 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
19690 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
19691 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
19692 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19693 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
19694 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
19695 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19696 ; GFX7-NEXT: flat_store_dword v[0:1], v2
19697 ; GFX7-NEXT: s_endpgm
19699 ; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19700 ; GFX10-WGP: ; %bb.0: ; %entry
19701 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
19702 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
19703 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
19704 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
19705 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
19706 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
19707 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
19708 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
19709 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
19710 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
19711 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
19712 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
19713 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19714 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
19715 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
19716 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
19717 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19718 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
19719 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
19720 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
19721 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
19722 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
19723 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19724 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
19725 ; GFX10-WGP-NEXT: buffer_gl0_inv
19726 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
19727 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
19728 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
19729 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
19730 ; GFX10-WGP-NEXT: s_endpgm
19732 ; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19733 ; GFX10-CU: ; %bb.0: ; %entry
19734 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
19735 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
19736 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
19737 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
19738 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
19739 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
19740 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
19741 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
19742 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
19743 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
19744 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
19745 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
19746 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19747 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
19748 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
19749 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
19750 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19751 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
19752 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
19753 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
19754 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19755 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
19756 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
19757 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19758 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
19759 ; GFX10-CU-NEXT: s_endpgm
19761 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19762 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
19763 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
19764 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
19765 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
19766 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
19767 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
19768 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
19769 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
19770 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
19771 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
19772 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
19773 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
19774 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
19775 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
19776 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
19777 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
19778 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
19779 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19780 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
19781 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
19782 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
19783 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19784 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
19785 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
19786 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19787 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
19788 ; SKIP-CACHE-INV-NEXT: s_endpgm
19790 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19791 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
19792 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
19793 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
19794 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
19795 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
19796 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
19797 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
19798 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19799 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
19800 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19801 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19802 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19803 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19804 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
19805 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
19807 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19808 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
19809 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
19810 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
19811 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
19812 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
19813 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
19814 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
19815 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19816 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
19817 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19818 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
19819 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
19820 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
19821 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
19822 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
19823 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
19824 ; GFX90A-TGSPLIT-NEXT: s_endpgm
19826 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19827 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
19828 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
19829 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
19830 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
19831 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
19832 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
19833 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
19834 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19835 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
19836 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
19837 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
19838 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
19839 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19840 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
19841 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
19843 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19844 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
19845 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
19846 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
19847 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
19848 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
19849 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
19850 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
19851 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19852 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
19853 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
19854 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
19855 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
19856 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
19857 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
19858 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
19859 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
19860 ; GFX940-TGSPLIT-NEXT: s_endpgm
19862 ; GFX11-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19863 ; GFX11-WGP: ; %bb.0: ; %entry
19864 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
19865 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
19866 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
19867 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
19868 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
19869 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
19870 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19871 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
19872 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
19873 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
19874 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
19875 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
19876 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19877 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
19878 ; GFX11-WGP-NEXT: buffer_gl0_inv
19879 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
19880 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
19881 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
19882 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
19883 ; GFX11-WGP-NEXT: s_endpgm
19885 ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19886 ; GFX11-CU: ; %bb.0: ; %entry
19887 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
19888 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
19889 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
19890 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
19891 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
19892 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
19893 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19894 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
19895 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
19896 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
19897 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
19898 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
19899 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
19900 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19901 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
19902 ; GFX11-CU-NEXT: s_endpgm
19904 ; GFX12-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19905 ; GFX12-WGP: ; %bb.0: ; %entry
19906 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
19907 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
19908 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
19909 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
19910 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
19911 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
19912 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19913 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
19914 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
19915 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
19916 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
19917 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
19918 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
19919 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
19920 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
19921 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
19922 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
19923 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
19924 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
19925 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
19926 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
19927 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
19928 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
19929 ; GFX12-WGP-NEXT: s_endpgm
19931 ; GFX12-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
19932 ; GFX12-CU: ; %bb.0: ; %entry
19933 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
19934 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
19935 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
19936 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
19937 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
19938 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
19939 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19940 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
19941 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
19942 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
19943 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
19944 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
19945 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
19946 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
19947 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
19948 ; GFX12-CU-NEXT: s_endpgm
19949 ptr %out, i32 %in, i32 %old) {
19951 %gep = getelementptr i32, ptr %out, i32 4
19952 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") acq_rel seq_cst
19953 %val0 = extractvalue { i32, i1 } %val, 0
19954 store i32 %val0, ptr %out, align 4
19958 define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
19959 ; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
19960 ; GFX7: ; %bb.0: ; %entry
19961 ; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
19962 ; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
19963 ; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
19964 ; GFX7-NEXT: s_load_dword s8, s[6:7], 0x3
19965 ; GFX7-NEXT: s_mov_b64 s[12:13], 16
19966 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
19967 ; GFX7-NEXT: s_mov_b32 s6, s4
19968 ; GFX7-NEXT: s_mov_b32 s7, s5
19969 ; GFX7-NEXT: s_mov_b32 s11, s12
19970 ; GFX7-NEXT: s_mov_b32 s10, s13
19971 ; GFX7-NEXT: s_add_u32 s6, s6, s11
19972 ; GFX7-NEXT: s_addc_u32 s10, s7, s10
19973 ; GFX7-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
19974 ; GFX7-NEXT: s_mov_b32 s7, s10
19975 ; GFX7-NEXT: v_mov_b32_e32 v2, s9
19976 ; GFX7-NEXT: v_mov_b32_e32 v0, s8
19977 ; GFX7-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
19978 ; GFX7-NEXT: v_mov_b32_e32 v3, v0
19979 ; GFX7-NEXT: v_mov_b32_e32 v0, s6
19980 ; GFX7-NEXT: v_mov_b32_e32 v1, s7
19981 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
19982 ; GFX7-NEXT: v_mov_b32_e32 v0, s4
19983 ; GFX7-NEXT: v_mov_b32_e32 v1, s5
19984 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19985 ; GFX7-NEXT: flat_store_dword v[0:1], v2
19986 ; GFX7-NEXT: s_endpgm
19988 ; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
19989 ; GFX10-WGP: ; %bb.0: ; %entry
19990 ; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
19991 ; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
19992 ; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
19993 ; GFX10-WGP-NEXT: s_load_dword s8, s[6:7], 0xc
19994 ; GFX10-WGP-NEXT: s_mov_b64 s[12:13], 16
19995 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
19996 ; GFX10-WGP-NEXT: s_mov_b32 s6, s4
19997 ; GFX10-WGP-NEXT: s_mov_b32 s7, s5
19998 ; GFX10-WGP-NEXT: s_mov_b32 s11, s12
19999 ; GFX10-WGP-NEXT: s_mov_b32 s10, s13
20000 ; GFX10-WGP-NEXT: s_add_u32 s6, s6, s11
20001 ; GFX10-WGP-NEXT: s_addc_u32 s10, s7, s10
20002 ; GFX10-WGP-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
20003 ; GFX10-WGP-NEXT: s_mov_b32 s7, s10
20004 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s9
20005 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s8
20006 ; GFX10-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20007 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, v0
20008 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s6
20009 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s7
20010 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
20011 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
20012 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20013 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
20014 ; GFX10-WGP-NEXT: buffer_gl0_inv
20015 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4
20016 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5
20017 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
20018 ; GFX10-WGP-NEXT: flat_store_dword v[0:1], v2
20019 ; GFX10-WGP-NEXT: s_endpgm
20021 ; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20022 ; GFX10-CU: ; %bb.0: ; %entry
20023 ; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
20024 ; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
20025 ; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
20026 ; GFX10-CU-NEXT: s_load_dword s8, s[6:7], 0xc
20027 ; GFX10-CU-NEXT: s_mov_b64 s[12:13], 16
20028 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
20029 ; GFX10-CU-NEXT: s_mov_b32 s6, s4
20030 ; GFX10-CU-NEXT: s_mov_b32 s7, s5
20031 ; GFX10-CU-NEXT: s_mov_b32 s11, s12
20032 ; GFX10-CU-NEXT: s_mov_b32 s10, s13
20033 ; GFX10-CU-NEXT: s_add_u32 s6, s6, s11
20034 ; GFX10-CU-NEXT: s_addc_u32 s10, s7, s10
20035 ; GFX10-CU-NEXT: ; kill: def $sgpr6 killed $sgpr6 def $sgpr6_sgpr7
20036 ; GFX10-CU-NEXT: s_mov_b32 s7, s10
20037 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s9
20038 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s8
20039 ; GFX10-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20040 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, v0
20041 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s6
20042 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s7
20043 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20044 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4
20045 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5
20046 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20047 ; GFX10-CU-NEXT: flat_store_dword v[0:1], v2
20048 ; GFX10-CU-NEXT: s_endpgm
20050 ; SKIP-CACHE-INV-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20051 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
20052 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[2:3], s[4:5]
20053 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
20054 ; SKIP-CACHE-INV-NEXT: s_load_dword s5, s[2:3], 0x2
20055 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[2:3], 0x3
20056 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[8:9], 16
20057 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
20058 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, s0
20059 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s1
20060 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s8
20061 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s9
20062 ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, s7
20063 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s6, s3, s6
20064 ; SKIP-CACHE-INV-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3
20065 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, s6
20066 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s5
20067 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
20068 ; SKIP-CACHE-INV-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20069 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, v0
20070 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
20071 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3
20072 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
20073 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
20074 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
20075 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20076 ; SKIP-CACHE-INV-NEXT: flat_store_dword v[0:1], v2
20077 ; SKIP-CACHE-INV-NEXT: s_endpgm
20079 ; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20080 ; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
20081 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
20082 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
20083 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
20084 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
20085 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s7
20086 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s6
20087 ; GFX90A-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20088 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
20089 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20090 ; GFX90A-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
20091 ; GFX90A-NOTTGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20092 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20093 ; GFX90A-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2
20094 ; GFX90A-NOTTGSPLIT-NEXT: s_endpgm
20096 ; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20097 ; GFX90A-TGSPLIT: ; %bb.0: ; %entry
20098 ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
20099 ; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
20100 ; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
20101 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
20102 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v2, s7
20103 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s6
20104 ; GFX90A-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20105 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
20106 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20107 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
20108 ; GFX90A-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 glc
20109 ; GFX90A-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
20110 ; GFX90A-TGSPLIT-NEXT: buffer_wbinvl1_vol
20111 ; GFX90A-TGSPLIT-NEXT: v_pk_mov_b32 v[0:1], s[4:5], s[4:5] op_sel:[0,1]
20112 ; GFX90A-TGSPLIT-NEXT: flat_store_dword v[0:1], v2
20113 ; GFX90A-TGSPLIT-NEXT: s_endpgm
20115 ; GFX940-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20116 ; GFX940-NOTTGSPLIT: ; %bb.0: ; %entry
20117 ; GFX940-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
20118 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
20119 ; GFX940-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
20120 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
20121 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v2, s3
20122 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s2
20123 ; GFX940-NOTTGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20124 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b32_e32 v3, v0
20125 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
20126 ; GFX940-NOTTGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
20127 ; GFX940-NOTTGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
20128 ; GFX940-NOTTGSPLIT-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20129 ; GFX940-NOTTGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
20130 ; GFX940-NOTTGSPLIT-NEXT: s_endpgm
20132 ; GFX940-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20133 ; GFX940-TGSPLIT: ; %bb.0: ; %entry
20134 ; GFX940-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
20135 ; GFX940-TGSPLIT-NEXT: s_load_dword s3, s[4:5], 0x8
20136 ; GFX940-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0xc
20137 ; GFX940-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
20138 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v2, s3
20139 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v0, s2
20140 ; GFX940-TGSPLIT-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20141 ; GFX940-TGSPLIT-NEXT: v_mov_b32_e32 v3, v0
20142 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
20143 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
20144 ; GFX940-TGSPLIT-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:16 sc0
20145 ; GFX940-TGSPLIT-NEXT: s_waitcnt vmcnt(0)
20146 ; GFX940-TGSPLIT-NEXT: buffer_inv sc0
20147 ; GFX940-TGSPLIT-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
20148 ; GFX940-TGSPLIT-NEXT: flat_store_dword v[0:1], v2 sc0 sc1
20149 ; GFX940-TGSPLIT-NEXT: s_endpgm
20151 ; GFX11-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20152 ; GFX11-WGP: ; %bb.0: ; %entry
20153 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
20154 ; GFX11-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
20155 ; GFX11-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
20156 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
20157 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s3
20158 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
20159 ; GFX11-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20160 ; GFX11-WGP-NEXT: v_mov_b32_e32 v3, v0
20161 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
20162 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
20163 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
20164 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
20165 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
20166 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0)
20167 ; GFX11-WGP-NEXT: buffer_gl0_inv
20168 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
20169 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
20170 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
20171 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2
20172 ; GFX11-WGP-NEXT: s_endpgm
20174 ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20175 ; GFX11-CU: ; %bb.0: ; %entry
20176 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
20177 ; GFX11-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
20178 ; GFX11-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
20179 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
20180 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s3
20181 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
20182 ; GFX11-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20183 ; GFX11-CU-NEXT: v_mov_b32_e32 v3, v0
20184 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
20185 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
20186 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc
20187 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
20188 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
20189 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
20190 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2
20191 ; GFX11-CU-NEXT: s_endpgm
20193 ; GFX12-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20194 ; GFX12-WGP: ; %bb.0: ; %entry
20195 ; GFX12-WGP-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
20196 ; GFX12-WGP-NEXT: s_load_b32 s3, s[4:5], 0x8
20197 ; GFX12-WGP-NEXT: s_load_b32 s2, s[4:5], 0xc
20198 ; GFX12-WGP-NEXT: s_wait_kmcnt 0x0
20199 ; GFX12-WGP-NEXT: v_mov_b32_e32 v2, s3
20200 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s2
20201 ; GFX12-WGP-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20202 ; GFX12-WGP-NEXT: v_mov_b32_e32 v3, v0
20203 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
20204 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
20205 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
20206 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
20207 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
20208 ; GFX12-WGP-NEXT: s_wait_storecnt 0x0
20209 ; GFX12-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SE
20210 ; GFX12-WGP-NEXT: s_wait_bvhcnt 0x0
20211 ; GFX12-WGP-NEXT: s_wait_samplecnt 0x0
20212 ; GFX12-WGP-NEXT: s_wait_loadcnt 0x0
20213 ; GFX12-WGP-NEXT: global_inv scope:SCOPE_SE
20214 ; GFX12-WGP-NEXT: v_mov_b32_e32 v0, s0
20215 ; GFX12-WGP-NEXT: v_mov_b32_e32 v1, s1
20216 ; GFX12-WGP-NEXT: s_wait_dscnt 0x0
20217 ; GFX12-WGP-NEXT: flat_store_b32 v[0:1], v2
20218 ; GFX12-WGP-NEXT: s_endpgm
20220 ; GFX12-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
20221 ; GFX12-CU: ; %bb.0: ; %entry
20222 ; GFX12-CU-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
20223 ; GFX12-CU-NEXT: s_load_b32 s3, s[4:5], 0x8
20224 ; GFX12-CU-NEXT: s_load_b32 s2, s[4:5], 0xc
20225 ; GFX12-CU-NEXT: s_wait_kmcnt 0x0
20226 ; GFX12-CU-NEXT: v_mov_b32_e32 v2, s3
20227 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s2
20228 ; GFX12-CU-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
20229 ; GFX12-CU-NEXT: v_mov_b32_e32 v3, v0
20230 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
20231 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
20232 ; GFX12-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 th:TH_ATOMIC_RETURN
20233 ; GFX12-CU-NEXT: v_mov_b32_e32 v0, s0
20234 ; GFX12-CU-NEXT: v_mov_b32_e32 v1, s1
20235 ; GFX12-CU-NEXT: s_wait_loadcnt_dscnt 0x0
20236 ; GFX12-CU-NEXT: flat_store_b32 v[0:1], v2
20237 ; GFX12-CU-NEXT: s_endpgm
20238 ptr %out, i32 %in, i32 %old) {
20240 %gep = getelementptr i32, ptr %out, i32 4
20241 %val = cmpxchg volatile ptr %gep, i32 %old, i32 %in syncscope("workgroup-one-as") seq_cst seq_cst
20242 %val0 = extractvalue { i32, i1 } %val, 0
20243 store i32 %val0, ptr %out, align 4