1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
8 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
10 define amdgpu_kernel void @local_volatile_load_0(
11 ; GFX6-LABEL: local_volatile_load_0:
12 ; GFX6: ; %bb.0: ; %entry
13 ; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9
14 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
15 ; GFX6-NEXT: s_mov_b32 m0, -1
16 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
17 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
18 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
19 ; GFX6-NEXT: ds_read_b32 v0, v0
20 ; GFX6-NEXT: s_mov_b32 s2, -1
21 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
22 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
25 ; GFX7-LABEL: local_volatile_load_0:
26 ; GFX7: ; %bb.0: ; %entry
27 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
28 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
29 ; GFX7-NEXT: s_mov_b32 m0, -1
30 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
31 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
32 ; GFX7-NEXT: ds_read_b32 v2, v0
33 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
34 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
35 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
36 ; GFX7-NEXT: flat_store_dword v[0:1], v2
39 ; GFX10-WGP-LABEL: local_volatile_load_0:
40 ; GFX10-WGP: ; %bb.0: ; %entry
41 ; GFX10-WGP-NEXT: s_clause 0x1
42 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
43 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
44 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
45 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
46 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
47 ; GFX10-WGP-NEXT: ds_read_b32 v0, v0
48 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
49 ; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
50 ; GFX10-WGP-NEXT: s_endpgm
52 ; GFX10-CU-LABEL: local_volatile_load_0:
53 ; GFX10-CU: ; %bb.0: ; %entry
54 ; GFX10-CU-NEXT: s_clause 0x1
55 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
56 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
57 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
58 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
59 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
60 ; GFX10-CU-NEXT: ds_read_b32 v0, v0
61 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
62 ; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
63 ; GFX10-CU-NEXT: s_endpgm
65 ; SKIP-CACHE-INV-LABEL: local_volatile_load_0:
66 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
67 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
68 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
69 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
70 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
71 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
72 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2
73 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
74 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
75 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
76 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
77 ; SKIP-CACHE-INV-NEXT: s_endpgm
79 ; GFX11-WGP-LABEL: local_volatile_load_0:
80 ; GFX11-WGP: ; %bb.0: ; %entry
81 ; GFX11-WGP-NEXT: s_clause 0x1
82 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
83 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
84 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
85 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
86 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2
87 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0
88 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
89 ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
90 ; GFX11-WGP-NEXT: s_endpgm
92 ; GFX11-CU-LABEL: local_volatile_load_0:
93 ; GFX11-CU: ; %bb.0: ; %entry
94 ; GFX11-CU-NEXT: s_clause 0x1
95 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
96 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
97 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
98 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
99 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2
100 ; GFX11-CU-NEXT: ds_load_b32 v0, v0
101 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
102 ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
103 ; GFX11-CU-NEXT: s_endpgm
104 i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
106 %val = load volatile i32, i32 addrspace(3)* %in, align 4
107 store i32 %val, i32 addrspace(1)* %out
111 define amdgpu_kernel void @local_volatile_load_1(
112 ; GFX6-LABEL: local_volatile_load_1:
113 ; GFX6: ; %bb.0: ; %entry
114 ; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9
115 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
116 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
117 ; GFX6-NEXT: s_mov_b32 m0, -1
118 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
119 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
120 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0
121 ; GFX6-NEXT: ds_read_b32 v0, v0
122 ; GFX6-NEXT: s_mov_b32 s2, -1
123 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
124 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
125 ; GFX6-NEXT: s_endpgm
127 ; GFX7-LABEL: local_volatile_load_1:
128 ; GFX7: ; %bb.0: ; %entry
129 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
130 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
131 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
132 ; GFX7-NEXT: s_mov_b32 m0, -1
133 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
134 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
135 ; GFX7-NEXT: ds_read_b32 v2, v0
136 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
137 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
138 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
139 ; GFX7-NEXT: flat_store_dword v[0:1], v2
140 ; GFX7-NEXT: s_endpgm
142 ; GFX10-WGP-LABEL: local_volatile_load_1:
143 ; GFX10-WGP: ; %bb.0: ; %entry
144 ; GFX10-WGP-NEXT: s_clause 0x1
145 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
146 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
147 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
148 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
149 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
150 ; GFX10-WGP-NEXT: ds_read_b32 v0, v0
151 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
152 ; GFX10-WGP-NEXT: global_store_dword v1, v0, s[0:1]
153 ; GFX10-WGP-NEXT: s_endpgm
155 ; GFX10-CU-LABEL: local_volatile_load_1:
156 ; GFX10-CU: ; %bb.0: ; %entry
157 ; GFX10-CU-NEXT: s_clause 0x1
158 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
159 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
160 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
161 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
162 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
163 ; GFX10-CU-NEXT: ds_read_b32 v0, v0
164 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
165 ; GFX10-CU-NEXT: global_store_dword v1, v0, s[0:1]
166 ; GFX10-CU-NEXT: s_endpgm
168 ; SKIP-CACHE-INV-LABEL: local_volatile_load_1:
169 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
170 ; SKIP-CACHE-INV-NEXT: s_load_dword s2, s[0:1], 0x0
171 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2
172 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
173 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
174 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
175 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
176 ; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s2, v0
177 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
178 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
179 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
180 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
181 ; SKIP-CACHE-INV-NEXT: s_endpgm
183 ; GFX11-WGP-LABEL: local_volatile_load_1:
184 ; GFX11-WGP: ; %bb.0: ; %entry
185 ; GFX11-WGP-NEXT: s_clause 0x1
186 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0
187 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
188 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0
189 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
190 ; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
191 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0
192 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
193 ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1]
194 ; GFX11-WGP-NEXT: s_endpgm
196 ; GFX11-CU-LABEL: local_volatile_load_1:
197 ; GFX11-CU: ; %bb.0: ; %entry
198 ; GFX11-CU-NEXT: s_clause 0x1
199 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0
200 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
201 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0
202 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
203 ; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
204 ; GFX11-CU-NEXT: ds_load_b32 v0, v0
205 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
206 ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1]
207 ; GFX11-CU-NEXT: s_endpgm
208 i32 addrspace(3)* %in, i32 addrspace(1)* %out) {
210 %tid = call i32 @llvm.amdgcn.workitem.id.x()
211 %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid
212 %val = load volatile i32, i32 addrspace(3)* %val.gep, align 4
213 store i32 %val, i32 addrspace(1)* %out
217 define amdgpu_kernel void @local_volatile_store_0(
218 ; GFX6-LABEL: local_volatile_store_0:
219 ; GFX6: ; %bb.0: ; %entry
220 ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
221 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb
222 ; GFX6-NEXT: s_mov_b32 m0, -1
223 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
224 ; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0
225 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
226 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
227 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
228 ; GFX6-NEXT: ds_write_b32 v0, v1
229 ; GFX6-NEXT: s_endpgm
231 ; GFX7-LABEL: local_volatile_store_0:
232 ; GFX7: ; %bb.0: ; %entry
233 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
234 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
235 ; GFX7-NEXT: s_mov_b32 m0, -1
236 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
237 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
238 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
239 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
240 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
241 ; GFX7-NEXT: ds_write_b32 v0, v1
242 ; GFX7-NEXT: s_endpgm
244 ; GFX10-WGP-LABEL: local_volatile_store_0:
245 ; GFX10-WGP: ; %bb.0: ; %entry
246 ; GFX10-WGP-NEXT: s_clause 0x1
247 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
248 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
249 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
250 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2
251 ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
252 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
253 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
254 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1
255 ; GFX10-WGP-NEXT: s_endpgm
257 ; GFX10-CU-LABEL: local_volatile_store_0:
258 ; GFX10-CU: ; %bb.0: ; %entry
259 ; GFX10-CU-NEXT: s_clause 0x1
260 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
261 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
262 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
263 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2
264 ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
265 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
266 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
267 ; GFX10-CU-NEXT: ds_write_b32 v0, v1
268 ; GFX10-CU-NEXT: s_endpgm
270 ; SKIP-CACHE-INV-LABEL: local_volatile_store_0:
271 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
272 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
273 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
274 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
275 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
276 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
277 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
278 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
279 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
280 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
281 ; SKIP-CACHE-INV-NEXT: s_endpgm
283 ; GFX11-WGP-LABEL: local_volatile_store_0:
284 ; GFX11-WGP: ; %bb.0: ; %entry
285 ; GFX11-WGP-NEXT: s_clause 0x1
286 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
287 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
288 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
289 ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
290 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
291 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
292 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
293 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1
294 ; GFX11-WGP-NEXT: s_endpgm
296 ; GFX11-CU-LABEL: local_volatile_store_0:
297 ; GFX11-CU: ; %bb.0: ; %entry
298 ; GFX11-CU-NEXT: s_clause 0x1
299 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
300 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
301 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
302 ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
303 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
304 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
305 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
306 ; GFX11-CU-NEXT: ds_store_b32 v0, v1
307 ; GFX11-CU-NEXT: s_endpgm
308 i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
310 %val = load i32, i32 addrspace(1)* %in, align 4
311 store volatile i32 %val, i32 addrspace(3)* %out
315 define amdgpu_kernel void @local_volatile_store_1(
316 ; GFX6-LABEL: local_volatile_store_1:
317 ; GFX6: ; %bb.0: ; %entry
318 ; GFX6-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x9
319 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0xb
320 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
321 ; GFX6-NEXT: s_mov_b32 m0, -1
322 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
323 ; GFX6-NEXT: s_load_dword s1, s[2:3], 0x0
324 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
325 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
326 ; GFX6-NEXT: v_mov_b32_e32 v1, s1
327 ; GFX6-NEXT: ds_write_b32 v0, v1
328 ; GFX6-NEXT: s_endpgm
330 ; GFX7-LABEL: local_volatile_store_1:
331 ; GFX7: ; %bb.0: ; %entry
332 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
333 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2
334 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
335 ; GFX7-NEXT: s_mov_b32 m0, -1
336 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
337 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
338 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
339 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
340 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
341 ; GFX7-NEXT: ds_write_b32 v0, v1
342 ; GFX7-NEXT: s_endpgm
344 ; GFX10-WGP-LABEL: local_volatile_store_1:
345 ; GFX10-WGP: ; %bb.0: ; %entry
346 ; GFX10-WGP-NEXT: s_clause 0x1
347 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
348 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x8
349 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
350 ; GFX10-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s2
351 ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
352 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
353 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
354 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1
355 ; GFX10-WGP-NEXT: s_endpgm
357 ; GFX10-CU-LABEL: local_volatile_store_1:
358 ; GFX10-CU: ; %bb.0: ; %entry
359 ; GFX10-CU-NEXT: s_clause 0x1
360 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
361 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x8
362 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
363 ; GFX10-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s2
364 ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
365 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
366 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
367 ; GFX10-CU-NEXT: ds_write_b32 v0, v1
368 ; GFX10-CU-NEXT: s_endpgm
370 ; SKIP-CACHE-INV-LABEL: local_volatile_store_1:
371 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
372 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
373 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x2
374 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
375 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
376 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
377 ; SKIP-CACHE-INV-NEXT: s_load_dword s1, s[2:3], 0x0
378 ; SKIP-CACHE-INV-NEXT: v_add_i32_e32 v0, vcc, s0, v0
379 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
380 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
381 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
382 ; SKIP-CACHE-INV-NEXT: s_endpgm
384 ; GFX11-WGP-LABEL: local_volatile_store_1:
385 ; GFX11-WGP: ; %bb.0: ; %entry
386 ; GFX11-WGP-NEXT: s_clause 0x1
387 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
388 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8
389 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
390 ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0
391 ; GFX11-WGP-NEXT: v_lshl_add_u32 v0, v0, 2, s0
392 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
393 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
394 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1
395 ; GFX11-WGP-NEXT: s_endpgm
397 ; GFX11-CU-LABEL: local_volatile_store_1:
398 ; GFX11-CU: ; %bb.0: ; %entry
399 ; GFX11-CU-NEXT: s_clause 0x1
400 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
401 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8
402 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
403 ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0
404 ; GFX11-CU-NEXT: v_lshl_add_u32 v0, v0, 2, s0
405 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
406 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
407 ; GFX11-CU-NEXT: ds_store_b32 v0, v1
408 ; GFX11-CU-NEXT: s_endpgm
409 i32 addrspace(1)* %in, i32 addrspace(3)* %out) {
411 %tid = call i32 @llvm.amdgcn.workitem.id.x()
412 %val = load i32, i32 addrspace(1)* %in, align 4
413 %out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid
414 store volatile i32 %val, i32 addrspace(3)* %out.gep
418 define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
419 ; GFX6-LABEL: local_volatile_workgroup_acquire_load:
420 ; GFX6: ; %bb.0: ; %entry
421 ; GFX6-NEXT: s_load_dword s2, s[0:1], 0x9
422 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0xa
423 ; GFX6-NEXT: s_mov_b32 m0, -1
424 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
425 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
426 ; GFX6-NEXT: ds_read_b32 v0, v0
427 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
428 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
429 ; GFX6-NEXT: ds_write_b32 v1, v0
430 ; GFX6-NEXT: s_endpgm
432 ; GFX7-LABEL: local_volatile_workgroup_acquire_load:
433 ; GFX7: ; %bb.0: ; %entry
434 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
435 ; GFX7-NEXT: s_mov_b32 m0, -1
436 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
437 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
438 ; GFX7-NEXT: ds_read_b32 v0, v0
439 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
440 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
441 ; GFX7-NEXT: ds_write_b32 v1, v0
442 ; GFX7-NEXT: s_endpgm
444 ; GFX10-WGP-LABEL: local_volatile_workgroup_acquire_load:
445 ; GFX10-WGP: ; %bb.0: ; %entry
446 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
447 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
448 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0
449 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1
450 ; GFX10-WGP-NEXT: ds_read_b32 v0, v0
451 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
452 ; GFX10-WGP-NEXT: buffer_gl0_inv
453 ; GFX10-WGP-NEXT: ds_write_b32 v1, v0
454 ; GFX10-WGP-NEXT: s_endpgm
456 ; GFX10-CU-LABEL: local_volatile_workgroup_acquire_load:
457 ; GFX10-CU: ; %bb.0: ; %entry
458 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
459 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
460 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0
461 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1
462 ; GFX10-CU-NEXT: ds_read_b32 v0, v0
463 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
464 ; GFX10-CU-NEXT: ds_write_b32 v1, v0
465 ; GFX10-CU-NEXT: s_endpgm
467 ; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_acquire_load:
468 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
469 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
470 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
471 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
472 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
473 ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0
474 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
475 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1
476 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v1, v0
477 ; SKIP-CACHE-INV-NEXT: s_endpgm
479 ; GFX11-WGP-LABEL: local_volatile_workgroup_acquire_load:
480 ; GFX11-WGP: ; %bb.0: ; %entry
481 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
482 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
483 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0
484 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1
485 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0
486 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
487 ; GFX11-WGP-NEXT: buffer_gl0_inv
488 ; GFX11-WGP-NEXT: ds_store_b32 v1, v0
489 ; GFX11-WGP-NEXT: s_endpgm
491 ; GFX11-CU-LABEL: local_volatile_workgroup_acquire_load:
492 ; GFX11-CU: ; %bb.0: ; %entry
493 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
494 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
495 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0
496 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1
497 ; GFX11-CU-NEXT: ds_load_b32 v0, v0
498 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
499 ; GFX11-CU-NEXT: ds_store_b32 v1, v0
500 ; GFX11-CU-NEXT: s_endpgm
501 i32 addrspace(3)* %in, i32 addrspace(3)* %out) {
503 %val = load atomic volatile i32, i32 addrspace(3)* %in syncscope("workgroup") acquire, align 4
504 store i32 %val, i32 addrspace(3)* %out
508 define amdgpu_kernel void @local_volatile_workgroup_release_store(
509 ; GFX6-LABEL: local_volatile_workgroup_release_store:
510 ; GFX6: ; %bb.0: ; %entry
511 ; GFX6-NEXT: s_load_dword s2, s[0:1], 0xa
512 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x9
513 ; GFX6-NEXT: s_mov_b32 m0, -1
514 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
515 ; GFX6-NEXT: v_mov_b32_e32 v0, s2
516 ; GFX6-NEXT: v_mov_b32_e32 v1, s0
517 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
518 ; GFX6-NEXT: ds_write_b32 v0, v1
519 ; GFX6-NEXT: s_endpgm
521 ; GFX7-LABEL: local_volatile_workgroup_release_store:
522 ; GFX7: ; %bb.0: ; %entry
523 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
524 ; GFX7-NEXT: s_mov_b32 m0, -1
525 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
526 ; GFX7-NEXT: v_mov_b32_e32 v0, s1
527 ; GFX7-NEXT: v_mov_b32_e32 v1, s0
528 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
529 ; GFX7-NEXT: ds_write_b32 v0, v1
530 ; GFX7-NEXT: s_endpgm
532 ; GFX10-WGP-LABEL: local_volatile_workgroup_release_store:
533 ; GFX10-WGP: ; %bb.0: ; %entry
534 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
535 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
536 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s1
537 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
538 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
539 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
540 ; GFX10-WGP-NEXT: ds_write_b32 v0, v1
541 ; GFX10-WGP-NEXT: s_endpgm
543 ; GFX10-CU-LABEL: local_volatile_workgroup_release_store:
544 ; GFX10-CU: ; %bb.0: ; %entry
545 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
546 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
547 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1
548 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
549 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
550 ; GFX10-CU-NEXT: ds_write_b32 v0, v1
551 ; GFX10-CU-NEXT: s_endpgm
553 ; SKIP-CACHE-INV-LABEL: local_volatile_workgroup_release_store:
554 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
555 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
556 ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1
557 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
558 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1
559 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0
560 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
561 ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1
562 ; SKIP-CACHE-INV-NEXT: s_endpgm
564 ; GFX11-WGP-LABEL: local_volatile_workgroup_release_store:
565 ; GFX11-WGP: ; %bb.0: ; %entry
566 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
567 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0)
568 ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1
569 ; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0
570 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
571 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0
572 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1
573 ; GFX11-WGP-NEXT: s_endpgm
575 ; GFX11-CU-LABEL: local_volatile_workgroup_release_store:
576 ; GFX11-CU: ; %bb.0: ; %entry
577 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
578 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
579 ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1
580 ; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0
581 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0)
582 ; GFX11-CU-NEXT: ds_store_b32 v0, v1
583 ; GFX11-CU-NEXT: s_endpgm
584 i32 %in, i32 addrspace(3)* %out) {
586 store atomic volatile i32 %in, i32 addrspace(3)* %out syncscope("workgroup") release, align 4
590 declare i32 @llvm.amdgcn.workitem.id.x()