1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
8 define amdgpu_kernel void @global_volatile_load_0(
9 ; GFX6-LABEL: global_volatile_load_0:
10 ; GFX6: ; %bb.0: ; %entry
11 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
12 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
13 ; GFX6-NEXT: s_mov_b32 s2, -1
14 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX6-NEXT: s_mov_b32 s0, s4
16 ; GFX6-NEXT: s_mov_b32 s1, s5
17 ; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
18 ; GFX6-NEXT: s_waitcnt vmcnt(0)
19 ; GFX6-NEXT: s_mov_b32 s4, s6
20 ; GFX6-NEXT: s_mov_b32 s5, s7
21 ; GFX6-NEXT: s_mov_b32 s6, s2
22 ; GFX6-NEXT: s_mov_b32 s7, s3
23 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
26 ; GFX7-LABEL: global_volatile_load_0:
27 ; GFX7: ; %bb.0: ; %entry
28 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
29 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
30 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
31 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
32 ; GFX7-NEXT: flat_load_dword v0, v[0:1] glc
33 ; GFX7-NEXT: s_waitcnt vmcnt(0)
34 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
35 ; GFX7-NEXT: v_mov_b32_e32 v3, s3
36 ; GFX7-NEXT: flat_store_dword v[2:3], v0
39 ; GFX10-WGP-LABEL: global_volatile_load_0:
40 ; GFX10-WGP: ; %bb.0: ; %entry
41 ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
42 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
43 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
44 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
45 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
46 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
47 ; GFX10-WGP-NEXT: s_endpgm
49 ; GFX10-CU-LABEL: global_volatile_load_0:
50 ; GFX10-CU: ; %bb.0: ; %entry
51 ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
52 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
53 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
54 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1] glc dlc
55 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
56 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
57 ; GFX10-CU-NEXT: s_endpgm
59 ; SKIP-CACHE-INV-LABEL: global_volatile_load_0:
60 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
61 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
62 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
63 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
64 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
65 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
66 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
67 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0 glc
68 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
69 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6
70 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7
71 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2
72 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3
73 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
74 ; SKIP-CACHE-INV-NEXT: s_endpgm
75 i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
77 %val = load volatile i32, i32 addrspace(1)* %in, align 4
78 store i32 %val, i32 addrspace(1)* %out
82 define amdgpu_kernel void @global_volatile_load_1(
83 ; GFX6-LABEL: global_volatile_load_1:
84 ; GFX6: ; %bb.0: ; %entry
85 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
86 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
87 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
88 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
89 ; GFX6-NEXT: s_mov_b32 s2, -1
90 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
91 ; GFX6-NEXT: s_mov_b32 s0, s6
92 ; GFX6-NEXT: s_mov_b32 s1, s7
93 ; GFX6-NEXT: s_mov_b32 s6, 0
94 ; GFX6-NEXT: s_mov_b32 s7, s3
95 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
96 ; GFX6-NEXT: s_waitcnt vmcnt(0)
97 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
100 ; GFX7-LABEL: global_volatile_load_1:
101 ; GFX7: ; %bb.0: ; %entry
102 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
103 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, 2, v0
104 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
105 ; GFX7-NEXT: v_mov_b32_e32 v3, s1
106 ; GFX7-NEXT: v_add_i32_e32 v2, vcc, s0, v2
107 ; GFX7-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
108 ; GFX7-NEXT: flat_load_dword v2, v[2:3] glc
109 ; GFX7-NEXT: s_waitcnt vmcnt(0)
110 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
111 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
112 ; GFX7-NEXT: flat_store_dword v[0:1], v2
113 ; GFX7-NEXT: s_endpgm
115 ; GFX10-WGP-LABEL: global_volatile_load_1:
116 ; GFX10-WGP: ; %bb.0: ; %entry
117 ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
118 ; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
119 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, 0
120 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
121 ; GFX10-WGP-NEXT: global_load_dword v0, v0, s[0:1] glc dlc
122 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
123 ; GFX10-WGP-NEXT: global_store_dword v1, v0, s[2:3]
124 ; GFX10-WGP-NEXT: s_endpgm
126 ; GFX10-CU-LABEL: global_volatile_load_1:
127 ; GFX10-CU: ; %bb.0: ; %entry
128 ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
129 ; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
130 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, 0
131 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
132 ; GFX10-CU-NEXT: global_load_dword v0, v0, s[0:1] glc dlc
133 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
134 ; GFX10-CU-NEXT: global_store_dword v1, v0, s[2:3]
135 ; GFX10-CU-NEXT: s_endpgm
137 ; SKIP-CACHE-INV-LABEL: global_volatile_load_1:
138 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
139 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
140 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
141 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
142 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, 0
143 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
144 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
145 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s6
146 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s7
147 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0
148 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3
149 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
150 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
151 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
152 ; SKIP-CACHE-INV-NEXT: s_endpgm
153 i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
155 %tid = call i32 @llvm.amdgcn.workitem.id.x()
156 %val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
157 %val = load volatile i32, i32 addrspace(1)* %val.gep, align 4
158 store i32 %val, i32 addrspace(1)* %out
162 define amdgpu_kernel void @global_volatile_store_0(
163 ; GFX6-LABEL: global_volatile_store_0:
164 ; GFX6: ; %bb.0: ; %entry
165 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
166 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
167 ; GFX6-NEXT: s_mov_b32 s6, -1
168 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
169 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
170 ; GFX6-NEXT: s_mov_b32 s4, s2
171 ; GFX6-NEXT: s_mov_b32 s5, s3
172 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
173 ; GFX6-NEXT: v_mov_b32_e32 v0, s0
174 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
175 ; GFX6-NEXT: s_waitcnt vmcnt(0)
176 ; GFX6-NEXT: s_endpgm
178 ; GFX7-LABEL: global_volatile_store_0:
179 ; GFX7: ; %bb.0: ; %entry
180 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
181 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
182 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
183 ; GFX7-NEXT: v_mov_b32_e32 v0, s2
184 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
185 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
186 ; GFX7-NEXT: v_mov_b32_e32 v2, s0
187 ; GFX7-NEXT: flat_store_dword v[0:1], v2
188 ; GFX7-NEXT: s_waitcnt vmcnt(0)
189 ; GFX7-NEXT: s_endpgm
191 ; GFX10-WGP-LABEL: global_volatile_store_0:
192 ; GFX10-WGP: ; %bb.0: ; %entry
193 ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
194 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
195 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
196 ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
197 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
198 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
199 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
200 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
201 ; GFX10-WGP-NEXT: s_endpgm
203 ; GFX10-CU-LABEL: global_volatile_store_0:
204 ; GFX10-CU: ; %bb.0: ; %entry
205 ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
206 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
207 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
208 ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
209 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
210 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
211 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
212 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
213 ; GFX10-CU-NEXT: s_endpgm
215 ; SKIP-CACHE-INV-LABEL: global_volatile_store_0:
216 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
217 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
218 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
219 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, -1
220 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
221 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
222 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s2
223 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s3
224 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
225 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0
226 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
227 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
228 ; SKIP-CACHE-INV-NEXT: s_endpgm
229 i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
231 %val = load i32, i32 addrspace(1)* %in, align 4
232 store volatile i32 %val, i32 addrspace(1)* %out
236 define amdgpu_kernel void @global_volatile_store_1(
237 ; GFX6-LABEL: global_volatile_store_1:
238 ; GFX6: ; %bb.0: ; %entry
239 ; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
240 ; GFX6-NEXT: s_mov_b32 s7, 0xf000
241 ; GFX6-NEXT: s_mov_b32 s6, 0
242 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0
243 ; GFX6-NEXT: v_mov_b32_e32 v1, 0
244 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
245 ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0
246 ; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
247 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
248 ; GFX6-NEXT: v_mov_b32_e32 v2, s0
249 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
250 ; GFX6-NEXT: s_waitcnt vmcnt(0)
251 ; GFX6-NEXT: s_endpgm
253 ; GFX7-LABEL: global_volatile_store_1:
254 ; GFX7: ; %bb.0: ; %entry
255 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
256 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0
257 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
258 ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0
259 ; GFX7-NEXT: v_mov_b32_e32 v1, s3
260 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0
261 ; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
262 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
263 ; GFX7-NEXT: v_mov_b32_e32 v2, s0
264 ; GFX7-NEXT: flat_store_dword v[0:1], v2
265 ; GFX7-NEXT: s_waitcnt vmcnt(0)
266 ; GFX7-NEXT: s_endpgm
268 ; GFX10-WGP-LABEL: global_volatile_store_1:
269 ; GFX10-WGP: ; %bb.0: ; %entry
270 ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
271 ; GFX10-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0
272 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
273 ; GFX10-WGP-NEXT: s_load_dword s0, s[0:1], 0x0
274 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
275 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s0
276 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
277 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
278 ; GFX10-WGP-NEXT: s_endpgm
280 ; GFX10-CU-LABEL: global_volatile_store_1:
281 ; GFX10-CU: ; %bb.0: ; %entry
282 ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
283 ; GFX10-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0
284 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
285 ; GFX10-CU-NEXT: s_load_dword s0, s[0:1], 0x0
286 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
287 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0
288 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
289 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0
290 ; GFX10-CU-NEXT: s_endpgm
292 ; SKIP-CACHE-INV-LABEL: global_volatile_store_1:
293 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
294 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
295 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, 0xf000
296 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, 0
297 ; SKIP-CACHE-INV-NEXT: v_lshlrev_b32_e32 v0, 2, v0
298 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, 0
299 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
300 ; SKIP-CACHE-INV-NEXT: s_load_dword s0, s[0:1], 0x0
301 ; SKIP-CACHE-INV-NEXT: s_mov_b64 s[4:5], s[2:3]
302 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
303 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0
304 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v2, v[0:1], s[4:7], 0 addr64
305 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
306 ; SKIP-CACHE-INV-NEXT: s_endpgm
307 i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
309 %tid = call i32 @llvm.amdgcn.workitem.id.x()
310 %val = load i32, i32 addrspace(1)* %in, align 4
311 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
312 store volatile i32 %val, i32 addrspace(1)* %out.gep
316 define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
317 ; GFX6-LABEL: global_volatile_workgroup_acquire_load:
318 ; GFX6: ; %bb.0: ; %entry
319 ; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
320 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
321 ; GFX6-NEXT: s_mov_b32 s2, -1
322 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
323 ; GFX6-NEXT: s_mov_b32 s0, s4
324 ; GFX6-NEXT: s_mov_b32 s1, s5
325 ; GFX6-NEXT: buffer_load_dword v0, off, s[0:3], 0
326 ; GFX6-NEXT: s_mov_b32 s4, s6
327 ; GFX6-NEXT: s_mov_b32 s5, s7
328 ; GFX6-NEXT: s_mov_b32 s6, s2
329 ; GFX6-NEXT: s_mov_b32 s7, s3
330 ; GFX6-NEXT: s_waitcnt vmcnt(0)
331 ; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
332 ; GFX6-NEXT: s_endpgm
334 ; GFX7-LABEL: global_volatile_workgroup_acquire_load:
335 ; GFX7: ; %bb.0: ; %entry
336 ; GFX7-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
337 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
338 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
339 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
340 ; GFX7-NEXT: flat_load_dword v0, v[0:1]
341 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
342 ; GFX7-NEXT: v_mov_b32_e32 v3, s3
343 ; GFX7-NEXT: s_waitcnt vmcnt(0)
344 ; GFX7-NEXT: flat_store_dword v[2:3], v0
345 ; GFX7-NEXT: s_endpgm
347 ; GFX10-WGP-LABEL: global_volatile_workgroup_acquire_load:
348 ; GFX10-WGP: ; %bb.0: ; %entry
349 ; GFX10-WGP-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
350 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
351 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
352 ; GFX10-WGP-NEXT: global_load_dword v1, v0, s[0:1] glc
353 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0)
354 ; GFX10-WGP-NEXT: buffer_gl0_inv
355 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[2:3]
356 ; GFX10-WGP-NEXT: s_endpgm
358 ; GFX10-CU-LABEL: global_volatile_workgroup_acquire_load:
359 ; GFX10-CU: ; %bb.0: ; %entry
360 ; GFX10-CU-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0
361 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
362 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
363 ; GFX10-CU-NEXT: global_load_dword v1, v0, s[0:1]
364 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0)
365 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[2:3]
366 ; GFX10-CU-NEXT: s_endpgm
368 ; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_acquire_load:
369 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
370 ; SKIP-CACHE-INV-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9
371 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
372 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
373 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
374 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s0, s4
375 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s1, s5
376 ; SKIP-CACHE-INV-NEXT: buffer_load_dword v0, off, s[0:3], 0
377 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s4, s6
378 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s5, s7
379 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s6, s2
380 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s7, s3
381 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0)
382 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[4:7], 0
383 ; SKIP-CACHE-INV-NEXT: s_endpgm
384 i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
386 %val = load atomic volatile i32, i32 addrspace(1)* %in syncscope("workgroup") acquire, align 4
387 store i32 %val, i32 addrspace(1)* %out
391 define amdgpu_kernel void @global_volatile_workgroup_release_store(
392 ; GFX6-LABEL: global_volatile_workgroup_release_store:
393 ; GFX6: ; %bb.0: ; %entry
394 ; GFX6-NEXT: s_load_dword s4, s[0:1], 0x9
395 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
396 ; GFX6-NEXT: s_mov_b32 s3, 0xf000
397 ; GFX6-NEXT: s_mov_b32 s2, -1
398 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
399 ; GFX6-NEXT: v_mov_b32_e32 v0, s4
400 ; GFX6-NEXT: s_waitcnt lgkmcnt(0)
401 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
402 ; GFX6-NEXT: s_endpgm
404 ; GFX7-LABEL: global_volatile_workgroup_release_store:
405 ; GFX7: ; %bb.0: ; %entry
406 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x0
407 ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2
408 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
409 ; GFX7-NEXT: v_mov_b32_e32 v2, s2
410 ; GFX7-NEXT: v_mov_b32_e32 v0, s0
411 ; GFX7-NEXT: v_mov_b32_e32 v1, s1
412 ; GFX7-NEXT: s_waitcnt lgkmcnt(0)
413 ; GFX7-NEXT: flat_store_dword v[0:1], v2
414 ; GFX7-NEXT: s_endpgm
416 ; GFX10-WGP-LABEL: global_volatile_workgroup_release_store:
417 ; GFX10-WGP: ; %bb.0: ; %entry
418 ; GFX10-WGP-NEXT: s_clause 0x1
419 ; GFX10-WGP-NEXT: s_load_dword s2, s[4:5], 0x0
420 ; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
421 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, 0
422 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
423 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s2
424 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
425 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0
426 ; GFX10-WGP-NEXT: global_store_dword v0, v1, s[0:1]
427 ; GFX10-WGP-NEXT: s_endpgm
429 ; GFX10-CU-LABEL: global_volatile_workgroup_release_store:
430 ; GFX10-CU: ; %bb.0: ; %entry
431 ; GFX10-CU-NEXT: s_clause 0x1
432 ; GFX10-CU-NEXT: s_load_dword s2, s[4:5], 0x0
433 ; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
434 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, 0
435 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
436 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2
437 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
438 ; GFX10-CU-NEXT: global_store_dword v0, v1, s[0:1]
439 ; GFX10-CU-NEXT: s_endpgm
441 ; SKIP-CACHE-INV-LABEL: global_volatile_workgroup_release_store:
442 ; SKIP-CACHE-INV: ; %bb.0: ; %entry
443 ; SKIP-CACHE-INV-NEXT: s_load_dword s4, s[0:1], 0x9
444 ; SKIP-CACHE-INV-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb
445 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s3, 0xf000
446 ; SKIP-CACHE-INV-NEXT: s_mov_b32 s2, -1
447 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
448 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4
449 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0)
450 ; SKIP-CACHE-INV-NEXT: buffer_store_dword v0, off, s[0:3], 0
451 ; SKIP-CACHE-INV-NEXT: s_endpgm
452 i32 %in, i32 addrspace(1)* %out) {
454 store atomic volatile i32 %in, i32 addrspace(1)* %out syncscope("workgroup") release, align 4
458 declare i32 @llvm.amdgcn.workitem.id.x()