1 ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
3 ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s
8 declare i32 @llvm.amdgcn.workitem.id.x()
10 ; GCN-LABEL: {{^}}system_one_as_unordered:
11 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
12 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
13 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
14 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
15 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
16 ; GFX89-NOT: buffer_wbinvl1_vol
17 ; GFX10-NOT: buffer_gl{{[01]}}_inv
18 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
19 ; GFX10: .amdhsa_kernel system_one_as_unordered
20 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
21 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
22 ; GFX10-NOT: .amdhsa_memory_ordered 0
23 define amdgpu_kernel void @system_one_as_unordered(
24 i32* %in, i32* %out) {
26 %val = load atomic i32, i32* %in syncscope("one-as") unordered, align 4
27 store i32 %val, i32* %out
31 ; GCN-LABEL: {{^}}system_one_as_monotonic:
32 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
33 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
34 ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
35 ; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
36 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
37 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
38 ; GFX89-NOT: buffer_wbinvl1_vol
39 ; GFX10-NOT: buffer_gl{{[01]}}_inv
40 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
41 ; GFX10: .amdhsa_kernel system_one_as_monotonic
42 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
43 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
44 ; GFX10-NOT: .amdhsa_memory_ordered 0
45 define amdgpu_kernel void @system_one_as_monotonic(
46 i32* %in, i32* %out) {
48 %val = load atomic i32, i32* %in syncscope("one-as") monotonic, align 4
49 store i32 %val, i32* %out
53 ; GCN-LABEL: {{^}}system_one_as_acquire:
54 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
55 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
56 ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
57 ; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
58 ; GCN-NEXT: s_waitcnt vmcnt(0){{$}}
59 ; GFX89-NEXT: buffer_wbinvl1_vol
60 ; GFX10-NEXT: buffer_gl0_inv
61 ; GFX10-NEXT: buffer_gl1_inv
62 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
63 ; GFX10: .amdhsa_kernel system_one_as_acquire
64 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
65 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
66 ; GFX10-NOT: .amdhsa_memory_ordered 0
67 define amdgpu_kernel void @system_one_as_acquire(
68 i32* %in, i32* %out) {
70 %val = load atomic i32, i32* %in syncscope("one-as") acquire, align 4
71 store i32 %val, i32* %out
75 ; GCN-LABEL: {{^}}system_one_as_seq_cst:
76 ; GCN: s_waitcnt vmcnt(0){{$}}
77 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
78 ; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
79 ; GFX10-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
80 ; GCN-NEXT: s_waitcnt vmcnt(0){{$}}
81 ; GFX89-NEXT: buffer_wbinvl1_vol
82 ; GFX10-NEXT: buffer_gl0_inv
83 ; GFX10-NEXT: buffer_gl1_inv
84 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
85 ; GFX10: .amdhsa_kernel system_one_as_seq_cst
86 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
87 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
88 ; GFX10-NOT: .amdhsa_memory_ordered 0
89 define amdgpu_kernel void @system_one_as_seq_cst(
90 i32* %in, i32* %out) {
92 %val = load atomic i32, i32* %in syncscope("one-as") seq_cst, align 4
93 store i32 %val, i32* %out
97 ; GCN-LABEL: {{^}}singlethread_one_as_unordered:
98 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
99 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
100 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
101 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
102 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
103 ; GFX89-NOT: buffer_wbinvl1_vol
104 ; GFX10-NOT: buffer_gl{{[01]}}_inv
105 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
106 ; GFX10: .amdhsa_kernel singlethread_one_as_unordered
107 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
108 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
109 ; GFX10-NOT: .amdhsa_memory_ordered 0
110 define amdgpu_kernel void @singlethread_one_as_unordered(
111 i32* %in, i32* %out) {
113 %val = load atomic i32, i32* %in syncscope("singlethread-one-as") unordered, align 4
114 store i32 %val, i32* %out
118 ; GCN-LABEL: {{^}}singlethread_one_as_monotonic:
119 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
120 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
121 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
122 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
123 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
124 ; GFX89-NOT: buffer_wbinvl1_vol
125 ; GFX10-NOT: buffer_gl{{[01]}}_inv
126 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
127 ; GFX10: .amdhsa_kernel singlethread_one_as_monotonic
128 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
129 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
130 ; GFX10-NOT: .amdhsa_memory_ordered 0
131 define amdgpu_kernel void @singlethread_one_as_monotonic(
132 i32* %in, i32* %out) {
134 %val = load atomic i32, i32* %in syncscope("singlethread-one-as") monotonic, align 4
135 store i32 %val, i32* %out
139 ; GCN-LABEL: {{^}}singlethread_one_as_acquire:
140 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
141 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
142 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
143 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
144 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
145 ; GFX89-NOT: buffer_wbinvl1_vol
146 ; GFX10-NOT: buffer_gl{{[01]}}_inv
147 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
148 ; GFX10: .amdhsa_kernel singlethread_one_as_acquire
149 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
150 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
151 ; GFX10-NOT: .amdhsa_memory_ordered 0
152 define amdgpu_kernel void @singlethread_one_as_acquire(
153 i32* %in, i32* %out) {
155 %val = load atomic i32, i32* %in syncscope("singlethread-one-as") acquire, align 4
156 store i32 %val, i32* %out
160 ; GCN-LABEL: {{^}}singlethread_one_as_seq_cst:
161 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
162 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
163 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
164 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
165 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
166 ; GFX89-NOT: buffer_wbinvl1_vol
167 ; GFX10-NOT: buffer_gl{{[01]}}_inv
168 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
169 ; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst
170 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
171 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
172 ; GFX10-NOT: .amdhsa_memory_ordered 0
173 define amdgpu_kernel void @singlethread_one_as_seq_cst(
174 i32* %in, i32* %out) {
176 %val = load atomic i32, i32* %in syncscope("singlethread-one-as") seq_cst, align 4
177 store i32 %val, i32* %out
181 ; GCN-LABEL: {{^}}agent_one_as_unordered:
182 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
183 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
184 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
185 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
186 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
187 ; GFX89-NOT: buffer_wbinvl1_vol
188 ; GFX10-NOT: buffer_gl{{[01]}}_inv
189 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
190 ; GFX10: .amdhsa_kernel agent_one_as_unordered
191 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
192 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
193 ; GFX10-NOT: .amdhsa_memory_ordered 0
194 define amdgpu_kernel void @agent_one_as_unordered(
195 i32* %in, i32* %out) {
197 %val = load atomic i32, i32* %in syncscope("agent-one-as") unordered, align 4
198 store i32 %val, i32* %out
202 ; GCN-LABEL: {{^}}agent_one_as_monotonic:
203 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
204 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
205 ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
206 ; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
207 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
208 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
209 ; GFX89-NOT: buffer_wbinvl1_vol
210 ; GFX10-NOT: buffer_gl{{[01]}}_inv
211 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
212 ; GFX10: .amdhsa_kernel agent_one_as_monotonic
213 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
214 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
215 ; GFX10-NOT: .amdhsa_memory_ordered 0
216 define amdgpu_kernel void @agent_one_as_monotonic(
217 i32* %in, i32* %out) {
219 %val = load atomic i32, i32* %in syncscope("agent-one-as") monotonic, align 4
220 store i32 %val, i32* %out
224 ; GCN-LABEL: {{^}}agent_one_as_acquire:
225 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
226 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
227 ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
228 ; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
229 ; GCN-NEXT: s_waitcnt vmcnt(0){{$}}
230 ; GFX89-NEXT: buffer_wbinvl1_vol
231 ; GFX10-NEXT: buffer_gl0_inv
232 ; GFX10-NEXT: buffer_gl1_inv
233 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
234 ; GFX10: .amdhsa_kernel agent_one_as_acquire
235 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
236 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
237 ; GFX10-NOT: .amdhsa_memory_ordered 0
238 define amdgpu_kernel void @agent_one_as_acquire(
239 i32* %in, i32* %out) {
241 %val = load atomic i32, i32* %in syncscope("agent-one-as") acquire, align 4
242 store i32 %val, i32* %out
246 ; GCN-LABEL: {{^}}agent_one_as_seq_cst:
247 ; GCN: s_waitcnt vmcnt(0){{$}}
248 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
249 ; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
250 ; GFX10-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
251 ; GCN-NEXT: s_waitcnt vmcnt(0){{$}}
252 ; GFX89-NEXT: buffer_wbinvl1_vol
253 ; GFX10-NEXT: buffer_gl0_inv
254 ; GFX10-NEXT: buffer_gl1_inv
255 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
256 ; GFX10: .amdhsa_kernel agent_one_as_seq_cst
257 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
258 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
259 ; GFX10-NOT: .amdhsa_memory_ordered 0
260 define amdgpu_kernel void @agent_one_as_seq_cst(
261 i32* %in, i32* %out) {
263 %val = load atomic i32, i32* %in syncscope("agent-one-as") seq_cst, align 4
264 store i32 %val, i32* %out
268 ; GCN-LABEL: {{^}}workgroup_one_as_unordered:
269 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
270 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
271 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
272 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
273 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
274 ; GFX89-NOT: buffer_wbinvl1_vol
275 ; GFX10-NOT: buffer_gl{{[01]}}_inv
276 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
277 ; GFX10: .amdhsa_kernel workgroup_one_as_unordered
278 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
279 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
280 ; GFX10-NOT: .amdhsa_memory_ordered 0
281 define amdgpu_kernel void @workgroup_one_as_unordered(
282 i32* %in, i32* %out) {
284 %val = load atomic i32, i32* %in syncscope("workgroup-one-as") unordered, align 4
285 store i32 %val, i32* %out
289 ; GCN-LABEL: {{^}}workgroup_one_as_monotonic:
290 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
291 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
292 ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
293 ; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
294 ; GFX10CU-NOT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
295 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
296 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
297 ; GFX89-NOT: buffer_wbinvl1_vol
298 ; GFX10-NOT: buffer_gl{{[01]}}_inv
299 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
300 ; GFX10: .amdhsa_kernel workgroup_one_as_monotonic
301 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
302 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
303 ; GFX10-NOT: .amdhsa_memory_ordered 0
304 define amdgpu_kernel void @workgroup_one_as_monotonic(
305 i32* %in, i32* %out) {
307 %val = load atomic i32, i32* %in syncscope("workgroup-one-as") monotonic, align 4
308 store i32 %val, i32* %out
312 ; GCN-LABEL: {{^}}workgroup_one_as_acquire:
313 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
314 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
315 ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
316 ; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
317 ; GFX10CU-NOT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
318 ; GFX89-NOT: s_waitcnt vmcnt(0){{$}}
319 ; GFX10WGP: s_waitcnt vmcnt(0){{$}}
320 ; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
321 ; GFX89-NOT: buffer_wbinvl1_vol
322 ; GFX10WGP-NEXT: buffer_gl0_inv
323 ; GFX10CU-NOT: buffer_gl0_inv
324 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
325 ; GFX10: .amdhsa_kernel workgroup_one_as_acquire
326 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
327 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
328 ; GFX10-NOT: .amdhsa_memory_ordered 0
329 define amdgpu_kernel void @workgroup_one_as_acquire(
330 i32* %in, i32* %out) {
332 %val = load atomic i32, i32* %in syncscope("workgroup-one-as") acquire, align 4
333 store i32 %val, i32* %out
337 ; GCN-LABEL: {{^}}workgroup_one_as_seq_cst:
338 ; GFX89-NOT: s_waitcnt vmcnt(0){{$}}
339 ; GFX10WGP: s_waitcnt vmcnt(0){{$}}
340 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0
341 ; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
342 ; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0
343 ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
344 ; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
345 ; GFX10CU: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
346 ; GFX89-NOT: s_waitcnt vmcnt(0){{$}}
347 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0){{$}}
348 ; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
349 ; GFX89-NOT: buffer_wbinvl1_vol
350 ; GFX10WGP-NEXT: buffer_gl0_inv
351 ; GFX10CU-NOT: buffer_gl0_inv
352 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
353 ; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst
354 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
355 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
356 ; GFX10-NOT: .amdhsa_memory_ordered 0
357 define amdgpu_kernel void @workgroup_one_as_seq_cst(
358 i32* %in, i32* %out) {
360 %val = load atomic i32, i32* %in syncscope("workgroup-one-as") seq_cst, align 4
361 store i32 %val, i32* %out
365 ; GCN-LABEL: {{^}}wavefront_one_as_unordered:
366 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
367 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
368 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
369 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
370 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
371 ; GFX89-NOT: buffer_wbinvl1_vol
372 ; GFX10-NOT: buffer_gl{{[01]}}_inv
373 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
374 ; GFX10: .amdhsa_kernel wavefront_one_as_unordered
375 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
376 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
377 ; GFX10-NOT: .amdhsa_memory_ordered 0
378 define amdgpu_kernel void @wavefront_one_as_unordered(
379 i32* %in, i32* %out) {
381 %val = load atomic i32, i32* %in syncscope("wavefront-one-as") unordered, align 4
382 store i32 %val, i32* %out
386 ; GCN-LABEL: {{^}}wavefront_one_as_monotonic:
387 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
388 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
389 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
390 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
391 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
392 ; GFX89-NOT: buffer_wbinvl1_vol
393 ; GFX10-NOT: buffer_gl{{[01]}}_inv
394 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
395 ; GFX10: .amdhsa_kernel wavefront_one_as_monotonic
396 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
397 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
398 ; GFX10-NOT: .amdhsa_memory_ordered 0
399 define amdgpu_kernel void @wavefront_one_as_monotonic(
400 i32* %in, i32* %out) {
402 %val = load atomic i32, i32* %in syncscope("wavefront-one-as") monotonic, align 4
403 store i32 %val, i32* %out
407 ; GCN-LABEL: {{^}}wavefront_one_as_acquire:
408 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
409 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
410 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
411 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
412 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
413 ; GFX89-NOT: buffer_wbinvl1_vol
414 ; GFX10-NOT: buffer_gl{{[01]}}_inv
415 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
416 ; GFX10: .amdhsa_kernel wavefront_one_as_acquire
417 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
418 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
419 ; GFX10-NOT: .amdhsa_memory_ordered 0
420 define amdgpu_kernel void @wavefront_one_as_acquire(
421 i32* %in, i32* %out) {
423 %val = load atomic i32, i32* %in syncscope("wavefront-one-as") acquire, align 4
424 store i32 %val, i32* %out
428 ; GCN-LABEL: {{^}}wavefront_one_as_seq_cst:
429 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
430 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
431 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
432 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
433 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
434 ; GFX89-NOT: buffer_wbinvl1_vol
435 ; GFX10-NOT: buffer_gl{{[01]}}_inv
436 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
437 ; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst
438 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
439 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
440 ; GFX10-NOT: .amdhsa_memory_ordered 0
441 define amdgpu_kernel void @wavefront_one_as_seq_cst(
442 i32* %in, i32* %out) {
444 %val = load atomic i32, i32* %in syncscope("wavefront-one-as") seq_cst, align 4
445 store i32 %val, i32* %out
449 ; GCN-LABEL: {{^}}nontemporal_private_0:
450 ; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
451 ; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}}
452 ; GFX10: .amdhsa_kernel nontemporal_private_0
453 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
454 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
455 ; GFX10-NOT: .amdhsa_memory_ordered 0
456 define amdgpu_kernel void @nontemporal_private_0(
457 i32 addrspace(5)* %in, i32* %out) {
459 %val = load i32, i32 addrspace(5)* %in, align 4, !nontemporal !0
460 store i32 %val, i32* %out
464 ; GCN-LABEL: {{^}}nontemporal_private_1:
465 ; GFX89: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
466 ; GFX10: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}}
467 ; GFX10: .amdhsa_kernel nontemporal_private_1
468 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
469 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
470 ; GFX10-NOT: .amdhsa_memory_ordered 0
471 define amdgpu_kernel void @nontemporal_private_1(
472 i32 addrspace(5)* %in, i32* %out) {
474 %tid = call i32 @llvm.amdgcn.workitem.id.x()
475 %val.gep = getelementptr inbounds i32, i32 addrspace(5)* %in, i32 %tid
476 %val = load i32, i32 addrspace(5)* %val.gep, align 4, !nontemporal !0
477 store i32 %val, i32* %out
481 ; GCN-LABEL: {{^}}nontemporal_global_0:
482 ; GCN: s_load_dword s{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], 0x0{{$}}
483 ; GFX10: .amdhsa_kernel nontemporal_global_0
484 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
485 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
486 ; GFX10-NOT: .amdhsa_memory_ordered 0
487 define amdgpu_kernel void @nontemporal_global_0(
488 i32 addrspace(1)* %in, i32* %out) {
490 %val = load i32, i32 addrspace(1)* %in, align 4, !nontemporal !0
491 store i32 %val, i32* %out
495 ; GCN-LABEL: {{^}}nontemporal_global_1:
496 ; GFX8: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
497 ; GFX9: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
498 ; GFX10: global_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}] slc{{$}}
499 ; GFX10: .amdhsa_kernel nontemporal_global_1
500 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
501 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
502 ; GFX10-NOT: .amdhsa_memory_ordered 0
503 define amdgpu_kernel void @nontemporal_global_1(
504 i32 addrspace(1)* %in, i32* %out) {
506 %tid = call i32 @llvm.amdgcn.workitem.id.x()
507 %val.gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 %tid
508 %val = load i32, i32 addrspace(1)* %val.gep, align 4, !nontemporal !0
509 store i32 %val, i32* %out
513 ; GCN-LABEL: {{^}}nontemporal_local_0:
514 ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
515 ; GFX10: .amdhsa_kernel nontemporal_local_0
516 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
517 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
518 ; GFX10-NOT: .amdhsa_memory_ordered 0
519 define amdgpu_kernel void @nontemporal_local_0(
520 i32 addrspace(3)* %in, i32* %out) {
522 %val = load i32, i32 addrspace(3)* %in, align 4, !nontemporal !0
523 store i32 %val, i32* %out
527 ; GCN-LABEL: {{^}}nontemporal_local_1:
528 ; GCN: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
529 ; GFX10: .amdhsa_kernel nontemporal_local_1
530 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
531 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
532 ; GFX10-NOT: .amdhsa_memory_ordered 0
533 define amdgpu_kernel void @nontemporal_local_1(
534 i32 addrspace(3)* %in, i32* %out) {
536 %tid = call i32 @llvm.amdgcn.workitem.id.x()
537 %val.gep = getelementptr inbounds i32, i32 addrspace(3)* %in, i32 %tid
538 %val = load i32, i32 addrspace(3)* %val.gep, align 4, !nontemporal !0
539 store i32 %val, i32* %out
543 ; GCN-LABEL: {{^}}nontemporal_flat_0:
544 ; GFX89: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
545 ; GFX10: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] slc{{$}}
546 ; GFX10: .amdhsa_kernel nontemporal_flat_0
547 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
548 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
549 ; GFX10-NOT: .amdhsa_memory_ordered 0
550 define amdgpu_kernel void @nontemporal_flat_0(
551 i32* %in, i32* %out) {
553 %val = load i32, i32* %in, align 4, !nontemporal !0
554 store i32 %val, i32* %out
558 ; GCN-LABEL: {{^}}nontemporal_flat_1:
559 ; GFX89: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
560 ; GFX10: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}] slc{{$}}
561 ; GFX10: .amdhsa_kernel nontemporal_flat_1
562 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
563 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
564 ; GFX10-NOT: .amdhsa_memory_ordered 0
565 define amdgpu_kernel void @nontemporal_flat_1(
566 i32* %in, i32* %out) {
568 %tid = call i32 @llvm.amdgcn.workitem.id.x()
569 %val.gep = getelementptr inbounds i32, i32* %in, i32 %tid
570 %val = load i32, i32* %val.gep, align 4, !nontemporal !0
571 store i32 %val, i32* %out
575 ; GCN-LABEL: {{^}}system_unordered:
576 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
577 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
578 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
579 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
580 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
581 ; GFX89-NOT: buffer_wbinvl1_vol
582 ; GFX10-NOT: buffer_gl{{[01]}}_inv
583 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
584 ; GFX10: .amdhsa_kernel system_unordered
585 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
586 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
587 ; GFX10-NOT: .amdhsa_memory_ordered 0
588 define amdgpu_kernel void @system_unordered(
589 i32* %in, i32* %out) {
591 %val = load atomic i32, i32* %in unordered, align 4
592 store i32 %val, i32* %out
596 ; GCN-LABEL: {{^}}system_monotonic:
597 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
598 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
599 ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
600 ; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
601 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
602 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
603 ; GFX89-NOT: buffer_wbinvl1_vol
604 ; GFX10-NOT: buffer_gl{{[01]}}_inv
605 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
606 ; GFX10: .amdhsa_kernel system_monotonic
607 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
608 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
609 ; GFX10-NOT: .amdhsa_memory_ordered 0
610 define amdgpu_kernel void @system_monotonic(
611 i32* %in, i32* %out) {
613 %val = load atomic i32, i32* %in monotonic, align 4
614 store i32 %val, i32* %out
618 ; GCN-LABEL: {{^}}system_acquire:
619 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
620 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
621 ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
622 ; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
623 ; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
624 ; GFX89-NEXT: buffer_wbinvl1_vol
625 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
626 ; GFX10-NEXT: buffer_gl0_inv
627 ; GFX10-NEXT: buffer_gl1_inv
628 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
629 ; GFX10: .amdhsa_kernel system_acquire
630 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
631 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
632 ; GFX10-NOT: .amdhsa_memory_ordered 0
633 define amdgpu_kernel void @system_acquire(
634 i32* %in, i32* %out) {
636 %val = load atomic i32, i32* %in acquire, align 4
637 store i32 %val, i32* %out
641 ; GCN-LABEL: {{^}}system_seq_cst:
642 ; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
643 ; GFX10: s_waitcnt lgkmcnt(0){{$}}
644 ; GFX10: s_waitcnt_vscnt null, 0x0{{$}}
645 ; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
646 ; GFX10-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
647 ; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
648 ; GFX89-NEXT: buffer_wbinvl1_vol
649 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
650 ; GFX10-NEXT: buffer_gl0_inv
651 ; GFX10-NEXT: buffer_gl1_inv
652 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
653 ; GFX10: .amdhsa_kernel system_seq_cst
654 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
655 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
656 ; GFX10-NOT: .amdhsa_memory_ordered 0
657 define amdgpu_kernel void @system_seq_cst(
658 i32* %in, i32* %out) {
660 %val = load atomic i32, i32* %in seq_cst, align 4
661 store i32 %val, i32* %out
665 ; GCN-LABEL: {{^}}singlethread_unordered:
666 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
667 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
668 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
669 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
670 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
671 ; GFX89-NOT: buffer_wbinvl1_vol
672 ; GFX10-NOT: buffer_gl{{[01]}}_inv
673 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
674 ; GFX10: .amdhsa_kernel singlethread_unordered
675 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
676 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
677 ; GFX10-NOT: .amdhsa_memory_ordered 0
678 define amdgpu_kernel void @singlethread_unordered(
679 i32* %in, i32* %out) {
681 %val = load atomic i32, i32* %in syncscope("singlethread") unordered, align 4
682 store i32 %val, i32* %out
686 ; GCN-LABEL: {{^}}singlethread_monotonic:
687 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
688 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
689 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
690 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
691 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
692 ; GFX89-NOT: buffer_wbinvl1_vol
693 ; GFX10-NOT: buffer_gl{{[01]}}_inv
694 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
695 ; GFX10: .amdhsa_kernel singlethread_monotonic
696 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
697 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
698 ; GFX10-NOT: .amdhsa_memory_ordered 0
699 define amdgpu_kernel void @singlethread_monotonic(
700 i32* %in, i32* %out) {
702 %val = load atomic i32, i32* %in syncscope("singlethread") monotonic, align 4
703 store i32 %val, i32* %out
707 ; GCN-LABEL: {{^}}singlethread_acquire:
708 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
709 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
710 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
711 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
712 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
713 ; GFX89-NOT: buffer_wbinvl1_vol
714 ; GFX10-NOT: buffer_gl{{[01]}}_inv
715 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
716 ; GFX10: .amdhsa_kernel singlethread_acquire
717 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
718 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
719 ; GFX10-NOT: .amdhsa_memory_ordered 0
720 define amdgpu_kernel void @singlethread_acquire(
721 i32* %in, i32* %out) {
723 %val = load atomic i32, i32* %in syncscope("singlethread") acquire, align 4
724 store i32 %val, i32* %out
728 ; GCN-LABEL: {{^}}singlethread_seq_cst:
729 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
730 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
731 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
732 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
733 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
734 ; GFX89-NOT: buffer_wbinvl1_vol
735 ; GFX10-NOT: buffer_gl{{[01]}}_inv
736 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
737 ; GFX10: .amdhsa_kernel singlethread_seq_cst
738 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
739 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
740 ; GFX10-NOT: .amdhsa_memory_ordered 0
741 define amdgpu_kernel void @singlethread_seq_cst(
742 i32* %in, i32* %out) {
744 %val = load atomic i32, i32* %in syncscope("singlethread") seq_cst, align 4
745 store i32 %val, i32* %out
749 ; GCN-LABEL: {{^}}agent_unordered:
750 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
751 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
752 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
753 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
754 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
755 ; GFX89-NOT: buffer_wbinvl1_vol
756 ; GFX10-NOT: buffer_gl{{[01]}}_inv
757 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
758 ; GFX10: .amdhsa_kernel agent_unordered
759 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
760 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
761 ; GFX10-NOT: .amdhsa_memory_ordered 0
762 define amdgpu_kernel void @agent_unordered(
763 i32* %in, i32* %out) {
765 %val = load atomic i32, i32* %in syncscope("agent") unordered, align 4
766 store i32 %val, i32* %out
770 ; GCN-LABEL: {{^}}agent_monotonic:
771 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
772 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
773 ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
774 ; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
775 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
776 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
777 ; GFX89-NOT: buffer_wbinvl1_vol
778 ; GFX10-NOT: buffer_gl{{[01]}}_inv
779 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
780 ; GFX10: .amdhsa_kernel agent_monotonic
781 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
782 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
783 ; GFX10-NOT: .amdhsa_memory_ordered 0
784 define amdgpu_kernel void @agent_monotonic(
785 i32* %in, i32* %out) {
787 %val = load atomic i32, i32* %in syncscope("agent") monotonic, align 4
788 store i32 %val, i32* %out
792 ; GCN-LABEL: {{^}}agent_acquire:
793 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
794 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
795 ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
796 ; GFX10: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
797 ; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
798 ; GFX89-NEXT: buffer_wbinvl1_vol
799 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
800 ; GFX10-NEXT: buffer_gl0_inv
801 ; GFX10-NEXT: buffer_gl1_inv
802 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
803 ; GFX10: .amdhsa_kernel agent_acquire
804 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
805 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
806 ; GFX10-NOT: .amdhsa_memory_ordered 0
807 define amdgpu_kernel void @agent_acquire(
808 i32* %in, i32* %out) {
810 %val = load atomic i32, i32* %in syncscope("agent") acquire, align 4
811 store i32 %val, i32* %out
815 ; GCN-LABEL: {{^}}agent_seq_cst:
816 ; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
817 ; GFX10: s_waitcnt lgkmcnt(0){{$}}
818 ; GFX10: s_waitcnt_vscnt null, 0x0{{$}}
819 ; GFX89-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
820 ; GFX10-NEXT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc dlc{{$}}
821 ; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
822 ; GFX89-NEXT: buffer_wbinvl1_vol
823 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
824 ; GFX10-NEXT: buffer_gl0_inv
825 ; GFX10-NEXT: buffer_gl1_inv
826 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
827 ; GFX10: .amdhsa_kernel agent_seq_cst
828 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
829 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
830 ; GFX10-NOT: .amdhsa_memory_ordered 0
831 define amdgpu_kernel void @agent_seq_cst(
832 i32* %in, i32* %out) {
834 %val = load atomic i32, i32* %in syncscope("agent") seq_cst, align 4
835 store i32 %val, i32* %out
839 ; GCN-LABEL: {{^}}workgroup_unordered:
840 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
841 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
842 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
843 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
844 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
845 ; GFX89-NOT: buffer_wbinvl1_vol
846 ; GFX10-NOT: buffer_gl{{[01]}}_inv
847 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
848 ; GFX10: .amdhsa_kernel workgroup_unordered
849 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
850 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
851 ; GFX10-NOT: .amdhsa_memory_ordered 0
852 define amdgpu_kernel void @workgroup_unordered(
853 i32* %in, i32* %out) {
855 %val = load atomic i32, i32* %in syncscope("workgroup") unordered, align 4
856 store i32 %val, i32* %out
860 ; GCN-LABEL: {{^}}workgroup_monotonic:
861 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
862 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
863 ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
864 ; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
865 ; GFX10CU-NOT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
866 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
867 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
868 ; GFX89-NOT: buffer_wbinvl1_vol
869 ; GFX10-NOT: buffer_gl{{[01]}}_inv
870 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
871 ; GFX10: .amdhsa_kernel workgroup_monotonic
872 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
873 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
874 ; GFX10-NOT: .amdhsa_memory_ordered 0
875 define amdgpu_kernel void @workgroup_monotonic(
876 i32* %in, i32* %out) {
878 %val = load atomic i32, i32* %in syncscope("workgroup") monotonic, align 4
879 store i32 %val, i32* %out
883 ; GCN-LABEL: {{^}}workgroup_acquire:
884 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
885 ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
886 ; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
887 ; GFX10CU-NOT: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
888 ; GFX89-NOT: s_waitcnt vmcnt(0){{$}}
889 ; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
890 ; GFX89-NOT: buffer_wbinvl1_vol
891 ; GFX10WGP-NEXT: buffer_gl0_inv
892 ; GFX10CU-NOT: buffer_gl0_inv
893 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
894 ; GFX10: .amdhsa_kernel workgroup_acquire
895 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
896 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
897 ; GFX10-NOT: .amdhsa_memory_ordered 0
898 define amdgpu_kernel void @workgroup_acquire(
899 i32* %in, i32* %out) {
901 %val = load atomic i32, i32* %in syncscope("workgroup") acquire, align 4
902 store i32 %val, i32* %out
906 ; GCN-LABEL: {{^}}workgroup_seq_cst:
907 ; GFX89-NOT: s_waitcnt vmcnt(0){{$}}
908 ; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
909 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0
910 ; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
911 ; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0
912 ; GFX89: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
913 ; GFX10WGP: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
914 ; GFX10CU: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
915 ; GFX89-NOT: s_waitcnt vmcnt(0){{$}}
916 ; GFX10WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
917 ; GFX89-NOT: buffer_wbinvl1_vol
918 ; GFX10WGP-NEXT: buffer_gl0_inv
919 ; GFX10CU-NOT: buffer_gl0_inv
920 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
921 ; GFX10: .amdhsa_kernel workgroup_seq_cst
922 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
923 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
924 ; GFX10-NOT: .amdhsa_memory_ordered 0
925 define amdgpu_kernel void @workgroup_seq_cst(
926 i32* %in, i32* %out) {
928 %val = load atomic i32, i32* %in syncscope("workgroup") seq_cst, align 4
929 store i32 %val, i32* %out
933 ; GCN-LABEL: {{^}}wavefront_unordered:
934 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
935 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
936 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
937 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
938 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
939 ; GFX89-NOT: buffer_wbinvl1_vol
940 ; GFX10-NOT: buffer_gl{{[01]}}_inv
941 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
942 ; GFX10: .amdhsa_kernel wavefront_unordered
943 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
944 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
945 ; GFX10-NOT: .amdhsa_memory_ordered 0
946 define amdgpu_kernel void @wavefront_unordered(
947 i32* %in, i32* %out) {
949 %val = load atomic i32, i32* %in syncscope("wavefront") unordered, align 4
950 store i32 %val, i32* %out
954 ; GCN-LABEL: {{^}}wavefront_monotonic:
955 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
956 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
957 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
958 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
959 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
960 ; GFX89-NOT: buffer_wbinvl1_vol
961 ; GFX10-NOT: buffer_gl{{[01]}}_inv
962 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
963 ; GFX10: .amdhsa_kernel wavefront_monotonic
964 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
965 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
966 ; GFX10-NOT: .amdhsa_memory_ordered 0
967 define amdgpu_kernel void @wavefront_monotonic(
968 i32* %in, i32* %out) {
970 %val = load atomic i32, i32* %in syncscope("wavefront") monotonic, align 4
971 store i32 %val, i32* %out
975 ; GCN-LABEL: {{^}}wavefront_acquire:
976 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
977 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
978 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
979 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
980 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
981 ; GFX89-NOT: buffer_wbinvl1_vol
982 ; GFX10-NOT: buffer_gl{{[01]}}_inv
983 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
984 ; GFX10: .amdhsa_kernel wavefront_acquire
985 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
986 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
987 ; GFX10-NOT: .amdhsa_memory_ordered 0
988 define amdgpu_kernel void @wavefront_acquire(
989 i32* %in, i32* %out) {
991 %val = load atomic i32, i32* %in syncscope("wavefront") acquire, align 4
992 store i32 %val, i32* %out
996 ; GCN-LABEL: {{^}}wavefront_seq_cst:
997 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
998 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
999 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
1000 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
1001 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
1002 ; GFX89-NOT: buffer_wbinvl1_vol
1003 ; GFX10-NOT: buffer_gl{{[01]}}_inv
1004 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
1005 ; GFX10: .amdhsa_kernel wavefront_seq_cst
1006 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
1007 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
1008 ; GFX10-NOT: .amdhsa_memory_ordered 0
1009 define amdgpu_kernel void @wavefront_seq_cst(
1010 i32* %in, i32* %out) {
1012 %val = load atomic i32, i32* %in syncscope("wavefront") seq_cst, align 4
1013 store i32 %val, i32* %out