1 ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
2 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX8,GFX89 %s
3 ; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX9,GFX89 %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3 -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10WGP %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+code-object-v3,+cumode -verify-machineinstrs -amdgpu-enable-global-sgpr-addr < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10CU %s
8 declare i32 @llvm.amdgcn.workitem.id.x()
10 ; GCN-LABEL: {{^}}system_one_as_unordered:
11 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
12 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
13 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
14 ; GFX10: .amdhsa_kernel system_one_as_unordered
15 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
16 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
17 ; GFX10-NOT: .amdhsa_memory_ordered 0
18 define amdgpu_kernel void @system_one_as_unordered(
21 store atomic i32 %in, i32* %out syncscope("one-as") unordered, align 4
25 ; GCN-LABEL: {{^}}system_one_as_monotonic:
26 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
27 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
28 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
29 ; GFX10: .amdhsa_kernel system_one_as_monotonic
30 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
31 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
32 ; GFX10-NOT: .amdhsa_memory_ordered 0
33 define amdgpu_kernel void @system_one_as_monotonic(
36 store atomic i32 %in, i32* %out syncscope("one-as") monotonic, align 4
40 ; GCN-LABEL: {{^}}system_one_as_release:
41 ; GCN: s_waitcnt vmcnt(0){{$}}
42 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
43 ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
44 ; GFX10: .amdhsa_kernel system_one_as_release
45 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
46 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
47 ; GFX10-NOT: .amdhsa_memory_ordered 0
48 define amdgpu_kernel void @system_one_as_release(
51 store atomic i32 %in, i32* %out syncscope("one-as") release, align 4
55 ; GCN-LABEL: {{^}}system_one_as_seq_cst:
56 ; GCN: s_waitcnt vmcnt(0){{$}}
57 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
58 ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
59 ; GFX10: .amdhsa_kernel system_one_as_seq_cst
60 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
61 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
62 ; GFX10-NOT: .amdhsa_memory_ordered 0
63 define amdgpu_kernel void @system_one_as_seq_cst(
66 store atomic i32 %in, i32* %out syncscope("one-as") seq_cst, align 4
70 ; GCN-LABEL: {{^}}singlethread_one_as_unordered:
71 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
72 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
73 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
74 ; GFX10: .amdhsa_kernel singlethread_one_as_unordered
75 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
76 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
77 ; GFX10-NOT: .amdhsa_memory_ordered 0
78 define amdgpu_kernel void @singlethread_one_as_unordered(
81 store atomic i32 %in, i32* %out syncscope("singlethread-one-as") unordered, align 4
85 ; GCN-LABEL: {{^}}singlethread_one_as_monotonic:
86 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
87 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
88 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
89 ; GFX10: .amdhsa_kernel singlethread_one_as_monotonic
90 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
91 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
92 ; GFX10-NOT: .amdhsa_memory_ordered 0
93 define amdgpu_kernel void @singlethread_one_as_monotonic(
96 store atomic i32 %in, i32* %out syncscope("singlethread-one-as") monotonic, align 4
100 ; GCN-LABEL: {{^}}singlethread_one_as_release:
101 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
102 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
103 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
104 ; GFX10: .amdhsa_kernel singlethread_one_as_release
105 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
106 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
107 ; GFX10-NOT: .amdhsa_memory_ordered 0
108 define amdgpu_kernel void @singlethread_one_as_release(
109 i32 %in, i32* %out) {
111 store atomic i32 %in, i32* %out syncscope("singlethread-one-as") release, align 4
115 ; GCN-LABEL: {{^}}singlethread_one_as_seq_cst:
116 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
117 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
118 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
119 ; GFX10: .amdhsa_kernel singlethread_one_as_seq_cst
120 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
121 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
122 ; GFX10-NOT: .amdhsa_memory_ordered 0
123 define amdgpu_kernel void @singlethread_one_as_seq_cst(
124 i32 %in, i32* %out) {
126 store atomic i32 %in, i32* %out syncscope("singlethread-one-as") seq_cst, align 4
130 ; GCN-LABEL: {{^}}agent_one_as_unordered:
131 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
132 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
133 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
134 ; GFX10: .amdhsa_kernel agent_one_as_unordered
135 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
136 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
137 ; GFX10-NOT: .amdhsa_memory_ordered 0
138 define amdgpu_kernel void @agent_one_as_unordered(
139 i32 %in, i32* %out) {
141 store atomic i32 %in, i32* %out syncscope("agent-one-as") unordered, align 4
145 ; GCN-LABEL: {{^}}agent_one_as_monotonic:
146 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
147 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
148 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
149 ; GFX10: .amdhsa_kernel agent_one_as_monotonic
150 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
151 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
152 ; GFX10-NOT: .amdhsa_memory_ordered 0
153 define amdgpu_kernel void @agent_one_as_monotonic(
154 i32 %in, i32* %out) {
156 store atomic i32 %in, i32* %out syncscope("agent-one-as") monotonic, align 4
160 ; GCN-LABEL: {{^}}agent_one_as_release:
161 ; GCN: s_waitcnt vmcnt(0){{$}}
162 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
163 ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
164 ; GFX10: .amdhsa_kernel agent_one_as_release
165 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
166 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
167 ; GFX10-NOT: .amdhsa_memory_ordered 0
168 define amdgpu_kernel void @agent_one_as_release(
169 i32 %in, i32* %out) {
171 store atomic i32 %in, i32* %out syncscope("agent-one-as") release, align 4
175 ; GCN-LABEL: {{^}}agent_one_as_seq_cst:
176 ; GCN: s_waitcnt vmcnt(0){{$}}
177 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
178 ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
179 ; GFX10: .amdhsa_kernel agent_one_as_seq_cst
180 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
181 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
182 ; GFX10-NOT: .amdhsa_memory_ordered 0
183 define amdgpu_kernel void @agent_one_as_seq_cst(
184 i32 %in, i32* %out) {
186 store atomic i32 %in, i32* %out syncscope("agent-one-as") seq_cst, align 4
190 ; GCN-LABEL: {{^}}workgroup_one_as_unordered:
191 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
192 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
193 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
194 ; GFX10: .amdhsa_kernel workgroup_one_as_unordered
195 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
196 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
197 ; GFX10-NOT: .amdhsa_memory_ordered 0
198 define amdgpu_kernel void @workgroup_one_as_unordered(
199 i32 %in, i32* %out) {
201 store atomic i32 %in, i32* %out syncscope("workgroup-one-as") unordered, align 4
205 ; GCN-LABEL: {{^}}workgroup_one_as_monotonic:
206 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
207 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
208 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
209 ; GFX10: .amdhsa_kernel workgroup_one_as_monotonic
210 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
211 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
212 ; GFX10-NOT: .amdhsa_memory_ordered 0
213 define amdgpu_kernel void @workgroup_one_as_monotonic(
214 i32 %in, i32* %out) {
216 store atomic i32 %in, i32* %out syncscope("workgroup-one-as") monotonic, align 4
220 ; GCN-LABEL: {{^}}workgroup_one_as_release:
221 ; GFX89-NOT: s_waitcnt vmcnt(0){{$}}
222 ; GFX10WGP: s_waitcnt vmcnt(0){{$}}
223 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
224 ; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
225 ; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}}
226 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
227 ; GFX10: .amdhsa_kernel workgroup_one_as_release
228 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
229 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
230 ; GFX10-NOT: .amdhsa_memory_ordered 0
231 define amdgpu_kernel void @workgroup_one_as_release(
232 i32 %in, i32* %out) {
234 store atomic i32 %in, i32* %out syncscope("workgroup-one-as") release, align 4
238 ; GCN-LABEL: {{^}}workgroup_one_as_seq_cst:
239 ; GFX89-NOT: s_waitcnt vmcnt(0){{$}}
240 ; GFX10WGP: s_waitcnt vmcnt(0){{$}}
241 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
242 ; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
243 ; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}}
244 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
245 ; GFX10: .amdhsa_kernel workgroup_one_as_seq_cst
246 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
247 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
248 ; GFX10-NOT: .amdhsa_memory_ordered 0
249 define amdgpu_kernel void @workgroup_one_as_seq_cst(
250 i32 %in, i32* %out) {
252 store atomic i32 %in, i32* %out syncscope("workgroup-one-as") seq_cst, align 4
256 ; GCN-LABEL: {{^}}wavefront_one_as_unordered:
257 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
258 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
259 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
260 ; GFX10: .amdhsa_kernel wavefront_one_as_unordered
261 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
262 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
263 ; GFX10-NOT: .amdhsa_memory_ordered 0
264 define amdgpu_kernel void @wavefront_one_as_unordered(
265 i32 %in, i32* %out) {
267 store atomic i32 %in, i32* %out syncscope("wavefront-one-as") unordered, align 4
271 ; GCN-LABEL: {{^}}wavefront_one_as_monotonic:
272 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
273 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
274 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
275 ; GFX10: .amdhsa_kernel wavefront_one_as_monotonic
276 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
277 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
278 ; GFX10-NOT: .amdhsa_memory_ordered 0
279 define amdgpu_kernel void @wavefront_one_as_monotonic(
280 i32 %in, i32* %out) {
282 store atomic i32 %in, i32* %out syncscope("wavefront-one-as") monotonic, align 4
286 ; GCN-LABEL: {{^}}wavefront_one_as_release:
287 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
288 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
289 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
290 ; GFX10: .amdhsa_kernel wavefront_one_as_release
291 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
292 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
293 ; GFX10-NOT: .amdhsa_memory_ordered 0
294 define amdgpu_kernel void @wavefront_one_as_release(
295 i32 %in, i32* %out) {
297 store atomic i32 %in, i32* %out syncscope("wavefront-one-as") release, align 4
301 ; GCN-LABEL: {{^}}wavefront_one_as_seq_cst:
302 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
303 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
304 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
305 ; GFX10: .amdhsa_kernel wavefront_one_as_seq_cst
306 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
307 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
308 ; GFX10-NOT: .amdhsa_memory_ordered 0
309 define amdgpu_kernel void @wavefront_one_as_seq_cst(
310 i32 %in, i32* %out) {
312 store atomic i32 %in, i32* %out syncscope("wavefront-one-as") seq_cst, align 4
316 ; GCN-LABEL: {{^}}nontemporal_private_0:
317 ; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
318 ; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}}
319 ; GFX10: .amdhsa_kernel nontemporal_private_0
320 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
321 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
322 ; GFX10-NOT: .amdhsa_memory_ordered 0
323 define amdgpu_kernel void @nontemporal_private_0(
324 i32* %in, i32 addrspace(5)* %out) {
326 %val = load i32, i32* %in, align 4
327 store i32 %val, i32 addrspace(5)* %out, !nontemporal !0
331 ; GCN-LABEL: {{^}}nontemporal_private_1:
332 ; GFX89: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen glc slc{{$}}
333 ; GFX10: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen slc{{$}}
334 ; GFX10: .amdhsa_kernel nontemporal_private_1
335 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
336 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
337 ; GFX10-NOT: .amdhsa_memory_ordered 0
338 define amdgpu_kernel void @nontemporal_private_1(
339 i32* %in, i32 addrspace(5)* %out) {
341 %tid = call i32 @llvm.amdgcn.workitem.id.x()
342 %val = load i32, i32* %in, align 4
343 %out.gep = getelementptr inbounds i32, i32 addrspace(5)* %out, i32 %tid
344 store i32 %val, i32 addrspace(5)* %out.gep, !nontemporal !0
348 ; GCN-LABEL: {{^}}nontemporal_global_0:
349 ; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
350 ; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off glc slc{{$}}
351 ; GFX10: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, off slc{{$}}
352 ; GFX10: .amdhsa_kernel nontemporal_global_0
353 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
354 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
355 ; GFX10-NOT: .amdhsa_memory_ordered 0
356 define amdgpu_kernel void @nontemporal_global_0(
357 i32* %in, i32 addrspace(1)* %out) {
359 %val = load i32, i32* %in, align 4
360 store i32 %val, i32 addrspace(1)* %out, !nontemporal !0
364 ; GCN-LABEL: {{^}}nontemporal_global_1:
365 ; GFX8: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
366 ; GFX9: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] glc slc{{$}}
367 ; GFX10: global_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}] slc{{$}}
368 ; GFX10: .amdhsa_kernel nontemporal_global_1
369 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
370 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
371 ; GFX10-NOT: .amdhsa_memory_ordered 0
372 define amdgpu_kernel void @nontemporal_global_1(
373 i32* %in, i32 addrspace(1)* %out) {
375 %tid = call i32 @llvm.amdgcn.workitem.id.x()
376 %val = load i32, i32* %in, align 4
377 %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
378 store i32 %val, i32 addrspace(1)* %out.gep, !nontemporal !0
382 ; GCN-LABEL: {{^}}nontemporal_local_0:
383 ; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
384 ; GFX10: .amdhsa_kernel nontemporal_local_0
385 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
386 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
387 ; GFX10-NOT: .amdhsa_memory_ordered 0
388 define amdgpu_kernel void @nontemporal_local_0(
389 i32* %in, i32 addrspace(3)* %out) {
391 %val = load i32, i32* %in, align 4
392 store i32 %val, i32 addrspace(3)* %out, !nontemporal !0
396 ; GCN-LABEL: {{^}}nontemporal_local_1:
397 ; GCN: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}{{$}}
398 ; GFX10: .amdhsa_kernel nontemporal_local_1
399 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
400 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
401 ; GFX10-NOT: .amdhsa_memory_ordered 0
402 define amdgpu_kernel void @nontemporal_local_1(
403 i32* %in, i32 addrspace(3)* %out) {
405 %tid = call i32 @llvm.amdgcn.workitem.id.x()
406 %val = load i32, i32* %in, align 4
407 %out.gep = getelementptr inbounds i32, i32 addrspace(3)* %out, i32 %tid
408 store i32 %val, i32 addrspace(3)* %out.gep, !nontemporal !0
412 ; GCN-LABEL: {{^}}nontemporal_flat_0:
413 ; GFX89: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
414 ; GFX10: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} slc{{$}}
415 ; GFX10: .amdhsa_kernel nontemporal_flat_0
416 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
417 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
418 ; GFX10-NOT: .amdhsa_memory_ordered 0
419 define amdgpu_kernel void @nontemporal_flat_0(
420 i32* %in, i32* %out) {
422 %val = load i32, i32* %in, align 4
423 store i32 %val, i32* %out, !nontemporal !0
427 ; GCN-LABEL: {{^}}nontemporal_flat_1:
428 ; GFX89: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc slc{{$}}
429 ; GFX10: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} slc{{$}}
430 ; GFX10: .amdhsa_kernel nontemporal_flat_1
431 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
432 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
433 ; GFX10-NOT: .amdhsa_memory_ordered 0
434 define amdgpu_kernel void @nontemporal_flat_1(
435 i32* %in, i32* %out) {
437 %tid = call i32 @llvm.amdgcn.workitem.id.x()
438 %val = load i32, i32* %in, align 4
439 %out.gep = getelementptr inbounds i32, i32* %out, i32 %tid
440 store i32 %val, i32* %out.gep, !nontemporal !0
444 ; GCN-LABEL: {{^}}system_unordered:
445 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
446 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
447 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
448 ; GFX10: .amdhsa_kernel system_unordered
449 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
450 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
451 ; GFX10-NOT: .amdhsa_memory_ordered 0
452 define amdgpu_kernel void @system_unordered(
453 i32 %in, i32* %out) {
455 store atomic i32 %in, i32* %out unordered, align 4
459 ; GCN-LABEL: {{^}}system_monotonic:
460 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
461 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
462 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
463 ; GFX10: .amdhsa_kernel system_monotonic
464 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
465 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
466 ; GFX10-NOT: .amdhsa_memory_ordered 0
467 define amdgpu_kernel void @system_monotonic(
468 i32 %in, i32* %out) {
470 store atomic i32 %in, i32* %out monotonic, align 4
474 ; GCN-LABEL: {{^}}system_release:
475 ; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
476 ; GFX10: s_waitcnt lgkmcnt(0){{$}}
477 ; GFX10: s_waitcnt_vscnt null, 0x0{{$}}
478 ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
479 ; GFX10: .amdhsa_kernel system_release
480 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
481 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
482 ; GFX10-NOT: .amdhsa_memory_ordered 0
483 define amdgpu_kernel void @system_release(
484 i32 %in, i32* %out) {
486 store atomic i32 %in, i32* %out release, align 4
490 ; GCN-LABEL: {{^}}system_seq_cst:
491 ; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
492 ; GFX10: s_waitcnt lgkmcnt(0){{$}}
493 ; GFX10: s_waitcnt_vscnt null, 0x0{{$}}
494 ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
495 ; GFX10: .amdhsa_kernel system_seq_cst
496 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
497 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
498 ; GFX10-NOT: .amdhsa_memory_ordered 0
499 define amdgpu_kernel void @system_seq_cst(
500 i32 %in, i32* %out) {
502 store atomic i32 %in, i32* %out seq_cst, align 4
506 ; GCN-LABEL: {{^}}singlethread_unordered:
507 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
508 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
509 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
510 ; GFX10: .amdhsa_kernel singlethread_unordered
511 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
512 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
513 ; GFX10-NOT: .amdhsa_memory_ordered 0
514 define amdgpu_kernel void @singlethread_unordered(
515 i32 %in, i32* %out) {
517 store atomic i32 %in, i32* %out syncscope("singlethread") unordered, align 4
521 ; GCN-LABEL: {{^}}singlethread_monotonic:
522 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
523 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
524 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
525 ; GFX10: .amdhsa_kernel singlethread_monotonic
526 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
527 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
528 ; GFX10-NOT: .amdhsa_memory_ordered 0
529 define amdgpu_kernel void @singlethread_monotonic(
530 i32 %in, i32* %out) {
532 store atomic i32 %in, i32* %out syncscope("singlethread") monotonic, align 4
536 ; GCN-LABEL: {{^}}singlethread_release:
537 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
538 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
539 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
540 ; GFX10: .amdhsa_kernel singlethread_release
541 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
542 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
543 ; GFX10-NOT: .amdhsa_memory_ordered 0
544 define amdgpu_kernel void @singlethread_release(
545 i32 %in, i32* %out) {
547 store atomic i32 %in, i32* %out syncscope("singlethread") release, align 4
551 ; GCN-LABEL: {{^}}singlethread_seq_cst:
552 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
553 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
554 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
555 ; GFX10: .amdhsa_kernel singlethread_seq_cst
556 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
557 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
558 ; GFX10-NOT: .amdhsa_memory_ordered 0
559 define amdgpu_kernel void @singlethread_seq_cst(
560 i32 %in, i32* %out) {
562 store atomic i32 %in, i32* %out syncscope("singlethread") seq_cst, align 4
566 ; GCN-LABEL: {{^}}agent_unordered:
567 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
568 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
569 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
570 ; GFX10: .amdhsa_kernel agent_unordered
571 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
572 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
573 ; GFX10-NOT: .amdhsa_memory_ordered 0
574 define amdgpu_kernel void @agent_unordered(
575 i32 %in, i32* %out) {
577 store atomic i32 %in, i32* %out syncscope("agent") unordered, align 4
581 ; GCN-LABEL: {{^}}agent_monotonic:
582 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
583 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
584 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
585 ; GFX10: .amdhsa_kernel agent_monotonic
586 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
587 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
588 ; GFX10-NOT: .amdhsa_memory_ordered 0
589 define amdgpu_kernel void @agent_monotonic(
590 i32 %in, i32* %out) {
592 store atomic i32 %in, i32* %out syncscope("agent") monotonic, align 4
596 ; GCN-LABEL: {{^}}agent_release:
597 ; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
598 ; GFX10: s_waitcnt lgkmcnt(0){{$}}
599 ; GFX10: s_waitcnt_vscnt null, 0x0{{$}}
600 ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
601 ; GFX10: .amdhsa_kernel agent_release
602 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
603 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
604 ; GFX10-NOT: .amdhsa_memory_ordered 0
605 define amdgpu_kernel void @agent_release(
606 i32 %in, i32* %out) {
608 store atomic i32 %in, i32* %out syncscope("agent") release, align 4
612 ; GCN-LABEL: {{^}}agent_seq_cst:
613 ; GFX89: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
614 ; GFX10: s_waitcnt lgkmcnt(0){{$}}
615 ; GFX10: s_waitcnt_vscnt null, 0x0{{$}}
616 ; GCN-NEXT: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
617 ; GFX10: .amdhsa_kernel agent_seq_cst
618 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
619 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
620 ; GFX10-NOT: .amdhsa_memory_ordered 0
621 define amdgpu_kernel void @agent_seq_cst(
622 i32 %in, i32* %out) {
624 store atomic i32 %in, i32* %out syncscope("agent") seq_cst, align 4
628 ; GCN-LABEL: {{^}}workgroup_unordered:
629 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
630 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
631 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
632 ; GFX10: .amdhsa_kernel workgroup_unordered
633 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
634 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
635 ; GFX10-NOT: .amdhsa_memory_ordered 0
636 define amdgpu_kernel void @workgroup_unordered(
637 i32 %in, i32* %out) {
639 store atomic i32 %in, i32* %out syncscope("workgroup") unordered, align 4
643 ; GCN-LABEL: {{^}}workgroup_monotonic:
644 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
645 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
646 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
647 ; GFX10: .amdhsa_kernel workgroup_monotonic
648 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
649 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
650 ; GFX10-NOT: .amdhsa_memory_ordered 0
651 define amdgpu_kernel void @workgroup_monotonic(
652 i32 %in, i32* %out) {
654 store atomic i32 %in, i32* %out syncscope("workgroup") monotonic, align 4
658 ; GCN-LABEL: {{^}}workgroup_release:
659 ; GFX89-NOT: s_waitcnt vmcnt(0){{$}}
660 ; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
661 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
662 ; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
663 ; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}}
664 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
665 ; GFX10: .amdhsa_kernel workgroup_release
666 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
667 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
668 ; GFX10-NOT: .amdhsa_memory_ordered 0
669 define amdgpu_kernel void @workgroup_release(
670 i32 %in, i32* %out) {
672 store atomic i32 %in, i32* %out syncscope("workgroup") release, align 4
676 ; GCN-LABEL: {{^}}workgroup_seq_cst:
677 ; GFX89-NOT: s_waitcnt vmcnt(0){{$}}
678 ; GFX10WGP: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
679 ; GFX10WGP-NEXT: s_waitcnt_vscnt null, 0x0{{$}}
680 ; GFX10CU-NOT: s_waitcnt vmcnt(0){{$}}
681 ; GFX10CU-NOT: s_waitcnt_vscnt null, 0x0{{$}}
682 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
683 ; GFX10: .amdhsa_kernel workgroup_seq_cst
684 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
685 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
686 ; GFX10-NOT: .amdhsa_memory_ordered 0
687 define amdgpu_kernel void @workgroup_seq_cst(
688 i32 %in, i32* %out) {
690 store atomic i32 %in, i32* %out syncscope("workgroup") seq_cst, align 4
694 ; GCN-LABEL: {{^}}wavefront_unordered:
695 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
696 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
697 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
698 ; GFX10: .amdhsa_kernel wavefront_unordered
699 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
700 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
701 ; GFX10-NOT: .amdhsa_memory_ordered 0
702 define amdgpu_kernel void @wavefront_unordered(
703 i32 %in, i32* %out) {
705 store atomic i32 %in, i32* %out syncscope("wavefront") unordered, align 4
709 ; GCN-LABEL: {{^}}wavefront_monotonic:
710 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
711 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
712 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
713 ; GFX10: .amdhsa_kernel wavefront_monotonic
714 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
715 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
716 ; GFX10-NOT: .amdhsa_memory_ordered 0
717 define amdgpu_kernel void @wavefront_monotonic(
718 i32 %in, i32* %out) {
720 store atomic i32 %in, i32* %out syncscope("wavefront") monotonic, align 4
724 ; GCN-LABEL: {{^}}wavefront_release:
725 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
726 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
727 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
728 ; GFX10: .amdhsa_kernel wavefront_release
729 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
730 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
731 ; GFX10-NOT: .amdhsa_memory_ordered 0
732 define amdgpu_kernel void @wavefront_release(
733 i32 %in, i32* %out) {
735 store atomic i32 %in, i32* %out syncscope("wavefront") release, align 4
739 ; GCN-LABEL: {{^}}wavefront_seq_cst:
740 ; GCN-NOT: s_waitcnt vmcnt(0){{$}}
741 ; GFX10-NOT: s_waitcnt_v{{[ms]}}cnt {{[^,]+, (0x)*0$}}
742 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}}{{$}}
743 ; GFX10: .amdhsa_kernel wavefront_seq_cst
744 ; GFX10WGP-NOT: .amdhsa_workgroup_processor_mode 0
745 ; GFX10CU: .amdhsa_workgroup_processor_mode 0
746 ; GFX10-NOT: .amdhsa_memory_ordered 0
747 define amdgpu_kernel void @wavefront_seq_cst(
748 i32 %in, i32* %out) {
750 store atomic i32 %in, i32* %out syncscope("wavefront") seq_cst, align 4