1 ; RUN: llc -march=amdgcn -mcpu=gfx802 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9PLUS,GFX8_9 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX9PLUS %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-back-off-barrier -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10PLUS,GFX9PLUS %s
6 ; GCN-LABEL: barrier_vmcnt_global:
7 ; GFX8: flat_load_dword
8 ; GFX9PLUS: global_load_{{dword|b32}}
9 ; GFX8: s_waitcnt vmcnt(0){{$}}
10 ; GFX9PLUS: s_waitcnt vmcnt(0){{$}}
12 define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) {
14 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
15 %tmp1 = zext i32 %tmp to i64
16 %tmp2 = shl nuw nsw i64 %tmp1, 32
17 %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp1
18 %tmp4 = load i32, ptr addrspace(1) %tmp3, align 4
19 fence syncscope("singlethread") release
20 tail call void @llvm.amdgcn.s.barrier()
21 fence syncscope("singlethread") acquire
22 %tmp5 = add nuw nsw i64 %tmp2, 4294967296
23 %tmp6 = lshr exact i64 %tmp5, 32
24 %tmp7 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp6
25 store i32 %tmp4, ptr addrspace(1) %tmp7, align 4
29 ; GCN-LABEL: barrier_vscnt_global:
30 ; GFX8: flat_store_dword
31 ; GFX9PLUS: global_store_{{dword|b32}}
32 ; GFX8: s_waitcnt vmcnt(0){{$}}
33 ; GFX9: s_waitcnt vmcnt(0){{$}}
34 ; GFX10PLUS: s_waitcnt_vscnt null, 0x0
36 define amdgpu_kernel void @barrier_vscnt_global(ptr addrspace(1) %arg) {
38 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
39 %tmp1 = zext i32 %tmp to i64
40 %tmp2 = shl nuw nsw i64 %tmp1, 32
41 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
42 %tmp4 = lshr exact i64 %tmp3, 32
43 %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp4
44 store i32 0, ptr addrspace(1) %tmp5, align 4
45 fence syncscope("singlethread") release
46 tail call void @llvm.amdgcn.s.barrier()
47 fence syncscope("singlethread") acquire
48 %tmp6 = add nuw nsw i64 %tmp2, 4294967296
49 %tmp7 = lshr exact i64 %tmp6, 32
50 %tmp8 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp7
51 store i32 1, ptr addrspace(1) %tmp8, align 4
55 ; GCN-LABEL: barrier_vmcnt_vscnt_global:
56 ; GFX8: flat_load_dword
57 ; GFX9PLUS: global_load_{{dword|b32}}
58 ; GFX8: s_waitcnt vmcnt(0){{$}}
59 ; GFX9PLUS: s_waitcnt vmcnt(0){{$}}
60 ; GFX10PLUS: s_waitcnt_vscnt null, 0x0
62 define amdgpu_kernel void @barrier_vmcnt_vscnt_global(ptr addrspace(1) %arg) {
64 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
65 %tmp1 = zext i32 %tmp to i64
66 %tmp2 = shl nuw nsw i64 %tmp1, 32
67 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
68 %tmp4 = lshr exact i64 %tmp3, 32
69 %tmp5 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp4
70 store i32 0, ptr addrspace(1) %tmp5, align 4
71 %tmp6 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp1
72 %tmp7 = load i32, ptr addrspace(1) %tmp6, align 4
73 fence syncscope("singlethread") release
74 tail call void @llvm.amdgcn.s.barrier()
75 fence syncscope("singlethread") acquire
76 %tmp8 = add nuw nsw i64 %tmp2, 4294967296
77 %tmp9 = lshr exact i64 %tmp8, 32
78 %tmp10 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp9
79 store i32 %tmp7, ptr addrspace(1) %tmp10, align 4
83 ; GCN-LABEL: barrier_vmcnt_flat:
84 ; GCN: flat_load_{{dword|b32}}
85 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
87 define amdgpu_kernel void @barrier_vmcnt_flat(ptr %arg) {
89 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
90 %tmp1 = zext i32 %tmp to i64
91 %tmp2 = shl nuw nsw i64 %tmp1, 32
92 %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp1
93 %tmp4 = load i32, ptr %tmp3, align 4
94 fence syncscope("singlethread") release
95 tail call void @llvm.amdgcn.s.barrier()
96 fence syncscope("singlethread") acquire
97 %tmp5 = add nuw nsw i64 %tmp2, 4294967296
98 %tmp6 = lshr exact i64 %tmp5, 32
99 %tmp7 = getelementptr inbounds i32, ptr %arg, i64 %tmp6
100 store i32 %tmp4, ptr %tmp7, align 4
104 ; GCN-LABEL: barrier_vscnt_flat:
105 ; GCN: flat_store_{{dword|b32}}
106 ; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
107 ; GFX10PLUS: s_waitcnt lgkmcnt(0){{$}}
108 ; GFX10PLUS: s_waitcnt_vscnt null, 0x0
109 ; GCN-NEXT: s_barrier
110 define amdgpu_kernel void @barrier_vscnt_flat(ptr %arg) {
112 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
113 %tmp1 = zext i32 %tmp to i64
114 %tmp2 = shl nuw nsw i64 %tmp1, 32
115 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
116 %tmp4 = lshr exact i64 %tmp3, 32
117 %tmp5 = getelementptr inbounds i32, ptr %arg, i64 %tmp4
118 store i32 0, ptr %tmp5, align 4
119 fence syncscope("singlethread") release
120 tail call void @llvm.amdgcn.s.barrier()
121 fence syncscope("singlethread") acquire
122 %tmp6 = add nuw nsw i64 %tmp2, 4294967296
123 %tmp7 = lshr exact i64 %tmp6, 32
124 %tmp8 = getelementptr inbounds i32, ptr %arg, i64 %tmp7
125 store i32 1, ptr %tmp8, align 4
129 ; GCN-LABEL: barrier_vmcnt_vscnt_flat:
130 ; GCN: flat_load_{{dword|b32}}
131 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
132 ; GFX10PLUS: s_waitcnt_vscnt null, 0x0
133 ; GCN-NEXT: s_barrier
134 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(ptr %arg) {
136 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
137 %tmp1 = zext i32 %tmp to i64
138 %tmp2 = shl nuw nsw i64 %tmp1, 32
139 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
140 %tmp4 = lshr exact i64 %tmp3, 32
141 %tmp5 = getelementptr inbounds i32, ptr %arg, i64 %tmp4
142 store i32 0, ptr %tmp5, align 4
143 %tmp6 = getelementptr inbounds i32, ptr %arg, i64 %tmp1
144 %tmp7 = load i32, ptr %tmp6, align 4
145 fence syncscope("singlethread") release
146 tail call void @llvm.amdgcn.s.barrier()
147 fence syncscope("singlethread") acquire
148 %tmp8 = add nuw nsw i64 %tmp2, 4294967296
149 %tmp9 = lshr exact i64 %tmp8, 32
150 %tmp10 = getelementptr inbounds i32, ptr %arg, i64 %tmp9
151 store i32 %tmp7, ptr %tmp10, align 4
155 ; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
156 ; GCN: flat_load_{{dword|b32}}
157 ; GFX8_9: s_waitcnt lgkmcnt(0){{$}}
158 ; GFX8_9: s_waitcnt vmcnt(0){{$}}
159 ; GFX10PLUS: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
160 ; GFX10PLUS: s_waitcnt_vscnt null, 0x0
161 ; GCN-NEXT: s_barrier
162 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) {
164 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
165 %tmp1 = zext i32 %tmp to i64
166 %tmp2 = shl nuw nsw i64 %tmp1, 32
167 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
168 %tmp4 = lshr exact i64 %tmp3, 32
169 %tmp5 = getelementptr inbounds i32, ptr %arg, i64 %tmp4
170 store i32 0, ptr %tmp5, align 4
171 %tmp6 = getelementptr inbounds i32, ptr %arg, i64 %tmp1
172 %tmp7 = load i32, ptr %tmp6, align 4
173 fence syncscope("workgroup") release
174 tail call void @llvm.amdgcn.s.barrier()
175 fence syncscope("workgroup") acquire
176 %tmp8 = add nuw nsw i64 %tmp2, 4294967296
177 %tmp9 = lshr exact i64 %tmp8, 32
178 %tmp10 = getelementptr inbounds i32, ptr %arg, i64 %tmp9
179 store i32 %tmp7, ptr %tmp10, align 4
183 ; GCN-LABEL: load_vmcnt_global:
184 ; GFX8: flat_load_dword
185 ; GFX9PLUS: global_load_{{dword|b32}}
186 ; GFX8: s_waitcnt vmcnt(0){{$}}
187 ; GFX9PLUS: s_waitcnt vmcnt(0){{$}}
188 ; GCN-NEXT: {{global|flat}}_store_{{dword|b32}}
189 define amdgpu_kernel void @load_vmcnt_global(ptr addrspace(1) %arg) {
191 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
192 %tmp1 = zext i32 %tmp to i64
193 %tmp2 = shl nuw nsw i64 %tmp1, 32
194 %tmp3 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp1
195 %tmp4 = load i32, ptr addrspace(1) %tmp3, align 4
196 %tmp5 = add nuw nsw i64 %tmp2, 4294967296
197 %tmp6 = lshr exact i64 %tmp5, 32
198 %tmp7 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tmp6
199 store i32 %tmp4, ptr addrspace(1) %tmp7, align 4
203 ; GCN-LABEL: load_vmcnt_flat:
204 ; GCN: flat_load_{{dword|b32}}
206 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
207 ; GCN-NEXT: {{global|flat}}_store_{{dword|b32}}
208 define amdgpu_kernel void @load_vmcnt_flat(ptr %arg) {
210 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
211 %tmp1 = zext i32 %tmp to i64
212 %tmp2 = shl nuw nsw i64 %tmp1, 32
213 %tmp3 = getelementptr inbounds i32, ptr %arg, i64 %tmp1
214 %tmp4 = load i32, ptr %tmp3, align 4
215 %tmp5 = add nuw nsw i64 %tmp2, 4294967296
216 %tmp6 = lshr exact i64 %tmp5, 32
217 %tmp7 = getelementptr inbounds i32, ptr %arg, i64 %tmp6
218 store i32 %tmp4, ptr %tmp7, align 4
222 ; GCN-LABEL: store_vscnt_private:
223 ; GCN: {{buffer|scratch}}_store_{{dword|b32}}
224 ; GFX8_9: s_waitcnt vmcnt(0)
225 ; GCN-NEXT: s_setpc_b64
226 define void @store_vscnt_private(ptr addrspace(5) %p) {
227 store i32 0, ptr addrspace(5) %p
231 ; GCN-LABEL: store_vscnt_global:
232 ; GFX8: flat_store_dword
233 ; GFX9PLUS: global_store_{{dword|b32}}
234 ; GFX8_9: s_waitcnt vmcnt(0)
235 ; GCN-NEXT: s_setpc_b64
236 define void @store_vscnt_global(ptr addrspace(1) %p) {
237 store i32 0, ptr addrspace(1) %p
241 ; GCN-LABEL: store_vscnt_flat:
242 ; GCN: flat_store_{{dword|b32}}
243 ; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
244 ; GFX10PLUS: s_waitcnt lgkmcnt(0){{$}}
245 ; GCN-NEXT: s_setpc_b64
246 define void @store_vscnt_flat(ptr %p) {
251 ; GCN-LABEL: function_prologue:
252 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}}
253 ; GCN-NEXT: s_setpc_b64
254 define void @function_prologue() {
258 declare void @llvm.amdgcn.s.barrier()
259 declare i32 @llvm.amdgcn.workitem.id.x()