1 ; RUN: llc -march=amdgcn -mcpu=gfx802 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_10,GFX8_9 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s
5 ; GCN-LABEL: barrier_vmcnt_global:
6 ; GFX8: flat_load_dword
7 ; GFX9_10: global_load_dword
8 ; GFX8: s_waitcnt vmcnt(0){{$}}
9 ; GFX9_10: s_waitcnt vmcnt(0){{$}}
11 define amdgpu_kernel void @barrier_vmcnt_global(i32 addrspace(1)* %arg) {
13 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
14 %tmp1 = zext i32 %tmp to i64
15 %tmp2 = shl nuw nsw i64 %tmp1, 32
16 %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
17 %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
18 fence syncscope("singlethread") release
19 tail call void @llvm.amdgcn.s.barrier()
20 fence syncscope("singlethread") acquire
21 %tmp5 = add nuw nsw i64 %tmp2, 4294967296
22 %tmp6 = lshr exact i64 %tmp5, 32
23 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6
24 store i32 %tmp4, i32 addrspace(1)* %tmp7, align 4
28 ; GCN-LABEL: barrier_vscnt_global:
29 ; GFX8: flat_store_dword
30 ; GFX9_10: global_store_dword
31 ; GFX8: s_waitcnt vmcnt(0){{$}}
32 ; GFX9: s_waitcnt vmcnt(0){{$}}
33 ; GFX10: s_waitcnt_vscnt null, 0x0
35 define amdgpu_kernel void @barrier_vscnt_global(i32 addrspace(1)* %arg) {
37 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
38 %tmp1 = zext i32 %tmp to i64
39 %tmp2 = shl nuw nsw i64 %tmp1, 32
40 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
41 %tmp4 = lshr exact i64 %tmp3, 32
42 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
43 store i32 0, i32 addrspace(1)* %tmp5, align 4
44 fence syncscope("singlethread") release
45 tail call void @llvm.amdgcn.s.barrier() #3
46 fence syncscope("singlethread") acquire
47 %tmp6 = add nuw nsw i64 %tmp2, 4294967296
48 %tmp7 = lshr exact i64 %tmp6, 32
49 %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp7
50 store i32 1, i32 addrspace(1)* %tmp8, align 4
54 ; GCN-LABEL: barrier_vmcnt_vscnt_global:
55 ; GFX8: flat_load_dword
56 ; GFX9_10: global_load_dword
57 ; GFX8: s_waitcnt vmcnt(0){{$}}
58 ; GFX9_10: s_waitcnt vmcnt(0){{$}}
59 ; GFX10: s_waitcnt_vscnt null, 0x0
61 define amdgpu_kernel void @barrier_vmcnt_vscnt_global(i32 addrspace(1)* %arg) {
63 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
64 %tmp1 = zext i32 %tmp to i64
65 %tmp2 = shl nuw nsw i64 %tmp1, 32
66 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
67 %tmp4 = lshr exact i64 %tmp3, 32
68 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
69 store i32 0, i32 addrspace(1)* %tmp5, align 4
70 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
71 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
72 fence syncscope("singlethread") release
73 tail call void @llvm.amdgcn.s.barrier()
74 fence syncscope("singlethread") acquire
75 %tmp8 = add nuw nsw i64 %tmp2, 4294967296
76 %tmp9 = lshr exact i64 %tmp8, 32
77 %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp9
78 store i32 %tmp7, i32 addrspace(1)* %tmp10, align 4
82 ; GCN-LABEL: barrier_vmcnt_flat:
83 ; GCN: flat_load_dword
84 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
86 define amdgpu_kernel void @barrier_vmcnt_flat(i32* %arg) {
88 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
89 %tmp1 = zext i32 %tmp to i64
90 %tmp2 = shl nuw nsw i64 %tmp1, 32
91 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
92 %tmp4 = load i32, i32* %tmp3, align 4
93 fence syncscope("singlethread") release
94 tail call void @llvm.amdgcn.s.barrier()
95 fence syncscope("singlethread") acquire
96 %tmp5 = add nuw nsw i64 %tmp2, 4294967296
97 %tmp6 = lshr exact i64 %tmp5, 32
98 %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp6
99 store i32 %tmp4, i32* %tmp7, align 4
103 ; GCN-LABEL: barrier_vscnt_flat:
104 ; GCN: flat_store_dword
105 ; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
106 ; GFX10: s_waitcnt lgkmcnt(0){{$}}
107 ; GFX10: s_waitcnt_vscnt null, 0x0
108 ; GCN-NEXT: s_barrier
109 define amdgpu_kernel void @barrier_vscnt_flat(i32* %arg) {
111 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
112 %tmp1 = zext i32 %tmp to i64
113 %tmp2 = shl nuw nsw i64 %tmp1, 32
114 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
115 %tmp4 = lshr exact i64 %tmp3, 32
116 %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
117 store i32 0, i32* %tmp5, align 4
118 fence syncscope("singlethread") release
119 tail call void @llvm.amdgcn.s.barrier() #3
120 fence syncscope("singlethread") acquire
121 %tmp6 = add nuw nsw i64 %tmp2, 4294967296
122 %tmp7 = lshr exact i64 %tmp6, 32
123 %tmp8 = getelementptr inbounds i32, i32* %arg, i64 %tmp7
124 store i32 1, i32* %tmp8, align 4
128 ; GCN-LABEL: barrier_vmcnt_vscnt_flat:
129 ; GCN: flat_load_dword
130 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
131 ; GFX10: s_waitcnt_vscnt null, 0x0
132 ; GCN-NEXT: s_barrier
133 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(i32* %arg) {
135 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
136 %tmp1 = zext i32 %tmp to i64
137 %tmp2 = shl nuw nsw i64 %tmp1, 32
138 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
139 %tmp4 = lshr exact i64 %tmp3, 32
140 %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
141 store i32 0, i32* %tmp5, align 4
142 %tmp6 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
143 %tmp7 = load i32, i32* %tmp6, align 4
144 fence syncscope("singlethread") release
145 tail call void @llvm.amdgcn.s.barrier()
146 fence syncscope("singlethread") acquire
147 %tmp8 = add nuw nsw i64 %tmp2, 4294967296
148 %tmp9 = lshr exact i64 %tmp8, 32
149 %tmp10 = getelementptr inbounds i32, i32* %arg, i64 %tmp9
150 store i32 %tmp7, i32* %tmp10, align 4
154 ; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
155 ; GCN: flat_load_dword
156 ; GFX8_9: s_waitcnt lgkmcnt(0){{$}}
157 ; GFX8_9: s_waitcnt vmcnt(0){{$}}
158 ; GFX10: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
159 ; GFX10: s_waitcnt_vscnt null, 0x0
160 ; GCN-NEXT: s_barrier
161 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) {
163 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
164 %tmp1 = zext i32 %tmp to i64
165 %tmp2 = shl nuw nsw i64 %tmp1, 32
166 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
167 %tmp4 = lshr exact i64 %tmp3, 32
168 %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
169 store i32 0, i32* %tmp5, align 4
170 %tmp6 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
171 %tmp7 = load i32, i32* %tmp6, align 4
172 fence syncscope("workgroup") release
173 tail call void @llvm.amdgcn.s.barrier()
174 fence syncscope("workgroup") acquire
175 %tmp8 = add nuw nsw i64 %tmp2, 4294967296
176 %tmp9 = lshr exact i64 %tmp8, 32
177 %tmp10 = getelementptr inbounds i32, i32* %arg, i64 %tmp9
178 store i32 %tmp7, i32* %tmp10, align 4
182 ; GCN-LABEL: load_vmcnt_global:
183 ; GFX8: flat_load_dword
184 ; GFX9_10: global_load_dword
185 ; GFX8: s_waitcnt vmcnt(0){{$}}
186 ; GFX9_10: s_waitcnt vmcnt(0){{$}}
187 ; GCN-NEXT: {{global|flat}}_store_dword
188 define amdgpu_kernel void @load_vmcnt_global(i32 addrspace(1)* %arg) {
190 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
191 %tmp1 = zext i32 %tmp to i64
192 %tmp2 = shl nuw nsw i64 %tmp1, 32
193 %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
194 %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
195 %tmp5 = add nuw nsw i64 %tmp2, 4294967296
196 %tmp6 = lshr exact i64 %tmp5, 32
197 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6
198 store i32 %tmp4, i32 addrspace(1)* %tmp7, align 4
202 ; GCN-LABEL: load_vmcnt_flat:
203 ; GCN: flat_load_dword
205 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
206 ; GCN-NEXT: {{global|flat}}_store_dword
207 define amdgpu_kernel void @load_vmcnt_flat(i32* %arg) {
209 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
210 %tmp1 = zext i32 %tmp to i64
211 %tmp2 = shl nuw nsw i64 %tmp1, 32
212 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
213 %tmp4 = load i32, i32* %tmp3, align 4
214 %tmp5 = add nuw nsw i64 %tmp2, 4294967296
215 %tmp6 = lshr exact i64 %tmp5, 32
216 %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp6
217 store i32 %tmp4, i32* %tmp7, align 4
221 ; GCN-LABEL: store_vscnt_private:
222 ; GCN: buffer_store_dword
223 ; GFX8_9: s_waitcnt vmcnt(0)
224 ; GFX10: s_waitcnt_vscnt null, 0x0
225 ; GCN-NEXT: s_setpc_b64
226 define void @store_vscnt_private(i32 addrspace(5)* %p) {
227 store i32 0, i32 addrspace(5)* %p
231 ; GCN-LABEL: store_vscnt_global:
232 ; GFX8: flat_store_dword
233 ; GFX9_10: global_store_dword
234 ; GFX8_9: s_waitcnt vmcnt(0)
235 ; GFX10: s_waitcnt_vscnt null, 0x0
236 ; GCN-NEXT: s_setpc_b64
237 define void @store_vscnt_global(i32 addrspace(1)* %p) {
238 store i32 0, i32 addrspace(1)* %p
242 ; GCN-LABEL: store_vscnt_flat:
243 ; GCN: flat_store_dword
244 ; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
245 ; GFX10: s_waitcnt lgkmcnt(0){{$}}
246 ; GFX10: s_waitcnt_vscnt null, 0x0
247 ; GCN-NEXT: s_setpc_b64
248 define void @store_vscnt_flat(i32* %p) {
253 ; GCN-LABEL: function_prologue:
254 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}}
255 ; GFX10: s_waitcnt_vscnt null, 0x0
256 ; GCN-NEXT: s_setpc_b64
257 define void @function_prologue() {
261 declare void @llvm.amdgcn.s.barrier()
262 declare i32 @llvm.amdgcn.workitem.id.x()