1 ; RUN: llc -march=amdgcn -mcpu=gfx802 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8_9 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9_10,GFX8_9 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -asm-verbose=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10,GFX9_10 %s
5 ; GCN-LABEL: barrier_vmcnt_global:
6 ; GFX8: flat_load_dword
7 ; GFX9_10: global_load_dword
8 ; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
9 ; GFX9_10: s_waitcnt vmcnt(0){{$}}
11 define amdgpu_kernel void @barrier_vmcnt_global(i32 addrspace(1)* %arg) {
13 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
14 %tmp1 = zext i32 %tmp to i64
15 %tmp2 = shl nuw nsw i64 %tmp1, 32
16 %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
17 %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
18 fence syncscope("singlethread") release
19 tail call void @llvm.amdgcn.s.barrier()
20 fence syncscope("singlethread") acquire
21 %tmp5 = add nuw nsw i64 %tmp2, 4294967296
22 %tmp6 = lshr exact i64 %tmp5, 32
23 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6
24 store i32 %tmp4, i32 addrspace(1)* %tmp7, align 4
28 ; GCN-LABEL: barrier_vscnt_global:
29 ; GFX8: flat_store_dword
30 ; GFX9_10: global_store_dword
31 ; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
32 ; GFX9: s_waitcnt vmcnt(0){{$}}
33 ; GFX10: s_waitcnt_vscnt null, 0x0
35 define amdgpu_kernel void @barrier_vscnt_global(i32 addrspace(1)* %arg) {
37 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
38 %tmp1 = zext i32 %tmp to i64
39 %tmp2 = shl nuw nsw i64 %tmp1, 32
40 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
41 %tmp4 = lshr exact i64 %tmp3, 32
42 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
43 store i32 0, i32 addrspace(1)* %tmp5, align 4
44 fence syncscope("singlethread") release
45 tail call void @llvm.amdgcn.s.barrier() #3
46 fence syncscope("singlethread") acquire
47 %tmp6 = add nuw nsw i64 %tmp2, 4294967296
48 %tmp7 = lshr exact i64 %tmp6, 32
49 %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp7
50 store i32 1, i32 addrspace(1)* %tmp8, align 4
54 ; GCN-LABEL: barrier_vmcnt_vscnt_global:
55 ; GFX8: flat_load_dword
56 ; GFX9_10: global_load_dword
57 ; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
58 ; GFX9_10: s_waitcnt vmcnt(0){{$}}
59 ; GFX10: s_waitcnt_vscnt null, 0x0
61 define amdgpu_kernel void @barrier_vmcnt_vscnt_global(i32 addrspace(1)* %arg) {
63 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
64 %tmp1 = zext i32 %tmp to i64
65 %tmp2 = shl nuw nsw i64 %tmp1, 32
66 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
67 %tmp4 = lshr exact i64 %tmp3, 32
68 %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp4
69 store i32 0, i32 addrspace(1)* %tmp5, align 4
70 %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
71 %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
72 fence syncscope("singlethread") release
73 tail call void @llvm.amdgcn.s.barrier()
74 fence syncscope("singlethread") acquire
75 %tmp8 = add nuw nsw i64 %tmp2, 4294967296
76 %tmp9 = lshr exact i64 %tmp8, 32
77 %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp9
78 store i32 %tmp7, i32 addrspace(1)* %tmp10, align 4
82 ; GCN-LABEL: barrier_vmcnt_flat:
83 ; GCN: flat_load_dword
84 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
86 define amdgpu_kernel void @barrier_vmcnt_flat(i32* %arg) {
88 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
89 %tmp1 = zext i32 %tmp to i64
90 %tmp2 = shl nuw nsw i64 %tmp1, 32
91 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
92 %tmp4 = load i32, i32* %tmp3, align 4
93 fence syncscope("singlethread") release
94 tail call void @llvm.amdgcn.s.barrier()
95 fence syncscope("singlethread") acquire
96 %tmp5 = add nuw nsw i64 %tmp2, 4294967296
97 %tmp6 = lshr exact i64 %tmp5, 32
98 %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp6
99 store i32 %tmp4, i32* %tmp7, align 4
103 ; GCN-LABEL: barrier_vscnt_flat:
104 ; GCN: flat_store_dword
105 ; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
106 ; GFX10: s_waitcnt lgkmcnt(0){{$}}
107 ; GFX10: s_waitcnt_vscnt null, 0x0
108 ; GCN-NEXT: s_barrier
109 define amdgpu_kernel void @barrier_vscnt_flat(i32* %arg) {
111 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
112 %tmp1 = zext i32 %tmp to i64
113 %tmp2 = shl nuw nsw i64 %tmp1, 32
114 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
115 %tmp4 = lshr exact i64 %tmp3, 32
116 %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
117 store i32 0, i32* %tmp5, align 4
118 fence syncscope("singlethread") release
119 tail call void @llvm.amdgcn.s.barrier() #3
120 fence syncscope("singlethread") acquire
121 %tmp6 = add nuw nsw i64 %tmp2, 4294967296
122 %tmp7 = lshr exact i64 %tmp6, 32
123 %tmp8 = getelementptr inbounds i32, i32* %arg, i64 %tmp7
124 store i32 1, i32* %tmp8, align 4
128 ; GCN-LABEL: barrier_vmcnt_vscnt_flat:
129 ; GCN: flat_load_dword
130 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
131 ; GFX10: s_waitcnt_vscnt null, 0x0
132 ; GCN-NEXT: s_barrier
133 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(i32* %arg) {
135 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
136 %tmp1 = zext i32 %tmp to i64
137 %tmp2 = shl nuw nsw i64 %tmp1, 32
138 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
139 %tmp4 = lshr exact i64 %tmp3, 32
140 %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
141 store i32 0, i32* %tmp5, align 4
142 %tmp6 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
143 %tmp7 = load i32, i32* %tmp6, align 4
144 fence syncscope("singlethread") release
145 tail call void @llvm.amdgcn.s.barrier()
146 fence syncscope("singlethread") acquire
147 %tmp8 = add nuw nsw i64 %tmp2, 4294967296
148 %tmp9 = lshr exact i64 %tmp8, 32
149 %tmp10 = getelementptr inbounds i32, i32* %arg, i64 %tmp9
150 store i32 %tmp7, i32* %tmp10, align 4
154 ; GCN-LABEL: barrier_vmcnt_vscnt_flat_workgroup:
155 ; GCN: flat_load_dword
156 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
157 ; GFX10: s_waitcnt_vscnt null, 0x0
158 ; GCN-NEXT: s_barrier
159 define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(i32* %arg) {
161 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
162 %tmp1 = zext i32 %tmp to i64
163 %tmp2 = shl nuw nsw i64 %tmp1, 32
164 %tmp3 = add nuw nsw i64 %tmp2, 8589934592
165 %tmp4 = lshr exact i64 %tmp3, 32
166 %tmp5 = getelementptr inbounds i32, i32* %arg, i64 %tmp4
167 store i32 0, i32* %tmp5, align 4
168 %tmp6 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
169 %tmp7 = load i32, i32* %tmp6, align 4
170 fence syncscope("workgroup") release
171 tail call void @llvm.amdgcn.s.barrier()
172 fence syncscope("workgroup") acquire
173 %tmp8 = add nuw nsw i64 %tmp2, 4294967296
174 %tmp9 = lshr exact i64 %tmp8, 32
175 %tmp10 = getelementptr inbounds i32, i32* %arg, i64 %tmp9
176 store i32 %tmp7, i32* %tmp10, align 4
180 ; GCN-LABEL: load_vmcnt_global:
181 ; GFX8: flat_load_dword
182 ; GFX9_10: global_load_dword
183 ; GFX8: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
184 ; GFX9_10: s_waitcnt vmcnt(0){{$}}
185 ; GCN-NEXT: {{global|flat}}_store_dword
186 define amdgpu_kernel void @load_vmcnt_global(i32 addrspace(1)* %arg) {
188 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
189 %tmp1 = zext i32 %tmp to i64
190 %tmp2 = shl nuw nsw i64 %tmp1, 32
191 %tmp3 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp1
192 %tmp4 = load i32, i32 addrspace(1)* %tmp3, align 4
193 %tmp5 = add nuw nsw i64 %tmp2, 4294967296
194 %tmp6 = lshr exact i64 %tmp5, 32
195 %tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 %tmp6
196 store i32 %tmp4, i32 addrspace(1)* %tmp7, align 4
200 ; GCN-LABEL: load_vmcnt_flat:
201 ; GCN: flat_load_dword
203 ; GCN: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
204 ; GCN-NEXT: {{global|flat}}_store_dword
205 define amdgpu_kernel void @load_vmcnt_flat(i32* %arg) {
207 %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
208 %tmp1 = zext i32 %tmp to i64
209 %tmp2 = shl nuw nsw i64 %tmp1, 32
210 %tmp3 = getelementptr inbounds i32, i32* %arg, i64 %tmp1
211 %tmp4 = load i32, i32* %tmp3, align 4
212 %tmp5 = add nuw nsw i64 %tmp2, 4294967296
213 %tmp6 = lshr exact i64 %tmp5, 32
214 %tmp7 = getelementptr inbounds i32, i32* %arg, i64 %tmp6
215 store i32 %tmp4, i32* %tmp7, align 4
219 ; GCN-LABEL: store_vscnt_private:
220 ; GCN: buffer_store_dword
221 ; GFX8_9: s_waitcnt vmcnt(0)
222 ; GFX10: s_waitcnt_vscnt null, 0x0
223 ; GCN-NEXT: s_setpc_b64
224 define void @store_vscnt_private(i32 addrspace(5)* %p) {
225 store i32 0, i32 addrspace(5)* %p
229 ; GCN-LABEL: store_vscnt_global:
230 ; GFX8: flat_store_dword
231 ; GFX9_10: global_store_dword
232 ; GFX8_9: s_waitcnt vmcnt(0)
233 ; GFX10: s_waitcnt_vscnt null, 0x0
234 ; GCN-NEXT: s_setpc_b64
235 define void @store_vscnt_global(i32 addrspace(1)* %p) {
236 store i32 0, i32 addrspace(1)* %p
240 ; GCN-LABEL: store_vscnt_flat:
241 ; GCN: flat_store_dword
242 ; GFX8_9: s_waitcnt vmcnt(0) lgkmcnt(0){{$}}
243 ; GFX10: s_waitcnt lgkmcnt(0){{$}}
244 ; GFX10: s_waitcnt_vscnt null, 0x0
245 ; GCN-NEXT: s_setpc_b64
246 define void @store_vscnt_flat(i32* %p) {
251 ; GCN-LABEL: function_prologue:
252 ; GCN: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0){{$}}
253 ; GFX10: s_waitcnt_vscnt null, 0x0
254 ; GCN-NEXT: s_setpc_b64
255 define void @function_prologue() {
259 declare void @llvm.amdgcn.s.barrier()
260 declare i32 @llvm.amdgcn.workitem.id.x()