1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -march=amdgcn -mcpu=gfx900 -amdgpu-aa -amdgpu-aa-wrapper -amdgpu-annotate-uniform -S < %s | FileCheck %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -amdgpu-atomic-optimizer-strategy=None < %s | FileCheck -check-prefix=GCN %s
5 ; Check that barrier or fence in between of loads is not considered a clobber
6 ; for the purpose of converting vector loads into scalar.
8 @LDS = linkonce_odr hidden local_unnamed_addr addrspace(3) global i32 undef
10 ; GCN-LABEL: {{^}}simple_barrier:
12 ; GCN: s_waitcnt lgkmcnt(0)
14 ; GCN: s_waitcnt lgkmcnt(0)
16 ; GCN-NOT: global_load_dword
18 ; GCN-NOT: global_load_dword
19 ; GCN: global_store_dword
20 define amdgpu_kernel void @simple_barrier(ptr addrspace(1) %arg) {
21 ; CHECK-LABEL: @simple_barrier(
23 ; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
24 ; CHECK-NEXT: fence syncscope("workgroup") release
25 ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
26 ; CHECK-NEXT: fence syncscope("workgroup") acquire
27 ; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier()
28 ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
29 ; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0
30 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]]
31 ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
32 ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
33 ; CHECK-NEXT: ret void
36 %i = load i32, ptr addrspace(1) %arg, align 4
37 fence syncscope("workgroup") release
38 tail call void @llvm.amdgcn.s.barrier()
39 fence syncscope("workgroup") acquire
40 tail call void @llvm.amdgcn.wave.barrier()
41 %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
42 %i2 = load i32, ptr addrspace(1) %i1, align 4
44 %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
45 store i32 %i3, ptr addrspace(1) %i4, align 4
49 ; GCN-LABEL: {{^}}memory_phi_no_clobber:
51 ; GCN: s_waitcnt lgkmcnt(0)
52 ; GCN: s_waitcnt lgkmcnt(0)
54 ; GCN-NOT: global_load_dword
56 ; GCN-NOT: global_load_dword
57 ; GCN: global_store_dword
58 define amdgpu_kernel void @memory_phi_no_clobber(ptr addrspace(1) %arg) {
59 ; CHECK-LABEL: @memory_phi_no_clobber(
61 ; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
62 ; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0
64 ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
65 ; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0
67 ; CHECK-NEXT: fence syncscope("workgroup") release
68 ; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0
70 ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
71 ; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0
72 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]]
73 ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
74 ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
75 ; CHECK-NEXT: ret void
78 %i = load i32, ptr addrspace(1) %arg, align 4
79 br i1 undef, label %if.then, label %if.else
82 tail call void @llvm.amdgcn.s.barrier()
86 fence syncscope("workgroup") release
90 %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
91 %i2 = load i32, ptr addrspace(1) %i1, align 4
93 %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
94 store i32 %i3, ptr addrspace(1) %i4, align 4
98 ; GCN-LABEL: {{^}}memory_phi_clobber1:
101 ; GCN: global_store_dword
102 ; GCN: global_load_dword
103 ; GCN: global_store_dword
104 define amdgpu_kernel void @memory_phi_clobber1(ptr addrspace(1) %arg) {
105 ; CHECK-LABEL: @memory_phi_clobber1(
107 ; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
108 ; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0
110 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3
111 ; CHECK-NEXT: store i32 1, ptr addrspace(1) [[GEP]], align 4
112 ; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0
114 ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
115 ; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0
117 ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
118 ; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4
119 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]]
120 ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
121 ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
122 ; CHECK-NEXT: ret void
125 %i = load i32, ptr addrspace(1) %arg, align 4
126 br i1 undef, label %if.then, label %if.else
129 %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3
130 store i32 1, ptr addrspace(1) %gep, align 4
134 tail call void @llvm.amdgcn.s.barrier()
138 %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
139 %i2 = load i32, ptr addrspace(1) %i1, align 4
140 %i3 = add i32 %i2, %i
141 %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
142 store i32 %i3, ptr addrspace(1) %i4, align 4
146 ; GCN-LABEL: {{^}}memory_phi_clobber2:
147 ; GCN-DAG: s_load_dword s
148 ; GCN-DAG: global_store_dword
150 ; GCN: global_load_dword
151 ; GCN: global_store_dword
152 define amdgpu_kernel void @memory_phi_clobber2(ptr addrspace(1) %arg) {
153 ; CHECK-LABEL: @memory_phi_clobber2(
155 ; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
156 ; CHECK-NEXT: br i1 undef, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]], !amdgpu.uniform !0
158 ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
159 ; CHECK-NEXT: br label [[IF_END:%.*]], !amdgpu.uniform !0
161 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3
162 ; CHECK-NEXT: store i32 1, ptr addrspace(1) [[GEP]], align 4
163 ; CHECK-NEXT: br label [[IF_END]], !amdgpu.uniform !0
165 ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
166 ; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4
167 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]]
168 ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
169 ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
170 ; CHECK-NEXT: ret void
173 %i = load i32, ptr addrspace(1) %arg, align 4
174 br i1 undef, label %if.then, label %if.else
177 tail call void @llvm.amdgcn.s.barrier()
181 %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3
182 store i32 1, ptr addrspace(1) %gep, align 4
186 %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
187 %i2 = load i32, ptr addrspace(1) %i1, align 4
188 %i3 = add i32 %i2, %i
189 %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
190 store i32 %i3, ptr addrspace(1) %i4, align 4
194 ; GCN-LABEL: {{^}}no_clobbering_loop1:
195 ; GCN: s_load_dword s
196 ; GCN: s_load_dword s
197 ; GCN-NOT: global_load_dword
198 ; GCN: global_store_dword
199 define amdgpu_kernel void @no_clobbering_loop1(ptr addrspace(1) %arg, i1 %cc) {
200 ; CHECK-LABEL: @no_clobbering_loop1(
202 ; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
203 ; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0
205 ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
206 ; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0
207 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]]
208 ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2
209 ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
210 ; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier()
211 ; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0
213 ; CHECK-NEXT: ret void
216 %i = load i32, ptr addrspace(1) %arg, align 4
220 %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
221 %i2 = load i32, ptr addrspace(1) %i1, align 4
222 %i3 = add i32 %i2, %i
223 %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
224 store i32 %i3, ptr addrspace(1) %i4, align 4
225 tail call void @llvm.amdgcn.wave.barrier()
226 br i1 %cc, label %while.cond, label %end
232 ; GCN-LABEL: {{^}}no_clobbering_loop2:
233 ; GCN: s_load_dword s
234 ; GCN: s_load_dword s
235 ; GCN-NOT: global_load_dword
236 ; GCN: global_store_dword
237 define amdgpu_kernel void @no_clobbering_loop2(ptr addrspace(1) noalias %arg, ptr addrspace(1) noalias %out, i32 %n) {
238 ; CHECK-LABEL: @no_clobbering_loop2(
240 ; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
241 ; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0
243 ; CHECK-NEXT: [[C:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[INC:%.*]], [[WHILE_COND]] ]
244 ; CHECK-NEXT: [[ACC:%.*]] = phi i32 [ [[I]], [[BB]] ], [ [[I3:%.*]], [[WHILE_COND]] ]
245 ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i32 [[C]], !amdgpu.uniform !0
246 ; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4, !amdgpu.noclobber !0
247 ; CHECK-NEXT: [[I3]] = add i32 [[I2]], [[ACC]]
248 ; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier()
249 ; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[C]], 1
250 ; CHECK-NEXT: [[CC:%.*]] = icmp eq i32 [[INC]], [[N:%.*]]
251 ; CHECK-NEXT: br i1 [[CC]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0
253 ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[OUT:%.*]], align 4
254 ; CHECK-NEXT: ret void
257 %i = load i32, ptr addrspace(1) %arg, align 4
261 %c = phi i32 [ 0, %bb ], [ %inc, %while.cond ]
262 %acc = phi i32 [ %i, %bb ], [ %i3, %while.cond ]
263 %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %c
264 %i2 = load i32, ptr addrspace(1) %i1, align 4
265 %i3 = add i32 %i2, %acc
266 tail call void @llvm.amdgcn.wave.barrier()
267 %inc = add nuw nsw i32 %c, 1
268 %cc = icmp eq i32 %inc, %n
269 br i1 %cc, label %while.cond, label %end
272 store i32 %i3, ptr addrspace(1) %out, align 4
276 ; GCN-LABEL: {{^}}clobbering_loop:
277 ; GCN: s_load_dword s
278 ; GCN: global_load_dword
279 ; GCN: global_store_dword
280 define amdgpu_kernel void @clobbering_loop(ptr addrspace(1) %arg, ptr addrspace(1) %out, i1 %cc) {
281 ; CHECK-LABEL: @clobbering_loop(
283 ; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
284 ; CHECK-NEXT: br label [[WHILE_COND:%.*]], !amdgpu.uniform !0
286 ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 1, !amdgpu.uniform !0
287 ; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4
288 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]]
289 ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[OUT:%.*]], i64 1
290 ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
291 ; CHECK-NEXT: tail call void @llvm.amdgcn.wave.barrier()
292 ; CHECK-NEXT: br i1 [[CC:%.*]], label [[WHILE_COND]], label [[END:%.*]], !amdgpu.uniform !0
294 ; CHECK-NEXT: ret void
297 %i = load i32, ptr addrspace(1) %arg, align 4
301 %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 1
302 %i2 = load i32, ptr addrspace(1) %i1, align 4
303 %i3 = add i32 %i2, %i
304 %i4 = getelementptr inbounds i32, ptr addrspace(1) %out, i64 1
305 store i32 %i3, ptr addrspace(1) %i4, align 4
306 tail call void @llvm.amdgcn.wave.barrier()
307 br i1 %cc, label %while.cond, label %end
313 ; GCN-LABEL: {{^}}clobber_by_atomic_load:
314 ; GCN: s_load_dword s
315 ; GCN: global_load_dword {{.*}} glc
316 ; GCN: global_load_dword
317 ; GCN: global_store_dword
318 define amdgpu_kernel void @clobber_by_atomic_load(ptr addrspace(1) %arg) {
319 ; CHECK-LABEL: @clobber_by_atomic_load(
321 ; CHECK-NEXT: [[I:%.*]] = load i32, ptr addrspace(1) [[ARG:%.*]], align 4, !amdgpu.noclobber !0
322 ; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 2, !amdgpu.uniform !0
323 ; CHECK-NEXT: [[VAL:%.*]] = load atomic i32, ptr addrspace(1) [[GEP]] seq_cst, align 4, !amdgpu.noclobber !0
324 ; CHECK-NEXT: [[I1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 3, !amdgpu.uniform !0
325 ; CHECK-NEXT: [[I2:%.*]] = load i32, ptr addrspace(1) [[I1]], align 4
326 ; CHECK-NEXT: [[I3:%.*]] = add i32 [[I2]], [[I]]
327 ; CHECK-NEXT: [[I4:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[ARG]], i64 4
328 ; CHECK-NEXT: store i32 [[I3]], ptr addrspace(1) [[I4]], align 4
329 ; CHECK-NEXT: ret void
332 %i = load i32, ptr addrspace(1) %arg, align 4
333 %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 2
334 %val = load atomic i32, ptr addrspace(1) %gep seq_cst, align 4
335 %i1 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 3
336 %i2 = load i32, ptr addrspace(1) %i1, align 4
337 %i3 = add i32 %i2, %i
338 %i4 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 4
339 store i32 %i3, ptr addrspace(1) %i4, align 4
343 ; GCN-LABEL: {{^}}no_alias_store:
346 ; GCN: s_load_dword s
347 ; GCN-NOT: global_load_dword
348 ; GCN: global_store_dword
349 define protected amdgpu_kernel void @no_alias_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
350 ; CHECK-LABEL: @no_alias_store(
352 ; CHECK-NEXT: store i32 0, ptr addrspace(3) @LDS, align 4
353 ; CHECK-NEXT: fence syncscope("workgroup") release
354 ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
355 ; CHECK-NEXT: fence syncscope("workgroup") acquire
356 ; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
357 ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
358 ; CHECK-NEXT: ret void
361 store i32 0, ptr addrspace(3) @LDS, align 4
362 fence syncscope("workgroup") release
363 tail call void @llvm.amdgcn.s.barrier()
364 fence syncscope("workgroup") acquire
365 %ld = load i32, ptr addrspace(1) %in, align 4
366 store i32 %ld, ptr addrspace(1) %out, align 4
370 ; GCN-LABEL: {{^}}may_alias_store:
371 ; GCN: global_store_dword
373 ; GCN: global_load_dword
374 ; GCN: global_store_dword
375 define protected amdgpu_kernel void @may_alias_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
376 ; CHECK-LABEL: @may_alias_store(
378 ; CHECK-NEXT: store i32 0, ptr addrspace(1) [[OUT:%.*]], align 4
379 ; CHECK-NEXT: fence syncscope("workgroup") release
380 ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
381 ; CHECK-NEXT: fence syncscope("workgroup") acquire
382 ; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4
383 ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT]], align 4
384 ; CHECK-NEXT: ret void
387 store i32 0, ptr addrspace(1) %out, align 4
388 fence syncscope("workgroup") release
389 tail call void @llvm.amdgcn.s.barrier()
390 fence syncscope("workgroup") acquire
391 %ld = load i32, ptr addrspace(1) %in, align 4
392 store i32 %ld, ptr addrspace(1) %out, align 4
396 ; GCN-LABEL: {{^}}no_alias_volatile_store:
399 ; GCN: s_load_dword s
400 ; GCN-NOT: global_load_dword
401 ; GCN: global_store_dword
402 define protected amdgpu_kernel void @no_alias_volatile_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
403 ; CHECK-LABEL: @no_alias_volatile_store(
405 ; CHECK-NEXT: store volatile i32 0, ptr addrspace(3) @LDS, align 4
406 ; CHECK-NEXT: fence syncscope("workgroup") release
407 ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
408 ; CHECK-NEXT: fence syncscope("workgroup") acquire
409 ; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
410 ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
411 ; CHECK-NEXT: ret void
414 store volatile i32 0, ptr addrspace(3) @LDS, align 4
415 fence syncscope("workgroup") release
416 tail call void @llvm.amdgcn.s.barrier()
417 fence syncscope("workgroup") acquire
418 %ld = load i32, ptr addrspace(1) %in, align 4
419 store i32 %ld, ptr addrspace(1) %out, align 4
423 ; GCN-LABEL: {{^}}no_alias_atomic_rmw_relaxed:
425 ; GCN: s_load_dword s
426 ; GCN-NOT: global_load_dword
427 ; GCN: global_store_dword
428 define protected amdgpu_kernel void @no_alias_atomic_rmw_relaxed(ptr addrspace(1) %in, ptr addrspace(1) %out) {
429 ; CHECK-LABEL: @no_alias_atomic_rmw_relaxed(
431 ; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 monotonic, align 4
432 ; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
433 ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
434 ; CHECK-NEXT: ret void
437 %unused = atomicrmw add ptr addrspace(3) @LDS, i32 5 monotonic
438 %ld = load i32, ptr addrspace(1) %in, align 4
439 store i32 %ld, ptr addrspace(1) %out, align 4
443 ; GCN-LABEL: {{^}}no_alias_atomic_cmpxchg:
445 ; GCN: s_load_dword s
446 ; GCN-NOT: global_load_dword
447 ; GCN: global_store_dword
448 define protected amdgpu_kernel void @no_alias_atomic_cmpxchg(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %swap) {
449 ; CHECK-LABEL: @no_alias_atomic_cmpxchg(
451 ; CHECK-NEXT: [[UNUSED:%.*]] = cmpxchg ptr addrspace(3) @LDS, i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4
452 ; CHECK-NEXT: fence syncscope("workgroup") release
453 ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
454 ; CHECK-NEXT: fence syncscope("workgroup") acquire
455 ; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
456 ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
457 ; CHECK-NEXT: ret void
460 %unused = cmpxchg ptr addrspace(3) @LDS, i32 7, i32 %swap seq_cst monotonic
461 fence syncscope("workgroup") release
462 tail call void @llvm.amdgcn.s.barrier()
463 fence syncscope("workgroup") acquire
464 %ld = load i32, ptr addrspace(1) %in, align 4
465 store i32 %ld, ptr addrspace(1) %out, align 4
469 ; GCN-LABEL: {{^}}no_alias_atomic_rmw:
471 ; GCN: s_load_dword s
472 ; GCN-NOT: global_load_dword
473 ; GCN: global_store_dword
474 define protected amdgpu_kernel void @no_alias_atomic_rmw(ptr addrspace(1) %in, ptr addrspace(1) %out) {
475 ; CHECK-LABEL: @no_alias_atomic_rmw(
477 ; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst, align 4
478 ; CHECK-NEXT: fence syncscope("workgroup") release
479 ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
480 ; CHECK-NEXT: fence syncscope("workgroup") acquire
481 ; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
482 ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
483 ; CHECK-NEXT: ret void
486 %unused = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst
487 fence syncscope("workgroup") release
488 tail call void @llvm.amdgcn.s.barrier()
489 fence syncscope("workgroup") acquire
490 %ld = load i32, ptr addrspace(1) %in, align 4
491 store i32 %ld, ptr addrspace(1) %out, align 4
495 ; GCN-LABEL: {{^}}may_alias_atomic_cmpxchg:
496 ; GCN: global_atomic_cmpswap
497 ; GCN: global_load_dword
498 ; GCN: global_store_dword
499 define protected amdgpu_kernel void @may_alias_atomic_cmpxchg(ptr addrspace(1) %in, ptr addrspace(1) %out, i32 %swap) {
500 ; CHECK-LABEL: @may_alias_atomic_cmpxchg(
502 ; CHECK-NEXT: [[UNUSED:%.*]] = cmpxchg ptr addrspace(1) [[OUT:%.*]], i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4
503 ; CHECK-NEXT: fence syncscope("workgroup") release
504 ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
505 ; CHECK-NEXT: fence syncscope("workgroup") acquire
506 ; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4
507 ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT]], align 4
508 ; CHECK-NEXT: ret void
511 %unused = cmpxchg ptr addrspace(1) %out, i32 7, i32 %swap seq_cst monotonic
512 fence syncscope("workgroup") release
513 tail call void @llvm.amdgcn.s.barrier()
514 fence syncscope("workgroup") acquire
515 %ld = load i32, ptr addrspace(1) %in, align 4
516 store i32 %ld, ptr addrspace(1) %out, align 4
520 ; GCN-LABEL: {{^}}may_alias_atomic_rmw:
521 ; GCN: global_atomic_add
522 ; GCN: global_load_dword
523 ; GCN: global_store_dword
524 define protected amdgpu_kernel void @may_alias_atomic_rmw(ptr addrspace(1) %in, ptr addrspace(1) %out) {
525 ; CHECK-LABEL: @may_alias_atomic_rmw(
527 ; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add ptr addrspace(1) [[OUT:%.*]], i32 5 syncscope("agent") seq_cst, align 4
528 ; CHECK-NEXT: fence syncscope("workgroup") release
529 ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
530 ; CHECK-NEXT: fence syncscope("workgroup") acquire
531 ; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4
532 ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT]], align 4
533 ; CHECK-NEXT: ret void
536 %unused = atomicrmw add ptr addrspace(1) %out, i32 5 syncscope("agent") seq_cst
537 fence syncscope("workgroup") release
538 tail call void @llvm.amdgcn.s.barrier()
539 fence syncscope("workgroup") acquire
540 %ld = load i32, ptr addrspace(1) %in, align 4
541 store i32 %ld, ptr addrspace(1) %out, align 4
545 ; GCN-LABEL: {{^}}no_alias_atomic_rmw_then_clobber:
546 ; GCN: global_store_dword
547 ; GCN: global_store_dword
549 ; GCN: global_load_dword
550 ; GCN: global_store_dword
551 define protected amdgpu_kernel void @no_alias_atomic_rmw_then_clobber(ptr addrspace(1) %in, ptr addrspace(1) %out, ptr addrspace(1) noalias %noalias) {
552 ; CHECK-LABEL: @no_alias_atomic_rmw_then_clobber(
554 ; CHECK-NEXT: store i32 1, ptr addrspace(1) [[OUT:%.*]], align 4
555 ; CHECK-NEXT: store i32 2, ptr addrspace(1) [[NOALIAS:%.*]], align 4
556 ; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst, align 4
557 ; CHECK-NEXT: fence syncscope("workgroup") release
558 ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
559 ; CHECK-NEXT: fence syncscope("workgroup") acquire
560 ; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4
561 ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT]], align 4
562 ; CHECK-NEXT: ret void
565 store i32 1, ptr addrspace(1) %out, align 4
566 store i32 2, ptr addrspace(1) %noalias, align 4
567 %unused = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst
568 fence syncscope("workgroup") release
569 tail call void @llvm.amdgcn.s.barrier()
570 fence syncscope("workgroup") acquire
571 %ld = load i32, ptr addrspace(1) %in, align 4
572 store i32 %ld, ptr addrspace(1) %out, align 4
576 ; GCN-LABEL: {{^}}no_alias_atomic_rmw_then_no_alias_store:
577 ; GCN: global_store_dword
579 ; GCN: s_load_dword s
580 ; GCN-NOT: global_load_dword
581 ; GCN: global_store_dword
582 define protected amdgpu_kernel void @no_alias_atomic_rmw_then_no_alias_store(ptr addrspace(1) %in, ptr addrspace(1) %out, ptr addrspace(1) noalias %noalias) {
583 ; CHECK-LABEL: @no_alias_atomic_rmw_then_no_alias_store(
585 ; CHECK-NEXT: store i32 2, ptr addrspace(1) [[NOALIAS:%.*]], align 4
586 ; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst, align 4
587 ; CHECK-NEXT: fence syncscope("workgroup") release
588 ; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
589 ; CHECK-NEXT: fence syncscope("workgroup") acquire
590 ; CHECK-NEXT: [[LD:%.*]] = load i32, ptr addrspace(1) [[IN:%.*]], align 4, !amdgpu.noclobber !0
591 ; CHECK-NEXT: store i32 [[LD]], ptr addrspace(1) [[OUT:%.*]], align 4
592 ; CHECK-NEXT: ret void
595 store i32 2, ptr addrspace(1) %noalias, align 4
596 %unused = atomicrmw add ptr addrspace(3) @LDS, i32 5 seq_cst
597 fence syncscope("workgroup") release
598 tail call void @llvm.amdgcn.s.barrier()
599 fence syncscope("workgroup") acquire
600 %ld = load i32, ptr addrspace(1) %in, align 4
601 store i32 %ld, ptr addrspace(1) %out, align 4
605 declare void @llvm.amdgcn.s.barrier()
606 declare void @llvm.amdgcn.wave.barrier()