1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR %s
4 define amdgpu_kernel void @atomic_add_i32_offset(ptr addrspace(1) %out, i32 %in) {
5 ; IR-LABEL: @atomic_add_i32_offset(
7 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
8 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
9 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
10 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
11 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
12 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
13 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
14 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
15 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
16 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
17 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
18 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
20 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
21 ; IR-NEXT: br label [[TMP12]]
26 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
27 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
31 define amdgpu_kernel void @atomic_add_i32_max_neg_offset(ptr addrspace(1) %out, i32 %in) {
32 ; IR-LABEL: @atomic_add_i32_max_neg_offset(
34 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 -1024
35 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
36 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
37 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
38 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
39 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
40 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
41 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
42 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
43 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
44 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
45 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
47 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
48 ; IR-NEXT: br label [[TMP12]]
53 %gep = getelementptr i32, ptr addrspace(1) %out, i64 -1024
54 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
58 define amdgpu_kernel void @atomic_add_i32_soffset(ptr addrspace(1) %out, i32 %in) {
59 ; IR-LABEL: @atomic_add_i32_soffset(
61 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 9000
62 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
63 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
64 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
65 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
66 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
67 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
68 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
69 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
70 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
71 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
72 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
74 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
75 ; IR-NEXT: br label [[TMP12]]
80 %gep = getelementptr i32, ptr addrspace(1) %out, i64 9000
81 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
85 define amdgpu_kernel void @atomic_add_i32_huge_offset(ptr addrspace(1) %out, i32 %in) {
86 ; IR-LABEL: @atomic_add_i32_huge_offset(
88 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 47224239175595
89 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
90 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
91 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
92 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
93 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
94 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
95 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
96 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
97 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
98 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
99 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
101 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
102 ; IR-NEXT: br label [[TMP12]]
107 %gep = getelementptr i32, ptr addrspace(1) %out, i64 47224239175595
109 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
113 define amdgpu_kernel void @atomic_add_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
114 ; IR-LABEL: @atomic_add_i32_ret_offset(
116 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
117 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
118 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
119 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
120 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
121 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
122 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
123 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
124 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
125 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
126 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
127 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
129 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
130 ; IR-NEXT: br label [[TMP12]]
132 ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
133 ; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
134 ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
135 ; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
136 ; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
140 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
141 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
142 store i32 %val, ptr addrspace(1) %out2
146 define amdgpu_kernel void @atomic_add_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
147 ; IR-LABEL: @atomic_add_i32_addr64_offset(
149 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
150 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
151 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
152 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
153 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
154 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
155 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
156 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
157 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
158 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
159 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
160 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
161 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
163 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
164 ; IR-NEXT: br label [[TMP12]]
169 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
170 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
171 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
175 define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
176 ; IR-LABEL: @atomic_add_i32_ret_addr64_offset(
178 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
179 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
180 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
181 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
182 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
183 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
184 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
185 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
186 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
187 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
188 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
189 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
190 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
192 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
193 ; IR-NEXT: br label [[TMP12]]
195 ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
196 ; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
197 ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
198 ; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
199 ; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
203 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
204 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
205 %val = atomicrmw volatile add ptr addrspace(1) %gep, i32 %in seq_cst
206 store i32 %val, ptr addrspace(1) %out2
210 define amdgpu_kernel void @atomic_add_i32(ptr addrspace(1) %out, i32 %in) {
211 ; IR-LABEL: @atomic_add_i32(
213 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
214 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
215 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
216 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
217 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
218 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
219 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
220 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
221 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
222 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
223 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
225 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
226 ; IR-NEXT: br label [[TMP12]]
231 %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst
235 define amdgpu_kernel void @atomic_add_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
236 ; IR-LABEL: @atomic_add_i32_ret(
238 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
239 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
240 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
241 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
242 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
243 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
244 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
245 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
246 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
247 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
248 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
250 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
251 ; IR-NEXT: br label [[TMP12]]
253 ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
254 ; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
255 ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
256 ; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
257 ; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
261 %val = atomicrmw volatile add ptr addrspace(1) %out, i32 %in seq_cst
262 store i32 %val, ptr addrspace(1) %out2
266 define amdgpu_kernel void @atomic_add_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
267 ; IR-LABEL: @atomic_add_i32_addr64(
269 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
270 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
271 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
272 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
273 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
274 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
275 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
276 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
277 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
278 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
279 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
280 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
282 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
283 ; IR-NEXT: br label [[TMP12]]
288 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
289 %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst
293 define amdgpu_kernel void @atomic_add_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
294 ; IR-LABEL: @atomic_add_i32_ret_addr64(
296 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
297 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
298 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
299 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
300 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
301 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
302 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
303 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
304 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
305 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
306 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
307 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
309 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile add ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
310 ; IR-NEXT: br label [[TMP12]]
312 ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
313 ; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
314 ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
315 ; IR-NEXT: [[TMP16:%.*]] = add i32 [[TMP14]], [[TMP15]]
316 ; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
320 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
321 %val = atomicrmw volatile add ptr addrspace(1) %ptr, i32 %in seq_cst
322 store i32 %val, ptr addrspace(1) %out2
326 define amdgpu_kernel void @atomic_and_i32_offset(ptr addrspace(1) %out, i32 %in) {
327 ; IR-LABEL: @atomic_and_i32_offset(
329 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
330 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
331 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
332 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
333 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
334 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
335 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
336 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
337 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
339 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
340 ; IR-NEXT: br label [[TMP9]]
345 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
346 %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
350 define amdgpu_kernel void @atomic_and_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
351 ; IR-LABEL: @atomic_and_i32_ret_offset(
353 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
354 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
355 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
356 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
357 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
358 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
359 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
360 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
361 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
363 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
364 ; IR-NEXT: br label [[TMP9]]
366 ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
367 ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
368 ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
369 ; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
370 ; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
374 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
375 %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
376 store i32 %val, ptr addrspace(1) %out2
380 define amdgpu_kernel void @atomic_and_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
381 ; IR-LABEL: @atomic_and_i32_addr64_offset(
383 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
384 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
385 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
386 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
387 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
388 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
389 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
390 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
391 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
392 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
394 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
395 ; IR-NEXT: br label [[TMP9]]
400 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
401 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
402 %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
406 define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
407 ; IR-LABEL: @atomic_and_i32_ret_addr64_offset(
409 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
410 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
411 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
412 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
413 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
414 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
415 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
416 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
417 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
418 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
420 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
421 ; IR-NEXT: br label [[TMP9]]
423 ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
424 ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
425 ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
426 ; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
427 ; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
431 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
432 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
433 %val = atomicrmw volatile and ptr addrspace(1) %gep, i32 %in seq_cst
434 store i32 %val, ptr addrspace(1) %out2
438 define amdgpu_kernel void @atomic_and_i32(ptr addrspace(1) %out, i32 %in) {
439 ; IR-LABEL: @atomic_and_i32(
441 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
442 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
443 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
444 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
445 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
446 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
447 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
448 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
450 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4
451 ; IR-NEXT: br label [[TMP9]]
456 %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst
460 define amdgpu_kernel void @atomic_and_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
461 ; IR-LABEL: @atomic_and_i32_ret(
463 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
464 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
465 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
466 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
467 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
468 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
469 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
470 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
472 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] seq_cst, align 4
473 ; IR-NEXT: br label [[TMP9]]
475 ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
476 ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
477 ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
478 ; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
479 ; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
483 %val = atomicrmw volatile and ptr addrspace(1) %out, i32 %in seq_cst
484 store i32 %val, ptr addrspace(1) %out2
488 define amdgpu_kernel void @atomic_and_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
489 ; IR-LABEL: @atomic_and_i32_addr64(
491 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
492 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
493 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
494 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
495 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
496 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
497 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
498 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
499 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
501 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4
502 ; IR-NEXT: br label [[TMP9]]
507 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
508 %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst
512 define amdgpu_kernel void @atomic_and_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
513 ; IR-LABEL: @atomic_and_i32_ret_addr64(
515 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
516 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
517 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
518 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
519 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
520 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
521 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
522 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
523 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
525 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile and ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] seq_cst, align 4
526 ; IR-NEXT: br label [[TMP9]]
528 ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
529 ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
530 ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -1, i32 [[IN]]
531 ; IR-NEXT: [[TMP13:%.*]] = and i32 [[TMP11]], [[TMP12]]
532 ; IR-NEXT: store i32 [[TMP13]], ptr addrspace(1) [[OUT2:%.*]], align 4
536 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
537 %val = atomicrmw volatile and ptr addrspace(1) %ptr, i32 %in seq_cst
538 store i32 %val, ptr addrspace(1) %out2
542 define amdgpu_kernel void @atomic_sub_i32_offset(ptr addrspace(1) %out, i32 %in) {
543 ; IR-LABEL: @atomic_sub_i32_offset(
545 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
546 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
547 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
548 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
549 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
550 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
551 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
552 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
553 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
554 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
555 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
556 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
558 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
559 ; IR-NEXT: br label [[TMP12]]
564 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
565 %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
569 define amdgpu_kernel void @atomic_sub_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
570 ; IR-LABEL: @atomic_sub_i32_ret_offset(
572 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
573 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
574 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
575 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
576 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
577 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
578 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
579 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
580 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
581 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
582 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
583 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
585 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
586 ; IR-NEXT: br label [[TMP12]]
588 ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
589 ; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
590 ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
591 ; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
592 ; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
596 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
597 %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
598 store i32 %val, ptr addrspace(1) %out2
602 define amdgpu_kernel void @atomic_sub_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
603 ; IR-LABEL: @atomic_sub_i32_addr64_offset(
605 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
606 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
607 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
608 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
609 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
610 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
611 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
612 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
613 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
614 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
615 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
616 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
617 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
619 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
620 ; IR-NEXT: br label [[TMP12]]
625 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
626 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
627 %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
631 define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
632 ; IR-LABEL: @atomic_sub_i32_ret_addr64_offset(
634 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
635 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
636 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
637 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
638 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
639 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
640 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
641 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
642 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
643 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
644 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
645 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
646 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
648 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[GEP]], i32 [[TMP8]] seq_cst, align 4
649 ; IR-NEXT: br label [[TMP12]]
651 ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
652 ; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
653 ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
654 ; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
655 ; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
659 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
660 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
661 %val = atomicrmw volatile sub ptr addrspace(1) %gep, i32 %in seq_cst
662 store i32 %val, ptr addrspace(1) %out2
666 define amdgpu_kernel void @atomic_sub_i32(ptr addrspace(1) %out, i32 %in) {
667 ; IR-LABEL: @atomic_sub_i32(
669 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
670 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
671 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
672 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
673 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
674 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
675 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
676 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
677 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
678 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
679 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
681 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
682 ; IR-NEXT: br label [[TMP12]]
687 %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst
691 define amdgpu_kernel void @atomic_sub_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
692 ; IR-LABEL: @atomic_sub_i32_ret(
694 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
695 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
696 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
697 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
698 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
699 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
700 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
701 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
702 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
703 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
704 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
706 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[OUT:%.*]], i32 [[TMP8]] seq_cst, align 4
707 ; IR-NEXT: br label [[TMP12]]
709 ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
710 ; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
711 ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
712 ; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
713 ; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
717 %val = atomicrmw volatile sub ptr addrspace(1) %out, i32 %in seq_cst
718 store i32 %val, ptr addrspace(1) %out2
722 define amdgpu_kernel void @atomic_sub_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
723 ; IR-LABEL: @atomic_sub_i32_addr64(
725 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
726 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
727 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
728 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
729 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
730 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
731 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
732 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
733 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
734 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
735 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
736 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
738 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
739 ; IR-NEXT: br label [[TMP12]]
744 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
745 %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst
749 define amdgpu_kernel void @atomic_sub_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
750 ; IR-LABEL: @atomic_sub_i32_ret_addr64(
752 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
753 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
754 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
755 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
756 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
757 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
758 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
759 ; IR-NEXT: [[TMP6:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP0]])
760 ; IR-NEXT: [[TMP7:%.*]] = trunc i64 [[TMP6]] to i32
761 ; IR-NEXT: [[TMP8:%.*]] = mul i32 [[IN:%.*]], [[TMP7]]
762 ; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP5]], 0
763 ; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
765 ; IR-NEXT: [[TMP11:%.*]] = atomicrmw volatile sub ptr addrspace(1) [[PTR]], i32 [[TMP8]] seq_cst, align 4
766 ; IR-NEXT: br label [[TMP12]]
768 ; IR-NEXT: [[TMP13:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP11]], [[TMP10]] ]
769 ; IR-NEXT: [[TMP14:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP13]])
770 ; IR-NEXT: [[TMP15:%.*]] = mul i32 [[IN]], [[TMP5]]
771 ; IR-NEXT: [[TMP16:%.*]] = sub i32 [[TMP14]], [[TMP15]]
772 ; IR-NEXT: store i32 [[TMP16]], ptr addrspace(1) [[OUT2:%.*]], align 4
776 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
777 %val = atomicrmw volatile sub ptr addrspace(1) %ptr, i32 %in seq_cst
778 store i32 %val, ptr addrspace(1) %out2
782 define amdgpu_kernel void @atomic_max_i32_offset(ptr addrspace(1) %out, i32 %in) {
783 ; IR-LABEL: @atomic_max_i32_offset(
785 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
786 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
787 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
788 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
789 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
790 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
791 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
792 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
793 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
795 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] seq_cst, align 4
796 ; IR-NEXT: br label [[TMP9]]
801 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
802 %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in seq_cst
806 define amdgpu_kernel void @atomic_max_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
807 ; IR-LABEL: @atomic_max_i32_ret_offset(
809 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
810 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
811 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
812 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
813 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
814 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
815 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
816 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
817 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
819 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
820 ; IR-NEXT: br label [[TMP9]]
822 ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
823 ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
824 ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
825 ; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
826 ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
827 ; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
831 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
832 %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
833 store i32 %val, ptr addrspace(1) %out2
837 define amdgpu_kernel void @atomic_max_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
838 ; IR-LABEL: @atomic_max_i32_addr64_offset(
840 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
841 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
842 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
843 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
844 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
845 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
846 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
847 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
848 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
849 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
851 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
852 ; IR-NEXT: br label [[TMP9]]
857 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
858 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
859 %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
863 define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
864 ; IR-LABEL: @atomic_max_i32_ret_addr64_offset(
866 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
867 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
868 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
869 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
870 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
871 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
872 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
873 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
874 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
875 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
877 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
878 ; IR-NEXT: br label [[TMP9]]
880 ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
881 ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
882 ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
883 ; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
884 ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
885 ; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
889 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
890 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
891 %val = atomicrmw volatile max ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
892 store i32 %val, ptr addrspace(1) %out2
896 define amdgpu_kernel void @atomic_max_i32(ptr addrspace(1) %out, i32 %in) {
897 ; IR-LABEL: @atomic_max_i32(
899 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
900 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
901 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
902 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
903 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
904 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
905 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
906 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
908 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
909 ; IR-NEXT: br label [[TMP9]]
914 %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
918 define amdgpu_kernel void @atomic_max_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
919 ; IR-LABEL: @atomic_max_i32_ret(
921 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
922 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
923 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
924 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
925 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
926 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
927 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
928 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
930 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
931 ; IR-NEXT: br label [[TMP9]]
933 ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
934 ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
935 ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
936 ; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
937 ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
938 ; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
942 %val = atomicrmw volatile max ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
943 store i32 %val, ptr addrspace(1) %out2
947 define amdgpu_kernel void @atomic_max_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
948 ; IR-LABEL: @atomic_max_i32_addr64(
950 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
951 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
952 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
953 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
954 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
955 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
956 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
957 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
958 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
960 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
961 ; IR-NEXT: br label [[TMP9]]
966 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
967 %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
971 define amdgpu_kernel void @atomic_max_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
972 ; IR-LABEL: @atomic_max_i32_ret_addr64(
974 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
975 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
976 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
977 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
978 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
979 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
980 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
981 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
982 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
984 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile max ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
985 ; IR-NEXT: br label [[TMP9]]
987 ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
988 ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
989 ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 -2147483648, i32 [[IN]]
990 ; IR-NEXT: [[TMP13:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
991 ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
992 ; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
996 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
997 %val = atomicrmw volatile max ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
998 store i32 %val, ptr addrspace(1) %out2
1002 define amdgpu_kernel void @atomic_umax_i32_offset(ptr addrspace(1) %out, i32 %in) {
1003 ; IR-LABEL: @atomic_umax_i32_offset(
1005 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
1006 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1007 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1008 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1009 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1010 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1011 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1012 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1013 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1015 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1016 ; IR-NEXT: br label [[TMP9]]
1021 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1022 %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1026 define amdgpu_kernel void @atomic_umax_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
1027 ; IR-LABEL: @atomic_umax_i32_ret_offset(
1029 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
1030 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1031 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1032 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1033 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1034 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1035 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1036 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1037 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1039 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1040 ; IR-NEXT: br label [[TMP9]]
1042 ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
1043 ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
1044 ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
1045 ; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
1046 ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
1047 ; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
1051 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1052 %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1053 store i32 %val, ptr addrspace(1) %out2
1057 define amdgpu_kernel void @atomic_umax_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
1058 ; IR-LABEL: @atomic_umax_i32_addr64_offset(
1060 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
1061 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
1062 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1063 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1064 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1065 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1066 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1067 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1068 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1069 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1071 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1072 ; IR-NEXT: br label [[TMP9]]
1077 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1078 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
1079 %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1083 define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
1084 ; IR-LABEL: @atomic_umax_i32_ret_addr64_offset(
1086 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
1087 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
1088 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1089 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1090 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1091 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1092 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1093 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1094 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1095 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1097 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1098 ; IR-NEXT: br label [[TMP9]]
1100 ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
1101 ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
1102 ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
1103 ; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
1104 ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
1105 ; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
1109 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1110 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
1111 %val = atomicrmw volatile umax ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1112 store i32 %val, ptr addrspace(1) %out2
1116 define amdgpu_kernel void @atomic_umax_i32(ptr addrspace(1) %out, i32 %in) {
1117 ; IR-LABEL: @atomic_umax_i32(
1119 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1120 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1121 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1122 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1123 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1124 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1125 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1126 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1128 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1129 ; IR-NEXT: br label [[TMP9]]
1134 %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
1138 define amdgpu_kernel void @atomic_umax_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
1139 ; IR-LABEL: @atomic_umax_i32_ret(
1141 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1142 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1143 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1144 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1145 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1146 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1147 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1148 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1150 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1151 ; IR-NEXT: br label [[TMP9]]
1153 ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
1154 ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
1155 ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
1156 ; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
1157 ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
1158 ; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
1162 %val = atomicrmw volatile umax ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
1163 store i32 %val, ptr addrspace(1) %out2
1167 define amdgpu_kernel void @atomic_umax_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
1168 ; IR-LABEL: @atomic_umax_i32_addr64(
1170 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
1171 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1172 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1173 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1174 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1175 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1176 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1177 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1178 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1180 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1181 ; IR-NEXT: br label [[TMP9]]
1186 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1187 %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
1191 define amdgpu_kernel void @atomic_umax_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
1192 ; IR-LABEL: @atomic_umax_i32_ret_addr64(
1194 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
1195 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1196 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1197 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1198 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1199 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1200 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1201 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1202 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1204 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile umax ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1205 ; IR-NEXT: br label [[TMP9]]
1207 ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
1208 ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
1209 ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 0, i32 [[IN]]
1210 ; IR-NEXT: [[TMP13:%.*]] = icmp ugt i32 [[TMP11]], [[TMP12]]
1211 ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
1212 ; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
1216 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1217 %val = atomicrmw volatile umax ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
1218 store i32 %val, ptr addrspace(1) %out2
1222 define amdgpu_kernel void @atomic_min_i32_offset(ptr addrspace(1) %out, i32 %in) {
1223 ; IR-LABEL: @atomic_min_i32_offset(
1225 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
1226 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1227 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1228 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1229 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1230 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1231 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1232 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1233 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1235 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1236 ; IR-NEXT: br label [[TMP9]]
1241 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1242 %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1246 define amdgpu_kernel void @atomic_min_i32_ret_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
1247 ; IR-LABEL: @atomic_min_i32_ret_offset(
1249 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 4
1250 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1251 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1252 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1253 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1254 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1255 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1256 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1257 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1259 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1260 ; IR-NEXT: br label [[TMP9]]
1262 ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
1263 ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
1264 ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
1265 ; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
1266 ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
1267 ; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
1271 %gep = getelementptr i32, ptr addrspace(1) %out, i64 4
1272 %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1273 store i32 %val, ptr addrspace(1) %out2
1277 define amdgpu_kernel void @atomic_min_i32_addr64_offset(ptr addrspace(1) %out, i32 %in, i64 %index) {
1278 ; IR-LABEL: @atomic_min_i32_addr64_offset(
1280 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
1281 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
1282 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1283 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1284 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1285 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1286 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1287 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1288 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1289 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1291 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1292 ; IR-NEXT: br label [[TMP9]]
1297 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1298 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
1299 %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1303 define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
1304 ; IR-LABEL: @atomic_min_i32_ret_addr64_offset(
1306 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
1307 ; IR-NEXT: [[GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4
1308 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1309 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1310 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1311 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1312 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1313 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1314 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1315 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1317 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[GEP]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1318 ; IR-NEXT: br label [[TMP9]]
1320 ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
1321 ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
1322 ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
1323 ; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
1324 ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
1325 ; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
1329 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1330 %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 4
1331 %val = atomicrmw volatile min ptr addrspace(1) %gep, i32 %in syncscope("workgroup") seq_cst
1332 store i32 %val, ptr addrspace(1) %out2
1336 define amdgpu_kernel void @atomic_min_i32(ptr addrspace(1) %out, i32 %in) {
1337 ; IR-LABEL: @atomic_min_i32(
1339 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1340 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1341 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1342 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1343 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1344 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1345 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1346 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1348 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1349 ; IR-NEXT: br label [[TMP9]]
1354 %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
1358 define amdgpu_kernel void @atomic_min_i32_ret(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in) {
1359 ; IR-LABEL: @atomic_min_i32_ret(
1361 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1362 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1363 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1364 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1365 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1366 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1367 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1368 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1370 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[OUT:%.*]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1371 ; IR-NEXT: br label [[TMP9]]
1373 ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
1374 ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
1375 ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
1376 ; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
1377 ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
1378 ; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
1382 %val = atomicrmw volatile min ptr addrspace(1) %out, i32 %in syncscope("workgroup") seq_cst
1383 store i32 %val, ptr addrspace(1) %out2
1387 define amdgpu_kernel void @atomic_min_i32_addr64(ptr addrspace(1) %out, i32 %in, i64 %index) {
1388 ; IR-LABEL: @atomic_min_i32_addr64(
1390 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
1391 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1392 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1393 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1394 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1395 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1396 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1397 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1398 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1400 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1401 ; IR-NEXT: br label [[TMP9]]
1406 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1407 %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
1411 define amdgpu_kernel void @atomic_min_i32_ret_addr64(ptr addrspace(1) %out, ptr addrspace(1) %out2, i32 %in, i64 %index) {
1412 ; IR-LABEL: @atomic_min_i32_ret_addr64(
1414 ; IR-NEXT: [[PTR:%.*]] = getelementptr i32, ptr addrspace(1) [[OUT:%.*]], i64 [[INDEX:%.*]]
1415 ; IR-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
1416 ; IR-NEXT: [[TMP1:%.*]] = trunc i64 [[TMP0]] to i32
1417 ; IR-NEXT: [[TMP2:%.*]] = lshr i64 [[TMP0]], 32
1418 ; IR-NEXT: [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
1419 ; IR-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP1]], i32 0)
1420 ; IR-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP3]], i32 [[TMP4]])
1421 ; IR-NEXT: [[TMP6:%.*]] = icmp eq i32 [[TMP5]], 0
1422 ; IR-NEXT: br i1 [[TMP6]], label [[TMP7:%.*]], label [[TMP9:%.*]]
1424 ; IR-NEXT: [[TMP8:%.*]] = atomicrmw volatile min ptr addrspace(1) [[PTR]], i32 [[IN:%.*]] syncscope("workgroup") seq_cst, align 4
1425 ; IR-NEXT: br label [[TMP9]]
1427 ; IR-NEXT: [[TMP10:%.*]] = phi i32 [ poison, [[ENTRY:%.*]] ], [ [[TMP8]], [[TMP7]] ]
1428 ; IR-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP10]])
1429 ; IR-NEXT: [[TMP12:%.*]] = select i1 [[TMP6]], i32 2147483647, i32 [[IN]]
1430 ; IR-NEXT: [[TMP13:%.*]] = icmp slt i32 [[TMP11]], [[TMP12]]
1431 ; IR-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i32 [[TMP11]], i32 [[TMP12]]
1432 ; IR-NEXT: store i32 [[TMP14]], ptr addrspace(1) [[OUT2:%.*]], align 4
1436 %ptr = getelementptr i32, ptr addrspace(1) %out, i64 %index
1437 %val = atomicrmw volatile min ptr addrspace(1) %ptr, i32 %in syncscope("workgroup") seq_cst
1438 store i32 %val, ptr addrspace(1) %out2