1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx90a -o - %s | FileCheck %s
4 %S = type <{ float, double }>
6 ; The result of that atomic ops should not be used as a uniform value.
8 define protected amdgpu_kernel void @add(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
11 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
13 ; CHECK-NEXT: v_mov_b32_e32 v1, 1
14 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
15 ; CHECK-NEXT: global_atomic_add v2, v0, v1, s[0:1] glc
16 ; CHECK-NEXT: v_mov_b32_e32 v0, s2
17 ; CHECK-NEXT: v_mov_b32_e32 v1, s3
18 ; CHECK-NEXT: s_waitcnt vmcnt(0)
19 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
20 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
21 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
22 ; CHECK-NEXT: s_endpgm
23 %n32 = atomicrmw add i32 addrspace(1)* %p, i32 1 monotonic
24 %n64 = zext i32 %n32 to i64
25 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
26 store float 1.0, float addrspace(1)* %p1
30 define protected amdgpu_kernel void @sub(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
33 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
34 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
35 ; CHECK-NEXT: v_mov_b32_e32 v1, 1
36 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
37 ; CHECK-NEXT: global_atomic_sub v2, v0, v1, s[0:1] glc
38 ; CHECK-NEXT: v_mov_b32_e32 v0, s2
39 ; CHECK-NEXT: v_mov_b32_e32 v1, s3
40 ; CHECK-NEXT: s_waitcnt vmcnt(0)
41 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
42 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
43 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
44 ; CHECK-NEXT: s_endpgm
45 %n32 = atomicrmw sub i32 addrspace(1)* %p, i32 1 monotonic
46 %n64 = zext i32 %n32 to i64
47 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
48 store float 1.0, float addrspace(1)* %p1
52 define protected amdgpu_kernel void @and(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
55 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
56 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
57 ; CHECK-NEXT: v_mov_b32_e32 v1, 1
58 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
59 ; CHECK-NEXT: global_atomic_and v2, v0, v1, s[0:1] glc
60 ; CHECK-NEXT: v_mov_b32_e32 v0, s2
61 ; CHECK-NEXT: v_mov_b32_e32 v1, s3
62 ; CHECK-NEXT: s_waitcnt vmcnt(0)
63 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
64 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
65 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
66 ; CHECK-NEXT: s_endpgm
67 %n32 = atomicrmw and i32 addrspace(1)* %p, i32 1 monotonic
68 %n64 = zext i32 %n32 to i64
69 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
70 store float 1.0, float addrspace(1)* %p1
74 define protected amdgpu_kernel void @or(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
77 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
78 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
79 ; CHECK-NEXT: v_mov_b32_e32 v1, 1
80 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
81 ; CHECK-NEXT: global_atomic_or v2, v0, v1, s[0:1] glc
82 ; CHECK-NEXT: v_mov_b32_e32 v0, s2
83 ; CHECK-NEXT: v_mov_b32_e32 v1, s3
84 ; CHECK-NEXT: s_waitcnt vmcnt(0)
85 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
86 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
87 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
88 ; CHECK-NEXT: s_endpgm
89 %n32 = atomicrmw or i32 addrspace(1)* %p, i32 1 monotonic
90 %n64 = zext i32 %n32 to i64
91 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
92 store float 1.0, float addrspace(1)* %p1
96 define protected amdgpu_kernel void @xor(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
99 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
100 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
101 ; CHECK-NEXT: v_mov_b32_e32 v1, 1
102 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
103 ; CHECK-NEXT: global_atomic_xor v2, v0, v1, s[0:1] glc
104 ; CHECK-NEXT: v_mov_b32_e32 v0, s2
105 ; CHECK-NEXT: v_mov_b32_e32 v1, s3
106 ; CHECK-NEXT: s_waitcnt vmcnt(0)
107 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
108 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
109 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
110 ; CHECK-NEXT: s_endpgm
111 %n32 = atomicrmw xor i32 addrspace(1)* %p, i32 1 monotonic
112 %n64 = zext i32 %n32 to i64
113 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
114 store float 1.0, float addrspace(1)* %p1
118 define protected amdgpu_kernel void @nand(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
121 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
122 ; CHECK-NEXT: s_mov_b64 s[4:5], 0
123 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
124 ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
125 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
126 ; CHECK-NEXT: v_mov_b32_e32 v0, s6
127 ; CHECK-NEXT: BB5_1: ; %atomicrmw.start
128 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
129 ; CHECK-NEXT: v_mov_b32_e32 v1, v0
130 ; CHECK-NEXT: v_not_b32_e32 v0, v1
131 ; CHECK-NEXT: v_mov_b32_e32 v2, 0
132 ; CHECK-NEXT: v_or_b32_e32 v0, -2, v0
133 ; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
134 ; CHECK-NEXT: s_waitcnt vmcnt(0)
135 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
136 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
137 ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
138 ; CHECK-NEXT: s_cbranch_execnz BB5_1
139 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
140 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
141 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
142 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
143 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
144 ; CHECK-NEXT: s_endpgm
145 %n32 = atomicrmw nand i32 addrspace(1)* %p, i32 1 monotonic
146 %n64 = zext i32 %n32 to i64
147 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
148 store float 1.0, float addrspace(1)* %p1
152 define protected amdgpu_kernel void @max(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
155 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
156 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
157 ; CHECK-NEXT: v_mov_b32_e32 v1, 1
158 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
159 ; CHECK-NEXT: global_atomic_smax v2, v0, v1, s[0:1] glc
160 ; CHECK-NEXT: v_mov_b32_e32 v0, s2
161 ; CHECK-NEXT: v_mov_b32_e32 v1, s3
162 ; CHECK-NEXT: s_waitcnt vmcnt(0)
163 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
164 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
165 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
166 ; CHECK-NEXT: s_endpgm
167 %n32 = atomicrmw max i32 addrspace(1)* %p, i32 1 monotonic
168 %n64 = zext i32 %n32 to i64
169 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
170 store float 1.0, float addrspace(1)* %p1
174 define protected amdgpu_kernel void @min(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
177 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
178 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
179 ; CHECK-NEXT: v_mov_b32_e32 v1, 1
180 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
181 ; CHECK-NEXT: global_atomic_smin v2, v0, v1, s[0:1] glc
182 ; CHECK-NEXT: v_mov_b32_e32 v0, s2
183 ; CHECK-NEXT: v_mov_b32_e32 v1, s3
184 ; CHECK-NEXT: s_waitcnt vmcnt(0)
185 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
186 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
187 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
188 ; CHECK-NEXT: s_endpgm
189 %n32 = atomicrmw min i32 addrspace(1)* %p, i32 1 monotonic
190 %n64 = zext i32 %n32 to i64
191 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
192 store float 1.0, float addrspace(1)* %p1
196 define protected amdgpu_kernel void @umax(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
199 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
200 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
201 ; CHECK-NEXT: v_mov_b32_e32 v1, 1
202 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
203 ; CHECK-NEXT: global_atomic_umax v2, v0, v1, s[0:1] glc
204 ; CHECK-NEXT: v_mov_b32_e32 v0, s2
205 ; CHECK-NEXT: v_mov_b32_e32 v1, s3
206 ; CHECK-NEXT: s_waitcnt vmcnt(0)
207 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
208 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
209 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
210 ; CHECK-NEXT: s_endpgm
211 %n32 = atomicrmw umax i32 addrspace(1)* %p, i32 1 monotonic
212 %n64 = zext i32 %n32 to i64
213 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
214 store float 1.0, float addrspace(1)* %p1
218 define protected amdgpu_kernel void @umin(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
221 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
222 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
223 ; CHECK-NEXT: v_mov_b32_e32 v1, 1
224 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
225 ; CHECK-NEXT: global_atomic_umin v2, v0, v1, s[0:1] glc
226 ; CHECK-NEXT: v_mov_b32_e32 v0, s2
227 ; CHECK-NEXT: v_mov_b32_e32 v1, s3
228 ; CHECK-NEXT: s_waitcnt vmcnt(0)
229 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
230 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
231 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
232 ; CHECK-NEXT: s_endpgm
233 %n32 = atomicrmw umin i32 addrspace(1)* %p, i32 1 monotonic
234 %n64 = zext i32 %n32 to i64
235 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
236 store float 1.0, float addrspace(1)* %p1
240 define protected amdgpu_kernel void @cmpxchg(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
241 ; CHECK-LABEL: cmpxchg:
243 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
244 ; CHECK-NEXT: v_mov_b32_e32 v2, 0
245 ; CHECK-NEXT: v_mov_b32_e32 v0, 2
246 ; CHECK-NEXT: v_mov_b32_e32 v1, 1
247 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
248 ; CHECK-NEXT: global_atomic_cmpswap v2, v2, v[0:1], s[0:1] glc
249 ; CHECK-NEXT: v_mov_b32_e32 v0, s2
250 ; CHECK-NEXT: v_mov_b32_e32 v1, s3
251 ; CHECK-NEXT: s_waitcnt vmcnt(0)
252 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
253 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
254 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
255 ; CHECK-NEXT: s_endpgm
256 %agg = cmpxchg i32 addrspace(1)* %p, i32 1, i32 2 monotonic monotonic
257 %n32 = extractvalue {i32, i1} %agg, 0
258 %n64 = zext i32 %n32 to i64
259 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
260 store float 1.0, float addrspace(1)* %p1
264 define protected amdgpu_kernel void @xchg(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
267 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
268 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
269 ; CHECK-NEXT: v_mov_b32_e32 v1, 1
270 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
271 ; CHECK-NEXT: global_atomic_swap v2, v0, v1, s[0:1] glc
272 ; CHECK-NEXT: v_mov_b32_e32 v0, s2
273 ; CHECK-NEXT: v_mov_b32_e32 v1, s3
274 ; CHECK-NEXT: s_waitcnt vmcnt(0)
275 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
276 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
277 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
278 ; CHECK-NEXT: s_endpgm
279 %n32 = atomicrmw xchg i32 addrspace(1)* %p, i32 1 monotonic
280 %n64 = zext i32 %n32 to i64
281 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
282 store float 1.0, float addrspace(1)* %p1
286 define protected amdgpu_kernel void @inc(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
289 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
290 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
291 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
292 ; CHECK-NEXT: global_atomic_inc v2, v0, v0, s[0:1] glc
293 ; CHECK-NEXT: v_mov_b32_e32 v0, s2
294 ; CHECK-NEXT: v_mov_b32_e32 v1, s3
295 ; CHECK-NEXT: s_waitcnt vmcnt(0)
296 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
297 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
298 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
299 ; CHECK-NEXT: s_endpgm
300 %n32 = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %p, i32 0, i32 0, i32 0, i1 false)
301 %n64 = zext i32 %n32 to i64
302 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
303 store float 1.0, float addrspace(1)* %p1
307 define protected amdgpu_kernel void @dec(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
310 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
311 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
312 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
313 ; CHECK-NEXT: global_atomic_dec v2, v0, v0, s[0:1] glc
314 ; CHECK-NEXT: v_mov_b32_e32 v0, s2
315 ; CHECK-NEXT: v_mov_b32_e32 v1, s3
316 ; CHECK-NEXT: s_waitcnt vmcnt(0)
317 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
318 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
319 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
320 ; CHECK-NEXT: s_endpgm
321 %n32 = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %p, i32 0, i32 0, i32 0, i1 false)
322 %n64 = zext i32 %n32 to i64
323 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
324 store float 1.0, float addrspace(1)* %p1
328 define protected amdgpu_kernel void @fadd(float addrspace(1)* %p, %S addrspace(1)* %q) {
331 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
332 ; CHECK-NEXT: s_mov_b64 s[4:5], 0
333 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
334 ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
335 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
336 ; CHECK-NEXT: v_mov_b32_e32 v0, s6
337 ; CHECK-NEXT: BB14_1: ; %atomicrmw.start
338 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
339 ; CHECK-NEXT: v_mov_b32_e32 v1, v0
340 ; CHECK-NEXT: v_mov_b32_e32 v2, 0
341 ; CHECK-NEXT: v_add_f32_e32 v0, 1.0, v1
342 ; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
343 ; CHECK-NEXT: s_waitcnt vmcnt(0)
344 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
345 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
346 ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
347 ; CHECK-NEXT: s_cbranch_execnz BB14_1
348 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
349 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
350 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
351 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
352 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
353 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
354 ; CHECK-NEXT: s_endpgm
355 %f32 = atomicrmw fadd float addrspace(1)* %p, float 1.0 monotonic
356 %n32 = fptoui float %f32 to i32
357 %n64 = zext i32 %n32 to i64
358 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
359 store float 1.0, float addrspace(1)* %p1
363 define protected amdgpu_kernel void @fsub(float addrspace(1)* %p, %S addrspace(1)* %q) {
366 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
367 ; CHECK-NEXT: s_mov_b64 s[4:5], 0
368 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
369 ; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
370 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
371 ; CHECK-NEXT: v_mov_b32_e32 v0, s6
372 ; CHECK-NEXT: BB15_1: ; %atomicrmw.start
373 ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
374 ; CHECK-NEXT: v_mov_b32_e32 v1, v0
375 ; CHECK-NEXT: v_mov_b32_e32 v2, 0
376 ; CHECK-NEXT: v_add_f32_e32 v0, -1.0, v1
377 ; CHECK-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
378 ; CHECK-NEXT: s_waitcnt vmcnt(0)
379 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
380 ; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
381 ; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
382 ; CHECK-NEXT: s_cbranch_execnz BB15_1
383 ; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
384 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
385 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
386 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
387 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[2:3]
388 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
389 ; CHECK-NEXT: s_endpgm
390 %f32 = atomicrmw fsub float addrspace(1)* %p, float 1.0 monotonic
391 %n32 = fptoui float %f32 to i32
392 %n64 = zext i32 %n32 to i64
393 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
394 store float 1.0, float addrspace(1)* %p1
398 define protected amdgpu_kernel void @fmin(double addrspace(1)* %p, %S addrspace(1)* %q) {
401 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
402 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
403 ; CHECK-NEXT: v_mov_b32_e32 v2, 0
404 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
405 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
406 ; CHECK-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc
407 ; CHECK-NEXT: v_mov_b32_e32 v2, s2
408 ; CHECK-NEXT: v_mov_b32_e32 v3, s3
409 ; CHECK-NEXT: s_waitcnt vmcnt(0)
410 ; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
411 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
412 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
413 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
414 ; CHECK-NEXT: s_endpgm
415 %f64 = call double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)* %p, double 1.0)
416 %n32 = fptoui double %f64 to i32
417 %n64 = zext i32 %n32 to i64
418 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
419 store float 1.0, float addrspace(1)* %p1
423 define protected amdgpu_kernel void @fmax(double addrspace(1)* %p, %S addrspace(1)* %q) {
426 ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
427 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
428 ; CHECK-NEXT: v_mov_b32_e32 v2, 0
429 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
430 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
431 ; CHECK-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc
432 ; CHECK-NEXT: v_mov_b32_e32 v2, s2
433 ; CHECK-NEXT: v_mov_b32_e32 v3, s3
434 ; CHECK-NEXT: s_waitcnt vmcnt(0)
435 ; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
436 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
437 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
438 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
439 ; CHECK-NEXT: s_endpgm
440 %f64 = call double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)* %p, double 1.0)
441 %n32 = fptoui double %f64 to i32
442 %n64 = zext i32 %n32 to i64
443 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
444 store float 1.0, float addrspace(1)* %p1
448 define protected amdgpu_kernel void @buffer.atomic.swap(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
449 ; CHECK-LABEL: buffer.atomic.swap:
451 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
452 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
453 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
454 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
455 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
456 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
457 ; CHECK-NEXT: v_mov_b32_e32 v1, s2
458 ; CHECK-NEXT: buffer_atomic_swap v0, v1, s[4:7], 0 offen glc
459 ; CHECK-NEXT: s_waitcnt vmcnt(0)
460 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
461 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
462 ; CHECK-NEXT: s_endpgm
463 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
464 %n64 = zext i32 %n32 to i64
465 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
466 store float 1.0, float addrspace(1)* %p1
470 define protected amdgpu_kernel void @buffer.atomic.add(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
471 ; CHECK-LABEL: buffer.atomic.add:
473 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
474 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
475 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
476 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
477 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
478 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
479 ; CHECK-NEXT: v_mov_b32_e32 v1, s2
480 ; CHECK-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 offen glc
481 ; CHECK-NEXT: s_waitcnt vmcnt(0)
482 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
483 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
484 ; CHECK-NEXT: s_endpgm
485 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
486 %n64 = zext i32 %n32 to i64
487 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
488 store float 1.0, float addrspace(1)* %p1
492 define protected amdgpu_kernel void @buffer.atomic.sub(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
493 ; CHECK-LABEL: buffer.atomic.sub:
495 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
496 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
497 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
498 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
499 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
500 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
501 ; CHECK-NEXT: v_mov_b32_e32 v1, s2
502 ; CHECK-NEXT: buffer_atomic_sub v0, v1, s[4:7], 0 offen glc
503 ; CHECK-NEXT: s_waitcnt vmcnt(0)
504 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
505 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
506 ; CHECK-NEXT: s_endpgm
507 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
508 %n64 = zext i32 %n32 to i64
509 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
510 store float 1.0, float addrspace(1)* %p1
514 define protected amdgpu_kernel void @buffer.atomic.smin(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
515 ; CHECK-LABEL: buffer.atomic.smin:
517 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
518 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
519 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
520 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
521 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
522 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
523 ; CHECK-NEXT: v_mov_b32_e32 v1, s2
524 ; CHECK-NEXT: buffer_atomic_smin v0, v1, s[4:7], 0 offen glc
525 ; CHECK-NEXT: s_waitcnt vmcnt(0)
526 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
527 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
528 ; CHECK-NEXT: s_endpgm
529 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
530 %n64 = zext i32 %n32 to i64
531 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
532 store float 1.0, float addrspace(1)* %p1
536 define protected amdgpu_kernel void @buffer.atomic.smax(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
537 ; CHECK-LABEL: buffer.atomic.smax:
539 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
540 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
541 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
542 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
543 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
544 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
545 ; CHECK-NEXT: v_mov_b32_e32 v1, s2
546 ; CHECK-NEXT: buffer_atomic_smax v0, v1, s[4:7], 0 offen glc
547 ; CHECK-NEXT: s_waitcnt vmcnt(0)
548 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
549 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
550 ; CHECK-NEXT: s_endpgm
551 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
552 %n64 = zext i32 %n32 to i64
553 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
554 store float 1.0, float addrspace(1)* %p1
558 define protected amdgpu_kernel void @buffer.atomic.umin(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
559 ; CHECK-LABEL: buffer.atomic.umin:
561 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
562 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
563 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
564 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
565 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
566 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
567 ; CHECK-NEXT: v_mov_b32_e32 v1, s2
568 ; CHECK-NEXT: buffer_atomic_umin v0, v1, s[4:7], 0 offen glc
569 ; CHECK-NEXT: s_waitcnt vmcnt(0)
570 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
571 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
572 ; CHECK-NEXT: s_endpgm
573 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
574 %n64 = zext i32 %n32 to i64
575 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
576 store float 1.0, float addrspace(1)* %p1
580 define protected amdgpu_kernel void @buffer.atomic.umax(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
581 ; CHECK-LABEL: buffer.atomic.umax:
583 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
584 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
585 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
586 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
587 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
588 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
589 ; CHECK-NEXT: v_mov_b32_e32 v1, s2
590 ; CHECK-NEXT: buffer_atomic_umax v0, v1, s[4:7], 0 offen glc
591 ; CHECK-NEXT: s_waitcnt vmcnt(0)
592 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
593 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
594 ; CHECK-NEXT: s_endpgm
595 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
596 %n64 = zext i32 %n32 to i64
597 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
598 store float 1.0, float addrspace(1)* %p1
602 define protected amdgpu_kernel void @buffer.atomic.and(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
603 ; CHECK-LABEL: buffer.atomic.and:
605 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
606 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
607 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
608 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
609 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
610 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
611 ; CHECK-NEXT: v_mov_b32_e32 v1, s2
612 ; CHECK-NEXT: buffer_atomic_and v0, v1, s[4:7], 0 offen glc
613 ; CHECK-NEXT: s_waitcnt vmcnt(0)
614 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
615 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
616 ; CHECK-NEXT: s_endpgm
617 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
618 %n64 = zext i32 %n32 to i64
619 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
620 store float 1.0, float addrspace(1)* %p1
624 define protected amdgpu_kernel void @buffer.atomic.or(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
625 ; CHECK-LABEL: buffer.atomic.or:
627 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
628 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
629 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
630 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
631 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
632 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
633 ; CHECK-NEXT: v_mov_b32_e32 v1, s2
634 ; CHECK-NEXT: buffer_atomic_or v0, v1, s[4:7], 0 offen glc
635 ; CHECK-NEXT: s_waitcnt vmcnt(0)
636 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
637 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
638 ; CHECK-NEXT: s_endpgm
639 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
640 %n64 = zext i32 %n32 to i64
641 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
642 store float 1.0, float addrspace(1)* %p1
646 define protected amdgpu_kernel void @buffer.atomic.xor(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
647 ; CHECK-LABEL: buffer.atomic.xor:
649 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
650 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
651 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
652 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
653 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
654 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
655 ; CHECK-NEXT: v_mov_b32_e32 v1, s2
656 ; CHECK-NEXT: buffer_atomic_xor v0, v1, s[4:7], 0 offen glc
657 ; CHECK-NEXT: s_waitcnt vmcnt(0)
658 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
659 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
660 ; CHECK-NEXT: s_endpgm
661 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
662 %n64 = zext i32 %n32 to i64
663 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
664 store float 1.0, float addrspace(1)* %p1
668 define protected amdgpu_kernel void @buffer.atomic.inc(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
669 ; CHECK-LABEL: buffer.atomic.inc:
671 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
672 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
673 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
674 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
675 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
676 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
677 ; CHECK-NEXT: v_mov_b32_e32 v1, s2
678 ; CHECK-NEXT: buffer_atomic_inc v0, v1, s[4:7], 0 offen glc
679 ; CHECK-NEXT: s_waitcnt vmcnt(0)
680 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
681 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
682 ; CHECK-NEXT: s_endpgm
683 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
684 %n64 = zext i32 %n32 to i64
685 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
686 store float 1.0, float addrspace(1)* %p1
690 define protected amdgpu_kernel void @buffer.atomic.dec(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
691 ; CHECK-LABEL: buffer.atomic.dec:
693 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
694 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
695 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
696 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
697 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
698 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
699 ; CHECK-NEXT: v_mov_b32_e32 v1, s2
700 ; CHECK-NEXT: buffer_atomic_dec v0, v1, s[4:7], 0 offen glc
701 ; CHECK-NEXT: s_waitcnt vmcnt(0)
702 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
703 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
704 ; CHECK-NEXT: s_endpgm
705 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32 1, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
706 %n64 = zext i32 %n32 to i64
707 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
708 store float 1.0, float addrspace(1)* %p1
712 define protected amdgpu_kernel void @buffer.atomic.cmpswap(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
713 ; CHECK-LABEL: buffer.atomic.cmpswap:
715 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
716 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
717 ; CHECK-NEXT: v_mov_b32_e32 v1, 2
718 ; CHECK-NEXT: v_mov_b32_e32 v0, 1
719 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
720 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
721 ; CHECK-NEXT: v_mov_b32_e32 v2, s2
722 ; CHECK-NEXT: buffer_atomic_cmpswap v[0:1], v2, s[4:7], 0 offen glc
723 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
724 ; CHECK-NEXT: s_waitcnt vmcnt(0)
725 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
726 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
727 ; CHECK-NEXT: s_endpgm
728 %n32 = call i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32 1, i32 2, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
729 %n64 = zext i32 %n32 to i64
730 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
731 store float 1.0, float addrspace(1)* %p1
735 define protected amdgpu_kernel void @buffer.atomic.fadd(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
736 ; CHECK-LABEL: buffer.atomic.fadd:
738 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
739 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
740 ; CHECK-NEXT: v_mov_b32_e32 v0, 1.0
741 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
742 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
743 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
744 ; CHECK-NEXT: v_mov_b32_e32 v1, s2
745 ; CHECK-NEXT: buffer_atomic_add_f32 v0, v1, s[4:7], 0 offen glc
746 ; CHECK-NEXT: s_waitcnt vmcnt(0)
747 ; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
748 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
749 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
750 ; CHECK-NEXT: s_endpgm
751 %f32 = call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float 1.0, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
752 %n32 = fptoui float %f32 to i32
753 %n64 = zext i32 %n32 to i64
754 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
755 store float 1.0, float addrspace(1)* %p1
759 define protected amdgpu_kernel void @buffer.atomic.fmin(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
760 ; CHECK-LABEL: buffer.atomic.fmin:
762 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
763 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
764 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
765 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
766 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
767 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
768 ; CHECK-NEXT: v_mov_b32_e32 v2, s2
769 ; CHECK-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen glc
770 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
771 ; CHECK-NEXT: s_waitcnt vmcnt(0)
772 ; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
773 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
774 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
775 ; CHECK-NEXT: s_endpgm
776 %f64 = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double 1.0, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
777 %n32 = fptoui double %f64 to i32
778 %n64 = zext i32 %n32 to i64
779 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
780 store float 1.0, float addrspace(1)* %p1
784 define protected amdgpu_kernel void @buffer.atomic.fmax(<4 x i32> inreg %rsrc, i32 %vindex, %S addrspace(1)* %q) {
785 ; CHECK-LABEL: buffer.atomic.fmax:
787 ; CHECK-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
788 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x34
789 ; CHECK-NEXT: v_mov_b32_e32 v0, 0
790 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000
791 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c
792 ; CHECK-NEXT: s_waitcnt lgkmcnt(0)
793 ; CHECK-NEXT: v_mov_b32_e32 v2, s2
794 ; CHECK-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen glc
795 ; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
796 ; CHECK-NEXT: s_waitcnt vmcnt(0)
797 ; CHECK-NEXT: v_cvt_u32_f64_e32 v0, v[0:1]
798 ; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, s[0:1]
799 ; CHECK-NEXT: global_store_dword v[0:1], v2, off
800 ; CHECK-NEXT: s_endpgm
801 %f64 = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double 1.0, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
802 %n32 = fptoui double %f64 to i32
803 %n64 = zext i32 %n32 to i64
804 %p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
805 store float 1.0, float addrspace(1)* %p1
809 declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)*, i32, i32 immarg, i32 immarg, i1 immarg)
810 declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)*, i32, i32 immarg, i32 immarg, i1 immarg)
811 declare double @llvm.amdgcn.global.atomic.fmin.f64.p1f64.f64(double addrspace(1)*, double)
812 declare double @llvm.amdgcn.global.atomic.fmax.f64.p1f64.f64(double addrspace(1)*, double)
813 declare i32 @llvm.amdgcn.raw.buffer.atomic.swap.i32(i32, <4 x i32>, i32, i32, i32)
814 declare i32 @llvm.amdgcn.raw.buffer.atomic.add.i32(i32, <4 x i32>, i32, i32, i32)
815 declare i32 @llvm.amdgcn.raw.buffer.atomic.sub.i32(i32, <4 x i32>, i32, i32, i32)
816 declare i32 @llvm.amdgcn.raw.buffer.atomic.smin.i32(i32, <4 x i32>, i32, i32, i32)
817 declare i32 @llvm.amdgcn.raw.buffer.atomic.smax.i32(i32, <4 x i32>, i32, i32, i32)
818 declare i32 @llvm.amdgcn.raw.buffer.atomic.umin.i32(i32, <4 x i32>, i32, i32, i32)
819 declare i32 @llvm.amdgcn.raw.buffer.atomic.umax.i32(i32, <4 x i32>, i32, i32, i32)
820 declare i32 @llvm.amdgcn.raw.buffer.atomic.and.i32(i32, <4 x i32>, i32, i32, i32)
821 declare i32 @llvm.amdgcn.raw.buffer.atomic.or.i32(i32, <4 x i32>, i32, i32, i32)
822 declare i32 @llvm.amdgcn.raw.buffer.atomic.xor.i32(i32, <4 x i32>, i32, i32, i32)
823 declare i32 @llvm.amdgcn.raw.buffer.atomic.inc.i32(i32, <4 x i32>, i32, i32, i32)
824 declare i32 @llvm.amdgcn.raw.buffer.atomic.dec.i32(i32, <4 x i32>, i32, i32, i32)
825 declare i32 @llvm.amdgcn.raw.buffer.atomic.cmpswap.i32(i32, i32, <4 x i32>, i32, i32, i32)
826 declare float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float, <4 x i32>, i32, i32, i32)
827 declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32)
828 declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32)