1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -march=amdgcn -mcpu=gfx90a -verify-machineinstrs | FileCheck %s -check-prefix=GFX90A
3 ; RUN: llc < %s -march=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940
5 declare double @llvm.amdgcn.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i1)
6 declare double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
7 declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
8 declare double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double, <4 x i32>, i32, i32, i32 immarg)
9 declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double, ptr addrspace(8), i32, i32, i32 immarg)
10 declare double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
11 declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
12 declare double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double, <4 x i32>, i32, i32, i32 immarg)
13 declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double, ptr addrspace(8), i32, i32, i32 immarg)
14 declare double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32, i32 immarg)
15 declare double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32, i32 immarg)
16 declare double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double, <4 x i32>, i32, i32, i32 immarg)
17 declare double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double, ptr addrspace(8), i32, i32, i32 immarg)
18 declare double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
19 declare double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
20 declare double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
21 declare double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
22 declare double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr addrspace(3) %ptr, double %data)
23 declare double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
24 declare double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
25 declare double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) nocapture, double, i32, i32, i1)
27 define amdgpu_kernel void @buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
28 ; GFX90A-LABEL: buffer_atomic_add_noret_f64:
29 ; GFX90A: ; %bb.0: ; %main_body
30 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
31 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c
32 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
33 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
34 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
35 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8
36 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen
37 ; GFX90A-NEXT: s_endpgm
39 ; GFX940-LABEL: buffer_atomic_add_noret_f64:
40 ; GFX940: ; %bb.0: ; %main_body
41 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
42 ; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c
43 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
44 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
45 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
46 ; GFX940-NEXT: v_mov_b32_e32 v2, s8
47 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen
48 ; GFX940-NEXT: s_endpgm
50 %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
54 define amdgpu_ps void @buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
55 ; GFX90A-LABEL: buffer_atomic_add_rtn_f64:
56 ; GFX90A: ; %bb.0: ; %main_body
57 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc
58 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
59 ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
60 ; GFX90A-NEXT: s_endpgm
62 ; GFX940-LABEL: buffer_atomic_add_rtn_f64:
63 ; GFX940: ; %bb.0: ; %main_body
64 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0
65 ; GFX940-NEXT: s_waitcnt vmcnt(0)
66 ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
67 ; GFX940-NEXT: s_endpgm
69 %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
70 store double %ret, ptr undef
74 define amdgpu_kernel void @buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
75 ; GFX90A-LABEL: buffer_atomic_add_rtn_f64_off4_slc:
76 ; GFX90A: ; %bb.0: ; %main_body
77 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
78 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c
79 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
80 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
81 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
82 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
83 ; GFX90A-NEXT: v_mov_b32_e32 v2, s10
84 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc
85 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
86 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
87 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
88 ; GFX90A-NEXT: s_endpgm
90 ; GFX940-LABEL: buffer_atomic_add_rtn_f64_off4_slc:
91 ; GFX940: ; %bb.0: ; %main_body
92 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
93 ; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c
94 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
95 ; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
96 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
97 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
98 ; GFX940-NEXT: v_mov_b32_e32 v2, s10
99 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt
100 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
101 ; GFX940-NEXT: s_waitcnt vmcnt(0)
102 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1
103 ; GFX940-NEXT: s_endpgm
105 %ret = call double @llvm.amdgcn.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
106 store double %ret, ptr addrspace(1) %out, align 8
110 define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
111 ; GFX90A-LABEL: raw_buffer_atomic_add_noret_f64:
112 ; GFX90A: ; %bb.0: ; %main_body
113 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
114 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c
115 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
116 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
117 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
118 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8
119 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen
120 ; GFX90A-NEXT: s_endpgm
122 ; GFX940-LABEL: raw_buffer_atomic_add_noret_f64:
123 ; GFX940: ; %bb.0: ; %main_body
124 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
125 ; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c
126 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
127 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
128 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
129 ; GFX940-NEXT: v_mov_b32_e32 v2, s8
130 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen
131 ; GFX940-NEXT: s_endpgm
133 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
137 define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
138 ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64:
139 ; GFX90A: ; %bb.0: ; %main_body
140 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen glc
141 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
142 ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
143 ; GFX90A-NEXT: s_endpgm
145 ; GFX940-LABEL: raw_buffer_atomic_add_rtn_f64:
146 ; GFX940: ; %bb.0: ; %main_body
147 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen sc0
148 ; GFX940-NEXT: s_waitcnt vmcnt(0)
149 ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
150 ; GFX940-NEXT: s_endpgm
152 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
153 store double %ret, ptr undef
157 define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
158 ; GFX90A-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
159 ; GFX90A: ; %bb.0: ; %main_body
160 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
161 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c
162 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
163 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
164 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
165 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
166 ; GFX90A-NEXT: v_mov_b32_e32 v2, s10
167 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc
168 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
169 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
170 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
171 ; GFX90A-NEXT: s_endpgm
173 ; GFX940-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
174 ; GFX940: ; %bb.0: ; %main_body
175 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
176 ; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c
177 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
178 ; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
179 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
180 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
181 ; GFX940-NEXT: v_mov_b32_e32 v2, s10
182 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt
183 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
184 ; GFX940-NEXT: s_waitcnt vmcnt(0)
185 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1
186 ; GFX940-NEXT: s_endpgm
188 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
189 store double %ret, ptr addrspace(1) %out, align 8
193 define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
194 ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
195 ; GFX90A: ; %bb.0: ; %main_body
196 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
197 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c
198 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
199 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
200 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
201 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8
202 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen
203 ; GFX90A-NEXT: s_endpgm
205 ; GFX940-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
206 ; GFX940: ; %bb.0: ; %main_body
207 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
208 ; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c
209 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
210 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
211 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
212 ; GFX940-NEXT: v_mov_b32_e32 v2, s8
213 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 offen
214 ; GFX940-NEXT: s_endpgm
216 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
220 define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
221 ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64:
222 ; GFX90A: ; %bb.0: ; %main_body
223 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen glc
224 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
225 ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
226 ; GFX90A-NEXT: s_endpgm
228 ; GFX940-LABEL: raw_ptr_buffer_atomic_add_rtn_f64:
229 ; GFX940: ; %bb.0: ; %main_body
230 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 offen sc0
231 ; GFX940-NEXT: s_waitcnt vmcnt(0)
232 ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
233 ; GFX940-NEXT: s_endpgm
235 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
236 store double %ret, ptr undef
240 define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
241 ; GFX90A-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
242 ; GFX90A: ; %bb.0: ; %main_body
243 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
244 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c
245 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
246 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
247 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
248 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
249 ; GFX90A-NEXT: v_mov_b32_e32 v2, s10
250 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen glc slc
251 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
252 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
253 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
254 ; GFX90A-NEXT: s_endpgm
256 ; GFX940-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
257 ; GFX940: ; %bb.0: ; %main_body
258 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
259 ; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c
260 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
261 ; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
262 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
263 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
264 ; GFX940-NEXT: v_mov_b32_e32 v2, s10
265 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt
266 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
267 ; GFX940-NEXT: s_waitcnt vmcnt(0)
268 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1
269 ; GFX940-NEXT: s_endpgm
271 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
272 store double %ret, ptr addrspace(1) %out, align 8
276 define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
277 ; GFX90A-LABEL: struct_buffer_atomic_add_noret_f64:
278 ; GFX90A: ; %bb.0: ; %main_body
279 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
280 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c
281 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
282 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
283 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
284 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8
285 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen
286 ; GFX90A-NEXT: s_endpgm
288 ; GFX940-LABEL: struct_buffer_atomic_add_noret_f64:
289 ; GFX940: ; %bb.0: ; %main_body
290 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
291 ; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c
292 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
293 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
294 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
295 ; GFX940-NEXT: v_mov_b32_e32 v2, s8
296 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen
297 ; GFX940-NEXT: s_endpgm
299 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
303 define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
304 ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64:
305 ; GFX90A: ; %bb.0: ; %main_body
306 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc
307 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
308 ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
309 ; GFX90A-NEXT: s_endpgm
311 ; GFX940-LABEL: struct_buffer_atomic_add_rtn_f64:
312 ; GFX940: ; %bb.0: ; %main_body
313 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0
314 ; GFX940-NEXT: s_waitcnt vmcnt(0)
315 ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
316 ; GFX940-NEXT: s_endpgm
318 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
319 store double %ret, ptr undef
323 define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
324 ; GFX90A-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
325 ; GFX90A: ; %bb.0: ; %main_body
326 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
327 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c
328 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
329 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
330 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
331 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
332 ; GFX90A-NEXT: v_mov_b32_e32 v2, s10
333 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc
334 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
335 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
336 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
337 ; GFX90A-NEXT: s_endpgm
339 ; GFX940-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
340 ; GFX940: ; %bb.0: ; %main_body
341 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
342 ; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c
343 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
344 ; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
345 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
346 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
347 ; GFX940-NEXT: v_mov_b32_e32 v2, s10
348 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt
349 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
350 ; GFX940-NEXT: s_waitcnt vmcnt(0)
351 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1
352 ; GFX940-NEXT: s_endpgm
354 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fadd.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
355 store double %ret, ptr addrspace(1) %out, align 8
359 define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
360 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
361 ; GFX90A: ; %bb.0: ; %main_body
362 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
363 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c
364 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
365 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
366 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
367 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8
368 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen
369 ; GFX90A-NEXT: s_endpgm
371 ; GFX940-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
372 ; GFX940: ; %bb.0: ; %main_body
373 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
374 ; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c
375 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
376 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
377 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
378 ; GFX940-NEXT: v_mov_b32_e32 v2, s8
379 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen
380 ; GFX940-NEXT: s_endpgm
382 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
386 define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
387 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64:
388 ; GFX90A: ; %bb.0: ; %main_body
389 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen glc
390 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
391 ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
392 ; GFX90A-NEXT: s_endpgm
394 ; GFX940-LABEL: struct_ptr_buffer_atomic_add_rtn_f64:
395 ; GFX940: ; %bb.0: ; %main_body
396 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[0:3], 0 idxen sc0
397 ; GFX940-NEXT: s_waitcnt vmcnt(0)
398 ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
399 ; GFX940-NEXT: s_endpgm
401 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
402 store double %ret, ptr undef
406 define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
407 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
408 ; GFX90A: ; %bb.0: ; %main_body
409 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
410 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c
411 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
412 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
413 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
414 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
415 ; GFX90A-NEXT: v_mov_b32_e32 v2, s10
416 ; GFX90A-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc
417 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
418 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
419 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
420 ; GFX90A-NEXT: s_endpgm
422 ; GFX940-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
423 ; GFX940: ; %bb.0: ; %main_body
424 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
425 ; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c
426 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
427 ; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
428 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
429 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
430 ; GFX940-NEXT: v_mov_b32_e32 v2, s10
431 ; GFX940-NEXT: buffer_atomic_add_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt
432 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
433 ; GFX940-NEXT: s_waitcnt vmcnt(0)
434 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1
435 ; GFX940-NEXT: s_endpgm
437 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
438 store double %ret, ptr addrspace(1) %out, align 8
442 define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
443 ; GFX90A-LABEL: raw_buffer_atomic_min_noret_f64:
444 ; GFX90A: ; %bb.0: ; %main_body
445 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
446 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c
447 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
448 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
449 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
450 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8
451 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen
452 ; GFX90A-NEXT: s_endpgm
454 ; GFX940-LABEL: raw_buffer_atomic_min_noret_f64:
455 ; GFX940: ; %bb.0: ; %main_body
456 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
457 ; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c
458 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
459 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
460 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
461 ; GFX940-NEXT: v_mov_b32_e32 v2, s8
462 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen
463 ; GFX940-NEXT: s_endpgm
465 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
469 define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
470 ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64:
471 ; GFX90A: ; %bb.0: ; %main_body
472 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen glc
473 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
474 ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
475 ; GFX90A-NEXT: s_endpgm
477 ; GFX940-LABEL: raw_buffer_atomic_min_rtn_f64:
478 ; GFX940: ; %bb.0: ; %main_body
479 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0
480 ; GFX940-NEXT: s_waitcnt vmcnt(0)
481 ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
482 ; GFX940-NEXT: s_endpgm
484 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
485 store double %ret, ptr undef
489 define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
490 ; GFX90A-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
491 ; GFX90A: ; %bb.0: ; %main_body
492 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
493 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c
494 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
495 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
496 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
497 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
498 ; GFX90A-NEXT: v_mov_b32_e32 v2, s10
499 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc
500 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
501 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
502 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
503 ; GFX90A-NEXT: s_endpgm
505 ; GFX940-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
506 ; GFX940: ; %bb.0: ; %main_body
507 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
508 ; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c
509 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
510 ; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
511 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
512 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
513 ; GFX940-NEXT: v_mov_b32_e32 v2, s10
514 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt
515 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
516 ; GFX940-NEXT: s_waitcnt vmcnt(0)
517 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1
518 ; GFX940-NEXT: s_endpgm
520 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
521 store double %ret, ptr addrspace(1) %out, align 8
525 define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
526 ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
527 ; GFX90A: ; %bb.0: ; %main_body
528 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
529 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c
530 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
531 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
532 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
533 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8
534 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen
535 ; GFX90A-NEXT: s_endpgm
537 ; GFX940-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
538 ; GFX940: ; %bb.0: ; %main_body
539 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
540 ; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c
541 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
542 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
543 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
544 ; GFX940-NEXT: v_mov_b32_e32 v2, s8
545 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 offen
546 ; GFX940-NEXT: s_endpgm
548 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
552 define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
553 ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64:
554 ; GFX90A: ; %bb.0: ; %main_body
555 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen glc
556 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
557 ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
558 ; GFX90A-NEXT: s_endpgm
560 ; GFX940-LABEL: raw_ptr_buffer_atomic_min_rtn_f64:
561 ; GFX940: ; %bb.0: ; %main_body
562 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 offen sc0
563 ; GFX940-NEXT: s_waitcnt vmcnt(0)
564 ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
565 ; GFX940-NEXT: s_endpgm
567 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
568 store double %ret, ptr undef
572 define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
573 ; GFX90A-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
574 ; GFX90A: ; %bb.0: ; %main_body
575 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
576 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c
577 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
578 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
579 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
580 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
581 ; GFX90A-NEXT: v_mov_b32_e32 v2, s10
582 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen glc slc
583 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
584 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
585 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
586 ; GFX90A-NEXT: s_endpgm
588 ; GFX940-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
589 ; GFX940: ; %bb.0: ; %main_body
590 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
591 ; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c
592 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
593 ; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
594 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
595 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
596 ; GFX940-NEXT: v_mov_b32_e32 v2, s10
597 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt
598 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
599 ; GFX940-NEXT: s_waitcnt vmcnt(0)
600 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1
601 ; GFX940-NEXT: s_endpgm
603 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
604 store double %ret, ptr addrspace(1) %out, align 8
608 define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
609 ; GFX90A-LABEL: struct_buffer_atomic_min_noret_f64:
610 ; GFX90A: ; %bb.0: ; %main_body
611 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
612 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c
613 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
614 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
615 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
616 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8
617 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen
618 ; GFX90A-NEXT: s_endpgm
620 ; GFX940-LABEL: struct_buffer_atomic_min_noret_f64:
621 ; GFX940: ; %bb.0: ; %main_body
622 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
623 ; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c
624 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
625 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
626 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
627 ; GFX940-NEXT: v_mov_b32_e32 v2, s8
628 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen
629 ; GFX940-NEXT: s_endpgm
631 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
635 define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
636 ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64:
637 ; GFX90A: ; %bb.0: ; %main_body
638 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen glc
639 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
640 ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
641 ; GFX90A-NEXT: s_endpgm
643 ; GFX940-LABEL: struct_buffer_atomic_min_rtn_f64:
644 ; GFX940: ; %bb.0: ; %main_body
645 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen sc0
646 ; GFX940-NEXT: s_waitcnt vmcnt(0)
647 ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
648 ; GFX940-NEXT: s_endpgm
650 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
651 store double %ret, ptr undef
655 define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
656 ; GFX90A-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
657 ; GFX90A: ; %bb.0: ; %main_body
658 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
659 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c
660 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
661 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
662 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
663 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
664 ; GFX90A-NEXT: v_mov_b32_e32 v2, s10
665 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc
666 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
667 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
668 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
669 ; GFX90A-NEXT: s_endpgm
671 ; GFX940-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
672 ; GFX940: ; %bb.0: ; %main_body
673 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
674 ; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c
675 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
676 ; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
677 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
678 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
679 ; GFX940-NEXT: v_mov_b32_e32 v2, s10
680 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt
681 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
682 ; GFX940-NEXT: s_waitcnt vmcnt(0)
683 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1
684 ; GFX940-NEXT: s_endpgm
686 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmin.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
687 store double %ret, ptr addrspace(1) %out, align 8
691 define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
692 ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
693 ; GFX90A: ; %bb.0: ; %main_body
694 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
695 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c
696 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
697 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
698 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
699 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8
700 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen
701 ; GFX90A-NEXT: s_endpgm
703 ; GFX940-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
704 ; GFX940: ; %bb.0: ; %main_body
705 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
706 ; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c
707 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
708 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
709 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
710 ; GFX940-NEXT: v_mov_b32_e32 v2, s8
711 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen
712 ; GFX940-NEXT: s_endpgm
714 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
718 define amdgpu_ps void @struct_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
719 ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64:
720 ; GFX90A: ; %bb.0: ; %main_body
721 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen glc
722 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
723 ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
724 ; GFX90A-NEXT: s_endpgm
726 ; GFX940-LABEL: struct_ptr_buffer_atomic_min_rtn_f64:
727 ; GFX940: ; %bb.0: ; %main_body
728 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[0:3], 0 idxen sc0
729 ; GFX940-NEXT: s_waitcnt vmcnt(0)
730 ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
731 ; GFX940-NEXT: s_endpgm
733 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
734 store double %ret, ptr undef
738 define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
739 ; GFX90A-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
740 ; GFX90A: ; %bb.0: ; %main_body
741 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
742 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c
743 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
744 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
745 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
746 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
747 ; GFX90A-NEXT: v_mov_b32_e32 v2, s10
748 ; GFX90A-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc
749 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
750 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
751 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
752 ; GFX90A-NEXT: s_endpgm
754 ; GFX940-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
755 ; GFX940: ; %bb.0: ; %main_body
756 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
757 ; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c
758 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
759 ; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
760 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
761 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
762 ; GFX940-NEXT: v_mov_b32_e32 v2, s10
763 ; GFX940-NEXT: buffer_atomic_min_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt
764 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
765 ; GFX940-NEXT: s_waitcnt vmcnt(0)
766 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1
767 ; GFX940-NEXT: s_endpgm
769 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmin.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
770 store double %ret, ptr addrspace(1) %out, align 8
774 define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
775 ; GFX90A-LABEL: raw_buffer_atomic_max_noret_f64:
776 ; GFX90A: ; %bb.0: ; %main_body
777 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
778 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c
779 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
780 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
781 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
782 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8
783 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen
784 ; GFX90A-NEXT: s_endpgm
786 ; GFX940-LABEL: raw_buffer_atomic_max_noret_f64:
787 ; GFX940: ; %bb.0: ; %main_body
788 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
789 ; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c
790 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
791 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
792 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
793 ; GFX940-NEXT: v_mov_b32_e32 v2, s8
794 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen
795 ; GFX940-NEXT: s_endpgm
797 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
801 define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
802 ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64:
803 ; GFX90A: ; %bb.0: ; %main_body
804 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen glc
805 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
806 ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
807 ; GFX90A-NEXT: s_endpgm
809 ; GFX940-LABEL: raw_buffer_atomic_max_rtn_f64:
810 ; GFX940: ; %bb.0: ; %main_body
811 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0
812 ; GFX940-NEXT: s_waitcnt vmcnt(0)
813 ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
814 ; GFX940-NEXT: s_endpgm
816 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0)
817 store double %ret, ptr undef
821 define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
822 ; GFX90A-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
823 ; GFX90A: ; %bb.0: ; %main_body
824 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
825 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c
826 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
827 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
828 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
829 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
830 ; GFX90A-NEXT: v_mov_b32_e32 v2, s10
831 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc
832 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
833 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
834 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
835 ; GFX90A-NEXT: s_endpgm
837 ; GFX940-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
838 ; GFX940: ; %bb.0: ; %main_body
839 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
840 ; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c
841 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
842 ; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
843 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
844 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
845 ; GFX940-NEXT: v_mov_b32_e32 v2, s10
846 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt
847 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
848 ; GFX940-NEXT: s_waitcnt vmcnt(0)
849 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1
850 ; GFX940-NEXT: s_endpgm
852 %ret = call double @llvm.amdgcn.raw.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 2)
853 store double %ret, ptr addrspace(1) %out, align 8
857 define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
858 ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
859 ; GFX90A: ; %bb.0: ; %main_body
860 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
861 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c
862 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
863 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
864 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
865 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8
866 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen
867 ; GFX90A-NEXT: s_endpgm
869 ; GFX940-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
870 ; GFX940: ; %bb.0: ; %main_body
871 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
872 ; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c
873 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
874 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
875 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
876 ; GFX940-NEXT: v_mov_b32_e32 v2, s8
877 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 offen
878 ; GFX940-NEXT: s_endpgm
880 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
884 define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
885 ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64:
886 ; GFX90A: ; %bb.0: ; %main_body
887 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen glc
888 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
889 ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
890 ; GFX90A-NEXT: s_endpgm
892 ; GFX940-LABEL: raw_ptr_buffer_atomic_max_rtn_f64:
893 ; GFX940: ; %bb.0: ; %main_body
894 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen sc0
895 ; GFX940-NEXT: s_waitcnt vmcnt(0)
896 ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
897 ; GFX940-NEXT: s_endpgm
899 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0)
900 store double %ret, ptr undef
904 define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
905 ; GFX90A-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
906 ; GFX90A: ; %bb.0: ; %main_body
907 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
908 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c
909 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
910 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
911 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
912 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
913 ; GFX90A-NEXT: v_mov_b32_e32 v2, s10
914 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen glc slc
915 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
916 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
917 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
918 ; GFX90A-NEXT: s_endpgm
920 ; GFX940-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
921 ; GFX940: ; %bb.0: ; %main_body
922 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
923 ; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c
924 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
925 ; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
926 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
927 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
928 ; GFX940-NEXT: v_mov_b32_e32 v2, s10
929 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 4 offen sc0 nt
930 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
931 ; GFX940-NEXT: s_waitcnt vmcnt(0)
932 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1
933 ; GFX940-NEXT: s_endpgm
935 %ret = call double @llvm.amdgcn.raw.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 2)
936 store double %ret, ptr addrspace(1) %out, align 8
940 define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, double %data, i32 %vindex) {
941 ; GFX90A-LABEL: struct_buffer_atomic_max_noret_f64:
942 ; GFX90A: ; %bb.0: ; %main_body
943 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
944 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c
945 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
946 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
947 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
948 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8
949 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen
950 ; GFX90A-NEXT: s_endpgm
952 ; GFX940-LABEL: struct_buffer_atomic_max_noret_f64:
953 ; GFX940: ; %bb.0: ; %main_body
954 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
955 ; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c
956 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
957 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
958 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
959 ; GFX940-NEXT: v_mov_b32_e32 v2, s8
960 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen
961 ; GFX940-NEXT: s_endpgm
963 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
967 define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, double %data, i32 %vindex) {
968 ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64:
969 ; GFX90A: ; %bb.0: ; %main_body
970 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen glc
971 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
972 ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
973 ; GFX90A-NEXT: s_endpgm
975 ; GFX940-LABEL: struct_buffer_atomic_max_rtn_f64:
976 ; GFX940: ; %bb.0: ; %main_body
977 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen sc0
978 ; GFX940-NEXT: s_waitcnt vmcnt(0)
979 ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
980 ; GFX940-NEXT: s_endpgm
982 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
983 store double %ret, ptr undef
987 define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
988 ; GFX90A-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
989 ; GFX90A: ; %bb.0: ; %main_body
990 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
991 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c
992 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
993 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
994 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
995 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
996 ; GFX90A-NEXT: v_mov_b32_e32 v2, s10
997 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc
998 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
999 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1000 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
1001 ; GFX90A-NEXT: s_endpgm
1003 ; GFX940-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
1004 ; GFX940: ; %bb.0: ; %main_body
1005 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1006 ; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c
1007 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1008 ; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
1009 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1010 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1011 ; GFX940-NEXT: v_mov_b32_e32 v2, s10
1012 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt
1013 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
1014 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1015 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1
1016 ; GFX940-NEXT: s_endpgm
1018 %ret = call double @llvm.amdgcn.struct.buffer.atomic.fmax.f64(double %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
1019 store double %ret, ptr addrspace(1) %out, align 8
1023 define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8) %rsrc, double %data, i32 %vindex) {
1024 ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
1025 ; GFX90A: ; %bb.0: ; %main_body
1026 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1027 ; GFX90A-NEXT: s_load_dword s8, s[0:1], 0x3c
1028 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1029 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1030 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1031 ; GFX90A-NEXT: v_mov_b32_e32 v2, s8
1032 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen
1033 ; GFX90A-NEXT: s_endpgm
1035 ; GFX940-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
1036 ; GFX940: ; %bb.0: ; %main_body
1037 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1038 ; GFX940-NEXT: s_load_dword s8, s[0:1], 0x3c
1039 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1040 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1041 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1042 ; GFX940-NEXT: v_mov_b32_e32 v2, s8
1043 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen
1044 ; GFX940-NEXT: s_endpgm
1046 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
1050 define amdgpu_ps void @struct_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg %rsrc, double %data, i32 %vindex) {
1051 ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64:
1052 ; GFX90A: ; %bb.0: ; %main_body
1053 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen glc
1054 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1055 ; GFX90A-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
1056 ; GFX90A-NEXT: s_endpgm
1058 ; GFX940-LABEL: struct_ptr_buffer_atomic_max_rtn_f64:
1059 ; GFX940: ; %bb.0: ; %main_body
1060 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 idxen sc0
1061 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1062 ; GFX940-NEXT: flat_store_dwordx2 v[0:1], v[0:1] sc0 sc1
1063 ; GFX940-NEXT: s_endpgm
1065 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 0, i32 0)
1066 store double %ret, ptr undef
1070 define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrspace(8) %rsrc, double %data, i32 %vindex, ptr addrspace(1) %out) {
1071 ; GFX90A-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
1072 ; GFX90A: ; %bb.0: ; %main_body
1073 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1074 ; GFX90A-NEXT: s_load_dword s10, s[0:1], 0x3c
1075 ; GFX90A-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1076 ; GFX90A-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
1077 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1078 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1079 ; GFX90A-NEXT: v_mov_b32_e32 v2, s10
1080 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 glc slc
1081 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
1082 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1083 ; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
1084 ; GFX90A-NEXT: s_endpgm
1086 ; GFX940-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
1087 ; GFX940: ; %bb.0: ; %main_body
1088 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
1089 ; GFX940-NEXT: s_load_dword s10, s[0:1], 0x3c
1090 ; GFX940-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
1091 ; GFX940-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x44
1092 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1093 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1094 ; GFX940-NEXT: v_mov_b32_e32 v2, s10
1095 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[4:7], 0 idxen offset:4 sc0 nt
1096 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
1097 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1098 ; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] sc0 sc1
1099 ; GFX940-NEXT: s_endpgm
1101 %ret = call double @llvm.amdgcn.struct.ptr.buffer.atomic.fmax.f64(double %data, ptr addrspace(8) %rsrc, i32 %vindex, i32 4, i32 0, i32 2)
1102 store double %ret, ptr addrspace(1) %out, align 8
1106 define amdgpu_kernel void @global_atomic_fadd_f64_noret(ptr addrspace(1) %ptr, double %data) {
1107 ; GFX90A-LABEL: global_atomic_fadd_f64_noret:
1108 ; GFX90A: ; %bb.0: ; %main_body
1109 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1110 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
1111 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1112 ; GFX90A-NEXT: v_mov_b32_e32 v0, s2
1113 ; GFX90A-NEXT: v_mov_b32_e32 v1, s3
1114 ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
1115 ; GFX90A-NEXT: s_endpgm
1117 ; GFX940-LABEL: global_atomic_fadd_f64_noret:
1118 ; GFX940: ; %bb.0: ; %main_body
1119 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1120 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
1121 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1122 ; GFX940-NEXT: v_mov_b32_e32 v0, s2
1123 ; GFX940-NEXT: v_mov_b32_e32 v1, s3
1124 ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
1125 ; GFX940-NEXT: s_endpgm
1127 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
1131 define amdgpu_kernel void @global_atomic_fmin_f64_noret(ptr addrspace(1) %ptr, double %data) {
1132 ; GFX90A-LABEL: global_atomic_fmin_f64_noret:
1133 ; GFX90A: ; %bb.0: ; %main_body
1134 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1135 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
1136 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1137 ; GFX90A-NEXT: v_mov_b32_e32 v0, s2
1138 ; GFX90A-NEXT: v_mov_b32_e32 v1, s3
1139 ; GFX90A-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
1140 ; GFX90A-NEXT: s_endpgm
1142 ; GFX940-LABEL: global_atomic_fmin_f64_noret:
1143 ; GFX940: ; %bb.0: ; %main_body
1144 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1145 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
1146 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1147 ; GFX940-NEXT: v_mov_b32_e32 v0, s2
1148 ; GFX940-NEXT: v_mov_b32_e32 v1, s3
1149 ; GFX940-NEXT: global_atomic_min_f64 v2, v[0:1], s[0:1]
1150 ; GFX940-NEXT: s_endpgm
1152 %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
1156 define amdgpu_kernel void @global_atomic_fmax_f64_noret(ptr addrspace(1) %ptr, double %data) {
1157 ; GFX90A-LABEL: global_atomic_fmax_f64_noret:
1158 ; GFX90A: ; %bb.0: ; %main_body
1159 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1160 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
1161 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1162 ; GFX90A-NEXT: v_mov_b32_e32 v0, s2
1163 ; GFX90A-NEXT: v_mov_b32_e32 v1, s3
1164 ; GFX90A-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
1165 ; GFX90A-NEXT: s_endpgm
1167 ; GFX940-LABEL: global_atomic_fmax_f64_noret:
1168 ; GFX940: ; %bb.0: ; %main_body
1169 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1170 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
1171 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1172 ; GFX940-NEXT: v_mov_b32_e32 v0, s2
1173 ; GFX940-NEXT: v_mov_b32_e32 v1, s3
1174 ; GFX940-NEXT: global_atomic_max_f64 v2, v[0:1], s[0:1]
1175 ; GFX940-NEXT: s_endpgm
1177 %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
1181 define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 {
1182 ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
1183 ; GFX90A: ; %bb.0: ; %main_body
1184 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1185 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0
1186 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0
1187 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1188 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
1189 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1190 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
1191 ; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start
1192 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1193 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
1194 ; GFX90A-NEXT: buffer_wbl2
1195 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1196 ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
1197 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1198 ; GFX90A-NEXT: buffer_invl2
1199 ; GFX90A-NEXT: buffer_wbinvl1_vol
1200 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
1201 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1202 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
1203 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
1204 ; GFX90A-NEXT: s_cbranch_execnz .LBB42_1
1205 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
1206 ; GFX90A-NEXT: s_endpgm
1208 ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat:
1209 ; GFX940: ; %bb.0: ; %main_body
1210 ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1211 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
1212 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
1213 ; GFX940-NEXT: buffer_wbl2 sc0 sc1
1214 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1215 ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
1216 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1217 ; GFX940-NEXT: buffer_inv sc0 sc1
1218 ; GFX940-NEXT: s_endpgm
1220 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst
1224 define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 {
1225 ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent:
1226 ; GFX90A: ; %bb.0: ; %main_body
1227 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1228 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0
1229 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
1230 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
1231 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1232 ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
1233 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1234 ; GFX90A-NEXT: buffer_wbinvl1_vol
1235 ; GFX90A-NEXT: s_endpgm
1237 ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent:
1238 ; GFX940: ; %bb.0: ; %main_body
1239 ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1240 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
1241 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
1242 ; GFX940-NEXT: buffer_wbl2 sc1
1243 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1244 ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
1245 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1246 ; GFX940-NEXT: buffer_inv sc1
1247 ; GFX940-NEXT: s_endpgm
1249 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
1253 define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 {
1254 ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system:
1255 ; GFX90A: ; %bb.0: ; %main_body
1256 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1257 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0
1258 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0
1259 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1260 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
1261 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1262 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
1263 ; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
1264 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1265 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
1266 ; GFX90A-NEXT: buffer_wbl2
1267 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1268 ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
1269 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1270 ; GFX90A-NEXT: buffer_invl2
1271 ; GFX90A-NEXT: buffer_wbinvl1_vol
1272 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
1273 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1274 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
1275 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
1276 ; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
1277 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
1278 ; GFX90A-NEXT: s_endpgm
1280 ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system:
1281 ; GFX940: ; %bb.0: ; %main_body
1282 ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1283 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
1284 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
1285 ; GFX940-NEXT: buffer_wbl2 sc0 sc1
1286 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1287 ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
1288 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1289 ; GFX940-NEXT: buffer_inv sc0 sc1
1290 ; GFX940-NEXT: s_endpgm
1292 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst
1296 define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 {
1297 ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush:
1298 ; GFX90A: ; %bb.0: ; %main_body
1299 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1300 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0
1301 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
1302 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
1303 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1304 ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
1305 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1306 ; GFX90A-NEXT: buffer_wbinvl1_vol
1307 ; GFX90A-NEXT: s_endpgm
1309 ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush:
1310 ; GFX940: ; %bb.0: ; %main_body
1311 ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1312 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
1313 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
1314 ; GFX940-NEXT: buffer_wbl2 sc1
1315 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1316 ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
1317 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1318 ; GFX940-NEXT: buffer_inv sc1
1319 ; GFX940-NEXT: s_endpgm
1321 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
1325 define double @global_atomic_fadd_f64_rtn(ptr addrspace(1) %ptr, double %data) {
1326 ; GFX90A-LABEL: global_atomic_fadd_f64_rtn:
1327 ; GFX90A: ; %bb.0: ; %main_body
1328 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1329 ; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
1330 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1331 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
1333 ; GFX940-LABEL: global_atomic_fadd_f64_rtn:
1334 ; GFX940: ; %bb.0: ; %main_body
1335 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1336 ; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
1337 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1338 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1340 %ret = call double @llvm.amdgcn.global.atomic.fadd.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
1344 define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %data) #1 {
1345 ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat:
1346 ; GFX90A: ; %bb.0: ; %main_body
1347 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1348 ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
1349 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
1350 ; GFX90A-NEXT: .LBB47_1: ; %atomicrmw.start
1351 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1352 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1353 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
1354 ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
1355 ; GFX90A-NEXT: buffer_wbl2
1356 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1357 ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
1358 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1359 ; GFX90A-NEXT: buffer_invl2
1360 ; GFX90A-NEXT: buffer_wbinvl1_vol
1361 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
1362 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1363 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
1364 ; GFX90A-NEXT: s_cbranch_execnz .LBB47_1
1365 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
1366 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
1367 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2
1368 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3
1369 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
1371 ; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat:
1372 ; GFX940: ; %bb.0: ; %main_body
1373 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1374 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
1375 ; GFX940-NEXT: buffer_wbl2 sc0 sc1
1376 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1377 ; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1
1378 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1379 ; GFX940-NEXT: buffer_inv sc0 sc1
1380 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1382 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst
1386 define double @global_atomic_fadd_f64_rtn_pat_agent(ptr addrspace(1) %ptr, double %data) #1 {
1387 ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_agent:
1388 ; GFX90A: ; %bb.0: ; %main_body
1389 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1390 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
1391 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
1392 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1393 ; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
1394 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1395 ; GFX90A-NEXT: buffer_wbinvl1_vol
1396 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
1398 ; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat_agent:
1399 ; GFX940: ; %bb.0: ; %main_body
1400 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1401 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
1402 ; GFX940-NEXT: buffer_wbl2 sc1
1403 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1404 ; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0
1405 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1406 ; GFX940-NEXT: buffer_inv sc1
1407 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1409 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
1413 define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, double %data) #1 {
1414 ; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_system:
1415 ; GFX90A: ; %bb.0: ; %main_body
1416 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1417 ; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
1418 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
1419 ; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start
1420 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1421 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1422 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
1423 ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
1424 ; GFX90A-NEXT: buffer_wbl2
1425 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1426 ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc
1427 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1428 ; GFX90A-NEXT: buffer_invl2
1429 ; GFX90A-NEXT: buffer_wbinvl1_vol
1430 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
1431 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1432 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
1433 ; GFX90A-NEXT: s_cbranch_execnz .LBB49_1
1434 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
1435 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
1436 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2
1437 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3
1438 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
1440 ; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat_system:
1441 ; GFX940: ; %bb.0: ; %main_body
1442 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1443 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
1444 ; GFX940-NEXT: buffer_wbl2 sc0 sc1
1445 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1446 ; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1
1447 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1448 ; GFX940-NEXT: buffer_inv sc0 sc1
1449 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1451 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst
1455 define double @global_atomic_fmax_f64_rtn(ptr addrspace(1) %ptr, double %data) {
1456 ; GFX90A-LABEL: global_atomic_fmax_f64_rtn:
1457 ; GFX90A: ; %bb.0: ; %main_body
1458 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1459 ; GFX90A-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off glc
1460 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1461 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
1463 ; GFX940-LABEL: global_atomic_fmax_f64_rtn:
1464 ; GFX940: ; %bb.0: ; %main_body
1465 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1466 ; GFX940-NEXT: global_atomic_max_f64 v[0:1], v[0:1], v[2:3], off sc0
1467 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1468 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1470 %ret = call double @llvm.amdgcn.global.atomic.fmax.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
1474 define double @global_atomic_fmin_f64_rtn(ptr addrspace(1) %ptr, double %data) {
1475 ; GFX90A-LABEL: global_atomic_fmin_f64_rtn:
1476 ; GFX90A: ; %bb.0: ; %main_body
1477 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1478 ; GFX90A-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off glc
1479 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1480 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
1482 ; GFX940-LABEL: global_atomic_fmin_f64_rtn:
1483 ; GFX940: ; %bb.0: ; %main_body
1484 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1485 ; GFX940-NEXT: global_atomic_min_f64 v[0:1], v[0:1], v[2:3], off sc0
1486 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1487 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1489 %ret = call double @llvm.amdgcn.global.atomic.fmin.f64.p1.f64(ptr addrspace(1) %ptr, double %data)
1493 define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) {
1494 ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
1495 ; GFX90A: ; %bb.0: ; %main_body
1496 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1497 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0
1498 ; GFX90A-NEXT: v_mov_b32_e32 v4, 0
1499 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1500 ; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
1501 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1502 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
1503 ; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
1504 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1505 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
1506 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1507 ; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
1508 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1509 ; GFX90A-NEXT: buffer_wbinvl1_vol
1510 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
1511 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1512 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
1513 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
1514 ; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
1515 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
1516 ; GFX90A-NEXT: s_endpgm
1518 ; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
1519 ; GFX940: ; %bb.0: ; %main_body
1520 ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1521 ; GFX940-NEXT: v_mov_b32_e32 v2, 0
1522 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
1523 ; GFX940-NEXT: buffer_wbl2 sc1
1524 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1525 ; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
1526 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1527 ; GFX940-NEXT: buffer_inv sc1
1528 ; GFX940-NEXT: s_endpgm
1530 %ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
1534 define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
1535 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat:
1536 ; GFX90A: ; %bb.0: ; %main_body
1537 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1538 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0
1539 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1540 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1541 ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
1542 ; GFX90A-NEXT: .LBB53_1: ; %atomicrmw.start
1543 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1544 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1545 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
1546 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
1547 ; GFX90A-NEXT: buffer_wbl2
1548 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1549 ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
1550 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1551 ; GFX90A-NEXT: buffer_invl2
1552 ; GFX90A-NEXT: buffer_wbinvl1_vol
1553 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
1554 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1555 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
1556 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
1557 ; GFX90A-NEXT: s_cbranch_execnz .LBB53_1
1558 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
1559 ; GFX90A-NEXT: s_endpgm
1561 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat:
1562 ; GFX940: ; %bb.0: ; %main_body
1563 ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1564 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
1565 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1566 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
1567 ; GFX940-NEXT: buffer_wbl2 sc0 sc1
1568 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1569 ; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1
1570 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1571 ; GFX940-NEXT: buffer_inv sc0 sc1
1572 ; GFX940-NEXT: s_endpgm
1574 %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst
1578 define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
1579 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
1580 ; GFX90A: ; %bb.0: ; %main_body
1581 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1582 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0
1583 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
1584 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1585 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
1586 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1587 ; GFX90A-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
1588 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1589 ; GFX90A-NEXT: buffer_wbinvl1_vol
1590 ; GFX90A-NEXT: s_endpgm
1592 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
1593 ; GFX940: ; %bb.0: ; %main_body
1594 ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1595 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
1596 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1597 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
1598 ; GFX940-NEXT: buffer_wbl2 sc1
1599 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1600 ; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
1601 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1602 ; GFX940-NEXT: buffer_inv sc1
1603 ; GFX940-NEXT: s_endpgm
1605 %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst
1609 define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
1610 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system:
1611 ; GFX90A: ; %bb.0: ; %main_body
1612 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1613 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0
1614 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1615 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1616 ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
1617 ; GFX90A-NEXT: .LBB55_1: ; %atomicrmw.start
1618 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1619 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1620 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
1621 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
1622 ; GFX90A-NEXT: buffer_wbl2
1623 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1624 ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
1625 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1626 ; GFX90A-NEXT: buffer_invl2
1627 ; GFX90A-NEXT: buffer_wbinvl1_vol
1628 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1629 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
1630 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1631 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
1632 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
1633 ; GFX90A-NEXT: s_cbranch_execnz .LBB55_1
1634 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
1635 ; GFX90A-NEXT: s_endpgm
1637 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system:
1638 ; GFX940: ; %bb.0: ; %main_body
1639 ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1640 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
1641 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1642 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
1643 ; GFX940-NEXT: buffer_wbl2 sc0 sc1
1644 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1645 ; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1] sc1
1646 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1647 ; GFX940-NEXT: buffer_inv sc0 sc1
1648 ; GFX940-NEXT: s_endpgm
1650 %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst
1654 define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
1655 ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat:
1656 ; GFX90A: ; %bb.0: ; %main_body
1657 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1658 ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
1659 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
1660 ; GFX90A-NEXT: .LBB56_1: ; %atomicrmw.start
1661 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1662 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1663 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
1664 ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
1665 ; GFX90A-NEXT: buffer_wbl2
1666 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1667 ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
1668 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1669 ; GFX90A-NEXT: buffer_invl2
1670 ; GFX90A-NEXT: buffer_wbinvl1_vol
1671 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
1672 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1673 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
1674 ; GFX90A-NEXT: s_cbranch_execnz .LBB56_1
1675 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
1676 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
1677 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2
1678 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3
1679 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
1681 ; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat:
1682 ; GFX940: ; %bb.0: ; %main_body
1683 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1684 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
1685 ; GFX940-NEXT: buffer_wbl2 sc0 sc1
1686 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1687 ; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1
1688 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1689 ; GFX940-NEXT: buffer_inv sc0 sc1
1690 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1692 %ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst
1696 define double @flat_atomic_fadd_f64_rtn_pat_agent(ptr %ptr) #1 {
1697 ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_agent:
1698 ; GFX90A: ; %bb.0: ; %main_body
1699 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1700 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
1701 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
1702 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1703 ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
1704 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1705 ; GFX90A-NEXT: buffer_wbinvl1_vol
1706 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
1708 ; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat_agent:
1709 ; GFX940: ; %bb.0: ; %main_body
1710 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1711 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
1712 ; GFX940-NEXT: buffer_wbl2 sc1
1713 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1714 ; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
1715 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1716 ; GFX940-NEXT: buffer_inv sc1
1717 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1719 %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst
1723 define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
1724 ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_system:
1725 ; GFX90A: ; %bb.0: ; %main_body
1726 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1727 ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
1728 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
1729 ; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
1730 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1731 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1732 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
1733 ; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
1734 ; GFX90A-NEXT: buffer_wbl2
1735 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1736 ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc
1737 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1738 ; GFX90A-NEXT: buffer_invl2
1739 ; GFX90A-NEXT: buffer_wbinvl1_vol
1740 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1741 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
1742 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1743 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
1744 ; GFX90A-NEXT: s_cbranch_execnz .LBB58_1
1745 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
1746 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
1747 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2
1748 ; GFX90A-NEXT: v_mov_b32_e32 v1, v3
1749 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
1751 ; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat_system:
1752 ; GFX940: ; %bb.0: ; %main_body
1753 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1754 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
1755 ; GFX940-NEXT: buffer_wbl2 sc0 sc1
1756 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1757 ; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1
1758 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1759 ; GFX940-NEXT: buffer_inv sc0 sc1
1760 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1761 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1763 %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst
1767 define amdgpu_kernel void @flat_atomic_fadd_f64_noret(ptr %ptr, double %data) {
1768 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret:
1769 ; GFX90A: ; %bb.0: ; %main_body
1770 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1771 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1772 ; GFX90A-NEXT: v_mov_b32_e32 v0, s0
1773 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1
1774 ; GFX90A-NEXT: v_mov_b32_e32 v2, s2
1775 ; GFX90A-NEXT: v_mov_b32_e32 v3, s3
1776 ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
1777 ; GFX90A-NEXT: s_endpgm
1779 ; GFX940-LABEL: flat_atomic_fadd_f64_noret:
1780 ; GFX940: ; %bb.0: ; %main_body
1781 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1782 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1783 ; GFX940-NEXT: v_mov_b32_e32 v0, s0
1784 ; GFX940-NEXT: v_mov_b32_e32 v1, s1
1785 ; GFX940-NEXT: v_mov_b32_e32 v2, s2
1786 ; GFX940-NEXT: v_mov_b32_e32 v3, s3
1787 ; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
1788 ; GFX940-NEXT: s_endpgm
1790 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
1794 define double @flat_atomic_fadd_f64_rtn(ptr %ptr, double %data) {
1795 ; GFX90A-LABEL: flat_atomic_fadd_f64_rtn:
1796 ; GFX90A: ; %bb.0: ; %main_body
1797 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1798 ; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
1799 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1800 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
1802 ; GFX940-LABEL: flat_atomic_fadd_f64_rtn:
1803 ; GFX940: ; %bb.0: ; %main_body
1804 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1805 ; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0
1806 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1807 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1809 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p0.f64(ptr %ptr, double %data)
1813 define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
1814 ; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
1815 ; GFX90A: ; %bb.0: ; %main_body
1816 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1817 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0
1818 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1819 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
1820 ; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
1821 ; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start
1822 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1823 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1824 ; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
1825 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
1826 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1827 ; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
1828 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1829 ; GFX90A-NEXT: buffer_wbinvl1_vol
1830 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
1831 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1832 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
1833 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
1834 ; GFX90A-NEXT: s_cbranch_execnz .LBB61_1
1835 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
1836 ; GFX90A-NEXT: s_endpgm
1838 ; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
1839 ; GFX940: ; %bb.0: ; %main_body
1840 ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1841 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
1842 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1843 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
1844 ; GFX940-NEXT: buffer_wbl2 sc1
1845 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1846 ; GFX940-NEXT: flat_atomic_add_f64 v[2:3], v[0:1]
1847 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1848 ; GFX940-NEXT: buffer_inv sc1
1849 ; GFX940-NEXT: s_endpgm
1851 %ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst
1855 define amdgpu_kernel void @flat_atomic_fmin_f64_noret(ptr %ptr, double %data) {
1856 ; GFX90A-LABEL: flat_atomic_fmin_f64_noret:
1857 ; GFX90A: ; %bb.0: ; %main_body
1858 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1859 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1860 ; GFX90A-NEXT: v_mov_b32_e32 v0, s0
1861 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1
1862 ; GFX90A-NEXT: v_mov_b32_e32 v2, s2
1863 ; GFX90A-NEXT: v_mov_b32_e32 v3, s3
1864 ; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
1865 ; GFX90A-NEXT: s_endpgm
1867 ; GFX940-LABEL: flat_atomic_fmin_f64_noret:
1868 ; GFX940: ; %bb.0: ; %main_body
1869 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1870 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1871 ; GFX940-NEXT: v_mov_b32_e32 v0, s0
1872 ; GFX940-NEXT: v_mov_b32_e32 v1, s1
1873 ; GFX940-NEXT: v_mov_b32_e32 v2, s2
1874 ; GFX940-NEXT: v_mov_b32_e32 v3, s3
1875 ; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[2:3]
1876 ; GFX940-NEXT: s_endpgm
1878 %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
1882 define double @flat_atomic_fmin_f64_rtn(ptr %ptr, double %data) {
1883 ; GFX90A-LABEL: flat_atomic_fmin_f64_rtn:
1884 ; GFX90A: ; %bb.0: ; %main_body
1885 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1886 ; GFX90A-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] glc
1887 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1888 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
1890 ; GFX940-LABEL: flat_atomic_fmin_f64_rtn:
1891 ; GFX940: ; %bb.0: ; %main_body
1892 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1893 ; GFX940-NEXT: flat_atomic_min_f64 v[0:1], v[0:1], v[2:3] sc0
1894 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1895 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1897 %ret = call double @llvm.amdgcn.flat.atomic.fmin.f64.p0.f64(ptr %ptr, double %data)
1901 define amdgpu_kernel void @flat_atomic_fmax_f64_noret(ptr %ptr, double %data) {
1902 ; GFX90A-LABEL: flat_atomic_fmax_f64_noret:
1903 ; GFX90A: ; %bb.0: ; %main_body
1904 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1905 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1906 ; GFX90A-NEXT: v_mov_b32_e32 v0, s0
1907 ; GFX90A-NEXT: v_mov_b32_e32 v1, s1
1908 ; GFX90A-NEXT: v_mov_b32_e32 v2, s2
1909 ; GFX90A-NEXT: v_mov_b32_e32 v3, s3
1910 ; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
1911 ; GFX90A-NEXT: s_endpgm
1913 ; GFX940-LABEL: flat_atomic_fmax_f64_noret:
1914 ; GFX940: ; %bb.0: ; %main_body
1915 ; GFX940-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
1916 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1917 ; GFX940-NEXT: v_mov_b32_e32 v0, s0
1918 ; GFX940-NEXT: v_mov_b32_e32 v1, s1
1919 ; GFX940-NEXT: v_mov_b32_e32 v2, s2
1920 ; GFX940-NEXT: v_mov_b32_e32 v3, s3
1921 ; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[2:3]
1922 ; GFX940-NEXT: s_endpgm
1924 %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
1928 define double @flat_atomic_fmax_f64_rtn(ptr %ptr, double %data) {
1929 ; GFX90A-LABEL: flat_atomic_fmax_f64_rtn:
1930 ; GFX90A: ; %bb.0: ; %main_body
1931 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1932 ; GFX90A-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] glc
1933 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1934 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
1936 ; GFX940-LABEL: flat_atomic_fmax_f64_rtn:
1937 ; GFX940: ; %bb.0: ; %main_body
1938 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1939 ; GFX940-NEXT: flat_atomic_max_f64 v[0:1], v[0:1], v[2:3] sc0
1940 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1941 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1943 %ret = call double @llvm.amdgcn.flat.atomic.fmax.f64.p0.f64(ptr %ptr, double %data)
1947 define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, double %data) {
1948 ; GFX90A-LABEL: local_atomic_fadd_f64_noret:
1949 ; GFX90A: ; %bb.0: ; %main_body
1950 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24
1951 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1952 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1953 ; GFX90A-NEXT: v_mov_b32_e32 v2, s4
1954 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
1955 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
1956 ; GFX90A-NEXT: s_endpgm
1958 ; GFX940-LABEL: local_atomic_fadd_f64_noret:
1959 ; GFX940: ; %bb.0: ; %main_body
1960 ; GFX940-NEXT: s_load_dword s4, s[0:1], 0x24
1961 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
1962 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1963 ; GFX940-NEXT: v_mov_b32_e32 v2, s4
1964 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
1965 ; GFX940-NEXT: ds_add_f64 v2, v[0:1]
1966 ; GFX940-NEXT: s_endpgm
1968 %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
1972 define double @local_atomic_fadd_f64_rtn(ptr addrspace(3) %ptr, double %data) {
1973 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn:
1974 ; GFX90A: ; %bb.0: ; %main_body
1975 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1976 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2
1977 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1
1978 ; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
1979 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1980 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
1982 ; GFX940-LABEL: local_atomic_fadd_f64_rtn:
1983 ; GFX940: ; %bb.0: ; %main_body
1984 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1985 ; GFX940-NEXT: v_mov_b32_e32 v3, v2
1986 ; GFX940-NEXT: v_mov_b32_e32 v2, v1
1987 ; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
1988 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1989 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1991 %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
1995 define amdgpu_kernel void @local_atomic_fadd_f64_noret_from_flat_intrinsic(ptr addrspace(3) %ptr, double %data) {
1996 ; GFX90A-LABEL: local_atomic_fadd_f64_noret_from_flat_intrinsic:
1997 ; GFX90A: ; %bb.0: ; %main_body
1998 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24
1999 ; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
2000 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2001 ; GFX90A-NEXT: v_mov_b32_e32 v2, s4
2002 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[2:3], s[2:3] op_sel:[0,1]
2003 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
2004 ; GFX90A-NEXT: s_endpgm
2006 ; GFX940-LABEL: local_atomic_fadd_f64_noret_from_flat_intrinsic:
2007 ; GFX940: ; %bb.0: ; %main_body
2008 ; GFX940-NEXT: s_load_dword s4, s[0:1], 0x24
2009 ; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c
2010 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2011 ; GFX940-NEXT: v_mov_b32_e32 v2, s4
2012 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
2013 ; GFX940-NEXT: ds_add_f64 v2, v[0:1]
2014 ; GFX940-NEXT: s_endpgm
2016 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr addrspace(3) %ptr, double %data)
2020 define double @local_atomic_fadd_f64_rtn_from_flat_intrinsic(ptr addrspace(3) %ptr, double %data) {
2021 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn_from_flat_intrinsic:
2022 ; GFX90A: ; %bb.0: ; %main_body
2023 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2024 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2
2025 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1
2026 ; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
2027 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2028 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
2030 ; GFX940-LABEL: local_atomic_fadd_f64_rtn_from_flat_intrinsic:
2031 ; GFX940: ; %bb.0: ; %main_body
2032 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2033 ; GFX940-NEXT: v_mov_b32_e32 v3, v2
2034 ; GFX940-NEXT: v_mov_b32_e32 v2, v1
2035 ; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
2036 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2037 ; GFX940-NEXT: s_setpc_b64 s[30:31]
2039 %ret = call double @llvm.amdgcn.flat.atomic.fadd.f64.p3.f64(ptr addrspace(3) %ptr, double %data)
2043 define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 {
2044 ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat:
2045 ; GFX90A: ; %bb.0: ; %main_body
2046 ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
2047 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0
2048 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
2049 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2050 ; GFX90A-NEXT: v_mov_b32_e32 v2, s0
2051 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2052 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
2053 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2054 ; GFX90A-NEXT: s_endpgm
2056 ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat:
2057 ; GFX940: ; %bb.0: ; %main_body
2058 ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
2059 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
2060 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2061 ; GFX940-NEXT: v_mov_b32_e32 v2, s0
2062 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2063 ; GFX940-NEXT: ds_add_f64 v2, v[0:1]
2064 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2065 ; GFX940-NEXT: s_endpgm
2067 %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
2071 define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 {
2072 ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush:
2073 ; GFX90A: ; %bb.0: ; %main_body
2074 ; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
2075 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0
2076 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
2077 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2078 ; GFX90A-NEXT: v_mov_b32_e32 v2, s0
2079 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2080 ; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
2081 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2082 ; GFX90A-NEXT: s_endpgm
2084 ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush:
2085 ; GFX940: ; %bb.0: ; %main_body
2086 ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
2087 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
2088 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2089 ; GFX940-NEXT: v_mov_b32_e32 v2, s0
2090 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2091 ; GFX940-NEXT: ds_add_f64 v2, v[0:1]
2092 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2093 ; GFX940-NEXT: s_endpgm
2095 %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
2099 define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 {
2100 ; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
2101 ; GFX90A: ; %bb.0: ; %main_body
2102 ; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x24
2103 ; GFX90A-NEXT: s_mov_b64 s[0:1], 0
2104 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2105 ; GFX90A-NEXT: v_mov_b32_e32 v0, s2
2106 ; GFX90A-NEXT: ds_read_b64 v[0:1], v0
2107 ; GFX90A-NEXT: .LBB72_1: ; %atomicrmw.start
2108 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
2109 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2110 ; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], 4.0
2111 ; GFX90A-NEXT: v_mov_b32_e32 v4, s2
2112 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2113 ; GFX90A-NEXT: ds_cmpst_rtn_b64 v[2:3], v4, v[0:1], v[2:3]
2114 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2115 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[0:1]
2116 ; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2117 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1]
2118 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
2119 ; GFX90A-NEXT: s_cbranch_execnz .LBB72_1
2120 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
2121 ; GFX90A-NEXT: s_endpgm
2123 ; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
2124 ; GFX940: ; %bb.0: ; %main_body
2125 ; GFX940-NEXT: s_load_dword s2, s[0:1], 0x24
2126 ; GFX940-NEXT: s_mov_b64 s[0:1], 0
2127 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2128 ; GFX940-NEXT: v_mov_b32_e32 v0, s2
2129 ; GFX940-NEXT: ds_read_b64 v[0:1], v0
2130 ; GFX940-NEXT: .LBB72_1: ; %atomicrmw.start
2131 ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
2132 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2133 ; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], 4.0
2134 ; GFX940-NEXT: v_mov_b32_e32 v4, s2
2135 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2136 ; GFX940-NEXT: ds_cmpst_rtn_b64 v[2:3], v4, v[0:1], v[2:3]
2137 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2138 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[0:1]
2139 ; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2140 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[2:3]
2141 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
2142 ; GFX940-NEXT: s_cbranch_execnz .LBB72_1
2143 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
2144 ; GFX940-NEXT: s_endpgm
2146 %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
2150 define double @local_atomic_fadd_f64_rtn_pat(ptr addrspace(3) %ptr, double %data) #1 {
2151 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn_pat:
2152 ; GFX90A: ; %bb.0: ; %main_body
2153 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2154 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
2155 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
2156 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2157 ; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
2158 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2159 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
2161 ; GFX940-LABEL: local_atomic_fadd_f64_rtn_pat:
2162 ; GFX940: ; %bb.0: ; %main_body
2163 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2164 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
2165 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2166 ; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
2167 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2168 ; GFX940-NEXT: s_setpc_b64 s[30:31]
2170 %ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
2174 define double @local_atomic_fadd_f64_rtn_ieee_unsafe(ptr addrspace(3) %ptr, double %data) #2 {
2175 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe:
2176 ; GFX90A: ; %bb.0: ; %main_body
2177 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2178 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2
2179 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1
2180 ; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
2181 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2182 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
2184 ; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_unsafe:
2185 ; GFX940: ; %bb.0: ; %main_body
2186 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2187 ; GFX940-NEXT: v_mov_b32_e32 v3, v2
2188 ; GFX940-NEXT: v_mov_b32_e32 v2, v1
2189 ; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
2190 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2191 ; GFX940-NEXT: s_setpc_b64 s[30:31]
2193 %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
2197 define double @local_atomic_fadd_f64_rtn_ieee_safe(ptr addrspace(3) %ptr, double %data) #3 {
2198 ; GFX90A-LABEL: local_atomic_fadd_f64_rtn_ieee_safe:
2199 ; GFX90A: ; %bb.0: ; %main_body
2200 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2201 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2
2202 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1
2203 ; GFX90A-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
2204 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
2205 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
2207 ; GFX940-LABEL: local_atomic_fadd_f64_rtn_ieee_safe:
2208 ; GFX940: ; %bb.0: ; %main_body
2209 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2210 ; GFX940-NEXT: v_mov_b32_e32 v3, v2
2211 ; GFX940-NEXT: v_mov_b32_e32 v2, v1
2212 ; GFX940-NEXT: ds_add_rtn_f64 v[0:1], v0, v[2:3]
2213 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2214 ; GFX940-NEXT: s_setpc_b64 s[30:31]
2216 %ret = call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %ptr, double %data, i32 0, i32 0, i1 0)
2220 attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
2221 attributes #1 = { "amdgpu-unsafe-fp-atomics"="true" }
2222 attributes #2 = { "denormal-fp-math"="ieee,ieee" "amdgpu-unsafe-fp-atomics"="true" }
2223 attributes #3 = { "denormal-fp-math"="ieee,ieee" }
2224 attributes #4 = { "denormal-fp-math"="preserve-sign,preserve-sign" }