1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12-SDAG
3 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12-GISEL
5 declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg)
6 declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, <4 x i32>, i32, i32, i32, i32 immarg)
7 declare <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32)
8 declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32)
9 declare <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
10 declare <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
11 declare <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32, i32, i1)
12 declare <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
13 declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
14 declare <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
16 define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
17 ; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_noret:
18 ; GFX12-SDAG: ; %bb.0:
19 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
20 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
21 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
22 ; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
23 ; GFX12-SDAG-NEXT: ds_pk_add_f16 v0, v1
24 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
25 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
26 ; GFX12-SDAG-NEXT: s_endpgm
28 ; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_noret:
29 ; GFX12-GISEL: ; %bb.0:
30 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
31 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
32 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
33 ; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
34 ; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1
35 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
36 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
37 ; GFX12-GISEL-NEXT: s_endpgm
38 %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
42 define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
43 ; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_noret:
44 ; GFX12-SDAG: ; %bb.0:
45 ; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
46 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
47 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
48 ; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
49 ; GFX12-SDAG-NEXT: ds_pk_add_bf16 v0, v1
50 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
51 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
52 ; GFX12-SDAG-NEXT: s_endpgm
54 ; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_noret:
55 ; GFX12-GISEL: ; %bb.0:
56 ; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
57 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
58 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
59 ; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
60 ; GFX12-GISEL-NEXT: ds_pk_add_f16 v0, v1
61 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
62 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
63 ; GFX12-GISEL-NEXT: s_endpgm
64 %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
68 define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half> %data) {
69 ; GFX12-SDAG-LABEL: local_atomic_fadd_v2f16_rtn:
70 ; GFX12-SDAG: ; %bb.0:
71 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
72 ; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
73 ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
74 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
75 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
76 ; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
77 ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
78 ; GFX12-SDAG-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
79 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
80 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
81 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
83 ; GFX12-GISEL-LABEL: local_atomic_fadd_v2f16_rtn:
84 ; GFX12-GISEL: ; %bb.0:
85 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
86 ; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
87 ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
88 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
89 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
90 ; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
91 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
92 ; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
93 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
94 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
95 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
96 %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
100 define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16> %data) {
101 ; GFX12-SDAG-LABEL: local_atomic_fadd_v2bf16_rtn:
102 ; GFX12-SDAG: ; %bb.0:
103 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
104 ; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
105 ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
106 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
107 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
108 ; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
109 ; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
110 ; GFX12-SDAG-NEXT: ds_pk_add_rtn_bf16 v0, v0, v1
111 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
112 ; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
113 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
115 ; GFX12-GISEL-LABEL: local_atomic_fadd_v2bf16_rtn:
116 ; GFX12-GISEL: ; %bb.0:
117 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
118 ; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
119 ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
120 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
121 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
122 ; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
123 ; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
124 ; GFX12-GISEL-NEXT: ds_pk_add_rtn_f16 v0, v0, v1
125 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
126 ; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
127 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
128 %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
132 define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %data) {
133 ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_noret:
134 ; GFX12-SDAG: ; %bb.0:
135 ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
136 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
137 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
138 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
139 ; GFX12-SDAG-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
140 ; GFX12-SDAG-NEXT: s_endpgm
142 ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_noret:
143 ; GFX12-GISEL: ; %bb.0:
144 ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
145 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
146 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
147 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
148 ; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v[0:1], v2
149 ; GFX12-GISEL-NEXT: s_endpgm
150 %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
154 define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
155 ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2f16_rtn:
156 ; GFX12-SDAG: ; %bb.0:
157 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
158 ; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
159 ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
160 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
161 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
162 ; GFX12-SDAG-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
163 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
164 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
166 ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2f16_rtn:
167 ; GFX12-GISEL: ; %bb.0:
168 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
169 ; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
170 ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
171 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
172 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
173 ; GFX12-GISEL-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
174 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
175 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
176 %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
180 define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %data) {
181 ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_noret:
182 ; GFX12-SDAG: ; %bb.0:
183 ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
184 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
185 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
186 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
187 ; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
188 ; GFX12-SDAG-NEXT: s_endpgm
190 ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_noret:
191 ; GFX12-GISEL: ; %bb.0:
192 ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
193 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
194 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
195 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
196 ; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2
197 ; GFX12-GISEL-NEXT: s_endpgm
198 %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
202 define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
203 ; GFX12-SDAG-LABEL: flat_atomic_fadd_v2bf16_rtn:
204 ; GFX12-SDAG: ; %bb.0:
205 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
206 ; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
207 ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
208 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
209 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
210 ; GFX12-SDAG-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
211 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
212 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
214 ; GFX12-GISEL-LABEL: flat_atomic_fadd_v2bf16_rtn:
215 ; GFX12-GISEL: ; %bb.0:
216 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
217 ; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
218 ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
219 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
220 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
221 ; GFX12-GISEL-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
222 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
223 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
224 %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
228 define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr, <2 x i16> %data) {
229 ; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_noret:
230 ; GFX12-SDAG: ; %bb.0:
231 ; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
232 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
233 ; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
234 ; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v1, s[0:1]
235 ; GFX12-SDAG-NEXT: s_nop 0
236 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
237 ; GFX12-SDAG-NEXT: s_endpgm
239 ; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_noret:
240 ; GFX12-GISEL: ; %bb.0:
241 ; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
242 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
243 ; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
244 ; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v1, v0, s[0:1]
245 ; GFX12-GISEL-NEXT: s_nop 0
246 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
247 ; GFX12-GISEL-NEXT: s_endpgm
248 %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
252 define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16> %data) {
253 ; GFX12-SDAG-LABEL: global_atomic_fadd_v2bf16_rtn:
254 ; GFX12-SDAG: ; %bb.0:
255 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
256 ; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
257 ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
258 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
259 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
260 ; GFX12-SDAG-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
261 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
262 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
264 ; GFX12-GISEL-LABEL: global_atomic_fadd_v2bf16_rtn:
265 ; GFX12-GISEL: ; %bb.0:
266 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
267 ; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
268 ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
269 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
270 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
271 ; GFX12-GISEL-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
272 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
273 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
274 %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
278 define void @global_atomic_pk_add_v2f16(ptr addrspace(1) %ptr, <2 x half> %data) {
279 ; GFX12-SDAG-LABEL: global_atomic_pk_add_v2f16:
280 ; GFX12-SDAG: ; %bb.0: ; %main_body
281 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
282 ; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
283 ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
284 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
285 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
286 ; GFX12-SDAG-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
287 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
289 ; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16:
290 ; GFX12-GISEL: ; %bb.0: ; %main_body
291 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
292 ; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
293 ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
294 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
295 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
296 ; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off
297 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
299 %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
303 define <2 x half> @global_atomic_pk_add_v2f16_rtn(ptr addrspace(1) %ptr, <2 x half> %data) {
304 ; GFX12-SDAG-LABEL: global_atomic_pk_add_v2f16_rtn:
305 ; GFX12-SDAG: ; %bb.0: ; %main_body
306 ; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
307 ; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
308 ; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
309 ; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
310 ; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
311 ; GFX12-SDAG-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
312 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
313 ; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
315 ; GFX12-GISEL-LABEL: global_atomic_pk_add_v2f16_rtn:
316 ; GFX12-GISEL: ; %bb.0: ; %main_body
317 ; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
318 ; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
319 ; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
320 ; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
321 ; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
322 ; GFX12-GISEL-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
323 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
324 ; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
326 %ret = call <2 x half> @llvm.amdgcn.global.atomic.fadd.v2f16.p1.v2f16(ptr addrspace(1) %ptr, <2 x half> %data)
330 define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret_offset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
331 ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2f16_noret_offset:
332 ; GFX12-SDAG: ; %bb.0:
333 ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92
334 ; GFX12-SDAG-NEXT: s_nop 0
335 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
336 ; GFX12-SDAG-NEXT: s_endpgm
338 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret_offset:
339 ; GFX12-GISEL: ; %bb.0:
340 ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92
341 ; GFX12-GISEL-NEXT: s_nop 0
342 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
343 ; GFX12-GISEL-NEXT: s_endpgm
344 %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
348 define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
349 ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2f16_noret:
350 ; GFX12-SDAG: ; %bb.0:
351 ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen
352 ; GFX12-SDAG-NEXT: s_nop 0
353 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
354 ; GFX12-SDAG-NEXT: s_endpgm
356 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret:
357 ; GFX12-GISEL: ; %bb.0:
358 ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
359 ; GFX12-GISEL-NEXT: s_nop 0
360 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
361 ; GFX12-GISEL-NEXT: s_endpgm
362 %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
366 define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret_offset(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
367 ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2f16_ret_offset:
368 ; GFX12-SDAG: ; %bb.0:
369 ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN
370 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
371 ; GFX12-SDAG-NEXT: ; return to shader part epilog
373 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret_offset:
374 ; GFX12-GISEL: ; %bb.0:
375 ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN
376 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
377 ; GFX12-GISEL-NEXT: ; return to shader part epilog
378 %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
382 define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
383 ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2f16_ret:
384 ; GFX12-SDAG: ; %bb.0:
385 ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
386 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
387 ; GFX12-SDAG-NEXT: ; return to shader part epilog
389 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret:
390 ; GFX12-GISEL: ; %bb.0:
391 ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
392 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
393 ; GFX12-GISEL-NEXT: ; return to shader part epilog
394 %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
398 define amdgpu_ps float @struct_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
399 ; GFX12-SDAG-LABEL: struct_buffer_atomic_add_v2f16_ret:
400 ; GFX12-SDAG: ; %bb.0:
401 ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
402 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
403 ; GFX12-SDAG-NEXT: ; return to shader part epilog
405 ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_ret:
406 ; GFX12-GISEL: ; %bb.0:
407 ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
408 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
409 ; GFX12-GISEL-NEXT: ; return to shader part epilog
410 %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
411 %r = bitcast <2 x half> %orig to float
415 define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
416 ; GFX12-SDAG-LABEL: struct_buffer_atomic_add_v2f16_noret:
417 ; GFX12-SDAG: ; %bb.0:
418 ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen
419 ; GFX12-SDAG-NEXT: s_nop 0
420 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
421 ; GFX12-SDAG-NEXT: s_endpgm
423 ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_noret:
424 ; GFX12-GISEL: ; %bb.0:
425 ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen
426 ; GFX12-GISEL-NEXT: s_nop 0
427 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
428 ; GFX12-GISEL-NEXT: s_endpgm
429 %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
433 define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
434 ; GFX12-SDAG-LABEL: struct_buffer_atomic_add_v2bf16_ret:
435 ; GFX12-SDAG: ; %bb.0:
436 ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
437 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0
438 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
439 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
440 ; GFX12-SDAG-NEXT: flat_store_b32 v[1:2], v0
441 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
442 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
443 ; GFX12-SDAG-NEXT: ; return to shader part epilog
445 ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_ret:
446 ; GFX12-GISEL: ; %bb.0:
447 ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
448 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
449 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
450 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
451 ; GFX12-GISEL-NEXT: flat_store_b32 v[1:2], v0
452 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
453 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
454 ; GFX12-GISEL-NEXT: ; return to shader part epilog
455 %orig = call <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
456 store <2 x bfloat> %orig, ptr null
460 define amdgpu_ps void @struct_buffer_atomic_add_v2bf16_noret(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
461 ; GFX12-SDAG-LABEL: struct_buffer_atomic_add_v2bf16_noret:
462 ; GFX12-SDAG: ; %bb.0:
463 ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen
464 ; GFX12-SDAG-NEXT: s_nop 0
465 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
466 ; GFX12-SDAG-NEXT: s_endpgm
468 ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_noret:
469 ; GFX12-GISEL: ; %bb.0:
470 ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen
471 ; GFX12-GISEL-NEXT: s_nop 0
472 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
473 ; GFX12-GISEL-NEXT: s_endpgm
474 %orig = call <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
478 define amdgpu_ps void @raw_buffer_atomic_add_v2bf16(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
479 ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2bf16:
480 ; GFX12-SDAG: ; %bb.0:
481 ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
482 ; GFX12-SDAG-NEXT: s_nop 0
483 ; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
484 ; GFX12-SDAG-NEXT: s_endpgm
486 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2bf16:
487 ; GFX12-GISEL: ; %bb.0:
488 ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
489 ; GFX12-GISEL-NEXT: s_nop 0
490 ; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
491 ; GFX12-GISEL-NEXT: s_endpgm
492 %ret = call <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
496 define amdgpu_ps float @raw_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
497 ; GFX12-SDAG-LABEL: raw_buffer_atomic_add_v2bf16_ret:
498 ; GFX12-SDAG: ; %bb.0:
499 ; GFX12-SDAG-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
500 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0
501 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
502 ; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
503 ; GFX12-SDAG-NEXT: flat_store_b32 v[1:2], v0
504 ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
505 ; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
506 ; GFX12-SDAG-NEXT: ; return to shader part epilog
508 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2bf16_ret:
509 ; GFX12-GISEL: ; %bb.0:
510 ; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
511 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
512 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
513 ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
514 ; GFX12-GISEL-NEXT: flat_store_b32 v[1:2], v0
515 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
516 ; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
517 ; GFX12-GISEL-NEXT: ; return to shader part epilog
518 %orig = call <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
519 store <2 x bfloat> %orig, ptr null