1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
3 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s
4 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX1200 %s
6 define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 {
7 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
9 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2
11 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1
12 ; GFX90A-NEXT: s_mov_b32 s11, s17
13 ; GFX90A-NEXT: s_mov_b32 s10, s16
14 ; GFX90A-NEXT: s_mov_b32 s9, s7
15 ; GFX90A-NEXT: s_mov_b32 s8, s6
16 ; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen glc
17 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
18 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
20 ; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
22 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
23 ; GFX940-NEXT: v_mov_b32_e32 v3, v2
24 ; GFX940-NEXT: v_mov_b32_e32 v2, v1
25 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen sc0
26 ; GFX940-NEXT: s_waitcnt vmcnt(0)
27 ; GFX940-NEXT: s_setpc_b64 s[30:31]
29 ; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
31 ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
32 ; GFX1200-NEXT: s_wait_expcnt 0x0
33 ; GFX1200-NEXT: s_wait_samplecnt 0x0
34 ; GFX1200-NEXT: s_wait_bvhcnt 0x0
35 ; GFX1200-NEXT: s_wait_kmcnt 0x0
36 ; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN
37 ; GFX1200-NEXT: s_wait_loadcnt 0x0
38 ; GFX1200-NEXT: s_setpc_b64 s[30:31]
39 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
43 ; Natural mapping, no voffset
44 define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 inreg %soffset) #0 {
45 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
47 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
48 ; GFX90A-NEXT: s_mov_b32 s11, s17
49 ; GFX90A-NEXT: s_mov_b32 s10, s16
50 ; GFX90A-NEXT: s_mov_b32 s9, s7
51 ; GFX90A-NEXT: s_mov_b32 s8, s6
52 ; GFX90A-NEXT: buffer_atomic_add_f32 v0, v1, s[8:11], s18 idxen glc
53 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
54 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
56 ; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
58 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen sc0
60 ; GFX940-NEXT: s_waitcnt vmcnt(0)
61 ; GFX940-NEXT: s_setpc_b64 s[30:31]
63 ; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__0_voffset__sgpr_soffset:
65 ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
66 ; GFX1200-NEXT: s_wait_expcnt 0x0
67 ; GFX1200-NEXT: s_wait_samplecnt 0x0
68 ; GFX1200-NEXT: s_wait_bvhcnt 0x0
69 ; GFX1200-NEXT: s_wait_kmcnt 0x0
70 ; GFX1200-NEXT: buffer_atomic_add_f32 v0, v1, s[0:3], s6 idxen th:TH_ATOMIC_RETURN
71 ; GFX1200-NEXT: s_wait_loadcnt 0x0
72 ; GFX1200-NEXT: s_setpc_b64 s[30:31]
73 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 0, i32 %soffset, i32 0)
77 define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc(float %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 {
78 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
80 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
81 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2
82 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1
83 ; GFX90A-NEXT: s_mov_b32 s11, s17
84 ; GFX90A-NEXT: s_mov_b32 s10, s16
85 ; GFX90A-NEXT: s_mov_b32 s9, s7
86 ; GFX90A-NEXT: s_mov_b32 s8, s6
87 ; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[8:11], s18 idxen offen glc slc
88 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
89 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
91 ; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
93 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
94 ; GFX940-NEXT: v_mov_b32_e32 v3, v2
95 ; GFX940-NEXT: v_mov_b32_e32 v2, v1
96 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v[2:3], s[0:3], s6 idxen offen sc0 nt
97 ; GFX940-NEXT: s_waitcnt vmcnt(0)
98 ; GFX940-NEXT: s_setpc_b64 s[30:31]
100 ; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset_slc:
102 ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
103 ; GFX1200-NEXT: s_wait_expcnt 0x0
104 ; GFX1200-NEXT: s_wait_samplecnt 0x0
105 ; GFX1200-NEXT: s_wait_bvhcnt 0x0
106 ; GFX1200-NEXT: s_wait_kmcnt 0x0
107 ; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_NT_RETURN
108 ; GFX1200-NEXT: s_wait_loadcnt 0x0
109 ; GFX1200-NEXT: s_setpc_b64 s[30:31]
110 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 2)
114 define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset(<2 x half> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) #0 {
115 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
117 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
118 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2
119 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1
120 ; GFX90A-NEXT: s_mov_b32 s11, s17
121 ; GFX90A-NEXT: s_mov_b32 s10, s16
122 ; GFX90A-NEXT: s_mov_b32 s9, s7
123 ; GFX90A-NEXT: s_mov_b32 s8, s6
124 ; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[8:11], s18 idxen offen glc
125 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
126 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
128 ; GFX940-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
130 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
131 ; GFX940-NEXT: v_mov_b32_e32 v3, v2
132 ; GFX940-NEXT: v_mov_b32_e32 v2, v1
133 ; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[2:3], s[0:3], s6 idxen offen sc0
134 ; GFX940-NEXT: s_waitcnt vmcnt(0)
135 ; GFX940-NEXT: s_setpc_b64 s[30:31]
137 ; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__sgpr_rsrc__vgpr_voffset__sgpr_soffset:
139 ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
140 ; GFX1200-NEXT: s_wait_expcnt 0x0
141 ; GFX1200-NEXT: s_wait_samplecnt 0x0
142 ; GFX1200-NEXT: s_wait_bvhcnt 0x0
143 ; GFX1200-NEXT: s_wait_kmcnt 0x0
144 ; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s6 idxen offen th:TH_ATOMIC_RETURN
145 ; GFX1200-NEXT: s_wait_loadcnt 0x0
146 ; GFX1200-NEXT: s_setpc_b64 s[30:31]
147 %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
151 ; Test waterfall loop
152 define float @struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) #0 {
153 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
155 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
156 ; GFX90A-NEXT: v_mov_b32_e32 v9, v6
157 ; GFX90A-NEXT: v_mov_b32_e32 v8, v5
158 ; GFX90A-NEXT: v_mov_b32_e32 v5, v4
159 ; GFX90A-NEXT: v_mov_b32_e32 v4, v3
160 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2
161 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1
162 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec
163 ; GFX90A-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
164 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v2
165 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v3
166 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v4
167 ; GFX90A-NEXT: v_readfirstlane_b32 s11, v5
168 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
169 ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
170 ; GFX90A-NEXT: v_readfirstlane_b32 s12, v7
171 ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
172 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s12, v7
173 ; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], vcc
174 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
175 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
176 ; GFX90A-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[8:11], s12 idxen offen glc
177 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
178 ; GFX90A-NEXT: ; implicit-def: $vgpr7
179 ; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9
180 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
181 ; GFX90A-NEXT: s_cbranch_execnz .LBB4_1
182 ; GFX90A-NEXT: ; %bb.2:
183 ; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
184 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
185 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
187 ; GFX940-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
189 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190 ; GFX940-NEXT: v_mov_b32_e32 v9, v6
191 ; GFX940-NEXT: v_mov_b32_e32 v8, v5
192 ; GFX940-NEXT: v_mov_b32_e32 v5, v4
193 ; GFX940-NEXT: v_mov_b32_e32 v4, v3
194 ; GFX940-NEXT: v_mov_b32_e32 v3, v2
195 ; GFX940-NEXT: v_mov_b32_e32 v2, v1
196 ; GFX940-NEXT: s_mov_b64 s[2:3], exec
197 ; GFX940-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
198 ; GFX940-NEXT: v_readfirstlane_b32 s4, v2
199 ; GFX940-NEXT: v_readfirstlane_b32 s5, v3
200 ; GFX940-NEXT: v_readfirstlane_b32 s6, v4
201 ; GFX940-NEXT: v_readfirstlane_b32 s7, v5
202 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
203 ; GFX940-NEXT: v_readfirstlane_b32 s8, v7
204 ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[4:5]
205 ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
206 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, s8, v7
207 ; GFX940-NEXT: s_and_b64 s[0:1], s[0:1], vcc
208 ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
209 ; GFX940-NEXT: s_waitcnt vmcnt(0)
210 ; GFX940-NEXT: buffer_atomic_add_f32 v0, v[8:9], s[4:7], s8 idxen offen sc0
211 ; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
212 ; GFX940-NEXT: ; implicit-def: $vgpr7
213 ; GFX940-NEXT: ; implicit-def: $vgpr8_vgpr9
214 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
215 ; GFX940-NEXT: s_cbranch_execnz .LBB4_1
216 ; GFX940-NEXT: ; %bb.2:
217 ; GFX940-NEXT: s_mov_b64 exec, s[2:3]
218 ; GFX940-NEXT: s_waitcnt vmcnt(0)
219 ; GFX940-NEXT: s_setpc_b64 s[30:31]
221 ; GFX1200-LABEL: struct_ptr_buffer_atomic_add_f32_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
223 ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
224 ; GFX1200-NEXT: s_wait_expcnt 0x0
225 ; GFX1200-NEXT: s_wait_samplecnt 0x0
226 ; GFX1200-NEXT: s_wait_bvhcnt 0x0
227 ; GFX1200-NEXT: s_wait_kmcnt 0x0
228 ; GFX1200-NEXT: s_mov_b32 s2, exec_lo
229 ; GFX1200-NEXT: .LBB4_1: ; =>This Inner Loop Header: Depth=1
230 ; GFX1200-NEXT: v_readfirstlane_b32 s4, v1
231 ; GFX1200-NEXT: v_readfirstlane_b32 s5, v2
232 ; GFX1200-NEXT: v_readfirstlane_b32 s6, v3
233 ; GFX1200-NEXT: v_readfirstlane_b32 s7, v4
234 ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7
235 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
236 ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
237 ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
238 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
239 ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
240 ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
241 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
242 ; GFX1200-NEXT: s_and_b32 s0, s0, s1
243 ; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
244 ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
245 ; GFX1200-NEXT: s_wait_loadcnt 0x0
246 ; GFX1200-NEXT: buffer_atomic_add_f32 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
247 ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
248 ; GFX1200-NEXT: ; implicit-def: $vgpr7
249 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
250 ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
251 ; GFX1200-NEXT: s_cbranch_execnz .LBB4_1
252 ; GFX1200-NEXT: ; %bb.2:
253 ; GFX1200-NEXT: s_mov_b32 exec_lo, s2
254 ; GFX1200-NEXT: s_wait_loadcnt 0x0
255 ; GFX1200-NEXT: s_setpc_b64 s[30:31]
256 %ret = call float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
260 ; Test waterfall loop
261 define <2 x half> @struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset) #0 {
262 ; GFX90A-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
264 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
265 ; GFX90A-NEXT: v_mov_b32_e32 v9, v6
266 ; GFX90A-NEXT: v_mov_b32_e32 v8, v5
267 ; GFX90A-NEXT: v_mov_b32_e32 v5, v4
268 ; GFX90A-NEXT: v_mov_b32_e32 v4, v3
269 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2
270 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1
271 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec
272 ; GFX90A-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
273 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v2
274 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v3
275 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v4
276 ; GFX90A-NEXT: v_readfirstlane_b32 s11, v5
277 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[2:3]
278 ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[4:5]
279 ; GFX90A-NEXT: v_readfirstlane_b32 s12, v7
280 ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
281 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, s12, v7
282 ; GFX90A-NEXT: s_and_b64 s[4:5], s[4:5], vcc
283 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
284 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
285 ; GFX90A-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[8:11], s12 idxen offen glc
286 ; GFX90A-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
287 ; GFX90A-NEXT: ; implicit-def: $vgpr7
288 ; GFX90A-NEXT: ; implicit-def: $vgpr8_vgpr9
289 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
290 ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
291 ; GFX90A-NEXT: ; %bb.2:
292 ; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
293 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
294 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
296 ; GFX940-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
298 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
299 ; GFX940-NEXT: v_mov_b32_e32 v9, v6
300 ; GFX940-NEXT: v_mov_b32_e32 v8, v5
301 ; GFX940-NEXT: v_mov_b32_e32 v5, v4
302 ; GFX940-NEXT: v_mov_b32_e32 v4, v3
303 ; GFX940-NEXT: v_mov_b32_e32 v3, v2
304 ; GFX940-NEXT: v_mov_b32_e32 v2, v1
305 ; GFX940-NEXT: s_mov_b64 s[2:3], exec
306 ; GFX940-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
307 ; GFX940-NEXT: v_readfirstlane_b32 s4, v2
308 ; GFX940-NEXT: v_readfirstlane_b32 s5, v3
309 ; GFX940-NEXT: v_readfirstlane_b32 s6, v4
310 ; GFX940-NEXT: v_readfirstlane_b32 s7, v5
311 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[2:3]
312 ; GFX940-NEXT: v_readfirstlane_b32 s8, v7
313 ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[4:5]
314 ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
315 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, s8, v7
316 ; GFX940-NEXT: s_and_b64 s[0:1], s[0:1], vcc
317 ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
318 ; GFX940-NEXT: s_waitcnt vmcnt(0)
319 ; GFX940-NEXT: buffer_atomic_pk_add_f16 v0, v[8:9], s[4:7], s8 idxen offen sc0
320 ; GFX940-NEXT: ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
321 ; GFX940-NEXT: ; implicit-def: $vgpr7
322 ; GFX940-NEXT: ; implicit-def: $vgpr8_vgpr9
323 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
324 ; GFX940-NEXT: s_cbranch_execnz .LBB5_1
325 ; GFX940-NEXT: ; %bb.2:
326 ; GFX940-NEXT: s_mov_b64 exec, s[2:3]
327 ; GFX940-NEXT: s_waitcnt vmcnt(0)
328 ; GFX940-NEXT: s_setpc_b64 s[30:31]
330 ; GFX1200-LABEL: struct_ptr_buffer_atomic_add_v2f16_rtn__vgpr_val__vgpr_rsrc__vgpr_voffset__vgpr_soffset:
332 ; GFX1200-NEXT: s_wait_loadcnt_dscnt 0x0
333 ; GFX1200-NEXT: s_wait_expcnt 0x0
334 ; GFX1200-NEXT: s_wait_samplecnt 0x0
335 ; GFX1200-NEXT: s_wait_bvhcnt 0x0
336 ; GFX1200-NEXT: s_wait_kmcnt 0x0
337 ; GFX1200-NEXT: s_mov_b32 s2, exec_lo
338 ; GFX1200-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
339 ; GFX1200-NEXT: v_readfirstlane_b32 s4, v1
340 ; GFX1200-NEXT: v_readfirstlane_b32 s5, v2
341 ; GFX1200-NEXT: v_readfirstlane_b32 s6, v3
342 ; GFX1200-NEXT: v_readfirstlane_b32 s7, v4
343 ; GFX1200-NEXT: v_readfirstlane_b32 s3, v7
344 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
345 ; GFX1200-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[1:2]
346 ; GFX1200-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[3:4]
347 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
348 ; GFX1200-NEXT: v_cmp_eq_u32_e64 s1, s3, v7
349 ; GFX1200-NEXT: s_and_b32 s0, vcc_lo, s0
350 ; GFX1200-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
351 ; GFX1200-NEXT: s_and_b32 s0, s0, s1
352 ; GFX1200-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
353 ; GFX1200-NEXT: s_and_saveexec_b32 s0, s0
354 ; GFX1200-NEXT: s_wait_loadcnt 0x0
355 ; GFX1200-NEXT: buffer_atomic_pk_add_f16 v0, v[5:6], s[4:7], s3 idxen offen th:TH_ATOMIC_RETURN
356 ; GFX1200-NEXT: ; implicit-def: $vgpr1_vgpr2_vgpr3_vgpr4
357 ; GFX1200-NEXT: ; implicit-def: $vgpr7
358 ; GFX1200-NEXT: ; implicit-def: $vgpr5_vgpr6
359 ; GFX1200-NEXT: s_xor_b32 exec_lo, exec_lo, s0
360 ; GFX1200-NEXT: s_cbranch_execnz .LBB5_1
361 ; GFX1200-NEXT: ; %bb.2:
362 ; GFX1200-NEXT: s_mov_b32 exec_lo, s2
363 ; GFX1200-NEXT: s_wait_loadcnt 0x0
364 ; GFX1200-NEXT: s_setpc_b64 s[30:31]
365 %ret = call <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
369 declare float @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.f32(float, ptr addrspace(8), i32, i32, i32, i32 immarg)
370 declare <2 x half> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2f16(<2 x half>, ptr addrspace(8), i32, i32, i32, i32 immarg)
372 attributes #0 = { nounwind }