1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
6 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
8 define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 {
9 ; GFX900-LABEL: global_atomic_fadd_ret_f32:
11 ; GFX900-NEXT: s_mov_b64 s[4:5], exec
12 ; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
13 ; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
14 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
15 ; GFX900-NEXT: ; implicit-def: $vgpr1
16 ; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc
17 ; GFX900-NEXT: s_cbranch_execz .LBB0_4
18 ; GFX900-NEXT: ; %bb.1:
19 ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
20 ; GFX900-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
21 ; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
22 ; GFX900-NEXT: s_mov_b64 s[4:5], 0
23 ; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v1
24 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
25 ; GFX900-NEXT: s_load_dword s6, s[0:1], 0x0
26 ; GFX900-NEXT: v_mov_b32_e32 v3, 0
27 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
28 ; GFX900-NEXT: v_mov_b32_e32 v1, s6
29 ; GFX900-NEXT: .LBB0_2: ; %atomicrmw.start
30 ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
31 ; GFX900-NEXT: v_mov_b32_e32 v5, v1
32 ; GFX900-NEXT: v_add_f32_e32 v4, v5, v2
33 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
34 ; GFX900-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
35 ; GFX900-NEXT: s_waitcnt vmcnt(0)
36 ; GFX900-NEXT: buffer_wbinvl1_vol
37 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
38 ; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
39 ; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5]
40 ; GFX900-NEXT: s_cbranch_execnz .LBB0_2
41 ; GFX900-NEXT: ; %bb.3: ; %Flow
42 ; GFX900-NEXT: s_or_b64 exec, exec, s[4:5]
43 ; GFX900-NEXT: .LBB0_4: ; %Flow1
44 ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3]
45 ; GFX900-NEXT: v_readfirstlane_b32 s0, v1
46 ; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
47 ; GFX900-NEXT: v_mad_f32 v0, v0, 4.0, s0
48 ; GFX900-NEXT: global_store_dword v[0:1], v0, off
49 ; GFX900-NEXT: s_endpgm
51 ; GFX908-LABEL: global_atomic_fadd_ret_f32:
53 ; GFX908-NEXT: s_mov_b64 s[4:5], exec
54 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
55 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
56 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
57 ; GFX908-NEXT: ; implicit-def: $vgpr1
58 ; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc
59 ; GFX908-NEXT: s_cbranch_execz .LBB0_4
60 ; GFX908-NEXT: ; %bb.1:
61 ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
62 ; GFX908-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
63 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
64 ; GFX908-NEXT: s_mov_b64 s[4:5], 0
65 ; GFX908-NEXT: v_mul_f32_e32 v2, 4.0, v1
66 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
67 ; GFX908-NEXT: s_load_dword s6, s[0:1], 0x0
68 ; GFX908-NEXT: v_mov_b32_e32 v3, 0
69 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
70 ; GFX908-NEXT: v_mov_b32_e32 v1, s6
71 ; GFX908-NEXT: .LBB0_2: ; %atomicrmw.start
72 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
73 ; GFX908-NEXT: v_mov_b32_e32 v5, v1
74 ; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
75 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
76 ; GFX908-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
77 ; GFX908-NEXT: s_waitcnt vmcnt(0)
78 ; GFX908-NEXT: buffer_wbinvl1_vol
79 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
80 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
81 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
82 ; GFX908-NEXT: s_cbranch_execnz .LBB0_2
83 ; GFX908-NEXT: ; %bb.3: ; %Flow
84 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
85 ; GFX908-NEXT: .LBB0_4: ; %Flow1
86 ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3]
87 ; GFX908-NEXT: v_readfirstlane_b32 s0, v1
88 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
89 ; GFX908-NEXT: v_mad_f32 v0, v0, 4.0, s0
90 ; GFX908-NEXT: global_store_dword v[0:1], v0, off
91 ; GFX908-NEXT: s_endpgm
93 ; GFX90A-LABEL: global_atomic_fadd_ret_f32:
95 ; GFX90A-NEXT: s_mov_b64 s[4:5], exec
96 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
97 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
98 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
99 ; GFX90A-NEXT: ; implicit-def: $vgpr1
100 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
101 ; GFX90A-NEXT: s_cbranch_execz .LBB0_4
102 ; GFX90A-NEXT: ; %bb.1:
103 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
104 ; GFX90A-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
105 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
106 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
107 ; GFX90A-NEXT: v_mul_f32_e32 v2, 4.0, v1
108 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
109 ; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x0
110 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0
111 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
112 ; GFX90A-NEXT: v_mov_b32_e32 v1, s6
113 ; GFX90A-NEXT: .LBB0_2: ; %atomicrmw.start
114 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
115 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1
116 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
117 ; GFX90A-NEXT: buffer_wbl2
118 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
119 ; GFX90A-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
120 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
121 ; GFX90A-NEXT: buffer_invl2
122 ; GFX90A-NEXT: buffer_wbinvl1_vol
123 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
124 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
125 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
126 ; GFX90A-NEXT: s_cbranch_execnz .LBB0_2
127 ; GFX90A-NEXT: ; %bb.3: ; %Flow
128 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
129 ; GFX90A-NEXT: .LBB0_4: ; %Flow1
130 ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
131 ; GFX90A-NEXT: v_readfirstlane_b32 s0, v1
132 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
133 ; GFX90A-NEXT: v_mad_f32 v0, v0, 4.0, s0
134 ; GFX90A-NEXT: global_store_dword v[0:1], v0, off
135 ; GFX90A-NEXT: s_endpgm
137 ; GFX10-LABEL: global_atomic_fadd_ret_f32:
139 ; GFX10-NEXT: s_mov_b32 s4, exec_lo
140 ; GFX10-NEXT: s_mov_b32 s3, 0
141 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
142 ; GFX10-NEXT: ; implicit-def: $vgpr1
143 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
144 ; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo
145 ; GFX10-NEXT: s_cbranch_execz .LBB0_4
146 ; GFX10-NEXT: ; %bb.1:
147 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
148 ; GFX10-NEXT: s_bcnt1_i32_b32 s4, s4
149 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
150 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s4
151 ; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v1
152 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
153 ; GFX10-NEXT: s_load_dword s5, s[0:1], 0x0
154 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
155 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
156 ; GFX10-NEXT: .LBB0_2: ; %atomicrmw.start
157 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
158 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
159 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
160 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
161 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
162 ; GFX10-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
163 ; GFX10-NEXT: s_waitcnt vmcnt(0)
164 ; GFX10-NEXT: buffer_gl0_inv
165 ; GFX10-NEXT: buffer_gl1_inv
166 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
167 ; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3
168 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3
169 ; GFX10-NEXT: s_cbranch_execnz .LBB0_2
170 ; GFX10-NEXT: ; %bb.3: ; %Flow
171 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
172 ; GFX10-NEXT: .LBB0_4: ; %Flow1
173 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
174 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1
175 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
176 ; GFX10-NEXT: v_mad_f32 v0, v0, 4.0, s0
177 ; GFX10-NEXT: global_store_dword v[0:1], v0, off
178 ; GFX10-NEXT: s_endpgm
180 ; GFX11-LABEL: global_atomic_fadd_ret_f32:
182 ; GFX11-NEXT: s_mov_b32 s4, exec_lo
183 ; GFX11-NEXT: s_mov_b32 s3, 0
184 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
185 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
186 ; GFX11-NEXT: ; implicit-def: $vgpr1
187 ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
188 ; GFX11-NEXT: s_cbranch_execz .LBB0_4
189 ; GFX11-NEXT: ; %bb.1:
190 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
191 ; GFX11-NEXT: s_bcnt1_i32_b32 s4, s4
192 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
193 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s4
194 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
195 ; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x0
196 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
197 ; GFX11-NEXT: v_dual_mul_f32 v2, 4.0, v1 :: v_dual_mov_b32 v1, s5
198 ; GFX11-NEXT: .LBB0_2: ; %atomicrmw.start
199 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
200 ; GFX11-NEXT: v_mov_b32_e32 v5, v1
201 ; GFX11-NEXT: v_add_f32_e32 v4, v5, v2
202 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
203 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
204 ; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v3, v[4:5], s[0:1] glc
205 ; GFX11-NEXT: s_waitcnt vmcnt(0)
206 ; GFX11-NEXT: buffer_gl0_inv
207 ; GFX11-NEXT: buffer_gl1_inv
208 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
209 ; GFX11-NEXT: s_or_b32 s3, vcc_lo, s3
210 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
211 ; GFX11-NEXT: s_cbranch_execnz .LBB0_2
212 ; GFX11-NEXT: ; %bb.3: ; %Flow
213 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3
214 ; GFX11-NEXT: .LBB0_4: ; %Flow1
215 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
216 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
217 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1
218 ; GFX11-NEXT: v_mul_f32_e32 v0, 4.0, v0
219 ; GFX11-NEXT: v_add_f32_e32 v0, s0, v0
220 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
221 ; GFX11-NEXT: s_nop 0
222 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
223 ; GFX11-NEXT: s_endpgm
224 %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst
225 store float %result, ptr addrspace(1) undef
229 define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr) #2 {
230 ; GFX900-LABEL: global_atomic_fadd_ret_f32_ieee:
232 ; GFX900-NEXT: s_mov_b64 s[4:5], exec
233 ; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
234 ; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
235 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
236 ; GFX900-NEXT: ; implicit-def: $vgpr1
237 ; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc
238 ; GFX900-NEXT: s_cbranch_execz .LBB1_4
239 ; GFX900-NEXT: ; %bb.1:
240 ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
241 ; GFX900-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
242 ; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
243 ; GFX900-NEXT: s_mov_b64 s[4:5], 0
244 ; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v1
245 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
246 ; GFX900-NEXT: s_load_dword s6, s[0:1], 0x0
247 ; GFX900-NEXT: v_mov_b32_e32 v3, 0
248 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
249 ; GFX900-NEXT: v_mov_b32_e32 v1, s6
250 ; GFX900-NEXT: .LBB1_2: ; %atomicrmw.start
251 ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
252 ; GFX900-NEXT: v_mov_b32_e32 v5, v1
253 ; GFX900-NEXT: v_add_f32_e32 v4, v5, v2
254 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
255 ; GFX900-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
256 ; GFX900-NEXT: s_waitcnt vmcnt(0)
257 ; GFX900-NEXT: buffer_wbinvl1_vol
258 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
259 ; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
260 ; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5]
261 ; GFX900-NEXT: s_cbranch_execnz .LBB1_2
262 ; GFX900-NEXT: ; %bb.3: ; %Flow
263 ; GFX900-NEXT: s_or_b64 exec, exec, s[4:5]
264 ; GFX900-NEXT: .LBB1_4: ; %Flow1
265 ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3]
266 ; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
267 ; GFX900-NEXT: v_readfirstlane_b32 s0, v1
268 ; GFX900-NEXT: v_mul_f32_e32 v0, 4.0, v0
269 ; GFX900-NEXT: v_add_f32_e32 v0, s0, v0
270 ; GFX900-NEXT: global_store_dword v[0:1], v0, off
271 ; GFX900-NEXT: s_endpgm
273 ; GFX908-LABEL: global_atomic_fadd_ret_f32_ieee:
275 ; GFX908-NEXT: s_mov_b64 s[4:5], exec
276 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
277 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
278 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
279 ; GFX908-NEXT: ; implicit-def: $vgpr1
280 ; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc
281 ; GFX908-NEXT: s_cbranch_execz .LBB1_4
282 ; GFX908-NEXT: ; %bb.1:
283 ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
284 ; GFX908-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
285 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
286 ; GFX908-NEXT: s_mov_b64 s[4:5], 0
287 ; GFX908-NEXT: v_mul_f32_e32 v2, 4.0, v1
288 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
289 ; GFX908-NEXT: s_load_dword s6, s[0:1], 0x0
290 ; GFX908-NEXT: v_mov_b32_e32 v3, 0
291 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
292 ; GFX908-NEXT: v_mov_b32_e32 v1, s6
293 ; GFX908-NEXT: .LBB1_2: ; %atomicrmw.start
294 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
295 ; GFX908-NEXT: v_mov_b32_e32 v5, v1
296 ; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
297 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
298 ; GFX908-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
299 ; GFX908-NEXT: s_waitcnt vmcnt(0)
300 ; GFX908-NEXT: buffer_wbinvl1_vol
301 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
302 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
303 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
304 ; GFX908-NEXT: s_cbranch_execnz .LBB1_2
305 ; GFX908-NEXT: ; %bb.3: ; %Flow
306 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
307 ; GFX908-NEXT: .LBB1_4: ; %Flow1
308 ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3]
309 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
310 ; GFX908-NEXT: v_readfirstlane_b32 s0, v1
311 ; GFX908-NEXT: v_mul_f32_e32 v0, 4.0, v0
312 ; GFX908-NEXT: v_add_f32_e32 v0, s0, v0
313 ; GFX908-NEXT: global_store_dword v[0:1], v0, off
314 ; GFX908-NEXT: s_endpgm
316 ; GFX90A-LABEL: global_atomic_fadd_ret_f32_ieee:
318 ; GFX90A-NEXT: s_mov_b64 s[4:5], exec
319 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
320 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
321 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
322 ; GFX90A-NEXT: ; implicit-def: $vgpr1
323 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
324 ; GFX90A-NEXT: s_cbranch_execz .LBB1_2
325 ; GFX90A-NEXT: ; %bb.1:
326 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
327 ; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
328 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v2, s4
329 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0
330 ; GFX90A-NEXT: v_mul_f32_e32 v2, 4.0, v2
331 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
332 ; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v2, s[0:1] glc
333 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
334 ; GFX90A-NEXT: buffer_wbinvl1_vol
335 ; GFX90A-NEXT: .LBB1_2:
336 ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
337 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
338 ; GFX90A-NEXT: v_readfirstlane_b32 s0, v1
339 ; GFX90A-NEXT: v_mul_f32_e32 v0, 4.0, v0
340 ; GFX90A-NEXT: v_add_f32_e32 v0, s0, v0
341 ; GFX90A-NEXT: global_store_dword v[0:1], v0, off
342 ; GFX90A-NEXT: s_endpgm
344 ; GFX10-LABEL: global_atomic_fadd_ret_f32_ieee:
346 ; GFX10-NEXT: s_mov_b32 s4, exec_lo
347 ; GFX10-NEXT: s_mov_b32 s3, 0
348 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
349 ; GFX10-NEXT: ; implicit-def: $vgpr1
350 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
351 ; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo
352 ; GFX10-NEXT: s_cbranch_execz .LBB1_4
353 ; GFX10-NEXT: ; %bb.1:
354 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
355 ; GFX10-NEXT: s_bcnt1_i32_b32 s4, s4
356 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
357 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s4
358 ; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v1
359 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
360 ; GFX10-NEXT: s_load_dword s5, s[0:1], 0x0
361 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
362 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
363 ; GFX10-NEXT: .LBB1_2: ; %atomicrmw.start
364 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
365 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
366 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
367 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
368 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
369 ; GFX10-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
370 ; GFX10-NEXT: s_waitcnt vmcnt(0)
371 ; GFX10-NEXT: buffer_gl0_inv
372 ; GFX10-NEXT: buffer_gl1_inv
373 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
374 ; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3
375 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3
376 ; GFX10-NEXT: s_cbranch_execnz .LBB1_2
377 ; GFX10-NEXT: ; %bb.3: ; %Flow
378 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
379 ; GFX10-NEXT: .LBB1_4: ; %Flow1
380 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
381 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
382 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1
383 ; GFX10-NEXT: v_mul_f32_e32 v0, 4.0, v0
384 ; GFX10-NEXT: v_add_f32_e32 v0, s0, v0
385 ; GFX10-NEXT: global_store_dword v[0:1], v0, off
386 ; GFX10-NEXT: s_endpgm
388 ; GFX11-LABEL: global_atomic_fadd_ret_f32_ieee:
390 ; GFX11-NEXT: s_mov_b32 s3, exec_lo
391 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
392 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
393 ; GFX11-NEXT: ; implicit-def: $vgpr1
394 ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
395 ; GFX11-NEXT: s_cbranch_execz .LBB1_2
396 ; GFX11-NEXT: ; %bb.1:
397 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
398 ; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3
399 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s3
400 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mul_f32 v1, 4.0, v1
401 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
402 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
403 ; GFX11-NEXT: global_atomic_add_f32 v1, v2, v1, s[0:1] glc
404 ; GFX11-NEXT: s_waitcnt vmcnt(0)
405 ; GFX11-NEXT: buffer_gl0_inv
406 ; GFX11-NEXT: buffer_gl1_inv
407 ; GFX11-NEXT: .LBB1_2:
408 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
409 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
410 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1
411 ; GFX11-NEXT: v_mul_f32_e32 v0, 4.0, v0
412 ; GFX11-NEXT: v_add_f32_e32 v0, s0, v0
413 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
414 ; GFX11-NEXT: s_nop 0
415 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
416 ; GFX11-NEXT: s_endpgm
417 %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
418 store float %result, ptr addrspace(1) undef
422 define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #0 {
423 ; GFX900-LABEL: global_atomic_fadd_noret_f32:
425 ; GFX900-NEXT: s_mov_b64 s[2:3], exec
426 ; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
427 ; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
428 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
429 ; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc
430 ; GFX900-NEXT: s_cbranch_execz .LBB2_3
431 ; GFX900-NEXT: ; %bb.1:
432 ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
433 ; GFX900-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
434 ; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
435 ; GFX900-NEXT: s_mov_b64 s[2:3], 0
436 ; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v0
437 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
438 ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
439 ; GFX900-NEXT: v_mov_b32_e32 v3, 0
440 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
441 ; GFX900-NEXT: v_mov_b32_e32 v1, s4
442 ; GFX900-NEXT: .LBB2_2: ; %atomicrmw.start
443 ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
444 ; GFX900-NEXT: v_add_f32_e32 v0, v1, v2
445 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
446 ; GFX900-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
447 ; GFX900-NEXT: s_waitcnt vmcnt(0)
448 ; GFX900-NEXT: buffer_wbinvl1_vol
449 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
450 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
451 ; GFX900-NEXT: v_mov_b32_e32 v1, v0
452 ; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
453 ; GFX900-NEXT: s_cbranch_execnz .LBB2_2
454 ; GFX900-NEXT: .LBB2_3:
455 ; GFX900-NEXT: s_endpgm
457 ; GFX908-LABEL: global_atomic_fadd_noret_f32:
459 ; GFX908-NEXT: s_mov_b64 s[2:3], exec
460 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
461 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
462 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
463 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
464 ; GFX908-NEXT: s_cbranch_execz .LBB2_2
465 ; GFX908-NEXT: ; %bb.1:
466 ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
467 ; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
468 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2
469 ; GFX908-NEXT: v_mov_b32_e32 v0, 0
470 ; GFX908-NEXT: v_mul_f32_e32 v1, 4.0, v1
471 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
472 ; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
473 ; GFX908-NEXT: s_waitcnt vmcnt(0)
474 ; GFX908-NEXT: buffer_wbinvl1_vol
475 ; GFX908-NEXT: .LBB2_2:
476 ; GFX908-NEXT: s_endpgm
478 ; GFX90A-LABEL: global_atomic_fadd_noret_f32:
480 ; GFX90A-NEXT: s_mov_b64 s[2:3], exec
481 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
482 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
483 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
484 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
485 ; GFX90A-NEXT: s_cbranch_execz .LBB2_2
486 ; GFX90A-NEXT: ; %bb.1:
487 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
488 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
489 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2
490 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0
491 ; GFX90A-NEXT: v_mul_f32_e32 v1, 4.0, v1
492 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
493 ; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
494 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
495 ; GFX90A-NEXT: buffer_wbinvl1_vol
496 ; GFX90A-NEXT: .LBB2_2:
497 ; GFX90A-NEXT: s_endpgm
499 ; GFX10-LABEL: global_atomic_fadd_noret_f32:
501 ; GFX10-NEXT: s_mov_b32 s3, exec_lo
502 ; GFX10-NEXT: s_mov_b32 s2, 0
503 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
504 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
505 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
506 ; GFX10-NEXT: s_cbranch_execz .LBB2_3
507 ; GFX10-NEXT: ; %bb.1:
508 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
509 ; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3
510 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
511 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
512 ; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v0
513 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
514 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0
515 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
516 ; GFX10-NEXT: v_mov_b32_e32 v1, s4
517 ; GFX10-NEXT: .LBB2_2: ; %atomicrmw.start
518 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
519 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v2
520 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
521 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
522 ; GFX10-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
523 ; GFX10-NEXT: s_waitcnt vmcnt(0)
524 ; GFX10-NEXT: buffer_gl0_inv
525 ; GFX10-NEXT: buffer_gl1_inv
526 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
527 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
528 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
529 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
530 ; GFX10-NEXT: s_cbranch_execnz .LBB2_2
531 ; GFX10-NEXT: .LBB2_3:
532 ; GFX10-NEXT: s_endpgm
534 ; GFX11-LABEL: global_atomic_fadd_noret_f32:
536 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
537 ; GFX11-NEXT: s_mov_b32 s3, exec_lo
538 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
539 ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
540 ; GFX11-NEXT: s_cbranch_execz .LBB2_2
541 ; GFX11-NEXT: ; %bb.1:
542 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
543 ; GFX11-NEXT: s_bcnt1_i32_b32 s2, s2
544 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
545 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0
546 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
547 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
548 ; GFX11-NEXT: global_atomic_add_f32 v1, v0, s[0:1]
549 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
550 ; GFX11-NEXT: buffer_gl0_inv
551 ; GFX11-NEXT: buffer_gl1_inv
552 ; GFX11-NEXT: .LBB2_2:
553 ; GFX11-NEXT: s_endpgm
554 %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
558 define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %ptr) #2 {
559 ; GFX900-LABEL: global_atomic_fadd_noret_f32_ieee:
561 ; GFX900-NEXT: s_mov_b64 s[2:3], exec
562 ; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
563 ; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
564 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
565 ; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc
566 ; GFX900-NEXT: s_cbranch_execz .LBB3_3
567 ; GFX900-NEXT: ; %bb.1:
568 ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
569 ; GFX900-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
570 ; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
571 ; GFX900-NEXT: s_mov_b64 s[2:3], 0
572 ; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v0
573 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
574 ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
575 ; GFX900-NEXT: v_mov_b32_e32 v3, 0
576 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
577 ; GFX900-NEXT: v_mov_b32_e32 v1, s4
578 ; GFX900-NEXT: .LBB3_2: ; %atomicrmw.start
579 ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
580 ; GFX900-NEXT: v_add_f32_e32 v0, v1, v2
581 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
582 ; GFX900-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
583 ; GFX900-NEXT: s_waitcnt vmcnt(0)
584 ; GFX900-NEXT: buffer_wbinvl1_vol
585 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
586 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
587 ; GFX900-NEXT: v_mov_b32_e32 v1, v0
588 ; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
589 ; GFX900-NEXT: s_cbranch_execnz .LBB3_2
590 ; GFX900-NEXT: .LBB3_3:
591 ; GFX900-NEXT: s_endpgm
593 ; GFX908-LABEL: global_atomic_fadd_noret_f32_ieee:
595 ; GFX908-NEXT: s_mov_b64 s[2:3], exec
596 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
597 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
598 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
599 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
600 ; GFX908-NEXT: s_cbranch_execz .LBB3_2
601 ; GFX908-NEXT: ; %bb.1:
602 ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
603 ; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
604 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2
605 ; GFX908-NEXT: v_mov_b32_e32 v0, 0
606 ; GFX908-NEXT: v_mul_f32_e32 v1, 4.0, v1
607 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
608 ; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
609 ; GFX908-NEXT: s_waitcnt vmcnt(0)
610 ; GFX908-NEXT: buffer_wbinvl1_vol
611 ; GFX908-NEXT: .LBB3_2:
612 ; GFX908-NEXT: s_endpgm
614 ; GFX90A-LABEL: global_atomic_fadd_noret_f32_ieee:
616 ; GFX90A-NEXT: s_mov_b64 s[2:3], exec
617 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
618 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
619 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
620 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
621 ; GFX90A-NEXT: s_cbranch_execz .LBB3_2
622 ; GFX90A-NEXT: ; %bb.1:
623 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
624 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
625 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2
626 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0
627 ; GFX90A-NEXT: v_mul_f32_e32 v1, 4.0, v1
628 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
629 ; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
630 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
631 ; GFX90A-NEXT: buffer_wbinvl1_vol
632 ; GFX90A-NEXT: .LBB3_2:
633 ; GFX90A-NEXT: s_endpgm
635 ; GFX10-LABEL: global_atomic_fadd_noret_f32_ieee:
637 ; GFX10-NEXT: s_mov_b32 s3, exec_lo
638 ; GFX10-NEXT: s_mov_b32 s2, 0
639 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
640 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
641 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
642 ; GFX10-NEXT: s_cbranch_execz .LBB3_3
643 ; GFX10-NEXT: ; %bb.1:
644 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
645 ; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3
646 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
647 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
648 ; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v0
649 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
650 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0
651 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
652 ; GFX10-NEXT: v_mov_b32_e32 v1, s4
653 ; GFX10-NEXT: .LBB3_2: ; %atomicrmw.start
654 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
655 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v2
656 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
657 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
658 ; GFX10-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
659 ; GFX10-NEXT: s_waitcnt vmcnt(0)
660 ; GFX10-NEXT: buffer_gl0_inv
661 ; GFX10-NEXT: buffer_gl1_inv
662 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
663 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
664 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
665 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
666 ; GFX10-NEXT: s_cbranch_execnz .LBB3_2
667 ; GFX10-NEXT: .LBB3_3:
668 ; GFX10-NEXT: s_endpgm
670 ; GFX11-LABEL: global_atomic_fadd_noret_f32_ieee:
672 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
673 ; GFX11-NEXT: s_mov_b32 s3, exec_lo
674 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
675 ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
676 ; GFX11-NEXT: s_cbranch_execz .LBB3_2
677 ; GFX11-NEXT: ; %bb.1:
678 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
679 ; GFX11-NEXT: s_bcnt1_i32_b32 s2, s2
680 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
681 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0
682 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
683 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
684 ; GFX11-NEXT: global_atomic_add_f32 v1, v0, s[0:1]
685 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
686 ; GFX11-NEXT: buffer_gl0_inv
687 ; GFX11-NEXT: buffer_gl1_inv
688 ; GFX11-NEXT: .LBB3_2:
689 ; GFX11-NEXT: s_endpgm
690 %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
694 define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %ptr) #0 {
695 ; GFX900-LABEL: global_atomic_fadd_ret_f32_agent:
697 ; GFX900-NEXT: s_mov_b64 s[4:5], exec
698 ; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
699 ; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
700 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
701 ; GFX900-NEXT: ; implicit-def: $vgpr1
702 ; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc
703 ; GFX900-NEXT: s_cbranch_execz .LBB4_4
704 ; GFX900-NEXT: ; %bb.1:
705 ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
706 ; GFX900-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
707 ; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
708 ; GFX900-NEXT: s_mov_b64 s[4:5], 0
709 ; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v1
710 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
711 ; GFX900-NEXT: s_load_dword s6, s[0:1], 0x0
712 ; GFX900-NEXT: v_mov_b32_e32 v3, 0
713 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
714 ; GFX900-NEXT: v_mov_b32_e32 v1, s6
715 ; GFX900-NEXT: .LBB4_2: ; %atomicrmw.start
716 ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
717 ; GFX900-NEXT: v_mov_b32_e32 v5, v1
718 ; GFX900-NEXT: v_add_f32_e32 v4, v5, v2
719 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
720 ; GFX900-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
721 ; GFX900-NEXT: s_waitcnt vmcnt(0)
722 ; GFX900-NEXT: buffer_wbinvl1_vol
723 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
724 ; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
725 ; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5]
726 ; GFX900-NEXT: s_cbranch_execnz .LBB4_2
727 ; GFX900-NEXT: ; %bb.3: ; %Flow
728 ; GFX900-NEXT: s_or_b64 exec, exec, s[4:5]
729 ; GFX900-NEXT: .LBB4_4: ; %Flow1
730 ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3]
731 ; GFX900-NEXT: v_readfirstlane_b32 s0, v1
732 ; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
733 ; GFX900-NEXT: v_mad_f32 v0, v0, 4.0, s0
734 ; GFX900-NEXT: global_store_dword v[0:1], v0, off
735 ; GFX900-NEXT: s_endpgm
737 ; GFX908-LABEL: global_atomic_fadd_ret_f32_agent:
739 ; GFX908-NEXT: s_mov_b64 s[4:5], exec
740 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
741 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
742 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
743 ; GFX908-NEXT: ; implicit-def: $vgpr1
744 ; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc
745 ; GFX908-NEXT: s_cbranch_execz .LBB4_4
746 ; GFX908-NEXT: ; %bb.1:
747 ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
748 ; GFX908-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
749 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
750 ; GFX908-NEXT: s_mov_b64 s[4:5], 0
751 ; GFX908-NEXT: v_mul_f32_e32 v2, 4.0, v1
752 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
753 ; GFX908-NEXT: s_load_dword s6, s[0:1], 0x0
754 ; GFX908-NEXT: v_mov_b32_e32 v3, 0
755 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
756 ; GFX908-NEXT: v_mov_b32_e32 v1, s6
757 ; GFX908-NEXT: .LBB4_2: ; %atomicrmw.start
758 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
759 ; GFX908-NEXT: v_mov_b32_e32 v5, v1
760 ; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
761 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
762 ; GFX908-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
763 ; GFX908-NEXT: s_waitcnt vmcnt(0)
764 ; GFX908-NEXT: buffer_wbinvl1_vol
765 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
766 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
767 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
768 ; GFX908-NEXT: s_cbranch_execnz .LBB4_2
769 ; GFX908-NEXT: ; %bb.3: ; %Flow
770 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
771 ; GFX908-NEXT: .LBB4_4: ; %Flow1
772 ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3]
773 ; GFX908-NEXT: v_readfirstlane_b32 s0, v1
774 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
775 ; GFX908-NEXT: v_mad_f32 v0, v0, 4.0, s0
776 ; GFX908-NEXT: global_store_dword v[0:1], v0, off
777 ; GFX908-NEXT: s_endpgm
779 ; GFX90A-LABEL: global_atomic_fadd_ret_f32_agent:
781 ; GFX90A-NEXT: s_mov_b64 s[4:5], exec
782 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
783 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
784 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
785 ; GFX90A-NEXT: ; implicit-def: $vgpr1
786 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
787 ; GFX90A-NEXT: s_cbranch_execz .LBB4_2
788 ; GFX90A-NEXT: ; %bb.1:
789 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
790 ; GFX90A-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
791 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v2, s4
792 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0
793 ; GFX90A-NEXT: v_mul_f32_e32 v2, 4.0, v2
794 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
795 ; GFX90A-NEXT: global_atomic_add_f32 v1, v1, v2, s[0:1] glc
796 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
797 ; GFX90A-NEXT: buffer_wbinvl1_vol
798 ; GFX90A-NEXT: .LBB4_2:
799 ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
800 ; GFX90A-NEXT: v_readfirstlane_b32 s0, v1
801 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
802 ; GFX90A-NEXT: v_mad_f32 v0, v0, 4.0, s0
803 ; GFX90A-NEXT: global_store_dword v[0:1], v0, off
804 ; GFX90A-NEXT: s_endpgm
806 ; GFX10-LABEL: global_atomic_fadd_ret_f32_agent:
808 ; GFX10-NEXT: s_mov_b32 s4, exec_lo
809 ; GFX10-NEXT: s_mov_b32 s3, 0
810 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
811 ; GFX10-NEXT: ; implicit-def: $vgpr1
812 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
813 ; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo
814 ; GFX10-NEXT: s_cbranch_execz .LBB4_4
815 ; GFX10-NEXT: ; %bb.1:
816 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
817 ; GFX10-NEXT: s_bcnt1_i32_b32 s4, s4
818 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
819 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s4
820 ; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v1
821 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
822 ; GFX10-NEXT: s_load_dword s5, s[0:1], 0x0
823 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
824 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
825 ; GFX10-NEXT: .LBB4_2: ; %atomicrmw.start
826 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
827 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
828 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
829 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
830 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
831 ; GFX10-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
832 ; GFX10-NEXT: s_waitcnt vmcnt(0)
833 ; GFX10-NEXT: buffer_gl0_inv
834 ; GFX10-NEXT: buffer_gl1_inv
835 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
836 ; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3
837 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3
838 ; GFX10-NEXT: s_cbranch_execnz .LBB4_2
839 ; GFX10-NEXT: ; %bb.3: ; %Flow
840 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
841 ; GFX10-NEXT: .LBB4_4: ; %Flow1
842 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
843 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1
844 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
845 ; GFX10-NEXT: v_mad_f32 v0, v0, 4.0, s0
846 ; GFX10-NEXT: global_store_dword v[0:1], v0, off
847 ; GFX10-NEXT: s_endpgm
849 ; GFX11-LABEL: global_atomic_fadd_ret_f32_agent:
851 ; GFX11-NEXT: s_mov_b32 s3, exec_lo
852 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
853 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
854 ; GFX11-NEXT: ; implicit-def: $vgpr1
855 ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
856 ; GFX11-NEXT: s_cbranch_execz .LBB4_2
857 ; GFX11-NEXT: ; %bb.1:
858 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
859 ; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3
860 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s3
861 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mul_f32 v1, 4.0, v1
862 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
863 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
864 ; GFX11-NEXT: global_atomic_add_f32 v1, v2, v1, s[0:1] glc
865 ; GFX11-NEXT: s_waitcnt vmcnt(0)
866 ; GFX11-NEXT: buffer_gl0_inv
867 ; GFX11-NEXT: buffer_gl1_inv
868 ; GFX11-NEXT: .LBB4_2:
869 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
870 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
871 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1
872 ; GFX11-NEXT: v_mul_f32_e32 v0, 4.0, v0
873 ; GFX11-NEXT: v_add_f32_e32 v0, s0, v0
874 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
875 ; GFX11-NEXT: s_nop 0
876 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
877 ; GFX11-NEXT: s_endpgm
878 %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
879 store float %result, ptr addrspace(1) undef
883 define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %ptr) #0 {
884 ; GFX900-LABEL: global_atomic_fadd_ret_f32_system:
886 ; GFX900-NEXT: s_mov_b64 s[4:5], exec
887 ; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
888 ; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
889 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
890 ; GFX900-NEXT: ; implicit-def: $vgpr1
891 ; GFX900-NEXT: s_and_saveexec_b64 s[2:3], vcc
892 ; GFX900-NEXT: s_cbranch_execz .LBB5_4
893 ; GFX900-NEXT: ; %bb.1:
894 ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
895 ; GFX900-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
896 ; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
897 ; GFX900-NEXT: s_mov_b64 s[4:5], 0
898 ; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v1
899 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
900 ; GFX900-NEXT: s_load_dword s6, s[0:1], 0x0
901 ; GFX900-NEXT: v_mov_b32_e32 v3, 0
902 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
903 ; GFX900-NEXT: v_mov_b32_e32 v1, s6
904 ; GFX900-NEXT: .LBB5_2: ; %atomicrmw.start
905 ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
906 ; GFX900-NEXT: v_mov_b32_e32 v5, v1
907 ; GFX900-NEXT: v_add_f32_e32 v4, v5, v2
908 ; GFX900-NEXT: s_waitcnt vmcnt(0)
909 ; GFX900-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
910 ; GFX900-NEXT: s_waitcnt vmcnt(0)
911 ; GFX900-NEXT: buffer_wbinvl1_vol
912 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
913 ; GFX900-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
914 ; GFX900-NEXT: s_andn2_b64 exec, exec, s[4:5]
915 ; GFX900-NEXT: s_cbranch_execnz .LBB5_2
916 ; GFX900-NEXT: ; %bb.3: ; %Flow
917 ; GFX900-NEXT: s_or_b64 exec, exec, s[4:5]
918 ; GFX900-NEXT: .LBB5_4: ; %Flow1
919 ; GFX900-NEXT: s_or_b64 exec, exec, s[2:3]
920 ; GFX900-NEXT: v_readfirstlane_b32 s0, v1
921 ; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
922 ; GFX900-NEXT: v_mad_f32 v0, v0, 4.0, s0
923 ; GFX900-NEXT: global_store_dword v[0:1], v0, off
924 ; GFX900-NEXT: s_endpgm
926 ; GFX908-LABEL: global_atomic_fadd_ret_f32_system:
928 ; GFX908-NEXT: s_mov_b64 s[4:5], exec
929 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
930 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
931 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
932 ; GFX908-NEXT: ; implicit-def: $vgpr1
933 ; GFX908-NEXT: s_and_saveexec_b64 s[2:3], vcc
934 ; GFX908-NEXT: s_cbranch_execz .LBB5_4
935 ; GFX908-NEXT: ; %bb.1:
936 ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
937 ; GFX908-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
938 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
939 ; GFX908-NEXT: s_mov_b64 s[4:5], 0
940 ; GFX908-NEXT: v_mul_f32_e32 v2, 4.0, v1
941 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
942 ; GFX908-NEXT: s_load_dword s6, s[0:1], 0x0
943 ; GFX908-NEXT: v_mov_b32_e32 v3, 0
944 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
945 ; GFX908-NEXT: v_mov_b32_e32 v1, s6
946 ; GFX908-NEXT: .LBB5_2: ; %atomicrmw.start
947 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
948 ; GFX908-NEXT: v_mov_b32_e32 v5, v1
949 ; GFX908-NEXT: v_add_f32_e32 v4, v5, v2
950 ; GFX908-NEXT: s_waitcnt vmcnt(0)
951 ; GFX908-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
952 ; GFX908-NEXT: s_waitcnt vmcnt(0)
953 ; GFX908-NEXT: buffer_wbinvl1_vol
954 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
955 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
956 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
957 ; GFX908-NEXT: s_cbranch_execnz .LBB5_2
958 ; GFX908-NEXT: ; %bb.3: ; %Flow
959 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
960 ; GFX908-NEXT: .LBB5_4: ; %Flow1
961 ; GFX908-NEXT: s_or_b64 exec, exec, s[2:3]
962 ; GFX908-NEXT: v_readfirstlane_b32 s0, v1
963 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
964 ; GFX908-NEXT: v_mad_f32 v0, v0, 4.0, s0
965 ; GFX908-NEXT: global_store_dword v[0:1], v0, off
966 ; GFX908-NEXT: s_endpgm
968 ; GFX90A-LABEL: global_atomic_fadd_ret_f32_system:
970 ; GFX90A-NEXT: s_mov_b64 s[4:5], exec
971 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
972 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
973 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
974 ; GFX90A-NEXT: ; implicit-def: $vgpr1
975 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
976 ; GFX90A-NEXT: s_cbranch_execz .LBB5_4
977 ; GFX90A-NEXT: ; %bb.1:
978 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
979 ; GFX90A-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
980 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
981 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
982 ; GFX90A-NEXT: v_mul_f32_e32 v2, 4.0, v1
983 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
984 ; GFX90A-NEXT: s_load_dword s6, s[0:1], 0x0
985 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0
986 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
987 ; GFX90A-NEXT: v_mov_b32_e32 v1, s6
988 ; GFX90A-NEXT: .LBB5_2: ; %atomicrmw.start
989 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
990 ; GFX90A-NEXT: v_mov_b32_e32 v5, v1
991 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
992 ; GFX90A-NEXT: buffer_wbl2
993 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
994 ; GFX90A-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
995 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
996 ; GFX90A-NEXT: buffer_invl2
997 ; GFX90A-NEXT: buffer_wbinvl1_vol
998 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
999 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1000 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
1001 ; GFX90A-NEXT: s_cbranch_execnz .LBB5_2
1002 ; GFX90A-NEXT: ; %bb.3: ; %Flow
1003 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
1004 ; GFX90A-NEXT: .LBB5_4: ; %Flow1
1005 ; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
1006 ; GFX90A-NEXT: v_readfirstlane_b32 s0, v1
1007 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1008 ; GFX90A-NEXT: v_mad_f32 v0, v0, 4.0, s0
1009 ; GFX90A-NEXT: global_store_dword v[0:1], v0, off
1010 ; GFX90A-NEXT: s_endpgm
1012 ; GFX10-LABEL: global_atomic_fadd_ret_f32_system:
1014 ; GFX10-NEXT: s_mov_b32 s4, exec_lo
1015 ; GFX10-NEXT: s_mov_b32 s3, 0
1016 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
1017 ; GFX10-NEXT: ; implicit-def: $vgpr1
1018 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
1019 ; GFX10-NEXT: s_and_saveexec_b32 s2, vcc_lo
1020 ; GFX10-NEXT: s_cbranch_execz .LBB5_4
1021 ; GFX10-NEXT: ; %bb.1:
1022 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1023 ; GFX10-NEXT: s_bcnt1_i32_b32 s4, s4
1024 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
1025 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, s4
1026 ; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v1
1027 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1028 ; GFX10-NEXT: s_load_dword s5, s[0:1], 0x0
1029 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1030 ; GFX10-NEXT: v_mov_b32_e32 v1, s5
1031 ; GFX10-NEXT: .LBB5_2: ; %atomicrmw.start
1032 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1033 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
1034 ; GFX10-NEXT: v_add_f32_e32 v4, v5, v2
1035 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1036 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1037 ; GFX10-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
1038 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1039 ; GFX10-NEXT: buffer_gl0_inv
1040 ; GFX10-NEXT: buffer_gl1_inv
1041 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
1042 ; GFX10-NEXT: s_or_b32 s3, vcc_lo, s3
1043 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s3
1044 ; GFX10-NEXT: s_cbranch_execnz .LBB5_2
1045 ; GFX10-NEXT: ; %bb.3: ; %Flow
1046 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s3
1047 ; GFX10-NEXT: .LBB5_4: ; %Flow1
1048 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s2
1049 ; GFX10-NEXT: v_readfirstlane_b32 s0, v1
1050 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1051 ; GFX10-NEXT: v_mad_f32 v0, v0, 4.0, s0
1052 ; GFX10-NEXT: global_store_dword v[0:1], v0, off
1053 ; GFX10-NEXT: s_endpgm
1055 ; GFX11-LABEL: global_atomic_fadd_ret_f32_system:
1057 ; GFX11-NEXT: s_mov_b32 s4, exec_lo
1058 ; GFX11-NEXT: s_mov_b32 s3, 0
1059 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
1060 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
1061 ; GFX11-NEXT: ; implicit-def: $vgpr1
1062 ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
1063 ; GFX11-NEXT: s_cbranch_execz .LBB5_4
1064 ; GFX11-NEXT: ; %bb.1:
1065 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1066 ; GFX11-NEXT: s_bcnt1_i32_b32 s4, s4
1067 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
1068 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s4
1069 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1070 ; GFX11-NEXT: s_load_b32 s5, s[0:1], 0x0
1071 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1072 ; GFX11-NEXT: v_dual_mul_f32 v2, 4.0, v1 :: v_dual_mov_b32 v1, s5
1073 ; GFX11-NEXT: .LBB5_2: ; %atomicrmw.start
1074 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1075 ; GFX11-NEXT: v_mov_b32_e32 v5, v1
1076 ; GFX11-NEXT: v_add_f32_e32 v4, v5, v2
1077 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1078 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1079 ; GFX11-NEXT: global_atomic_cmpswap_b32 v1, v3, v[4:5], s[0:1] glc
1080 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1081 ; GFX11-NEXT: buffer_gl0_inv
1082 ; GFX11-NEXT: buffer_gl1_inv
1083 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
1084 ; GFX11-NEXT: s_or_b32 s3, vcc_lo, s3
1085 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
1086 ; GFX11-NEXT: s_cbranch_execnz .LBB5_2
1087 ; GFX11-NEXT: ; %bb.3: ; %Flow
1088 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s3
1089 ; GFX11-NEXT: .LBB5_4: ; %Flow1
1090 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2
1091 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1092 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1
1093 ; GFX11-NEXT: v_mul_f32_e32 v0, 4.0, v0
1094 ; GFX11-NEXT: v_add_f32_e32 v0, s0, v0
1095 ; GFX11-NEXT: global_store_b32 v[0:1], v0, off
1096 ; GFX11-NEXT: s_nop 0
1097 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1098 ; GFX11-NEXT: s_endpgm
1099 %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") seq_cst
1100 store float %result, ptr addrspace(1) undef
1104 define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 {
1105 ; GCN-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget:
1107 ; GCN-NEXT: s_mov_b64 s[4:5], exec
1108 ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
1109 ; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
1110 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1111 ; GCN-NEXT: ; implicit-def: $vgpr1
1112 ; GCN-NEXT: s_and_saveexec_b64 s[2:3], vcc
1113 ; GCN-NEXT: s_cbranch_execz .LBB6_4
1114 ; GCN-NEXT: ; %bb.1:
1115 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1116 ; GCN-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
1117 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
1118 ; GCN-NEXT: s_mov_b64 s[4:5], 0
1119 ; GCN-NEXT: v_mul_f32_e32 v2, 4.0, v1
1120 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1121 ; GCN-NEXT: s_load_dword s6, s[0:1], 0x0
1122 ; GCN-NEXT: v_mov_b32_e32 v3, 0
1123 ; GCN-NEXT: s_waitcnt lgkmcnt(0)
1124 ; GCN-NEXT: v_mov_b32_e32 v1, s6
1125 ; GCN-NEXT: .LBB6_2: ; %atomicrmw.start
1126 ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
1127 ; GCN-NEXT: v_mov_b32_e32 v5, v1
1128 ; GCN-NEXT: v_add_f32_e32 v4, v5, v2
1129 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1130 ; GCN-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
1131 ; GCN-NEXT: s_waitcnt vmcnt(0)
1132 ; GCN-NEXT: buffer_wbinvl1_vol
1133 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
1134 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1135 ; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5]
1136 ; GCN-NEXT: s_cbranch_execnz .LBB6_2
1137 ; GCN-NEXT: ; %bb.3: ; %Flow
1138 ; GCN-NEXT: s_or_b64 exec, exec, s[4:5]
1139 ; GCN-NEXT: .LBB6_4: ; %Flow1
1140 ; GCN-NEXT: s_or_b64 exec, exec, s[2:3]
1141 ; GCN-NEXT: v_readfirstlane_b32 s0, v1
1142 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1143 ; GCN-NEXT: v_mad_f32 v0, v0, 4.0, s0
1144 ; GCN-NEXT: global_store_dword v[0:1], v0, off
1145 ; GCN-NEXT: s_endpgm
1147 ; GFX11-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget:
1149 ; GFX11-NEXT: s_mov_b64 s[4:5], exec
1150 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
1151 ; GFX11-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
1152 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1153 ; GFX11-NEXT: ; implicit-def: $vgpr1
1154 ; GFX11-NEXT: s_and_saveexec_b64 s[2:3], vcc
1155 ; GFX11-NEXT: s_cbranch_execz .LBB6_4
1156 ; GFX11-NEXT: ; %bb.1:
1157 ; GFX11-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1158 ; GFX11-NEXT: s_bcnt1_i32_b64 s7, s[4:5]
1159 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s7
1160 ; GFX11-NEXT: s_mov_b64 s[4:5], 0
1161 ; GFX11-NEXT: v_mul_f32_e32 v2, 4.0, v1
1162 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1163 ; GFX11-NEXT: s_load_dword s6, s[0:1], 0x0
1164 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
1165 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1166 ; GFX11-NEXT: v_mov_b32_e32 v1, s6
1167 ; GFX11-NEXT: .LBB6_2: ; %atomicrmw.start
1168 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1169 ; GFX11-NEXT: v_mov_b32_e32 v5, v1
1170 ; GFX11-NEXT: v_add_f32_e32 v4, v5, v2
1171 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1172 ; GFX11-NEXT: global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
1173 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1174 ; GFX11-NEXT: buffer_wbinvl1_vol
1175 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
1176 ; GFX11-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1177 ; GFX11-NEXT: s_andn2_b64 exec, exec, s[4:5]
1178 ; GFX11-NEXT: s_cbranch_execnz .LBB6_2
1179 ; GFX11-NEXT: ; %bb.3: ; %Flow
1180 ; GFX11-NEXT: s_or_b64 exec, exec, s[4:5]
1181 ; GFX11-NEXT: .LBB6_4: ; %Flow1
1182 ; GFX11-NEXT: s_or_b64 exec, exec, s[2:3]
1183 ; GFX11-NEXT: v_readfirstlane_b32 s0, v1
1184 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
1185 ; GFX11-NEXT: v_mad_f32 v0, v0, 4.0, s0
1186 ; GFX11-NEXT: global_store_dword v[0:1], v0, off
1187 ; GFX11-NEXT: s_endpgm
1188 %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
1189 store float %result, ptr addrspace(1) undef
1193 define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addrspace(1) %ptr) #1 {
1194 ; GCN-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget:
1196 ; GCN-NEXT: s_mov_b64 s[2:3], exec
1197 ; GCN-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
1198 ; GCN-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
1199 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1200 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc
1201 ; GCN-NEXT: s_cbranch_execz .LBB7_2
1202 ; GCN-NEXT: ; %bb.1:
1203 ; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1204 ; GCN-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
1205 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 v1, s2
1206 ; GCN-NEXT: v_mov_b32_e32 v0, 0
1207 ; GCN-NEXT: v_mul_f32_e32 v1, 4.0, v1
1208 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1209 ; GCN-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
1210 ; GCN-NEXT: s_waitcnt vmcnt(0)
1211 ; GCN-NEXT: buffer_wbinvl1_vol
1212 ; GCN-NEXT: .LBB7_2:
1213 ; GCN-NEXT: s_endpgm
1215 ; GFX11-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget:
1217 ; GFX11-NEXT: s_mov_b64 s[2:3], exec
1218 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
1219 ; GFX11-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
1220 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1221 ; GFX11-NEXT: s_and_saveexec_b64 s[4:5], vcc
1222 ; GFX11-NEXT: s_cbranch_execz .LBB7_2
1223 ; GFX11-NEXT: ; %bb.1:
1224 ; GFX11-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1225 ; GFX11-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
1226 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s2
1227 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1228 ; GFX11-NEXT: v_mul_f32_e32 v1, 4.0, v1
1229 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1230 ; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
1231 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1232 ; GFX11-NEXT: buffer_wbinvl1_vol
1233 ; GFX11-NEXT: .LBB7_2:
1234 ; GFX11-NEXT: s_endpgm
1235 %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
1239 define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %ptr) {
1240 ; GFX900-LABEL: global_atomic_fadd_noret_f32_safe:
1242 ; GFX900-NEXT: s_mov_b64 s[2:3], exec
1243 ; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
1244 ; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
1245 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1246 ; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc
1247 ; GFX900-NEXT: s_cbranch_execz .LBB8_3
1248 ; GFX900-NEXT: ; %bb.1:
1249 ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1250 ; GFX900-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
1251 ; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
1252 ; GFX900-NEXT: s_mov_b64 s[2:3], 0
1253 ; GFX900-NEXT: v_mul_f32_e32 v2, 4.0, v0
1254 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
1255 ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
1256 ; GFX900-NEXT: v_mov_b32_e32 v3, 0
1257 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
1258 ; GFX900-NEXT: v_mov_b32_e32 v1, s4
1259 ; GFX900-NEXT: .LBB8_2: ; %atomicrmw.start
1260 ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
1261 ; GFX900-NEXT: v_add_f32_e32 v0, v1, v2
1262 ; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1263 ; GFX900-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1264 ; GFX900-NEXT: s_waitcnt vmcnt(0)
1265 ; GFX900-NEXT: buffer_wbinvl1_vol
1266 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1267 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1268 ; GFX900-NEXT: v_mov_b32_e32 v1, v0
1269 ; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
1270 ; GFX900-NEXT: s_cbranch_execnz .LBB8_2
1271 ; GFX900-NEXT: .LBB8_3:
1272 ; GFX900-NEXT: s_endpgm
1274 ; GFX908-LABEL: global_atomic_fadd_noret_f32_safe:
1276 ; GFX908-NEXT: s_mov_b64 s[2:3], exec
1277 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
1278 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
1279 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1280 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
1281 ; GFX908-NEXT: s_cbranch_execz .LBB8_3
1282 ; GFX908-NEXT: ; %bb.1:
1283 ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1284 ; GFX908-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
1285 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
1286 ; GFX908-NEXT: s_mov_b64 s[2:3], 0
1287 ; GFX908-NEXT: v_mul_f32_e32 v2, 4.0, v0
1288 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
1289 ; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0
1290 ; GFX908-NEXT: v_mov_b32_e32 v3, 0
1291 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
1292 ; GFX908-NEXT: v_mov_b32_e32 v1, s4
1293 ; GFX908-NEXT: .LBB8_2: ; %atomicrmw.start
1294 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
1295 ; GFX908-NEXT: v_add_f32_e32 v0, v1, v2
1296 ; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1297 ; GFX908-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1298 ; GFX908-NEXT: s_waitcnt vmcnt(0)
1299 ; GFX908-NEXT: buffer_wbinvl1_vol
1300 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1301 ; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1302 ; GFX908-NEXT: v_mov_b32_e32 v1, v0
1303 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3]
1304 ; GFX908-NEXT: s_cbranch_execnz .LBB8_2
1305 ; GFX908-NEXT: .LBB8_3:
1306 ; GFX908-NEXT: s_endpgm
1308 ; GFX90A-LABEL: global_atomic_fadd_noret_f32_safe:
1310 ; GFX90A-NEXT: s_mov_b64 s[2:3], exec
1311 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
1312 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
1313 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1314 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
1315 ; GFX90A-NEXT: s_cbranch_execz .LBB8_3
1316 ; GFX90A-NEXT: ; %bb.1:
1317 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1318 ; GFX90A-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
1319 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
1320 ; GFX90A-NEXT: s_mov_b64 s[2:3], 0
1321 ; GFX90A-NEXT: v_mul_f32_e32 v2, 4.0, v0
1322 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1323 ; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0
1324 ; GFX90A-NEXT: v_mov_b32_e32 v3, 0
1325 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1326 ; GFX90A-NEXT: v_mov_b32_e32 v1, s4
1327 ; GFX90A-NEXT: .LBB8_2: ; %atomicrmw.start
1328 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1329 ; GFX90A-NEXT: v_add_f32_e32 v0, v1, v2
1330 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1331 ; GFX90A-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1332 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1333 ; GFX90A-NEXT: buffer_wbinvl1_vol
1334 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1335 ; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1336 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0
1337 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
1338 ; GFX90A-NEXT: s_cbranch_execnz .LBB8_2
1339 ; GFX90A-NEXT: .LBB8_3:
1340 ; GFX90A-NEXT: s_endpgm
1342 ; GFX10-LABEL: global_atomic_fadd_noret_f32_safe:
1344 ; GFX10-NEXT: s_mov_b32 s3, exec_lo
1345 ; GFX10-NEXT: s_mov_b32 s2, 0
1346 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
1347 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
1348 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
1349 ; GFX10-NEXT: s_cbranch_execz .LBB8_3
1350 ; GFX10-NEXT: ; %bb.1:
1351 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1352 ; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3
1353 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
1354 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
1355 ; GFX10-NEXT: v_mul_f32_e32 v2, 4.0, v0
1356 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1357 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0
1358 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1359 ; GFX10-NEXT: v_mov_b32_e32 v1, s4
1360 ; GFX10-NEXT: .LBB8_2: ; %atomicrmw.start
1361 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1362 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v2
1363 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1364 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1365 ; GFX10-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1366 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1367 ; GFX10-NEXT: buffer_gl0_inv
1368 ; GFX10-NEXT: buffer_gl1_inv
1369 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
1370 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
1371 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
1372 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
1373 ; GFX10-NEXT: s_cbranch_execnz .LBB8_2
1374 ; GFX10-NEXT: .LBB8_3:
1375 ; GFX10-NEXT: s_endpgm
1377 ; GFX11-LABEL: global_atomic_fadd_noret_f32_safe:
1379 ; GFX11-NEXT: s_mov_b32 s3, exec_lo
1380 ; GFX11-NEXT: s_mov_b32 s2, 0
1381 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
1382 ; GFX11-NEXT: s_mov_b32 s4, exec_lo
1383 ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
1384 ; GFX11-NEXT: s_cbranch_execz .LBB8_3
1385 ; GFX11-NEXT: ; %bb.1:
1386 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1387 ; GFX11-NEXT: s_bcnt1_i32_b32 s3, s3
1388 ; GFX11-NEXT: v_mov_b32_e32 v3, 0
1389 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
1390 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1391 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0
1392 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1393 ; GFX11-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
1394 ; GFX11-NEXT: .LBB8_2: ; %atomicrmw.start
1395 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1396 ; GFX11-NEXT: v_add_f32_e32 v0, v1, v2
1397 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1398 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1399 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
1400 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1401 ; GFX11-NEXT: buffer_gl0_inv
1402 ; GFX11-NEXT: buffer_gl1_inv
1403 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
1404 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
1405 ; GFX11-NEXT: s_or_b32 s2, vcc_lo, s2
1406 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
1407 ; GFX11-NEXT: s_cbranch_execnz .LBB8_2
1408 ; GFX11-NEXT: .LBB8_3:
1409 ; GFX11-NEXT: s_endpgm
1410 %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
1414 define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 {
1415 ; GFX900-LABEL: infer_as_before_atomic:
1417 ; GFX900-NEXT: s_mov_b64 s[2:3], exec
1418 ; GFX900-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
1419 ; GFX900-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
1420 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1421 ; GFX900-NEXT: s_and_saveexec_b64 s[4:5], vcc
1422 ; GFX900-NEXT: s_cbranch_execz .LBB9_3
1423 ; GFX900-NEXT: ; %bb.1:
1424 ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1425 ; GFX900-NEXT: s_bcnt1_i32_b64 s5, s[2:3]
1426 ; GFX900-NEXT: s_mov_b64 s[2:3], 0
1427 ; GFX900-NEXT: v_cvt_f32_ubyte0_e32 v2, s5
1428 ; GFX900-NEXT: v_mov_b32_e32 v3, 0
1429 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
1430 ; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
1431 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
1432 ; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
1433 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
1434 ; GFX900-NEXT: v_mov_b32_e32 v1, s4
1435 ; GFX900-NEXT: .LBB9_2: ; %atomicrmw.start
1436 ; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
1437 ; GFX900-NEXT: v_add_f32_e32 v0, v1, v2
1438 ; GFX900-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1439 ; GFX900-NEXT: s_waitcnt vmcnt(0)
1440 ; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1441 ; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1442 ; GFX900-NEXT: v_mov_b32_e32 v1, v0
1443 ; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
1444 ; GFX900-NEXT: s_cbranch_execnz .LBB9_2
1445 ; GFX900-NEXT: .LBB9_3:
1446 ; GFX900-NEXT: s_endpgm
1448 ; GFX908-LABEL: infer_as_before_atomic:
1450 ; GFX908-NEXT: s_mov_b64 s[2:3], exec
1451 ; GFX908-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
1452 ; GFX908-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
1453 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1454 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], vcc
1455 ; GFX908-NEXT: s_cbranch_execz .LBB9_2
1456 ; GFX908-NEXT: ; %bb.1:
1457 ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1458 ; GFX908-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
1459 ; GFX908-NEXT: v_mov_b32_e32 v0, 0
1460 ; GFX908-NEXT: v_cvt_f32_ubyte0_e32 v1, s2
1461 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
1462 ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
1463 ; GFX908-NEXT: s_waitcnt lgkmcnt(0)
1464 ; GFX908-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
1465 ; GFX908-NEXT: .LBB9_2:
1466 ; GFX908-NEXT: s_endpgm
1468 ; GFX90A-LABEL: infer_as_before_atomic:
1470 ; GFX90A-NEXT: s_mov_b64 s[2:3], exec
1471 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
1472 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
1473 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1474 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
1475 ; GFX90A-NEXT: s_cbranch_execz .LBB9_2
1476 ; GFX90A-NEXT: ; %bb.1:
1477 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1478 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
1479 ; GFX90A-NEXT: v_mov_b32_e32 v0, 0
1480 ; GFX90A-NEXT: v_cvt_f32_ubyte0_e32 v1, s2
1481 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1482 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
1483 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1484 ; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
1485 ; GFX90A-NEXT: .LBB9_2:
1486 ; GFX90A-NEXT: s_endpgm
1488 ; GFX10-LABEL: infer_as_before_atomic:
1490 ; GFX10-NEXT: s_mov_b32 s3, exec_lo
1491 ; GFX10-NEXT: s_mov_b32 s2, 0
1492 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
1493 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
1494 ; GFX10-NEXT: s_and_saveexec_b32 s4, vcc_lo
1495 ; GFX10-NEXT: s_cbranch_execz .LBB9_3
1496 ; GFX10-NEXT: ; %bb.1:
1497 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1498 ; GFX10-NEXT: s_bcnt1_i32_b32 s3, s3
1499 ; GFX10-NEXT: v_mov_b32_e32 v3, 0
1500 ; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v2, s3
1501 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1502 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
1503 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1504 ; GFX10-NEXT: s_load_dword s4, s[0:1], 0x0
1505 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1506 ; GFX10-NEXT: v_mov_b32_e32 v1, s4
1507 ; GFX10-NEXT: .LBB9_2: ; %atomicrmw.start
1508 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1509 ; GFX10-NEXT: v_add_f32_e32 v0, v1, v2
1510 ; GFX10-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1511 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1512 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
1513 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
1514 ; GFX10-NEXT: s_or_b32 s2, vcc_lo, s2
1515 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
1516 ; GFX10-NEXT: s_cbranch_execnz .LBB9_2
1517 ; GFX10-NEXT: .LBB9_3:
1518 ; GFX10-NEXT: s_endpgm
1520 ; GFX11-LABEL: infer_as_before_atomic:
1522 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
1523 ; GFX11-NEXT: s_mov_b32 s3, exec_lo
1524 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
1525 ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
1526 ; GFX11-NEXT: s_cbranch_execz .LBB9_2
1527 ; GFX11-NEXT: ; %bb.1:
1528 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1529 ; GFX11-NEXT: s_bcnt1_i32_b32 s2, s2
1530 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
1531 ; GFX11-NEXT: v_cvt_f32_ubyte0_e32 v1, s2
1532 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1533 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
1534 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1535 ; GFX11-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
1536 ; GFX11-NEXT: .LBB9_2:
1537 ; GFX11-NEXT: s_nop 0
1538 ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
1539 ; GFX11-NEXT: s_endpgm
1540 %load = load ptr, ptr addrspace(4) %arg
1541 %v = atomicrmw fadd ptr %load, float 1.0 syncscope("agent-one-as") monotonic, align 4
1545 attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
1546 attributes #1 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "target-cpu"="gfx803" "target-features"="+atomic-fadd-no-rtn-insts" "amdgpu-unsafe-fp-atomics"="true" }
1547 attributes #2 = { "amdgpu-unsafe-fp-atomics"="true" }