1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
3 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s
4 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
5 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
6 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx90a < %s | FileCheck -check-prefix=GFX90A %s
7 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX908 %s
8 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
9 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=hawaii < %s | FileCheck -check-prefix=GFX7 %s
10 ; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=tahiti < %s | FileCheck -check-prefix=GFX6 %s
12 ; --------------------------------------------------------------------
14 ; --------------------------------------------------------------------
16 define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
17 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
19 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
20 ; GFX12-NEXT: s_wait_expcnt 0x0
21 ; GFX12-NEXT: s_wait_samplecnt 0x0
22 ; GFX12-NEXT: s_wait_bvhcnt 0x0
23 ; GFX12-NEXT: s_wait_kmcnt 0x0
24 ; GFX12-NEXT: v_mov_b32_e32 v1, s6
25 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
26 ; GFX12-NEXT: s_wait_storecnt 0x0
27 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
28 ; GFX12-NEXT: s_wait_loadcnt 0x0
29 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
30 ; GFX12-NEXT: s_setpc_b64 s[30:31]
32 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
34 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35 ; GFX940-NEXT: v_mov_b32_e32 v1, v0
36 ; GFX940-NEXT: v_mov_b32_e32 v0, s6
37 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
38 ; GFX940-NEXT: s_addk_i32 s6, 0x400
39 ; GFX940-NEXT: s_mov_b64 s[4:5], 0
40 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1
41 ; GFX940-NEXT: v_mov_b32_e32 v3, s6
42 ; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start
43 ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
44 ; GFX940-NEXT: s_waitcnt vmcnt(0)
45 ; GFX940-NEXT: v_mov_b32_e32 v5, v0
46 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5
47 ; GFX940-NEXT: v_max_f32_e32 v4, v0, v2
48 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
49 ; GFX940-NEXT: buffer_wbl2 sc1
50 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
51 ; GFX940-NEXT: s_waitcnt vmcnt(0)
52 ; GFX940-NEXT: buffer_inv sc1
53 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
54 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
55 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
56 ; GFX940-NEXT: s_cbranch_execnz .LBB0_1
57 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
58 ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
59 ; GFX940-NEXT: s_setpc_b64 s[30:31]
61 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
63 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
64 ; GFX11-NEXT: v_mov_b32_e32 v1, s6
65 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
66 ; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024 glc
67 ; GFX11-NEXT: s_waitcnt vmcnt(0)
68 ; GFX11-NEXT: buffer_gl1_inv
69 ; GFX11-NEXT: buffer_gl0_inv
70 ; GFX11-NEXT: s_setpc_b64 s[30:31]
72 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
74 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
75 ; GFX10-NEXT: v_mov_b32_e32 v1, s18
76 ; GFX10-NEXT: s_mov_b32 s11, s17
77 ; GFX10-NEXT: s_mov_b32 s10, s16
78 ; GFX10-NEXT: s_mov_b32 s9, s7
79 ; GFX10-NEXT: s_mov_b32 s8, s6
80 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
81 ; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc
82 ; GFX10-NEXT: s_waitcnt vmcnt(0)
83 ; GFX10-NEXT: buffer_gl1_inv
84 ; GFX10-NEXT: buffer_gl0_inv
85 ; GFX10-NEXT: s_setpc_b64 s[30:31]
87 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
89 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
90 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0
91 ; GFX90A-NEXT: s_mov_b32 s11, s17
92 ; GFX90A-NEXT: s_mov_b32 s10, s16
93 ; GFX90A-NEXT: s_mov_b32 s9, s7
94 ; GFX90A-NEXT: s_mov_b32 s8, s6
95 ; GFX90A-NEXT: v_mov_b32_e32 v0, s18
96 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
97 ; GFX90A-NEXT: s_add_i32 s6, s18, 0x400
98 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
99 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
100 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6
101 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
102 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
103 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
104 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0
105 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
106 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2
107 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
108 ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
109 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
110 ; GFX90A-NEXT: buffer_wbinvl1
111 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
112 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
113 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
114 ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1
115 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
116 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
117 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
119 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
121 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
122 ; GFX908-NEXT: v_mov_b32_e32 v1, v0
123 ; GFX908-NEXT: s_mov_b32 s11, s17
124 ; GFX908-NEXT: s_mov_b32 s10, s16
125 ; GFX908-NEXT: s_mov_b32 s9, s7
126 ; GFX908-NEXT: s_mov_b32 s8, s6
127 ; GFX908-NEXT: v_mov_b32_e32 v0, s18
128 ; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
129 ; GFX908-NEXT: s_add_i32 s6, s18, 0x400
130 ; GFX908-NEXT: s_mov_b64 s[4:5], 0
131 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
132 ; GFX908-NEXT: v_mov_b32_e32 v3, s6
133 ; GFX908-NEXT: .LBB0_1: ; %atomicrmw.start
134 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
135 ; GFX908-NEXT: s_waitcnt vmcnt(0)
136 ; GFX908-NEXT: v_mov_b32_e32 v5, v0
137 ; GFX908-NEXT: v_max_f32_e32 v0, v5, v5
138 ; GFX908-NEXT: v_max_f32_e32 v4, v0, v2
139 ; GFX908-NEXT: v_mov_b32_e32 v0, v4
140 ; GFX908-NEXT: v_mov_b32_e32 v1, v5
141 ; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
142 ; GFX908-NEXT: s_waitcnt vmcnt(0)
143 ; GFX908-NEXT: buffer_wbinvl1
144 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
145 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
146 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
147 ; GFX908-NEXT: s_cbranch_execnz .LBB0_1
148 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
149 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
150 ; GFX908-NEXT: s_setpc_b64 s[30:31]
152 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
154 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
155 ; GFX8-NEXT: v_mov_b32_e32 v1, v0
156 ; GFX8-NEXT: s_mov_b32 s11, s17
157 ; GFX8-NEXT: s_mov_b32 s10, s16
158 ; GFX8-NEXT: s_mov_b32 s9, s7
159 ; GFX8-NEXT: s_mov_b32 s8, s6
160 ; GFX8-NEXT: v_mov_b32_e32 v0, s18
161 ; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
162 ; GFX8-NEXT: s_add_i32 s6, s18, 0x400
163 ; GFX8-NEXT: s_mov_b64 s[4:5], 0
164 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
165 ; GFX8-NEXT: v_mov_b32_e32 v3, s6
166 ; GFX8-NEXT: .LBB0_1: ; %atomicrmw.start
167 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
168 ; GFX8-NEXT: s_waitcnt vmcnt(0)
169 ; GFX8-NEXT: v_mov_b32_e32 v5, v0
170 ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5
171 ; GFX8-NEXT: v_max_f32_e32 v4, v0, v2
172 ; GFX8-NEXT: v_mov_b32_e32 v0, v4
173 ; GFX8-NEXT: v_mov_b32_e32 v1, v5
174 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
175 ; GFX8-NEXT: s_waitcnt vmcnt(0)
176 ; GFX8-NEXT: buffer_wbinvl1
177 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
178 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
179 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
180 ; GFX8-NEXT: s_cbranch_execnz .LBB0_1
181 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
182 ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
183 ; GFX8-NEXT: s_setpc_b64 s[30:31]
185 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
187 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
188 ; GFX7-NEXT: s_mov_b32 s11, s17
189 ; GFX7-NEXT: s_mov_b32 s10, s16
190 ; GFX7-NEXT: s_mov_b32 s9, s7
191 ; GFX7-NEXT: s_mov_b32 s8, s6
192 ; GFX7-NEXT: v_mov_b32_e32 v1, s18
193 ; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc
194 ; GFX7-NEXT: s_waitcnt vmcnt(0)
195 ; GFX7-NEXT: buffer_wbinvl1
196 ; GFX7-NEXT: s_setpc_b64 s[30:31]
198 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset:
200 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
201 ; GFX6-NEXT: s_mov_b32 s11, s17
202 ; GFX6-NEXT: s_mov_b32 s10, s16
203 ; GFX6-NEXT: s_mov_b32 s9, s7
204 ; GFX6-NEXT: s_mov_b32 s8, s6
205 ; GFX6-NEXT: v_mov_b32_e32 v1, s18
206 ; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024 glc
207 ; GFX6-NEXT: s_waitcnt vmcnt(0)
208 ; GFX6-NEXT: buffer_wbinvl1
209 ; GFX6-NEXT: s_waitcnt expcnt(0)
210 ; GFX6-NEXT: s_setpc_b64 s[30:31]
211 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
212 %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
216 define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
217 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
219 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
220 ; GFX12-NEXT: s_wait_expcnt 0x0
221 ; GFX12-NEXT: s_wait_samplecnt 0x0
222 ; GFX12-NEXT: s_wait_bvhcnt 0x0
223 ; GFX12-NEXT: s_wait_kmcnt 0x0
224 ; GFX12-NEXT: v_mov_b32_e32 v1, s6
225 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
226 ; GFX12-NEXT: s_wait_storecnt 0x0
227 ; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen offset:1024
228 ; GFX12-NEXT: s_wait_storecnt 0x0
229 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
230 ; GFX12-NEXT: s_setpc_b64 s[30:31]
232 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
234 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
235 ; GFX940-NEXT: v_mov_b32_e32 v1, s6
236 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
237 ; GFX940-NEXT: s_addk_i32 s6, 0x400
238 ; GFX940-NEXT: s_mov_b64 s[4:5], 0
239 ; GFX940-NEXT: v_max_f32_e32 v2, v0, v0
240 ; GFX940-NEXT: v_mov_b32_e32 v3, s6
241 ; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
242 ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
243 ; GFX940-NEXT: s_waitcnt vmcnt(0)
244 ; GFX940-NEXT: v_max_f32_e32 v0, v1, v1
245 ; GFX940-NEXT: v_max_f32_e32 v0, v0, v2
246 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
247 ; GFX940-NEXT: buffer_wbl2 sc1
248 ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
249 ; GFX940-NEXT: s_waitcnt vmcnt(0)
250 ; GFX940-NEXT: buffer_inv sc1
251 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
252 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
253 ; GFX940-NEXT: v_mov_b32_e32 v1, v4
254 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
255 ; GFX940-NEXT: s_cbranch_execnz .LBB1_1
256 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
257 ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
258 ; GFX940-NEXT: s_setpc_b64 s[30:31]
260 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
262 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
263 ; GFX11-NEXT: v_mov_b32_e32 v1, s6
264 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
265 ; GFX11-NEXT: buffer_atomic_max_f32 v0, v1, s[0:3], 0 offen offset:1024
266 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
267 ; GFX11-NEXT: buffer_gl1_inv
268 ; GFX11-NEXT: buffer_gl0_inv
269 ; GFX11-NEXT: s_setpc_b64 s[30:31]
271 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
273 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
274 ; GFX10-NEXT: v_mov_b32_e32 v1, s18
275 ; GFX10-NEXT: s_mov_b32 s11, s17
276 ; GFX10-NEXT: s_mov_b32 s10, s16
277 ; GFX10-NEXT: s_mov_b32 s9, s7
278 ; GFX10-NEXT: s_mov_b32 s8, s6
279 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
280 ; GFX10-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024
281 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
282 ; GFX10-NEXT: buffer_gl1_inv
283 ; GFX10-NEXT: buffer_gl0_inv
284 ; GFX10-NEXT: s_setpc_b64 s[30:31]
286 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
288 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
289 ; GFX90A-NEXT: s_mov_b32 s11, s17
290 ; GFX90A-NEXT: s_mov_b32 s10, s16
291 ; GFX90A-NEXT: s_mov_b32 s9, s7
292 ; GFX90A-NEXT: s_mov_b32 s8, s6
293 ; GFX90A-NEXT: v_mov_b32_e32 v1, s18
294 ; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024
295 ; GFX90A-NEXT: s_add_i32 s6, s18, 0x400
296 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
297 ; GFX90A-NEXT: v_max_f32_e32 v2, v0, v0
298 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6
299 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
300 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
301 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
302 ; GFX90A-NEXT: v_max_f32_e32 v0, v1, v1
303 ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2
304 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
305 ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc
306 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
307 ; GFX90A-NEXT: buffer_wbinvl1
308 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
309 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
310 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4
311 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
312 ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
313 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
314 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
315 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
317 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
319 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
320 ; GFX908-NEXT: s_mov_b32 s11, s17
321 ; GFX908-NEXT: s_mov_b32 s10, s16
322 ; GFX908-NEXT: s_mov_b32 s9, s7
323 ; GFX908-NEXT: s_mov_b32 s8, s6
324 ; GFX908-NEXT: v_mov_b32_e32 v1, s18
325 ; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024
326 ; GFX908-NEXT: s_add_i32 s6, s18, 0x400
327 ; GFX908-NEXT: s_mov_b64 s[4:5], 0
328 ; GFX908-NEXT: v_max_f32_e32 v2, v0, v0
329 ; GFX908-NEXT: v_mov_b32_e32 v3, s6
330 ; GFX908-NEXT: .LBB1_1: ; %atomicrmw.start
331 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
332 ; GFX908-NEXT: s_waitcnt vmcnt(0)
333 ; GFX908-NEXT: v_max_f32_e32 v0, v1, v1
334 ; GFX908-NEXT: v_max_f32_e32 v0, v0, v2
335 ; GFX908-NEXT: v_mov_b32_e32 v5, v1
336 ; GFX908-NEXT: v_mov_b32_e32 v4, v0
337 ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc
338 ; GFX908-NEXT: s_waitcnt vmcnt(0)
339 ; GFX908-NEXT: buffer_wbinvl1
340 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
341 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
342 ; GFX908-NEXT: v_mov_b32_e32 v1, v4
343 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
344 ; GFX908-NEXT: s_cbranch_execnz .LBB1_1
345 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
346 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
347 ; GFX908-NEXT: s_setpc_b64 s[30:31]
349 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
351 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
352 ; GFX8-NEXT: s_mov_b32 s11, s17
353 ; GFX8-NEXT: s_mov_b32 s10, s16
354 ; GFX8-NEXT: s_mov_b32 s9, s7
355 ; GFX8-NEXT: s_mov_b32 s8, s6
356 ; GFX8-NEXT: v_mov_b32_e32 v1, s18
357 ; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024
358 ; GFX8-NEXT: s_add_i32 s6, s18, 0x400
359 ; GFX8-NEXT: s_mov_b64 s[4:5], 0
360 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v0
361 ; GFX8-NEXT: v_mov_b32_e32 v3, s6
362 ; GFX8-NEXT: .LBB1_1: ; %atomicrmw.start
363 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
364 ; GFX8-NEXT: s_waitcnt vmcnt(0)
365 ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v1
366 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
367 ; GFX8-NEXT: v_mov_b32_e32 v5, v1
368 ; GFX8-NEXT: v_mov_b32_e32 v4, v0
369 ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc
370 ; GFX8-NEXT: s_waitcnt vmcnt(0)
371 ; GFX8-NEXT: buffer_wbinvl1
372 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
373 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
374 ; GFX8-NEXT: v_mov_b32_e32 v1, v4
375 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
376 ; GFX8-NEXT: s_cbranch_execnz .LBB1_1
377 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
378 ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
379 ; GFX8-NEXT: s_setpc_b64 s[30:31]
381 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
383 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
384 ; GFX7-NEXT: s_mov_b32 s11, s17
385 ; GFX7-NEXT: s_mov_b32 s10, s16
386 ; GFX7-NEXT: s_mov_b32 s9, s7
387 ; GFX7-NEXT: s_mov_b32 s8, s6
388 ; GFX7-NEXT: v_mov_b32_e32 v1, s18
389 ; GFX7-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024
390 ; GFX7-NEXT: s_waitcnt vmcnt(0)
391 ; GFX7-NEXT: buffer_wbinvl1
392 ; GFX7-NEXT: s_setpc_b64 s[30:31]
394 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__offset:
396 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
397 ; GFX6-NEXT: s_mov_b32 s11, s17
398 ; GFX6-NEXT: s_mov_b32 s10, s16
399 ; GFX6-NEXT: s_mov_b32 s9, s7
400 ; GFX6-NEXT: s_mov_b32 s8, s6
401 ; GFX6-NEXT: v_mov_b32_e32 v1, s18
402 ; GFX6-NEXT: buffer_atomic_fmax v0, v1, s[8:11], 0 offen offset:1024
403 ; GFX6-NEXT: s_waitcnt vmcnt(0)
404 ; GFX6-NEXT: buffer_wbinvl1
405 ; GFX6-NEXT: s_waitcnt expcnt(0)
406 ; GFX6-NEXT: s_setpc_b64 s[30:31]
407 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
408 %unused = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
412 define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall(ptr addrspace(7) %ptr, float %val) #0 {
413 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
415 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
416 ; GFX12-NEXT: s_wait_expcnt 0x0
417 ; GFX12-NEXT: s_wait_samplecnt 0x0
418 ; GFX12-NEXT: s_wait_bvhcnt 0x0
419 ; GFX12-NEXT: s_wait_kmcnt 0x0
420 ; GFX12-NEXT: s_mov_b32 s1, exec_lo
421 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
422 ; GFX12-NEXT: s_wait_storecnt 0x0
423 ; GFX12-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
424 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0
425 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1
426 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2
427 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3
428 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
429 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
430 ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
431 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
432 ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
433 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0
434 ; GFX12-NEXT: s_wait_loadcnt 0x0
435 ; GFX12-NEXT: buffer_atomic_max_num_f32 v5, v4, s[4:7], null offen offset:1024 th:TH_ATOMIC_RETURN
436 ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
437 ; GFX12-NEXT: ; implicit-def: $vgpr4
438 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
439 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1
440 ; GFX12-NEXT: ; %bb.2:
441 ; GFX12-NEXT: s_mov_b32 exec_lo, s1
442 ; GFX12-NEXT: s_wait_loadcnt 0x0
443 ; GFX12-NEXT: v_mov_b32_e32 v0, v5
444 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
445 ; GFX12-NEXT: s_setpc_b64 s[30:31]
447 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
449 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
450 ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4
451 ; GFX940-NEXT: s_mov_b64 s[2:3], exec
452 ; GFX940-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
453 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0
454 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1
455 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2
456 ; GFX940-NEXT: v_readfirstlane_b32 s7, v3
457 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
458 ; GFX940-NEXT: s_nop 0
459 ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
460 ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
461 ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
462 ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
463 ; GFX940-NEXT: ; implicit-def: $vgpr4
464 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
465 ; GFX940-NEXT: s_cbranch_execnz .LBB2_1
466 ; GFX940-NEXT: ; %bb.2:
467 ; GFX940-NEXT: s_mov_b64 exec, s[2:3]
468 ; GFX940-NEXT: s_mov_b64 s[2:3], 0
469 ; GFX940-NEXT: v_max_f32_e32 v9, v5, v5
470 ; GFX940-NEXT: .LBB2_3: ; %atomicrmw.start
471 ; GFX940-NEXT: ; =>This Loop Header: Depth=1
472 ; GFX940-NEXT: ; Child Loop BB2_4 Depth 2
473 ; GFX940-NEXT: s_waitcnt vmcnt(0)
474 ; GFX940-NEXT: v_max_f32_e32 v4, v7, v7
475 ; GFX940-NEXT: v_max_f32_e32 v6, v4, v9
476 ; GFX940-NEXT: s_mov_b64 s[8:9], exec
477 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
478 ; GFX940-NEXT: buffer_wbl2 sc1
479 ; GFX940-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
480 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
481 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0
482 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1
483 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2
484 ; GFX940-NEXT: v_readfirstlane_b32 s7, v3
485 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
486 ; GFX940-NEXT: s_nop 0
487 ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
488 ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
489 ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
490 ; GFX940-NEXT: s_waitcnt vmcnt(0)
491 ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
492 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
493 ; GFX940-NEXT: s_cbranch_execnz .LBB2_4
494 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
495 ; GFX940-NEXT: s_mov_b64 exec, s[8:9]
496 ; GFX940-NEXT: s_waitcnt vmcnt(0)
497 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
498 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
499 ; GFX940-NEXT: v_mov_b32_e32 v7, v4
500 ; GFX940-NEXT: buffer_inv sc1
501 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
502 ; GFX940-NEXT: s_cbranch_execnz .LBB2_3
503 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
504 ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
505 ; GFX940-NEXT: v_mov_b32_e32 v0, v4
506 ; GFX940-NEXT: s_setpc_b64 s[30:31]
508 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
510 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
511 ; GFX11-NEXT: s_mov_b32 s1, exec_lo
512 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
513 ; GFX11-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
514 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
515 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1
516 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2
517 ; GFX11-NEXT: v_readfirstlane_b32 s7, v3
518 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
519 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
520 ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
521 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
522 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
523 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0
524 ; GFX11-NEXT: s_waitcnt vmcnt(0)
525 ; GFX11-NEXT: buffer_atomic_max_f32 v5, v4, s[4:7], 0 offen offset:1024 glc
526 ; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
527 ; GFX11-NEXT: ; implicit-def: $vgpr4
528 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
529 ; GFX11-NEXT: s_cbranch_execnz .LBB2_1
530 ; GFX11-NEXT: ; %bb.2:
531 ; GFX11-NEXT: s_mov_b32 exec_lo, s1
532 ; GFX11-NEXT: s_waitcnt vmcnt(0)
533 ; GFX11-NEXT: v_mov_b32_e32 v0, v5
534 ; GFX11-NEXT: buffer_gl1_inv
535 ; GFX11-NEXT: buffer_gl0_inv
536 ; GFX11-NEXT: s_setpc_b64 s[30:31]
538 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
540 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
541 ; GFX10-NEXT: s_mov_b32 s5, exec_lo
542 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
543 ; GFX10-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
544 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0
545 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1
546 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2
547 ; GFX10-NEXT: v_readfirstlane_b32 s11, v3
548 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
549 ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
550 ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
551 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4
552 ; GFX10-NEXT: s_waitcnt vmcnt(0)
553 ; GFX10-NEXT: buffer_atomic_fmax v5, v4, s[8:11], 0 offen offset:1024 glc
554 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
555 ; GFX10-NEXT: ; implicit-def: $vgpr4
556 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
557 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
558 ; GFX10-NEXT: s_cbranch_execnz .LBB2_1
559 ; GFX10-NEXT: ; %bb.2:
560 ; GFX10-NEXT: s_mov_b32 exec_lo, s5
561 ; GFX10-NEXT: s_waitcnt vmcnt(0)
562 ; GFX10-NEXT: v_mov_b32_e32 v0, v5
563 ; GFX10-NEXT: buffer_gl1_inv
564 ; GFX10-NEXT: buffer_gl0_inv
565 ; GFX10-NEXT: s_setpc_b64 s[30:31]
567 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
569 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
570 ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
571 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec
572 ; GFX90A-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
573 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
574 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
575 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
576 ; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
577 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
578 ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
579 ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
580 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
581 ; GFX90A-NEXT: s_nop 0
582 ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
583 ; GFX90A-NEXT: ; implicit-def: $vgpr4
584 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
585 ; GFX90A-NEXT: s_cbranch_execnz .LBB2_1
586 ; GFX90A-NEXT: ; %bb.2:
587 ; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
588 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0
589 ; GFX90A-NEXT: v_max_f32_e32 v9, v5, v5
590 ; GFX90A-NEXT: .LBB2_3: ; %atomicrmw.start
591 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1
592 ; GFX90A-NEXT: ; Child Loop BB2_4 Depth 2
593 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
594 ; GFX90A-NEXT: v_max_f32_e32 v4, v7, v7
595 ; GFX90A-NEXT: v_max_f32_e32 v6, v4, v9
596 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec
597 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
598 ; GFX90A-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
599 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
600 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
601 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
602 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
603 ; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
604 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
605 ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
606 ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
607 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
608 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
609 ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
610 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
611 ; GFX90A-NEXT: s_cbranch_execnz .LBB2_4
612 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
613 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
614 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
615 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
616 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
617 ; GFX90A-NEXT: v_mov_b32_e32 v7, v4
618 ; GFX90A-NEXT: buffer_wbinvl1
619 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
620 ; GFX90A-NEXT: s_cbranch_execnz .LBB2_3
621 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
622 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
623 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4
624 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
626 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
628 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
629 ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
630 ; GFX908-NEXT: s_mov_b64 s[6:7], exec
631 ; GFX908-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
632 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0
633 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1
634 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2
635 ; GFX908-NEXT: v_readfirstlane_b32 s11, v3
636 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
637 ; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
638 ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
639 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
640 ; GFX908-NEXT: s_nop 0
641 ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
642 ; GFX908-NEXT: ; implicit-def: $vgpr4
643 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
644 ; GFX908-NEXT: s_cbranch_execnz .LBB2_1
645 ; GFX908-NEXT: ; %bb.2:
646 ; GFX908-NEXT: s_mov_b64 exec, s[6:7]
647 ; GFX908-NEXT: s_mov_b64 s[6:7], 0
648 ; GFX908-NEXT: v_max_f32_e32 v8, v5, v5
649 ; GFX908-NEXT: .LBB2_3: ; %atomicrmw.start
650 ; GFX908-NEXT: ; =>This Loop Header: Depth=1
651 ; GFX908-NEXT: ; Child Loop BB2_4 Depth 2
652 ; GFX908-NEXT: s_waitcnt vmcnt(0)
653 ; GFX908-NEXT: v_max_f32_e32 v4, v6, v6
654 ; GFX908-NEXT: v_max_f32_e32 v5, v4, v8
655 ; GFX908-NEXT: v_mov_b32_e32 v4, v5
656 ; GFX908-NEXT: s_mov_b64 s[12:13], exec
657 ; GFX908-NEXT: v_mov_b32_e32 v5, v6
658 ; GFX908-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
659 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
660 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0
661 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1
662 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2
663 ; GFX908-NEXT: v_readfirstlane_b32 s11, v3
664 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
665 ; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
666 ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
667 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
668 ; GFX908-NEXT: s_waitcnt vmcnt(0)
669 ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
670 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
671 ; GFX908-NEXT: s_cbranch_execnz .LBB2_4
672 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
673 ; GFX908-NEXT: s_mov_b64 exec, s[12:13]
674 ; GFX908-NEXT: s_waitcnt vmcnt(0)
675 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
676 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
677 ; GFX908-NEXT: v_mov_b32_e32 v6, v4
678 ; GFX908-NEXT: buffer_wbinvl1
679 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
680 ; GFX908-NEXT: s_cbranch_execnz .LBB2_3
681 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
682 ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
683 ; GFX908-NEXT: v_mov_b32_e32 v0, v4
684 ; GFX908-NEXT: s_setpc_b64 s[30:31]
686 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
688 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
689 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
690 ; GFX8-NEXT: s_mov_b64 s[6:7], exec
691 ; GFX8-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
692 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0
693 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1
694 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2
695 ; GFX8-NEXT: v_readfirstlane_b32 s11, v3
696 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
697 ; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
698 ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
699 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
701 ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
702 ; GFX8-NEXT: ; implicit-def: $vgpr4
703 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
704 ; GFX8-NEXT: s_cbranch_execnz .LBB2_1
705 ; GFX8-NEXT: ; %bb.2:
706 ; GFX8-NEXT: s_mov_b64 exec, s[6:7]
707 ; GFX8-NEXT: s_mov_b64 s[6:7], 0
708 ; GFX8-NEXT: v_mul_f32_e32 v8, 1.0, v5
709 ; GFX8-NEXT: .LBB2_3: ; %atomicrmw.start
710 ; GFX8-NEXT: ; =>This Loop Header: Depth=1
711 ; GFX8-NEXT: ; Child Loop BB2_4 Depth 2
712 ; GFX8-NEXT: s_waitcnt vmcnt(0)
713 ; GFX8-NEXT: v_mul_f32_e32 v4, 1.0, v6
714 ; GFX8-NEXT: v_max_f32_e32 v5, v4, v8
715 ; GFX8-NEXT: v_mov_b32_e32 v4, v5
716 ; GFX8-NEXT: s_mov_b64 s[12:13], exec
717 ; GFX8-NEXT: v_mov_b32_e32 v5, v6
718 ; GFX8-NEXT: .LBB2_4: ; Parent Loop BB2_3 Depth=1
719 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
720 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0
721 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1
722 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2
723 ; GFX8-NEXT: v_readfirstlane_b32 s11, v3
724 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
725 ; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
726 ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
727 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
728 ; GFX8-NEXT: s_waitcnt vmcnt(0)
729 ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
730 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
731 ; GFX8-NEXT: s_cbranch_execnz .LBB2_4
732 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB2_3 Depth=1
733 ; GFX8-NEXT: s_mov_b64 exec, s[12:13]
734 ; GFX8-NEXT: s_waitcnt vmcnt(0)
735 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
736 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
737 ; GFX8-NEXT: v_mov_b32_e32 v6, v4
738 ; GFX8-NEXT: buffer_wbinvl1
739 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
740 ; GFX8-NEXT: s_cbranch_execnz .LBB2_3
741 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
742 ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
743 ; GFX8-NEXT: v_mov_b32_e32 v0, v4
744 ; GFX8-NEXT: s_setpc_b64 s[30:31]
746 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
748 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
749 ; GFX7-NEXT: s_mov_b64 s[6:7], exec
750 ; GFX7-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
751 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0
752 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1
753 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2
754 ; GFX7-NEXT: v_readfirstlane_b32 s11, v3
755 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
756 ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
757 ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
758 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
759 ; GFX7-NEXT: s_waitcnt vmcnt(0)
760 ; GFX7-NEXT: buffer_atomic_fmax v5, v4, s[8:11], 0 offen offset:1024 glc
761 ; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
762 ; GFX7-NEXT: ; implicit-def: $vgpr4
763 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
764 ; GFX7-NEXT: s_cbranch_execnz .LBB2_1
765 ; GFX7-NEXT: ; %bb.2:
766 ; GFX7-NEXT: s_mov_b64 exec, s[6:7]
767 ; GFX7-NEXT: s_waitcnt vmcnt(0)
768 ; GFX7-NEXT: v_mov_b32_e32 v0, v5
769 ; GFX7-NEXT: buffer_wbinvl1
770 ; GFX7-NEXT: s_setpc_b64 s[30:31]
772 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__offset__waterfall:
774 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
775 ; GFX6-NEXT: s_mov_b64 s[6:7], exec
776 ; GFX6-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1
777 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0
778 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1
779 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2
780 ; GFX6-NEXT: v_readfirstlane_b32 s11, v3
781 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
782 ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
783 ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
784 ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
785 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
786 ; GFX6-NEXT: buffer_atomic_fmax v5, v4, s[8:11], 0 offen offset:1024 glc
787 ; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
788 ; GFX6-NEXT: ; implicit-def: $vgpr4
789 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
790 ; GFX6-NEXT: s_cbranch_execnz .LBB2_1
791 ; GFX6-NEXT: ; %bb.2:
792 ; GFX6-NEXT: s_mov_b64 exec, s[6:7]
793 ; GFX6-NEXT: s_waitcnt vmcnt(0)
794 ; GFX6-NEXT: v_mov_b32_e32 v0, v5
795 ; GFX6-NEXT: buffer_wbinvl1
796 ; GFX6-NEXT: s_waitcnt expcnt(0)
797 ; GFX6-NEXT: s_setpc_b64 s[30:31]
798 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
799 %result = atomicrmw fmax ptr addrspace(7) %gep, float %val syncscope("agent") seq_cst
803 ; --------------------------------------------------------------------
805 ; --------------------------------------------------------------------
807 define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
808 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
810 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
811 ; GFX12-NEXT: s_wait_expcnt 0x0
812 ; GFX12-NEXT: s_wait_samplecnt 0x0
813 ; GFX12-NEXT: s_wait_bvhcnt 0x0
814 ; GFX12-NEXT: s_wait_kmcnt 0x0
815 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
816 ; GFX12-NEXT: v_mov_b32_e32 v0, s6
817 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
818 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
819 ; GFX12-NEXT: v_mov_b32_e32 v6, s4
820 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
821 ; GFX12-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], null offen offset:2048
822 ; GFX12-NEXT: s_mov_b32 s4, 0
823 ; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start
824 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
825 ; GFX12-NEXT: s_wait_loadcnt 0x0
826 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
827 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
828 ; GFX12-NEXT: s_wait_storecnt 0x0
829 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
830 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
831 ; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
832 ; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
833 ; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
834 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
835 ; GFX12-NEXT: s_wait_loadcnt 0x0
836 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
837 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
838 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
839 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
840 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
841 ; GFX12-NEXT: s_cbranch_execnz .LBB3_1
842 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
843 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
844 ; GFX12-NEXT: s_setpc_b64 s[30:31]
846 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
848 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
849 ; GFX940-NEXT: v_mov_b32_e32 v2, s6
850 ; GFX940-NEXT: buffer_wbl2 sc1
851 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048 sc0
852 ; GFX940-NEXT: s_waitcnt vmcnt(0)
853 ; GFX940-NEXT: buffer_inv sc1
854 ; GFX940-NEXT: s_setpc_b64 s[30:31]
856 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
858 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
859 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
860 ; GFX11-NEXT: v_mov_b32_e32 v0, s6
861 ; GFX11-NEXT: s_add_i32 s4, s6, 0x800
862 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
863 ; GFX11-NEXT: v_mov_b32_e32 v6, s4
864 ; GFX11-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
865 ; GFX11-NEXT: buffer_load_b64 v[0:1], v0, s[0:3], 0 offen offset:2048
866 ; GFX11-NEXT: s_mov_b32 s4, 0
867 ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
868 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
869 ; GFX11-NEXT: s_waitcnt vmcnt(0)
870 ; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
871 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
872 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
873 ; GFX11-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
874 ; GFX11-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
875 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
876 ; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
877 ; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
878 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], 0 offen glc
879 ; GFX11-NEXT: s_waitcnt vmcnt(0)
880 ; GFX11-NEXT: buffer_gl1_inv
881 ; GFX11-NEXT: buffer_gl0_inv
882 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[9:10]
883 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
884 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
885 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
886 ; GFX11-NEXT: s_cbranch_execnz .LBB3_1
887 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
888 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
889 ; GFX11-NEXT: s_setpc_b64 s[30:31]
891 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
893 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
894 ; GFX10-NEXT: v_mov_b32_e32 v2, s18
895 ; GFX10-NEXT: s_mov_b32 s11, s17
896 ; GFX10-NEXT: s_mov_b32 s10, s16
897 ; GFX10-NEXT: s_mov_b32 s9, s7
898 ; GFX10-NEXT: s_mov_b32 s8, s6
899 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
900 ; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
901 ; GFX10-NEXT: s_waitcnt vmcnt(0)
902 ; GFX10-NEXT: buffer_gl1_inv
903 ; GFX10-NEXT: buffer_gl0_inv
904 ; GFX10-NEXT: s_setpc_b64 s[30:31]
906 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
908 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
909 ; GFX90A-NEXT: s_mov_b32 s11, s17
910 ; GFX90A-NEXT: s_mov_b32 s10, s16
911 ; GFX90A-NEXT: s_mov_b32 s9, s7
912 ; GFX90A-NEXT: s_mov_b32 s8, s6
913 ; GFX90A-NEXT: v_mov_b32_e32 v2, s18
914 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
915 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
916 ; GFX90A-NEXT: buffer_wbinvl1
917 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
919 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
921 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
922 ; GFX908-NEXT: s_mov_b32 s11, s17
923 ; GFX908-NEXT: s_mov_b32 s10, s16
924 ; GFX908-NEXT: s_mov_b32 s9, s7
925 ; GFX908-NEXT: s_mov_b32 s8, s6
926 ; GFX908-NEXT: v_mov_b32_e32 v2, v0
927 ; GFX908-NEXT: v_mov_b32_e32 v0, s18
928 ; GFX908-NEXT: v_mov_b32_e32 v3, v1
929 ; GFX908-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
930 ; GFX908-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
931 ; GFX908-NEXT: s_add_i32 s6, s18, 0x800
932 ; GFX908-NEXT: s_mov_b64 s[4:5], 0
933 ; GFX908-NEXT: v_mov_b32_e32 v6, s6
934 ; GFX908-NEXT: .LBB3_1: ; %atomicrmw.start
935 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
936 ; GFX908-NEXT: s_waitcnt vmcnt(0)
937 ; GFX908-NEXT: v_mov_b32_e32 v10, v1
938 ; GFX908-NEXT: v_mov_b32_e32 v9, v0
939 ; GFX908-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
940 ; GFX908-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
941 ; GFX908-NEXT: v_mov_b32_e32 v0, v7
942 ; GFX908-NEXT: v_mov_b32_e32 v1, v8
943 ; GFX908-NEXT: v_mov_b32_e32 v2, v9
944 ; GFX908-NEXT: v_mov_b32_e32 v3, v10
945 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
946 ; GFX908-NEXT: s_waitcnt vmcnt(0)
947 ; GFX908-NEXT: buffer_wbinvl1
948 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
949 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
950 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
951 ; GFX908-NEXT: s_cbranch_execnz .LBB3_1
952 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
953 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
954 ; GFX908-NEXT: s_setpc_b64 s[30:31]
956 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
958 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
959 ; GFX8-NEXT: s_mov_b32 s11, s17
960 ; GFX8-NEXT: s_mov_b32 s10, s16
961 ; GFX8-NEXT: s_mov_b32 s9, s7
962 ; GFX8-NEXT: s_mov_b32 s8, s6
963 ; GFX8-NEXT: v_mov_b32_e32 v2, v0
964 ; GFX8-NEXT: v_mov_b32_e32 v0, s18
965 ; GFX8-NEXT: v_mov_b32_e32 v3, v1
966 ; GFX8-NEXT: buffer_load_dwordx2 v[0:1], v0, s[8:11], 0 offen offset:2048
967 ; GFX8-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
968 ; GFX8-NEXT: s_add_i32 s6, s18, 0x800
969 ; GFX8-NEXT: s_mov_b64 s[4:5], 0
970 ; GFX8-NEXT: v_mov_b32_e32 v6, s6
971 ; GFX8-NEXT: .LBB3_1: ; %atomicrmw.start
972 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
973 ; GFX8-NEXT: s_waitcnt vmcnt(0)
974 ; GFX8-NEXT: v_mov_b32_e32 v10, v1
975 ; GFX8-NEXT: v_mov_b32_e32 v9, v0
976 ; GFX8-NEXT: v_max_f64 v[0:1], v[9:10], v[9:10]
977 ; GFX8-NEXT: v_max_f64 v[7:8], v[0:1], v[4:5]
978 ; GFX8-NEXT: v_mov_b32_e32 v0, v7
979 ; GFX8-NEXT: v_mov_b32_e32 v1, v8
980 ; GFX8-NEXT: v_mov_b32_e32 v2, v9
981 ; GFX8-NEXT: v_mov_b32_e32 v3, v10
982 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v6, s[8:11], 0 offen glc
983 ; GFX8-NEXT: s_waitcnt vmcnt(0)
984 ; GFX8-NEXT: buffer_wbinvl1
985 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[9:10]
986 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
987 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
988 ; GFX8-NEXT: s_cbranch_execnz .LBB3_1
989 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
990 ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
991 ; GFX8-NEXT: s_setpc_b64 s[30:31]
993 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
995 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
996 ; GFX7-NEXT: s_mov_b32 s11, s17
997 ; GFX7-NEXT: s_mov_b32 s10, s16
998 ; GFX7-NEXT: s_mov_b32 s9, s7
999 ; GFX7-NEXT: s_mov_b32 s8, s6
1000 ; GFX7-NEXT: v_mov_b32_e32 v2, s18
1001 ; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
1002 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1003 ; GFX7-NEXT: buffer_wbinvl1
1004 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1006 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset:
1008 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1009 ; GFX6-NEXT: s_mov_b32 s11, s17
1010 ; GFX6-NEXT: s_mov_b32 s10, s16
1011 ; GFX6-NEXT: s_mov_b32 s9, s7
1012 ; GFX6-NEXT: s_mov_b32 s8, s6
1013 ; GFX6-NEXT: v_mov_b32_e32 v2, s18
1014 ; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048 glc
1015 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1016 ; GFX6-NEXT: buffer_wbinvl1
1017 ; GFX6-NEXT: s_waitcnt expcnt(0)
1018 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1019 %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
1020 %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
1024 define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset(ptr addrspace(7) inreg %ptr, double %val) #0 {
1025 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
1027 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1028 ; GFX12-NEXT: s_wait_expcnt 0x0
1029 ; GFX12-NEXT: s_wait_samplecnt 0x0
1030 ; GFX12-NEXT: s_wait_bvhcnt 0x0
1031 ; GFX12-NEXT: s_wait_kmcnt 0x0
1032 ; GFX12-NEXT: v_mov_b32_e32 v2, s6
1033 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
1034 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x800
1035 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1036 ; GFX12-NEXT: v_mov_b32_e32 v6, s4
1037 ; GFX12-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], null offen offset:2048
1038 ; GFX12-NEXT: s_mov_b32 s4, 0
1039 ; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start
1040 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1041 ; GFX12-NEXT: s_wait_loadcnt 0x0
1042 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
1043 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
1044 ; GFX12-NEXT: s_wait_storecnt 0x0
1045 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
1046 ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
1047 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
1048 ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
1049 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
1050 ; GFX12-NEXT: s_wait_loadcnt 0x0
1051 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1052 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
1053 ; GFX12-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
1054 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
1055 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1056 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
1057 ; GFX12-NEXT: s_cbranch_execnz .LBB4_1
1058 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
1059 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
1060 ; GFX12-NEXT: s_setpc_b64 s[30:31]
1062 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
1064 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1065 ; GFX940-NEXT: v_mov_b32_e32 v2, s6
1066 ; GFX940-NEXT: buffer_wbl2 sc1
1067 ; GFX940-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[0:3], 0 offen offset:2048
1068 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1069 ; GFX940-NEXT: buffer_inv sc1
1070 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1072 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
1074 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1075 ; GFX11-NEXT: v_mov_b32_e32 v2, s6
1076 ; GFX11-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
1077 ; GFX11-NEXT: s_add_i32 s4, s6, 0x800
1078 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1079 ; GFX11-NEXT: v_mov_b32_e32 v6, s4
1080 ; GFX11-NEXT: buffer_load_b64 v[2:3], v2, s[0:3], 0 offen offset:2048
1081 ; GFX11-NEXT: s_mov_b32 s4, 0
1082 ; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start
1083 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1084 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1085 ; GFX11-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
1086 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1087 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1088 ; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
1089 ; GFX11-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
1090 ; GFX11-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
1091 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], 0 offen glc
1092 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1093 ; GFX11-NEXT: buffer_gl1_inv
1094 ; GFX11-NEXT: buffer_gl0_inv
1095 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3]
1096 ; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v3, v8
1097 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
1098 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1099 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
1100 ; GFX11-NEXT: s_cbranch_execnz .LBB4_1
1101 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
1102 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
1103 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1105 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
1107 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1108 ; GFX10-NEXT: v_mov_b32_e32 v2, s18
1109 ; GFX10-NEXT: s_mov_b32 s11, s17
1110 ; GFX10-NEXT: s_mov_b32 s10, s16
1111 ; GFX10-NEXT: s_mov_b32 s9, s7
1112 ; GFX10-NEXT: s_mov_b32 s8, s6
1113 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1114 ; GFX10-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
1115 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1116 ; GFX10-NEXT: buffer_gl1_inv
1117 ; GFX10-NEXT: buffer_gl0_inv
1118 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1120 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
1122 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1123 ; GFX90A-NEXT: s_mov_b32 s11, s17
1124 ; GFX90A-NEXT: s_mov_b32 s10, s16
1125 ; GFX90A-NEXT: s_mov_b32 s9, s7
1126 ; GFX90A-NEXT: s_mov_b32 s8, s6
1127 ; GFX90A-NEXT: v_mov_b32_e32 v2, s18
1128 ; GFX90A-NEXT: buffer_atomic_max_f64 v[0:1], v2, s[8:11], 0 offen offset:2048
1129 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1130 ; GFX90A-NEXT: buffer_wbinvl1
1131 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
1133 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
1135 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1136 ; GFX908-NEXT: s_mov_b32 s11, s17
1137 ; GFX908-NEXT: s_mov_b32 s10, s16
1138 ; GFX908-NEXT: s_mov_b32 s9, s7
1139 ; GFX908-NEXT: s_mov_b32 s8, s6
1140 ; GFX908-NEXT: v_mov_b32_e32 v2, s18
1141 ; GFX908-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
1142 ; GFX908-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
1143 ; GFX908-NEXT: s_add_i32 s6, s18, 0x800
1144 ; GFX908-NEXT: s_mov_b64 s[4:5], 0
1145 ; GFX908-NEXT: v_mov_b32_e32 v6, s6
1146 ; GFX908-NEXT: .LBB4_1: ; %atomicrmw.start
1147 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
1148 ; GFX908-NEXT: s_waitcnt vmcnt(0)
1149 ; GFX908-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
1150 ; GFX908-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
1151 ; GFX908-NEXT: v_mov_b32_e32 v10, v3
1152 ; GFX908-NEXT: v_mov_b32_e32 v9, v2
1153 ; GFX908-NEXT: v_mov_b32_e32 v8, v1
1154 ; GFX908-NEXT: v_mov_b32_e32 v7, v0
1155 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
1156 ; GFX908-NEXT: s_waitcnt vmcnt(0)
1157 ; GFX908-NEXT: buffer_wbinvl1
1158 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
1159 ; GFX908-NEXT: v_mov_b32_e32 v2, v7
1160 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1161 ; GFX908-NEXT: v_mov_b32_e32 v3, v8
1162 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
1163 ; GFX908-NEXT: s_cbranch_execnz .LBB4_1
1164 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
1165 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
1166 ; GFX908-NEXT: s_setpc_b64 s[30:31]
1168 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
1170 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1171 ; GFX8-NEXT: s_mov_b32 s11, s17
1172 ; GFX8-NEXT: s_mov_b32 s10, s16
1173 ; GFX8-NEXT: s_mov_b32 s9, s7
1174 ; GFX8-NEXT: s_mov_b32 s8, s6
1175 ; GFX8-NEXT: v_mov_b32_e32 v2, s18
1176 ; GFX8-NEXT: buffer_load_dwordx2 v[2:3], v2, s[8:11], 0 offen offset:2048
1177 ; GFX8-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
1178 ; GFX8-NEXT: s_add_i32 s6, s18, 0x800
1179 ; GFX8-NEXT: s_mov_b64 s[4:5], 0
1180 ; GFX8-NEXT: v_mov_b32_e32 v6, s6
1181 ; GFX8-NEXT: .LBB4_1: ; %atomicrmw.start
1182 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
1183 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1184 ; GFX8-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
1185 ; GFX8-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
1186 ; GFX8-NEXT: v_mov_b32_e32 v10, v3
1187 ; GFX8-NEXT: v_mov_b32_e32 v9, v2
1188 ; GFX8-NEXT: v_mov_b32_e32 v8, v1
1189 ; GFX8-NEXT: v_mov_b32_e32 v7, v0
1190 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[7:10], v6, s[8:11], 0 offen glc
1191 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1192 ; GFX8-NEXT: buffer_wbinvl1
1193 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[7:8], v[2:3]
1194 ; GFX8-NEXT: v_mov_b32_e32 v2, v7
1195 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1196 ; GFX8-NEXT: v_mov_b32_e32 v3, v8
1197 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
1198 ; GFX8-NEXT: s_cbranch_execnz .LBB4_1
1199 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
1200 ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
1201 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1203 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
1205 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1206 ; GFX7-NEXT: s_mov_b32 s11, s17
1207 ; GFX7-NEXT: s_mov_b32 s10, s16
1208 ; GFX7-NEXT: s_mov_b32 s9, s7
1209 ; GFX7-NEXT: s_mov_b32 s8, s6
1210 ; GFX7-NEXT: v_mov_b32_e32 v2, s18
1211 ; GFX7-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
1212 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1213 ; GFX7-NEXT: buffer_wbinvl1
1214 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1216 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__offset:
1218 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1219 ; GFX6-NEXT: s_mov_b32 s11, s17
1220 ; GFX6-NEXT: s_mov_b32 s10, s16
1221 ; GFX6-NEXT: s_mov_b32 s9, s7
1222 ; GFX6-NEXT: s_mov_b32 s8, s6
1223 ; GFX6-NEXT: v_mov_b32_e32 v2, s18
1224 ; GFX6-NEXT: buffer_atomic_fmax_x2 v[0:1], v2, s[8:11], 0 offen offset:2048
1225 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1226 ; GFX6-NEXT: buffer_wbinvl1
1227 ; GFX6-NEXT: s_waitcnt expcnt(0)
1228 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1229 %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
1230 %unused = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
1234 define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall(ptr addrspace(7) %ptr, double %val) #0 {
1235 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
1237 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1238 ; GFX12-NEXT: s_wait_expcnt 0x0
1239 ; GFX12-NEXT: s_wait_samplecnt 0x0
1240 ; GFX12-NEXT: s_wait_bvhcnt 0x0
1241 ; GFX12-NEXT: s_wait_kmcnt 0x0
1242 ; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
1243 ; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1244 ; GFX12-NEXT: v_add_nc_u32_e32 v15, 0x800, v4
1245 ; GFX12-NEXT: s_mov_b32 s1, exec_lo
1246 ; GFX12-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
1247 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
1248 ; GFX12-NEXT: v_readfirstlane_b32 s4, v9
1249 ; GFX12-NEXT: v_readfirstlane_b32 s5, v10
1250 ; GFX12-NEXT: v_readfirstlane_b32 s6, v7
1251 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8
1252 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
1253 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
1254 ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
1255 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1256 ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
1257 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0
1258 ; GFX12-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], null offen offset:2048
1259 ; GFX12-NEXT: ; implicit-def: $vgpr4
1260 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
1261 ; GFX12-NEXT: s_cbranch_execnz .LBB5_1
1262 ; GFX12-NEXT: ; %bb.2:
1263 ; GFX12-NEXT: s_mov_b32 exec_lo, s1
1264 ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[5:6], v[5:6]
1265 ; GFX12-NEXT: s_mov_b32 s1, 0
1266 ; GFX12-NEXT: .LBB5_3: ; %atomicrmw.start
1267 ; GFX12-NEXT: ; =>This Loop Header: Depth=1
1268 ; GFX12-NEXT: ; Child Loop BB5_4 Depth 2
1269 ; GFX12-NEXT: s_wait_loadcnt 0x0
1270 ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[13:14], v[13:14]
1271 ; GFX12-NEXT: s_mov_b32 s2, exec_lo
1272 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
1273 ; GFX12-NEXT: s_wait_storecnt 0x0
1274 ; GFX12-NEXT: v_max_num_f64_e32 v[11:12], v[0:1], v[4:5]
1275 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1276 ; GFX12-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
1277 ; GFX12-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
1278 ; GFX12-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1
1279 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
1280 ; GFX12-NEXT: v_readfirstlane_b32 s4, v9
1281 ; GFX12-NEXT: v_readfirstlane_b32 s5, v10
1282 ; GFX12-NEXT: v_readfirstlane_b32 s6, v7
1283 ; GFX12-NEXT: v_readfirstlane_b32 s7, v8
1284 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
1285 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
1286 ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
1287 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1288 ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
1289 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0
1290 ; GFX12-NEXT: s_wait_loadcnt 0x0
1291 ; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], null offen th:TH_ATOMIC_RETURN
1292 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
1293 ; GFX12-NEXT: s_cbranch_execnz .LBB5_4
1294 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
1295 ; GFX12-NEXT: s_mov_b32 exec_lo, s2
1296 ; GFX12-NEXT: s_wait_loadcnt 0x0
1297 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
1298 ; GFX12-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
1299 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1300 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
1301 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1302 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
1303 ; GFX12-NEXT: s_cbranch_execnz .LBB5_3
1304 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
1305 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
1306 ; GFX12-NEXT: s_setpc_b64 s[30:31]
1308 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
1310 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1311 ; GFX940-NEXT: v_mov_b32_e32 v7, v6
1312 ; GFX940-NEXT: v_mov_b32_e32 v6, v5
1313 ; GFX940-NEXT: s_mov_b64 s[2:3], exec
1314 ; GFX940-NEXT: buffer_wbl2 sc1
1315 ; GFX940-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
1316 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0
1317 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1
1318 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2
1319 ; GFX940-NEXT: v_readfirstlane_b32 s7, v3
1320 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
1321 ; GFX940-NEXT: s_nop 0
1322 ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
1323 ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
1324 ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
1325 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1326 ; GFX940-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[4:7], 0 offen offset:2048 sc0
1327 ; GFX940-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1328 ; GFX940-NEXT: ; implicit-def: $vgpr4
1329 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
1330 ; GFX940-NEXT: s_cbranch_execnz .LBB5_1
1331 ; GFX940-NEXT: ; %bb.2:
1332 ; GFX940-NEXT: s_mov_b64 exec, s[2:3]
1333 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1334 ; GFX940-NEXT: v_mov_b32_e32 v0, v6
1335 ; GFX940-NEXT: v_mov_b32_e32 v1, v7
1336 ; GFX940-NEXT: buffer_inv sc1
1337 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1339 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
1341 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1342 ; GFX11-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
1343 ; GFX11-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1344 ; GFX11-NEXT: v_add_nc_u32_e32 v15, 0x800, v4
1345 ; GFX11-NEXT: s_mov_b32 s1, 0
1346 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
1347 ; GFX11-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
1348 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
1349 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9
1350 ; GFX11-NEXT: v_readfirstlane_b32 s5, v10
1351 ; GFX11-NEXT: v_readfirstlane_b32 s6, v7
1352 ; GFX11-NEXT: v_readfirstlane_b32 s7, v8
1353 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
1354 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1355 ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
1356 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
1357 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1358 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0
1359 ; GFX11-NEXT: buffer_load_b64 v[13:14], v4, s[4:7], 0 offen offset:2048
1360 ; GFX11-NEXT: ; implicit-def: $vgpr4
1361 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
1362 ; GFX11-NEXT: s_cbranch_execnz .LBB5_1
1363 ; GFX11-NEXT: ; %bb.2:
1364 ; GFX11-NEXT: s_mov_b32 exec_lo, s2
1365 ; GFX11-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6]
1366 ; GFX11-NEXT: .p2align 6
1367 ; GFX11-NEXT: .LBB5_3: ; %atomicrmw.start
1368 ; GFX11-NEXT: ; =>This Loop Header: Depth=1
1369 ; GFX11-NEXT: ; Child Loop BB5_4 Depth 2
1370 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1371 ; GFX11-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
1372 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
1373 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1374 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1375 ; GFX11-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5]
1376 ; GFX11-NEXT: v_dual_mov_b32 v0, v11 :: v_dual_mov_b32 v1, v12
1377 ; GFX11-NEXT: v_dual_mov_b32 v2, v13 :: v_dual_mov_b32 v3, v14
1378 ; GFX11-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1
1379 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
1380 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9
1381 ; GFX11-NEXT: v_readfirstlane_b32 s5, v10
1382 ; GFX11-NEXT: v_readfirstlane_b32 s6, v7
1383 ; GFX11-NEXT: v_readfirstlane_b32 s7, v8
1384 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
1385 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[9:10]
1386 ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[7:8]
1387 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1388 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
1389 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0
1390 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1391 ; GFX11-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v15, s[4:7], 0 offen glc
1392 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
1393 ; GFX11-NEXT: s_cbranch_execnz .LBB5_4
1394 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
1395 ; GFX11-NEXT: s_mov_b32 exec_lo, s2
1396 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1397 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[13:14]
1398 ; GFX11-NEXT: v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v13, v0
1399 ; GFX11-NEXT: buffer_gl1_inv
1400 ; GFX11-NEXT: buffer_gl0_inv
1401 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
1402 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1403 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
1404 ; GFX11-NEXT: s_cbranch_execnz .LBB5_3
1405 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
1406 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
1407 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1409 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
1411 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1412 ; GFX10-NEXT: s_mov_b32 s5, exec_lo
1413 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1414 ; GFX10-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
1415 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0
1416 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1
1417 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2
1418 ; GFX10-NEXT: v_readfirstlane_b32 s11, v3
1419 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
1420 ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
1421 ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
1422 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4
1423 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1424 ; GFX10-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
1425 ; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1426 ; GFX10-NEXT: ; implicit-def: $vgpr4
1427 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
1428 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
1429 ; GFX10-NEXT: s_cbranch_execnz .LBB5_1
1430 ; GFX10-NEXT: ; %bb.2:
1431 ; GFX10-NEXT: s_mov_b32 exec_lo, s5
1432 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1433 ; GFX10-NEXT: v_mov_b32_e32 v0, v5
1434 ; GFX10-NEXT: v_mov_b32_e32 v1, v6
1435 ; GFX10-NEXT: buffer_gl1_inv
1436 ; GFX10-NEXT: buffer_gl0_inv
1437 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1439 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
1441 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1442 ; GFX90A-NEXT: v_mov_b32_e32 v7, v6
1443 ; GFX90A-NEXT: v_mov_b32_e32 v6, v5
1444 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec
1445 ; GFX90A-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
1446 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
1447 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
1448 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
1449 ; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
1450 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
1451 ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
1452 ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
1453 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
1454 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1455 ; GFX90A-NEXT: buffer_atomic_max_f64 v[6:7], v4, s[8:11], 0 offen offset:2048 glc
1456 ; GFX90A-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1457 ; GFX90A-NEXT: ; implicit-def: $vgpr4
1458 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
1459 ; GFX90A-NEXT: s_cbranch_execnz .LBB5_1
1460 ; GFX90A-NEXT: ; %bb.2:
1461 ; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
1462 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1463 ; GFX90A-NEXT: v_mov_b32_e32 v0, v6
1464 ; GFX90A-NEXT: v_mov_b32_e32 v1, v7
1465 ; GFX90A-NEXT: buffer_wbinvl1
1466 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
1468 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
1470 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1471 ; GFX908-NEXT: v_mov_b32_e32 v8, v3
1472 ; GFX908-NEXT: v_mov_b32_e32 v7, v2
1473 ; GFX908-NEXT: v_mov_b32_e32 v10, v1
1474 ; GFX908-NEXT: v_mov_b32_e32 v9, v0
1475 ; GFX908-NEXT: v_add_u32_e32 v15, 0x800, v4
1476 ; GFX908-NEXT: s_mov_b64 s[6:7], exec
1477 ; GFX908-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
1478 ; GFX908-NEXT: v_readfirstlane_b32 s8, v9
1479 ; GFX908-NEXT: v_readfirstlane_b32 s9, v10
1480 ; GFX908-NEXT: v_readfirstlane_b32 s10, v7
1481 ; GFX908-NEXT: v_readfirstlane_b32 s11, v8
1482 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
1483 ; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
1484 ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
1485 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
1486 ; GFX908-NEXT: s_nop 0
1487 ; GFX908-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
1488 ; GFX908-NEXT: ; implicit-def: $vgpr4
1489 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
1490 ; GFX908-NEXT: s_cbranch_execnz .LBB5_1
1491 ; GFX908-NEXT: ; %bb.2:
1492 ; GFX908-NEXT: s_mov_b64 exec, s[6:7]
1493 ; GFX908-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6]
1494 ; GFX908-NEXT: s_mov_b64 s[6:7], 0
1495 ; GFX908-NEXT: .LBB5_3: ; %atomicrmw.start
1496 ; GFX908-NEXT: ; =>This Loop Header: Depth=1
1497 ; GFX908-NEXT: ; Child Loop BB5_4 Depth 2
1498 ; GFX908-NEXT: s_waitcnt vmcnt(0)
1499 ; GFX908-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
1500 ; GFX908-NEXT: s_mov_b64 s[12:13], exec
1501 ; GFX908-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5]
1502 ; GFX908-NEXT: v_mov_b32_e32 v0, v11
1503 ; GFX908-NEXT: v_mov_b32_e32 v1, v12
1504 ; GFX908-NEXT: v_mov_b32_e32 v2, v13
1505 ; GFX908-NEXT: v_mov_b32_e32 v3, v14
1506 ; GFX908-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1
1507 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
1508 ; GFX908-NEXT: v_readfirstlane_b32 s8, v9
1509 ; GFX908-NEXT: v_readfirstlane_b32 s9, v10
1510 ; GFX908-NEXT: v_readfirstlane_b32 s10, v7
1511 ; GFX908-NEXT: v_readfirstlane_b32 s11, v8
1512 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
1513 ; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
1514 ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
1515 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
1516 ; GFX908-NEXT: s_waitcnt vmcnt(0)
1517 ; GFX908-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
1518 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
1519 ; GFX908-NEXT: s_cbranch_execnz .LBB5_4
1520 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
1521 ; GFX908-NEXT: s_mov_b64 exec, s[12:13]
1522 ; GFX908-NEXT: s_waitcnt vmcnt(0)
1523 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
1524 ; GFX908-NEXT: v_mov_b32_e32 v14, v1
1525 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
1526 ; GFX908-NEXT: v_mov_b32_e32 v13, v0
1527 ; GFX908-NEXT: buffer_wbinvl1
1528 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
1529 ; GFX908-NEXT: s_cbranch_execnz .LBB5_3
1530 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
1531 ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
1532 ; GFX908-NEXT: s_setpc_b64 s[30:31]
1534 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
1536 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1537 ; GFX8-NEXT: v_mov_b32_e32 v8, v3
1538 ; GFX8-NEXT: v_mov_b32_e32 v7, v2
1539 ; GFX8-NEXT: v_mov_b32_e32 v10, v1
1540 ; GFX8-NEXT: v_mov_b32_e32 v9, v0
1541 ; GFX8-NEXT: v_add_u32_e32 v15, vcc, 0x800, v4
1542 ; GFX8-NEXT: s_mov_b64 s[6:7], exec
1543 ; GFX8-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
1544 ; GFX8-NEXT: v_readfirstlane_b32 s8, v9
1545 ; GFX8-NEXT: v_readfirstlane_b32 s9, v10
1546 ; GFX8-NEXT: v_readfirstlane_b32 s10, v7
1547 ; GFX8-NEXT: v_readfirstlane_b32 s11, v8
1548 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
1549 ; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
1550 ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
1551 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
1552 ; GFX8-NEXT: s_nop 0
1553 ; GFX8-NEXT: buffer_load_dwordx2 v[13:14], v4, s[8:11], 0 offen offset:2048
1554 ; GFX8-NEXT: ; implicit-def: $vgpr4
1555 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
1556 ; GFX8-NEXT: s_cbranch_execnz .LBB5_1
1557 ; GFX8-NEXT: ; %bb.2:
1558 ; GFX8-NEXT: s_mov_b64 exec, s[6:7]
1559 ; GFX8-NEXT: v_max_f64 v[4:5], v[5:6], v[5:6]
1560 ; GFX8-NEXT: s_mov_b64 s[6:7], 0
1561 ; GFX8-NEXT: .LBB5_3: ; %atomicrmw.start
1562 ; GFX8-NEXT: ; =>This Loop Header: Depth=1
1563 ; GFX8-NEXT: ; Child Loop BB5_4 Depth 2
1564 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1565 ; GFX8-NEXT: v_max_f64 v[0:1], v[13:14], v[13:14]
1566 ; GFX8-NEXT: s_mov_b64 s[12:13], exec
1567 ; GFX8-NEXT: v_max_f64 v[11:12], v[0:1], v[4:5]
1568 ; GFX8-NEXT: v_mov_b32_e32 v0, v11
1569 ; GFX8-NEXT: v_mov_b32_e32 v1, v12
1570 ; GFX8-NEXT: v_mov_b32_e32 v2, v13
1571 ; GFX8-NEXT: v_mov_b32_e32 v3, v14
1572 ; GFX8-NEXT: .LBB5_4: ; Parent Loop BB5_3 Depth=1
1573 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
1574 ; GFX8-NEXT: v_readfirstlane_b32 s8, v9
1575 ; GFX8-NEXT: v_readfirstlane_b32 s9, v10
1576 ; GFX8-NEXT: v_readfirstlane_b32 s10, v7
1577 ; GFX8-NEXT: v_readfirstlane_b32 s11, v8
1578 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[9:10]
1579 ; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[7:8]
1580 ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
1581 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
1582 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1583 ; GFX8-NEXT: buffer_atomic_cmpswap_x2 v[0:3], v15, s[8:11], 0 offen glc
1584 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
1585 ; GFX8-NEXT: s_cbranch_execnz .LBB5_4
1586 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB5_3 Depth=1
1587 ; GFX8-NEXT: s_mov_b64 exec, s[12:13]
1588 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1589 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[13:14]
1590 ; GFX8-NEXT: v_mov_b32_e32 v14, v1
1591 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
1592 ; GFX8-NEXT: v_mov_b32_e32 v13, v0
1593 ; GFX8-NEXT: buffer_wbinvl1
1594 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
1595 ; GFX8-NEXT: s_cbranch_execnz .LBB5_3
1596 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
1597 ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
1598 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1600 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
1602 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1603 ; GFX7-NEXT: s_mov_b64 s[6:7], exec
1604 ; GFX7-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
1605 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0
1606 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1
1607 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2
1608 ; GFX7-NEXT: v_readfirstlane_b32 s11, v3
1609 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
1610 ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
1611 ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
1612 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
1613 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1614 ; GFX7-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
1615 ; GFX7-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1616 ; GFX7-NEXT: ; implicit-def: $vgpr4
1617 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
1618 ; GFX7-NEXT: s_cbranch_execnz .LBB5_1
1619 ; GFX7-NEXT: ; %bb.2:
1620 ; GFX7-NEXT: s_mov_b64 exec, s[6:7]
1621 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1622 ; GFX7-NEXT: v_mov_b32_e32 v0, v5
1623 ; GFX7-NEXT: v_mov_b32_e32 v1, v6
1624 ; GFX7-NEXT: buffer_wbinvl1
1625 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1627 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__offset__waterfall:
1629 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1630 ; GFX6-NEXT: s_mov_b64 s[6:7], exec
1631 ; GFX6-NEXT: .LBB5_1: ; =>This Inner Loop Header: Depth=1
1632 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0
1633 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1
1634 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2
1635 ; GFX6-NEXT: v_readfirstlane_b32 s11, v3
1636 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
1637 ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
1638 ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
1639 ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
1640 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
1641 ; GFX6-NEXT: buffer_atomic_fmax_x2 v[5:6], v4, s[8:11], 0 offen offset:2048 glc
1642 ; GFX6-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
1643 ; GFX6-NEXT: ; implicit-def: $vgpr4
1644 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
1645 ; GFX6-NEXT: s_cbranch_execnz .LBB5_1
1646 ; GFX6-NEXT: ; %bb.2:
1647 ; GFX6-NEXT: s_mov_b64 exec, s[6:7]
1648 ; GFX6-NEXT: s_waitcnt vmcnt(0)
1649 ; GFX6-NEXT: v_mov_b32_e32 v0, v5
1650 ; GFX6-NEXT: v_mov_b32_e32 v1, v6
1651 ; GFX6-NEXT: buffer_wbinvl1
1652 ; GFX6-NEXT: s_waitcnt expcnt(0)
1653 ; GFX6-NEXT: s_setpc_b64 s[30:31]
1654 %gep = getelementptr double, ptr addrspace(7) %ptr, i32 256
1655 %result = atomicrmw fmax ptr addrspace(7) %gep, double %val syncscope("agent") seq_cst
1659 ; --------------------------------------------------------------------
1661 ; --------------------------------------------------------------------
1663 define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
1664 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
1666 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1667 ; GFX12-NEXT: s_wait_expcnt 0x0
1668 ; GFX12-NEXT: s_wait_samplecnt 0x0
1669 ; GFX12-NEXT: s_wait_bvhcnt 0x0
1670 ; GFX12-NEXT: s_wait_kmcnt 0x0
1671 ; GFX12-NEXT: s_addk_co_i32 s6, 0x200
1672 ; GFX12-NEXT: v_max_num_f16_e32 v5, v0, v0
1673 ; GFX12-NEXT: s_and_b32 s4, s6, -4
1674 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1675 ; GFX12-NEXT: v_mov_b32_e32 v4, s4
1676 ; GFX12-NEXT: s_and_b32 s4, s6, 3
1677 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3
1678 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1679 ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
1680 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
1681 ; GFX12-NEXT: s_not_b32 s6, s5
1682 ; GFX12-NEXT: s_mov_b32 s5, 0
1683 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
1684 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1685 ; GFX12-NEXT: s_wait_loadcnt 0x0
1686 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
1687 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
1688 ; GFX12-NEXT: s_wait_storecnt 0x0
1689 ; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
1690 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1691 ; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v5
1692 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
1693 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1694 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
1695 ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
1696 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1697 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
1698 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
1699 ; GFX12-NEXT: s_wait_loadcnt 0x0
1700 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
1701 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
1702 ; GFX12-NEXT: v_mov_b32_e32 v1, v2
1703 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
1704 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1705 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
1706 ; GFX12-NEXT: s_cbranch_execnz .LBB6_1
1707 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
1708 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
1709 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
1710 ; GFX12-NEXT: s_setpc_b64 s[30:31]
1712 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
1714 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1715 ; GFX940-NEXT: s_addk_i32 s6, 0x200
1716 ; GFX940-NEXT: s_and_b32 s4, s6, -4
1717 ; GFX940-NEXT: v_mov_b32_e32 v4, s4
1718 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
1719 ; GFX940-NEXT: s_and_b32 s4, s6, 3
1720 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3
1721 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
1722 ; GFX940-NEXT: s_not_b32 s7, s4
1723 ; GFX940-NEXT: s_mov_b64 s[4:5], 0
1724 ; GFX940-NEXT: v_max_f16_e32 v5, v0, v0
1725 ; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start
1726 ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
1727 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1728 ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1
1729 ; GFX940-NEXT: v_max_f16_e32 v0, v0, v0
1730 ; GFX940-NEXT: v_max_f16_e32 v0, v0, v5
1731 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0
1732 ; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
1733 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
1734 ; GFX940-NEXT: buffer_wbl2 sc1
1735 ; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
1736 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1737 ; GFX940-NEXT: buffer_inv sc1
1738 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
1739 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1740 ; GFX940-NEXT: v_mov_b32_e32 v1, v2
1741 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
1742 ; GFX940-NEXT: s_cbranch_execnz .LBB6_1
1743 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
1744 ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
1745 ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2
1746 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1748 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
1750 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1751 ; GFX11-NEXT: s_addk_i32 s6, 0x200
1752 ; GFX11-NEXT: v_max_f16_e32 v5, v0, v0
1753 ; GFX11-NEXT: s_and_b32 s4, s6, -4
1754 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1755 ; GFX11-NEXT: v_mov_b32_e32 v4, s4
1756 ; GFX11-NEXT: s_and_b32 s4, s6, 3
1757 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3
1758 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1759 ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
1760 ; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
1761 ; GFX11-NEXT: s_not_b32 s6, s5
1762 ; GFX11-NEXT: s_mov_b32 s5, 0
1763 ; GFX11-NEXT: .p2align 6
1764 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
1765 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1766 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1767 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
1768 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1769 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1770 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
1771 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v5
1772 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1773 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
1774 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
1775 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1776 ; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
1777 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
1778 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
1779 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1780 ; GFX11-NEXT: buffer_gl1_inv
1781 ; GFX11-NEXT: buffer_gl0_inv
1782 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
1783 ; GFX11-NEXT: v_mov_b32_e32 v1, v2
1784 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
1785 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1786 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
1787 ; GFX11-NEXT: s_cbranch_execnz .LBB6_1
1788 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
1789 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
1790 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
1791 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1793 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
1795 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1796 ; GFX10-NEXT: s_addk_i32 s18, 0x200
1797 ; GFX10-NEXT: s_mov_b32 s11, s17
1798 ; GFX10-NEXT: s_and_b32 s4, s18, -4
1799 ; GFX10-NEXT: s_mov_b32 s10, s16
1800 ; GFX10-NEXT: v_mov_b32_e32 v4, s4
1801 ; GFX10-NEXT: s_mov_b32 s9, s7
1802 ; GFX10-NEXT: s_mov_b32 s8, s6
1803 ; GFX10-NEXT: s_and_b32 s4, s18, 3
1804 ; GFX10-NEXT: v_max_f16_e32 v5, v0, v0
1805 ; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen
1806 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3
1807 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
1808 ; GFX10-NEXT: s_not_b32 s6, s5
1809 ; GFX10-NEXT: s_mov_b32 s5, 0
1810 ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
1811 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1812 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1813 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1
1814 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1815 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
1816 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v5
1817 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
1818 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
1819 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
1820 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
1821 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc
1822 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1823 ; GFX10-NEXT: buffer_gl1_inv
1824 ; GFX10-NEXT: buffer_gl0_inv
1825 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
1826 ; GFX10-NEXT: v_mov_b32_e32 v1, v2
1827 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
1828 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
1829 ; GFX10-NEXT: s_cbranch_execnz .LBB6_1
1830 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
1831 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
1832 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2
1833 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1835 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
1837 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1838 ; GFX90A-NEXT: s_addk_i32 s18, 0x200
1839 ; GFX90A-NEXT: s_and_b32 s4, s18, -4
1840 ; GFX90A-NEXT: s_mov_b32 s11, s17
1841 ; GFX90A-NEXT: s_mov_b32 s10, s16
1842 ; GFX90A-NEXT: s_mov_b32 s9, s7
1843 ; GFX90A-NEXT: s_mov_b32 s8, s6
1844 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4
1845 ; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen
1846 ; GFX90A-NEXT: s_and_b32 s4, s18, 3
1847 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
1848 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
1849 ; GFX90A-NEXT: s_not_b32 s7, s4
1850 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
1851 ; GFX90A-NEXT: v_max_f16_e32 v5, v0, v0
1852 ; GFX90A-NEXT: .LBB6_1: ; %atomicrmw.start
1853 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
1854 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1855 ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1
1856 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
1857 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v5
1858 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0
1859 ; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
1860 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
1861 ; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc
1862 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1863 ; GFX90A-NEXT: buffer_wbinvl1
1864 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
1865 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1866 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2
1867 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
1868 ; GFX90A-NEXT: s_cbranch_execnz .LBB6_1
1869 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
1870 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
1871 ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2
1872 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
1874 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
1876 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1877 ; GFX908-NEXT: s_addk_i32 s18, 0x200
1878 ; GFX908-NEXT: s_and_b32 s4, s18, -4
1879 ; GFX908-NEXT: s_mov_b32 s11, s17
1880 ; GFX908-NEXT: s_mov_b32 s10, s16
1881 ; GFX908-NEXT: s_mov_b32 s9, s7
1882 ; GFX908-NEXT: s_mov_b32 s8, s6
1883 ; GFX908-NEXT: v_mov_b32_e32 v4, s4
1884 ; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen
1885 ; GFX908-NEXT: s_and_b32 s4, s18, 3
1886 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3
1887 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
1888 ; GFX908-NEXT: s_not_b32 s7, s4
1889 ; GFX908-NEXT: s_mov_b64 s[4:5], 0
1890 ; GFX908-NEXT: v_max_f16_e32 v5, v0, v0
1891 ; GFX908-NEXT: .LBB6_1: ; %atomicrmw.start
1892 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
1893 ; GFX908-NEXT: s_waitcnt vmcnt(0)
1894 ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1
1895 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
1896 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v5
1897 ; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0
1898 ; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
1899 ; GFX908-NEXT: v_mov_b32_e32 v3, v1
1900 ; GFX908-NEXT: v_mov_b32_e32 v2, v0
1901 ; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc
1902 ; GFX908-NEXT: s_waitcnt vmcnt(0)
1903 ; GFX908-NEXT: buffer_wbinvl1
1904 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
1905 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1906 ; GFX908-NEXT: v_mov_b32_e32 v1, v2
1907 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
1908 ; GFX908-NEXT: s_cbranch_execnz .LBB6_1
1909 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
1910 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
1911 ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2
1912 ; GFX908-NEXT: s_setpc_b64 s[30:31]
1914 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
1916 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1917 ; GFX8-NEXT: s_addk_i32 s18, 0x200
1918 ; GFX8-NEXT: s_and_b32 s4, s18, -4
1919 ; GFX8-NEXT: s_mov_b32 s11, s17
1920 ; GFX8-NEXT: s_mov_b32 s10, s16
1921 ; GFX8-NEXT: s_mov_b32 s9, s7
1922 ; GFX8-NEXT: s_mov_b32 s8, s6
1923 ; GFX8-NEXT: v_mov_b32_e32 v4, s4
1924 ; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen
1925 ; GFX8-NEXT: s_and_b32 s4, s18, 3
1926 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3
1927 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
1928 ; GFX8-NEXT: s_not_b32 s7, s4
1929 ; GFX8-NEXT: s_mov_b64 s[4:5], 0
1930 ; GFX8-NEXT: v_max_f16_e32 v5, v0, v0
1931 ; GFX8-NEXT: .LBB6_1: ; %atomicrmw.start
1932 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
1933 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1934 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1
1935 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
1936 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v5
1937 ; GFX8-NEXT: v_and_b32_e32 v2, s7, v1
1938 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0
1939 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
1940 ; GFX8-NEXT: v_mov_b32_e32 v3, v1
1941 ; GFX8-NEXT: v_mov_b32_e32 v2, v0
1942 ; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc
1943 ; GFX8-NEXT: s_waitcnt vmcnt(0)
1944 ; GFX8-NEXT: buffer_wbinvl1
1945 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
1946 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1947 ; GFX8-NEXT: v_mov_b32_e32 v1, v2
1948 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
1949 ; GFX8-NEXT: s_cbranch_execnz .LBB6_1
1950 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
1951 ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
1952 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2
1953 ; GFX8-NEXT: s_setpc_b64 s[30:31]
1955 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
1957 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1958 ; GFX7-NEXT: s_addk_i32 s18, 0x200
1959 ; GFX7-NEXT: s_and_b32 s4, s18, -4
1960 ; GFX7-NEXT: s_mov_b32 s11, s17
1961 ; GFX7-NEXT: s_mov_b32 s10, s16
1962 ; GFX7-NEXT: s_mov_b32 s9, s7
1963 ; GFX7-NEXT: s_mov_b32 s8, s6
1964 ; GFX7-NEXT: v_mov_b32_e32 v4, s4
1965 ; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen
1966 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
1967 ; GFX7-NEXT: s_and_b32 s4, s18, 3
1968 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3
1969 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
1970 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
1971 ; GFX7-NEXT: s_not_b32 s7, s4
1972 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
1973 ; GFX7-NEXT: .LBB6_1: ; %atomicrmw.start
1974 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
1975 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1976 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
1977 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
1978 ; GFX7-NEXT: v_and_b32_e32 v2, s7, v1
1979 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v5
1980 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
1981 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
1982 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
1983 ; GFX7-NEXT: v_mov_b32_e32 v3, v1
1984 ; GFX7-NEXT: v_mov_b32_e32 v2, v0
1985 ; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc
1986 ; GFX7-NEXT: s_waitcnt vmcnt(0)
1987 ; GFX7-NEXT: buffer_wbinvl1
1988 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
1989 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1990 ; GFX7-NEXT: v_mov_b32_e32 v1, v2
1991 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
1992 ; GFX7-NEXT: s_cbranch_execnz .LBB6_1
1993 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
1994 ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
1995 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
1996 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
1997 ; GFX7-NEXT: s_setpc_b64 s[30:31]
1999 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset:
2001 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2002 ; GFX6-NEXT: s_addk_i32 s18, 0x200
2003 ; GFX6-NEXT: s_and_b32 s4, s18, -4
2004 ; GFX6-NEXT: s_mov_b32 s11, s17
2005 ; GFX6-NEXT: s_mov_b32 s10, s16
2006 ; GFX6-NEXT: s_mov_b32 s9, s7
2007 ; GFX6-NEXT: s_mov_b32 s8, s6
2008 ; GFX6-NEXT: v_mov_b32_e32 v4, s4
2009 ; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen
2010 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
2011 ; GFX6-NEXT: s_and_b32 s4, s18, 3
2012 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3
2013 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
2014 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
2015 ; GFX6-NEXT: s_not_b32 s7, s4
2016 ; GFX6-NEXT: s_mov_b64 s[4:5], 0
2017 ; GFX6-NEXT: .LBB6_1: ; %atomicrmw.start
2018 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
2019 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2020 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
2021 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
2022 ; GFX6-NEXT: s_waitcnt expcnt(0)
2023 ; GFX6-NEXT: v_and_b32_e32 v2, s7, v1
2024 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v5
2025 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
2026 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
2027 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
2028 ; GFX6-NEXT: v_mov_b32_e32 v3, v1
2029 ; GFX6-NEXT: v_mov_b32_e32 v2, v0
2030 ; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc
2031 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2032 ; GFX6-NEXT: buffer_wbinvl1
2033 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
2034 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2035 ; GFX6-NEXT: v_mov_b32_e32 v1, v2
2036 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
2037 ; GFX6-NEXT: s_cbranch_execnz .LBB6_1
2038 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
2039 ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
2040 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
2041 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
2042 ; GFX6-NEXT: s_waitcnt expcnt(0)
2043 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2044 %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
2045 %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
2049 define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset(ptr addrspace(7) inreg %ptr, half %val) #0 {
2050 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
2052 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
2053 ; GFX12-NEXT: s_wait_expcnt 0x0
2054 ; GFX12-NEXT: s_wait_samplecnt 0x0
2055 ; GFX12-NEXT: s_wait_bvhcnt 0x0
2056 ; GFX12-NEXT: s_wait_kmcnt 0x0
2057 ; GFX12-NEXT: s_addk_co_i32 s6, 0x200
2058 ; GFX12-NEXT: v_max_num_f16_e32 v3, v0, v0
2059 ; GFX12-NEXT: s_and_b32 s4, s6, -4
2060 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2061 ; GFX12-NEXT: v_mov_b32_e32 v2, s4
2062 ; GFX12-NEXT: s_and_b32 s4, s6, 3
2063 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3
2064 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2065 ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
2066 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
2067 ; GFX12-NEXT: s_not_b32 s6, s5
2068 ; GFX12-NEXT: s_mov_b32 s5, 0
2069 ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
2070 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
2071 ; GFX12-NEXT: s_wait_loadcnt 0x0
2072 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
2073 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
2074 ; GFX12-NEXT: s_wait_storecnt 0x0
2075 ; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v0
2076 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2077 ; GFX12-NEXT: v_max_num_f16_e32 v0, v0, v3
2078 ; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0
2079 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2080 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
2081 ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
2082 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2083 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
2084 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
2085 ; GFX12-NEXT: s_wait_loadcnt 0x0
2086 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
2087 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
2088 ; GFX12-NEXT: v_mov_b32_e32 v1, v4
2089 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
2090 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2091 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
2092 ; GFX12-NEXT: s_cbranch_execnz .LBB7_1
2093 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
2094 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
2095 ; GFX12-NEXT: s_setpc_b64 s[30:31]
2097 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
2099 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2100 ; GFX940-NEXT: s_addk_i32 s6, 0x200
2101 ; GFX940-NEXT: s_and_b32 s4, s6, -4
2102 ; GFX940-NEXT: v_mov_b32_e32 v2, s4
2103 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
2104 ; GFX940-NEXT: s_and_b32 s4, s6, 3
2105 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3
2106 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
2107 ; GFX940-NEXT: s_not_b32 s7, s4
2108 ; GFX940-NEXT: s_mov_b64 s[4:5], 0
2109 ; GFX940-NEXT: v_max_f16_e32 v3, v0, v0
2110 ; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start
2111 ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
2112 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2113 ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v1
2114 ; GFX940-NEXT: v_max_f16_e32 v0, v0, v0
2115 ; GFX940-NEXT: v_max_f16_e32 v0, v0, v3
2116 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, s6, v0
2117 ; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
2118 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
2119 ; GFX940-NEXT: buffer_wbl2 sc1
2120 ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
2121 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2122 ; GFX940-NEXT: buffer_inv sc1
2123 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
2124 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2125 ; GFX940-NEXT: v_mov_b32_e32 v1, v4
2126 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
2127 ; GFX940-NEXT: s_cbranch_execnz .LBB7_1
2128 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
2129 ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
2130 ; GFX940-NEXT: s_setpc_b64 s[30:31]
2132 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
2134 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2135 ; GFX11-NEXT: s_addk_i32 s6, 0x200
2136 ; GFX11-NEXT: v_max_f16_e32 v3, v0, v0
2137 ; GFX11-NEXT: s_and_b32 s4, s6, -4
2138 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2139 ; GFX11-NEXT: v_mov_b32_e32 v2, s4
2140 ; GFX11-NEXT: s_and_b32 s4, s6, 3
2141 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3
2142 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2143 ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
2144 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
2145 ; GFX11-NEXT: s_not_b32 s6, s5
2146 ; GFX11-NEXT: s_mov_b32 s5, 0
2147 ; GFX11-NEXT: .p2align 6
2148 ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
2149 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
2150 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2151 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
2152 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2153 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2154 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
2155 ; GFX11-NEXT: v_max_f16_e32 v0, v0, v3
2156 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2157 ; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0
2158 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
2159 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2160 ; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
2161 ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
2162 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
2163 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2164 ; GFX11-NEXT: buffer_gl1_inv
2165 ; GFX11-NEXT: buffer_gl0_inv
2166 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
2167 ; GFX11-NEXT: v_mov_b32_e32 v1, v4
2168 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
2169 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2170 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
2171 ; GFX11-NEXT: s_cbranch_execnz .LBB7_1
2172 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
2173 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
2174 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2176 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
2178 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2179 ; GFX10-NEXT: s_addk_i32 s18, 0x200
2180 ; GFX10-NEXT: s_mov_b32 s11, s17
2181 ; GFX10-NEXT: s_and_b32 s4, s18, -4
2182 ; GFX10-NEXT: s_mov_b32 s10, s16
2183 ; GFX10-NEXT: v_mov_b32_e32 v2, s4
2184 ; GFX10-NEXT: s_mov_b32 s9, s7
2185 ; GFX10-NEXT: s_mov_b32 s8, s6
2186 ; GFX10-NEXT: s_and_b32 s4, s18, 3
2187 ; GFX10-NEXT: v_max_f16_e32 v3, v0, v0
2188 ; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen
2189 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3
2190 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
2191 ; GFX10-NEXT: s_not_b32 s6, s5
2192 ; GFX10-NEXT: s_mov_b32 s5, 0
2193 ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
2194 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2195 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2196 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v1
2197 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2198 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v0
2199 ; GFX10-NEXT: v_max_f16_e32 v0, v0, v3
2200 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2201 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
2202 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
2203 ; GFX10-NEXT: v_mov_b32_e32 v4, v0
2204 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc
2205 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2206 ; GFX10-NEXT: buffer_gl1_inv
2207 ; GFX10-NEXT: buffer_gl0_inv
2208 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
2209 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
2210 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
2211 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
2212 ; GFX10-NEXT: s_cbranch_execnz .LBB7_1
2213 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
2214 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
2215 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2217 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
2219 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2220 ; GFX90A-NEXT: s_addk_i32 s18, 0x200
2221 ; GFX90A-NEXT: s_and_b32 s4, s18, -4
2222 ; GFX90A-NEXT: s_mov_b32 s11, s17
2223 ; GFX90A-NEXT: s_mov_b32 s10, s16
2224 ; GFX90A-NEXT: s_mov_b32 s9, s7
2225 ; GFX90A-NEXT: s_mov_b32 s8, s6
2226 ; GFX90A-NEXT: v_mov_b32_e32 v2, s4
2227 ; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen
2228 ; GFX90A-NEXT: s_and_b32 s4, s18, 3
2229 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
2230 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
2231 ; GFX90A-NEXT: s_not_b32 s7, s4
2232 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
2233 ; GFX90A-NEXT: v_max_f16_e32 v3, v0, v0
2234 ; GFX90A-NEXT: .LBB7_1: ; %atomicrmw.start
2235 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
2236 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
2237 ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v1
2238 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v0
2239 ; GFX90A-NEXT: v_max_f16_e32 v0, v0, v3
2240 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, s6, v0
2241 ; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
2242 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
2243 ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc
2244 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
2245 ; GFX90A-NEXT: buffer_wbinvl1
2246 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
2247 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2248 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4
2249 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
2250 ; GFX90A-NEXT: s_cbranch_execnz .LBB7_1
2251 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
2252 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
2253 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
2255 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
2257 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2258 ; GFX908-NEXT: s_addk_i32 s18, 0x200
2259 ; GFX908-NEXT: s_and_b32 s4, s18, -4
2260 ; GFX908-NEXT: s_mov_b32 s11, s17
2261 ; GFX908-NEXT: s_mov_b32 s10, s16
2262 ; GFX908-NEXT: s_mov_b32 s9, s7
2263 ; GFX908-NEXT: s_mov_b32 s8, s6
2264 ; GFX908-NEXT: v_mov_b32_e32 v2, s4
2265 ; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen
2266 ; GFX908-NEXT: s_and_b32 s4, s18, 3
2267 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3
2268 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
2269 ; GFX908-NEXT: s_not_b32 s7, s4
2270 ; GFX908-NEXT: s_mov_b64 s[4:5], 0
2271 ; GFX908-NEXT: v_max_f16_e32 v3, v0, v0
2272 ; GFX908-NEXT: .LBB7_1: ; %atomicrmw.start
2273 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
2274 ; GFX908-NEXT: s_waitcnt vmcnt(0)
2275 ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v1
2276 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v0
2277 ; GFX908-NEXT: v_max_f16_e32 v0, v0, v3
2278 ; GFX908-NEXT: v_lshlrev_b32_e32 v0, s6, v0
2279 ; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
2280 ; GFX908-NEXT: v_mov_b32_e32 v5, v1
2281 ; GFX908-NEXT: v_mov_b32_e32 v4, v0
2282 ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc
2283 ; GFX908-NEXT: s_waitcnt vmcnt(0)
2284 ; GFX908-NEXT: buffer_wbinvl1
2285 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
2286 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2287 ; GFX908-NEXT: v_mov_b32_e32 v1, v4
2288 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
2289 ; GFX908-NEXT: s_cbranch_execnz .LBB7_1
2290 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
2291 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
2292 ; GFX908-NEXT: s_setpc_b64 s[30:31]
2294 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
2296 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2297 ; GFX8-NEXT: s_addk_i32 s18, 0x200
2298 ; GFX8-NEXT: s_and_b32 s4, s18, -4
2299 ; GFX8-NEXT: s_mov_b32 s11, s17
2300 ; GFX8-NEXT: s_mov_b32 s10, s16
2301 ; GFX8-NEXT: s_mov_b32 s9, s7
2302 ; GFX8-NEXT: s_mov_b32 s8, s6
2303 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
2304 ; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen
2305 ; GFX8-NEXT: s_and_b32 s4, s18, 3
2306 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3
2307 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
2308 ; GFX8-NEXT: s_not_b32 s7, s4
2309 ; GFX8-NEXT: s_mov_b64 s[4:5], 0
2310 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
2311 ; GFX8-NEXT: .LBB7_1: ; %atomicrmw.start
2312 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
2313 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2314 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v1
2315 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v0
2316 ; GFX8-NEXT: v_max_f16_e32 v0, v0, v3
2317 ; GFX8-NEXT: v_and_b32_e32 v4, s7, v1
2318 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, s6, v0
2319 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
2320 ; GFX8-NEXT: v_mov_b32_e32 v5, v1
2321 ; GFX8-NEXT: v_mov_b32_e32 v4, v0
2322 ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc
2323 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2324 ; GFX8-NEXT: buffer_wbinvl1
2325 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
2326 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2327 ; GFX8-NEXT: v_mov_b32_e32 v1, v4
2328 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
2329 ; GFX8-NEXT: s_cbranch_execnz .LBB7_1
2330 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
2331 ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
2332 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2334 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
2336 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2337 ; GFX7-NEXT: s_addk_i32 s18, 0x200
2338 ; GFX7-NEXT: s_and_b32 s4, s18, -4
2339 ; GFX7-NEXT: s_mov_b32 s11, s17
2340 ; GFX7-NEXT: s_mov_b32 s10, s16
2341 ; GFX7-NEXT: s_mov_b32 s9, s7
2342 ; GFX7-NEXT: s_mov_b32 s8, s6
2343 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
2344 ; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen
2345 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
2346 ; GFX7-NEXT: s_and_b32 s4, s18, 3
2347 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3
2348 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
2349 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
2350 ; GFX7-NEXT: s_not_b32 s7, s4
2351 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
2352 ; GFX7-NEXT: .LBB7_1: ; %atomicrmw.start
2353 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
2354 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2355 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
2356 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
2357 ; GFX7-NEXT: v_and_b32_e32 v4, s7, v1
2358 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
2359 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
2360 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
2361 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
2362 ; GFX7-NEXT: v_mov_b32_e32 v5, v1
2363 ; GFX7-NEXT: v_mov_b32_e32 v4, v0
2364 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc
2365 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2366 ; GFX7-NEXT: buffer_wbinvl1
2367 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
2368 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2369 ; GFX7-NEXT: v_mov_b32_e32 v1, v4
2370 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
2371 ; GFX7-NEXT: s_cbranch_execnz .LBB7_1
2372 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
2373 ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
2374 ; GFX7-NEXT: s_setpc_b64 s[30:31]
2376 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset:
2378 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2379 ; GFX6-NEXT: s_addk_i32 s18, 0x200
2380 ; GFX6-NEXT: s_and_b32 s4, s18, -4
2381 ; GFX6-NEXT: s_mov_b32 s11, s17
2382 ; GFX6-NEXT: s_mov_b32 s10, s16
2383 ; GFX6-NEXT: s_mov_b32 s9, s7
2384 ; GFX6-NEXT: s_mov_b32 s8, s6
2385 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
2386 ; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen
2387 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
2388 ; GFX6-NEXT: s_and_b32 s4, s18, 3
2389 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3
2390 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
2391 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
2392 ; GFX6-NEXT: s_not_b32 s7, s4
2393 ; GFX6-NEXT: s_mov_b64 s[4:5], 0
2394 ; GFX6-NEXT: .LBB7_1: ; %atomicrmw.start
2395 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
2396 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2397 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
2398 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
2399 ; GFX6-NEXT: s_waitcnt expcnt(0)
2400 ; GFX6-NEXT: v_and_b32_e32 v4, s7, v1
2401 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v3
2402 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
2403 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
2404 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
2405 ; GFX6-NEXT: v_mov_b32_e32 v5, v1
2406 ; GFX6-NEXT: v_mov_b32_e32 v4, v0
2407 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc
2408 ; GFX6-NEXT: s_waitcnt vmcnt(0)
2409 ; GFX6-NEXT: buffer_wbinvl1
2410 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
2411 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2412 ; GFX6-NEXT: v_mov_b32_e32 v1, v4
2413 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
2414 ; GFX6-NEXT: s_cbranch_execnz .LBB7_1
2415 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
2416 ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
2417 ; GFX6-NEXT: s_waitcnt expcnt(0)
2418 ; GFX6-NEXT: s_setpc_b64 s[30:31]
2419 %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
2420 %unused = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
2424 define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall(ptr addrspace(7) %ptr, half %val) #0 {
2425 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
2427 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
2428 ; GFX12-NEXT: s_wait_expcnt 0x0
2429 ; GFX12-NEXT: s_wait_samplecnt 0x0
2430 ; GFX12-NEXT: s_wait_bvhcnt 0x0
2431 ; GFX12-NEXT: s_wait_kmcnt 0x0
2432 ; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
2433 ; GFX12-NEXT: s_mov_b32 s1, exec_lo
2434 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2435 ; GFX12-NEXT: v_and_b32_e32 v6, 3, v4
2436 ; GFX12-NEXT: v_and_b32_e32 v8, -4, v4
2437 ; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6
2438 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2439 ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
2440 ; GFX12-NEXT: v_not_b32_e32 v9, v6
2441 ; GFX12-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
2442 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0
2443 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1
2444 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2
2445 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3
2446 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2447 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
2448 ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
2449 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2450 ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
2451 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0
2452 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
2453 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
2454 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1
2455 ; GFX12-NEXT: ; %bb.2:
2456 ; GFX12-NEXT: s_mov_b32 exec_lo, s1
2457 ; GFX12-NEXT: v_max_num_f16_e32 v10, v5, v5
2458 ; GFX12-NEXT: s_mov_b32 s1, 0
2459 ; GFX12-NEXT: .LBB8_3: ; %atomicrmw.start
2460 ; GFX12-NEXT: ; =>This Loop Header: Depth=1
2461 ; GFX12-NEXT: ; Child Loop BB8_4 Depth 2
2462 ; GFX12-NEXT: s_wait_loadcnt 0x0
2463 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
2464 ; GFX12-NEXT: s_mov_b32 s2, exec_lo
2465 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
2466 ; GFX12-NEXT: s_wait_storecnt 0x0
2467 ; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v4
2468 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2469 ; GFX12-NEXT: v_max_num_f16_e32 v4, v4, v10
2470 ; GFX12-NEXT: v_and_b32_e32 v4, 0xffff, v4
2471 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2472 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
2473 ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
2474 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2475 ; GFX12-NEXT: v_mov_b32_e32 v4, v5
2476 ; GFX12-NEXT: v_mov_b32_e32 v5, v6
2477 ; GFX12-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
2478 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
2479 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0
2480 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1
2481 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2
2482 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3
2483 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2484 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
2485 ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
2486 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2487 ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
2488 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0
2489 ; GFX12-NEXT: s_wait_loadcnt 0x0
2490 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
2491 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
2492 ; GFX12-NEXT: s_cbranch_execnz .LBB8_4
2493 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
2494 ; GFX12-NEXT: s_mov_b32 exec_lo, s2
2495 ; GFX12-NEXT: s_wait_loadcnt 0x0
2496 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
2497 ; GFX12-NEXT: v_mov_b32_e32 v6, v4
2498 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
2499 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
2500 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2501 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
2502 ; GFX12-NEXT: s_cbranch_execnz .LBB8_3
2503 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
2504 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
2505 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
2506 ; GFX12-NEXT: s_setpc_b64 s[30:31]
2508 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
2510 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2511 ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4
2512 ; GFX940-NEXT: v_and_b32_e32 v9, -4, v4
2513 ; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
2514 ; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4
2515 ; GFX940-NEXT: s_mov_b32 s0, 0xffff
2516 ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0
2517 ; GFX940-NEXT: v_not_b32_e32 v10, v4
2518 ; GFX940-NEXT: s_mov_b64 s[2:3], exec
2519 ; GFX940-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
2520 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0
2521 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1
2522 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2
2523 ; GFX940-NEXT: v_readfirstlane_b32 s7, v3
2524 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
2525 ; GFX940-NEXT: s_nop 0
2526 ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
2527 ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
2528 ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
2529 ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen
2530 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
2531 ; GFX940-NEXT: s_cbranch_execnz .LBB8_1
2532 ; GFX940-NEXT: ; %bb.2:
2533 ; GFX940-NEXT: s_mov_b64 exec, s[2:3]
2534 ; GFX940-NEXT: s_mov_b64 s[2:3], 0
2535 ; GFX940-NEXT: v_max_f16_e32 v11, v5, v5
2536 ; GFX940-NEXT: .LBB8_3: ; %atomicrmw.start
2537 ; GFX940-NEXT: ; =>This Loop Header: Depth=1
2538 ; GFX940-NEXT: ; Child Loop BB8_4 Depth 2
2539 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2540 ; GFX940-NEXT: v_lshrrev_b32_e32 v4, v8, v7
2541 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v4
2542 ; GFX940-NEXT: v_max_f16_e32 v4, v4, v11
2543 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, v8, v4
2544 ; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4
2545 ; GFX940-NEXT: s_mov_b64 s[8:9], exec
2546 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
2547 ; GFX940-NEXT: buffer_wbl2 sc1
2548 ; GFX940-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
2549 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
2550 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0
2551 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1
2552 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2
2553 ; GFX940-NEXT: v_readfirstlane_b32 s7, v3
2554 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
2555 ; GFX940-NEXT: s_nop 0
2556 ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
2557 ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
2558 ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
2559 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2560 ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
2561 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
2562 ; GFX940-NEXT: s_cbranch_execnz .LBB8_4
2563 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
2564 ; GFX940-NEXT: s_mov_b64 exec, s[8:9]
2565 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2566 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
2567 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2568 ; GFX940-NEXT: v_mov_b32_e32 v7, v4
2569 ; GFX940-NEXT: buffer_inv sc1
2570 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
2571 ; GFX940-NEXT: s_cbranch_execnz .LBB8_3
2572 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
2573 ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
2574 ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4
2575 ; GFX940-NEXT: s_setpc_b64 s[30:31]
2577 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
2579 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2580 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
2581 ; GFX11-NEXT: s_mov_b32 s1, 0
2582 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
2583 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2584 ; GFX11-NEXT: v_and_b32_e32 v6, 3, v4
2585 ; GFX11-NEXT: v_and_b32_e32 v8, -4, v4
2586 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6
2587 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2588 ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
2589 ; GFX11-NEXT: v_not_b32_e32 v9, v6
2590 ; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
2591 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
2592 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1
2593 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2
2594 ; GFX11-NEXT: v_readfirstlane_b32 s7, v3
2595 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2596 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
2597 ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
2598 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2599 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
2600 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0
2601 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
2602 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
2603 ; GFX11-NEXT: s_cbranch_execnz .LBB8_1
2604 ; GFX11-NEXT: ; %bb.2:
2605 ; GFX11-NEXT: s_mov_b32 exec_lo, s2
2606 ; GFX11-NEXT: v_max_f16_e32 v10, v5, v5
2607 ; GFX11-NEXT: .p2align 6
2608 ; GFX11-NEXT: .LBB8_3: ; %atomicrmw.start
2609 ; GFX11-NEXT: ; =>This Loop Header: Depth=1
2610 ; GFX11-NEXT: ; Child Loop BB8_4 Depth 2
2611 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2612 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6
2613 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
2614 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2615 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2616 ; GFX11-NEXT: v_max_f16_e32 v4, v4, v4
2617 ; GFX11-NEXT: v_max_f16_e32 v4, v4, v10
2618 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2619 ; GFX11-NEXT: v_and_b32_e32 v4, 0xffff, v4
2620 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
2621 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2622 ; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
2623 ; GFX11-NEXT: v_mov_b32_e32 v4, v5
2624 ; GFX11-NEXT: v_mov_b32_e32 v5, v6
2625 ; GFX11-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
2626 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
2627 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
2628 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1
2629 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2
2630 ; GFX11-NEXT: v_readfirstlane_b32 s7, v3
2631 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
2632 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
2633 ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
2634 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2635 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
2636 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0
2637 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2638 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
2639 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
2640 ; GFX11-NEXT: s_cbranch_execnz .LBB8_4
2641 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
2642 ; GFX11-NEXT: s_mov_b32 exec_lo, s2
2643 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2644 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
2645 ; GFX11-NEXT: v_mov_b32_e32 v6, v4
2646 ; GFX11-NEXT: buffer_gl1_inv
2647 ; GFX11-NEXT: buffer_gl0_inv
2648 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
2649 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2650 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
2651 ; GFX11-NEXT: s_cbranch_execnz .LBB8_3
2652 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
2653 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
2654 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4
2655 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2657 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
2659 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2660 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
2661 ; GFX10-NEXT: s_mov_b32 s5, 0
2662 ; GFX10-NEXT: s_mov_b32 s6, exec_lo
2663 ; GFX10-NEXT: v_and_b32_e32 v6, 3, v4
2664 ; GFX10-NEXT: v_and_b32_e32 v8, -4, v4
2665 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6
2666 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
2667 ; GFX10-NEXT: v_not_b32_e32 v9, v6
2668 ; GFX10-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
2669 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0
2670 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1
2671 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2
2672 ; GFX10-NEXT: v_readfirstlane_b32 s11, v3
2673 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
2674 ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
2675 ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
2676 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4
2677 ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
2678 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
2679 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
2680 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1
2681 ; GFX10-NEXT: ; %bb.2:
2682 ; GFX10-NEXT: s_mov_b32 exec_lo, s6
2683 ; GFX10-NEXT: v_max_f16_e32 v10, v5, v5
2684 ; GFX10-NEXT: .LBB8_3: ; %atomicrmw.start
2685 ; GFX10-NEXT: ; =>This Loop Header: Depth=1
2686 ; GFX10-NEXT: ; Child Loop BB8_4 Depth 2
2687 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2688 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, v7, v6
2689 ; GFX10-NEXT: s_mov_b32 s6, exec_lo
2690 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2691 ; GFX10-NEXT: v_max_f16_e32 v4, v4, v4
2692 ; GFX10-NEXT: v_max_f16_e32 v4, v4, v10
2693 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
2694 ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4
2695 ; GFX10-NEXT: v_mov_b32_e32 v4, v5
2696 ; GFX10-NEXT: v_mov_b32_e32 v5, v6
2697 ; GFX10-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
2698 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
2699 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0
2700 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1
2701 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2
2702 ; GFX10-NEXT: v_readfirstlane_b32 s11, v3
2703 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
2704 ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
2705 ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
2706 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4
2707 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2708 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
2709 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
2710 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
2711 ; GFX10-NEXT: s_cbranch_execnz .LBB8_4
2712 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
2713 ; GFX10-NEXT: s_mov_b32 exec_lo, s6
2714 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2715 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
2716 ; GFX10-NEXT: v_mov_b32_e32 v6, v4
2717 ; GFX10-NEXT: buffer_gl1_inv
2718 ; GFX10-NEXT: buffer_gl0_inv
2719 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
2720 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
2721 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
2722 ; GFX10-NEXT: s_cbranch_execnz .LBB8_3
2723 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
2724 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
2725 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4
2726 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2728 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
2730 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2731 ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4
2732 ; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4
2733 ; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
2734 ; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4
2735 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff
2736 ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4
2737 ; GFX90A-NEXT: v_not_b32_e32 v10, v4
2738 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec
2739 ; GFX90A-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
2740 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
2741 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
2742 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
2743 ; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
2744 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
2745 ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
2746 ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
2747 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
2748 ; GFX90A-NEXT: s_nop 0
2749 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen
2750 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
2751 ; GFX90A-NEXT: s_cbranch_execnz .LBB8_1
2752 ; GFX90A-NEXT: ; %bb.2:
2753 ; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
2754 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0
2755 ; GFX90A-NEXT: v_max_f16_e32 v11, v5, v5
2756 ; GFX90A-NEXT: .LBB8_3: ; %atomicrmw.start
2757 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1
2758 ; GFX90A-NEXT: ; Child Loop BB8_4 Depth 2
2759 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
2760 ; GFX90A-NEXT: v_lshrrev_b32_e32 v4, v8, v7
2761 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v4
2762 ; GFX90A-NEXT: v_max_f16_e32 v4, v4, v11
2763 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, v8, v4
2764 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
2765 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec
2766 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
2767 ; GFX90A-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
2768 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
2769 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
2770 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
2771 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
2772 ; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
2773 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
2774 ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
2775 ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
2776 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
2777 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
2778 ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
2779 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
2780 ; GFX90A-NEXT: s_cbranch_execnz .LBB8_4
2781 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
2782 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
2783 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
2784 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
2785 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
2786 ; GFX90A-NEXT: v_mov_b32_e32 v7, v4
2787 ; GFX90A-NEXT: buffer_wbinvl1
2788 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
2789 ; GFX90A-NEXT: s_cbranch_execnz .LBB8_3
2790 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
2791 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
2792 ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4
2793 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
2795 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
2797 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2798 ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4
2799 ; GFX908-NEXT: v_and_b32_e32 v8, -4, v4
2800 ; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
2801 ; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4
2802 ; GFX908-NEXT: s_mov_b32 s4, 0xffff
2803 ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4
2804 ; GFX908-NEXT: v_not_b32_e32 v9, v4
2805 ; GFX908-NEXT: s_mov_b64 s[6:7], exec
2806 ; GFX908-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
2807 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0
2808 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1
2809 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2
2810 ; GFX908-NEXT: v_readfirstlane_b32 s11, v3
2811 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
2812 ; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
2813 ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
2814 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
2815 ; GFX908-NEXT: s_nop 0
2816 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
2817 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
2818 ; GFX908-NEXT: s_cbranch_execnz .LBB8_1
2819 ; GFX908-NEXT: ; %bb.2:
2820 ; GFX908-NEXT: s_mov_b64 exec, s[6:7]
2821 ; GFX908-NEXT: s_mov_b64 s[6:7], 0
2822 ; GFX908-NEXT: v_max_f16_e32 v10, v5, v5
2823 ; GFX908-NEXT: .LBB8_3: ; %atomicrmw.start
2824 ; GFX908-NEXT: ; =>This Loop Header: Depth=1
2825 ; GFX908-NEXT: ; Child Loop BB8_4 Depth 2
2826 ; GFX908-NEXT: s_waitcnt vmcnt(0)
2827 ; GFX908-NEXT: v_lshrrev_b32_e32 v4, v7, v6
2828 ; GFX908-NEXT: v_max_f16_e32 v4, v4, v4
2829 ; GFX908-NEXT: v_max_f16_e32 v4, v4, v10
2830 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, v7, v4
2831 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4
2832 ; GFX908-NEXT: v_mov_b32_e32 v4, v5
2833 ; GFX908-NEXT: s_mov_b64 s[12:13], exec
2834 ; GFX908-NEXT: v_mov_b32_e32 v5, v6
2835 ; GFX908-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
2836 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
2837 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0
2838 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1
2839 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2
2840 ; GFX908-NEXT: v_readfirstlane_b32 s11, v3
2841 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
2842 ; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
2843 ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
2844 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
2845 ; GFX908-NEXT: s_waitcnt vmcnt(0)
2846 ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
2847 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
2848 ; GFX908-NEXT: s_cbranch_execnz .LBB8_4
2849 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
2850 ; GFX908-NEXT: s_mov_b64 exec, s[12:13]
2851 ; GFX908-NEXT: s_waitcnt vmcnt(0)
2852 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
2853 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
2854 ; GFX908-NEXT: v_mov_b32_e32 v6, v4
2855 ; GFX908-NEXT: buffer_wbinvl1
2856 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
2857 ; GFX908-NEXT: s_cbranch_execnz .LBB8_3
2858 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
2859 ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
2860 ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4
2861 ; GFX908-NEXT: s_setpc_b64 s[30:31]
2863 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
2865 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2866 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4
2867 ; GFX8-NEXT: v_and_b32_e32 v8, -4, v4
2868 ; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
2869 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4
2870 ; GFX8-NEXT: s_mov_b32 s4, 0xffff
2871 ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4
2872 ; GFX8-NEXT: v_not_b32_e32 v9, v4
2873 ; GFX8-NEXT: s_mov_b64 s[6:7], exec
2874 ; GFX8-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
2875 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0
2876 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1
2877 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2
2878 ; GFX8-NEXT: v_readfirstlane_b32 s11, v3
2879 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
2880 ; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
2881 ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
2882 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
2883 ; GFX8-NEXT: s_nop 0
2884 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
2885 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
2886 ; GFX8-NEXT: s_cbranch_execnz .LBB8_1
2887 ; GFX8-NEXT: ; %bb.2:
2888 ; GFX8-NEXT: s_mov_b64 exec, s[6:7]
2889 ; GFX8-NEXT: s_mov_b64 s[6:7], 0
2890 ; GFX8-NEXT: v_max_f16_e32 v10, v5, v5
2891 ; GFX8-NEXT: .LBB8_3: ; %atomicrmw.start
2892 ; GFX8-NEXT: ; =>This Loop Header: Depth=1
2893 ; GFX8-NEXT: ; Child Loop BB8_4 Depth 2
2894 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2895 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, v7, v6
2896 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v4
2897 ; GFX8-NEXT: v_max_f16_e32 v4, v4, v10
2898 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, v7, v4
2899 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9
2900 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
2901 ; GFX8-NEXT: v_mov_b32_e32 v4, v5
2902 ; GFX8-NEXT: s_mov_b64 s[12:13], exec
2903 ; GFX8-NEXT: v_mov_b32_e32 v5, v6
2904 ; GFX8-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
2905 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
2906 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0
2907 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1
2908 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2
2909 ; GFX8-NEXT: v_readfirstlane_b32 s11, v3
2910 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
2911 ; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
2912 ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
2913 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
2914 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2915 ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
2916 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
2917 ; GFX8-NEXT: s_cbranch_execnz .LBB8_4
2918 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
2919 ; GFX8-NEXT: s_mov_b64 exec, s[12:13]
2920 ; GFX8-NEXT: s_waitcnt vmcnt(0)
2921 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
2922 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
2923 ; GFX8-NEXT: v_mov_b32_e32 v6, v4
2924 ; GFX8-NEXT: buffer_wbinvl1
2925 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
2926 ; GFX8-NEXT: s_cbranch_execnz .LBB8_3
2927 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
2928 ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
2929 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4
2930 ; GFX8-NEXT: s_setpc_b64 s[30:31]
2932 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
2934 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2935 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
2936 ; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
2937 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
2938 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
2939 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
2940 ; GFX7-NEXT: v_not_b32_e32 v9, v4
2941 ; GFX7-NEXT: s_mov_b64 s[6:7], exec
2942 ; GFX7-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
2943 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0
2944 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1
2945 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2
2946 ; GFX7-NEXT: v_readfirstlane_b32 s11, v3
2947 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
2948 ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
2949 ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
2950 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
2951 ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
2952 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
2953 ; GFX7-NEXT: s_cbranch_execnz .LBB8_1
2954 ; GFX7-NEXT: ; %bb.2:
2955 ; GFX7-NEXT: s_mov_b64 exec, s[6:7]
2956 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5
2957 ; GFX7-NEXT: s_mov_b64 s[6:7], 0
2958 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4
2959 ; GFX7-NEXT: .LBB8_3: ; %atomicrmw.start
2960 ; GFX7-NEXT: ; =>This Loop Header: Depth=1
2961 ; GFX7-NEXT: ; Child Loop BB8_4 Depth 2
2962 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2963 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
2964 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
2965 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
2966 ; GFX7-NEXT: s_mov_b64 s[12:13], exec
2967 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v10
2968 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
2969 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
2970 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
2971 ; GFX7-NEXT: v_mov_b32_e32 v4, v5
2972 ; GFX7-NEXT: v_mov_b32_e32 v5, v6
2973 ; GFX7-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
2974 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
2975 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0
2976 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1
2977 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2
2978 ; GFX7-NEXT: v_readfirstlane_b32 s11, v3
2979 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
2980 ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
2981 ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
2982 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
2983 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2984 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
2985 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
2986 ; GFX7-NEXT: s_cbranch_execnz .LBB8_4
2987 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
2988 ; GFX7-NEXT: s_mov_b64 exec, s[12:13]
2989 ; GFX7-NEXT: s_waitcnt vmcnt(0)
2990 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
2991 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
2992 ; GFX7-NEXT: v_mov_b32_e32 v6, v4
2993 ; GFX7-NEXT: buffer_wbinvl1
2994 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
2995 ; GFX7-NEXT: s_cbranch_execnz .LBB8_3
2996 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
2997 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
2998 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
2999 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
3000 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3002 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall:
3004 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3005 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
3006 ; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
3007 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
3008 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
3009 ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
3010 ; GFX6-NEXT: v_not_b32_e32 v9, v4
3011 ; GFX6-NEXT: s_mov_b64 s[6:7], exec
3012 ; GFX6-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1
3013 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0
3014 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1
3015 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2
3016 ; GFX6-NEXT: v_readfirstlane_b32 s11, v3
3017 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
3018 ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
3019 ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
3020 ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
3021 ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
3022 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
3023 ; GFX6-NEXT: s_cbranch_execnz .LBB8_1
3024 ; GFX6-NEXT: ; %bb.2:
3025 ; GFX6-NEXT: s_mov_b64 exec, s[6:7]
3026 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5
3027 ; GFX6-NEXT: s_mov_b64 s[6:7], 0
3028 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4
3029 ; GFX6-NEXT: .LBB8_3: ; %atomicrmw.start
3030 ; GFX6-NEXT: ; =>This Loop Header: Depth=1
3031 ; GFX6-NEXT: ; Child Loop BB8_4 Depth 2
3032 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3033 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
3034 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
3035 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
3036 ; GFX6-NEXT: s_mov_b64 s[12:13], exec
3037 ; GFX6-NEXT: v_max_f32_e32 v4, v4, v10
3038 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
3039 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
3040 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
3041 ; GFX6-NEXT: v_mov_b32_e32 v4, v5
3042 ; GFX6-NEXT: v_mov_b32_e32 v5, v6
3043 ; GFX6-NEXT: .LBB8_4: ; Parent Loop BB8_3 Depth=1
3044 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
3045 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0
3046 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1
3047 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2
3048 ; GFX6-NEXT: v_readfirstlane_b32 s11, v3
3049 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
3050 ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
3051 ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
3052 ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
3053 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
3054 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
3055 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
3056 ; GFX6-NEXT: s_cbranch_execnz .LBB8_4
3057 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB8_3 Depth=1
3058 ; GFX6-NEXT: s_mov_b64 exec, s[12:13]
3059 ; GFX6-NEXT: s_waitcnt vmcnt(0)
3060 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
3061 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
3062 ; GFX6-NEXT: v_mov_b32_e32 v6, v4
3063 ; GFX6-NEXT: buffer_wbinvl1
3064 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
3065 ; GFX6-NEXT: s_cbranch_execnz .LBB8_3
3066 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
3067 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
3068 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
3069 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
3070 ; GFX6-NEXT: s_waitcnt expcnt(0)
3071 ; GFX6-NEXT: s_setpc_b64 s[30:31]
3072 %gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
3073 %result = atomicrmw fmax ptr addrspace(7) %gep, half %val syncscope("agent") seq_cst
3077 ; --------------------------------------------------------------------
3079 ; --------------------------------------------------------------------
3081 define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
3082 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
3084 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
3085 ; GFX12-NEXT: s_wait_expcnt 0x0
3086 ; GFX12-NEXT: s_wait_samplecnt 0x0
3087 ; GFX12-NEXT: s_wait_bvhcnt 0x0
3088 ; GFX12-NEXT: s_wait_kmcnt 0x0
3089 ; GFX12-NEXT: s_addk_co_i32 s6, 0x200
3090 ; GFX12-NEXT: v_lshlrev_b32_e32 v5, 16, v0
3091 ; GFX12-NEXT: s_and_b32 s4, s6, -4
3092 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
3093 ; GFX12-NEXT: v_mov_b32_e32 v4, s4
3094 ; GFX12-NEXT: s_and_b32 s4, s6, 3
3095 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3
3096 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3097 ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
3098 ; GFX12-NEXT: buffer_load_b32 v1, v4, s[0:3], null offen
3099 ; GFX12-NEXT: s_not_b32 s6, s5
3100 ; GFX12-NEXT: s_mov_b32 s5, 0
3101 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
3102 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
3103 ; GFX12-NEXT: s_wait_loadcnt 0x0
3104 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
3105 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
3106 ; GFX12-NEXT: s_wait_storecnt 0x0
3107 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3108 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3109 ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v5
3110 ; GFX12-NEXT: v_bfe_u32 v2, v0, 16, 1
3111 ; GFX12-NEXT: v_or_b32_e32 v3, 0x400000, v0
3112 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
3113 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3114 ; GFX12-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
3115 ; GFX12-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
3116 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3117 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3118 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
3119 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3120 ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
3121 ; GFX12-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
3122 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
3123 ; GFX12-NEXT: s_wait_loadcnt 0x0
3124 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
3125 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
3126 ; GFX12-NEXT: v_mov_b32_e32 v1, v2
3127 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
3128 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3129 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
3130 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1
3131 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
3132 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
3133 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v2
3134 ; GFX12-NEXT: s_setpc_b64 s[30:31]
3136 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
3138 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3139 ; GFX940-NEXT: s_addk_i32 s6, 0x200
3140 ; GFX940-NEXT: s_and_b32 s4, s6, -4
3141 ; GFX940-NEXT: v_mov_b32_e32 v4, s4
3142 ; GFX940-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen
3143 ; GFX940-NEXT: s_and_b32 s4, s6, 3
3144 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3
3145 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
3146 ; GFX940-NEXT: s_not_b32 s7, s4
3147 ; GFX940-NEXT: s_mov_b64 s[4:5], 0
3148 ; GFX940-NEXT: v_lshlrev_b32_e32 v5, 16, v0
3149 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff
3150 ; GFX940-NEXT: .LBB9_1: ; %atomicrmw.start
3151 ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
3152 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3153 ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3154 ; GFX940-NEXT: buffer_wbl2 sc1
3155 ; GFX940-NEXT: v_max_f32_e32 v0, v0, v5
3156 ; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1
3157 ; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0
3158 ; GFX940-NEXT: v_add3_u32 v2, v2, v0, s8
3159 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
3160 ; GFX940-NEXT: s_nop 1
3161 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
3162 ; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3163 ; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
3164 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
3165 ; GFX940-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[0:3], 0 offen sc0
3166 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3167 ; GFX940-NEXT: buffer_inv sc1
3168 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
3169 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3170 ; GFX940-NEXT: v_mov_b32_e32 v1, v2
3171 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
3172 ; GFX940-NEXT: s_cbranch_execnz .LBB9_1
3173 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
3174 ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
3175 ; GFX940-NEXT: v_lshrrev_b32_e32 v0, s6, v2
3176 ; GFX940-NEXT: s_setpc_b64 s[30:31]
3178 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
3180 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3181 ; GFX11-NEXT: s_addk_i32 s6, 0x200
3182 ; GFX11-NEXT: v_lshlrev_b32_e32 v5, 16, v0
3183 ; GFX11-NEXT: s_and_b32 s4, s6, -4
3184 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
3185 ; GFX11-NEXT: v_mov_b32_e32 v4, s4
3186 ; GFX11-NEXT: s_and_b32 s4, s6, 3
3187 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3
3188 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3189 ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
3190 ; GFX11-NEXT: buffer_load_b32 v1, v4, s[0:3], 0 offen
3191 ; GFX11-NEXT: s_not_b32 s6, s5
3192 ; GFX11-NEXT: s_mov_b32 s5, 0
3193 ; GFX11-NEXT: .p2align 6
3194 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
3195 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
3196 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3197 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
3198 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3199 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3200 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3201 ; GFX11-NEXT: v_max_f32_e32 v0, v0, v5
3202 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
3203 ; GFX11-NEXT: v_bfe_u32 v2, v0, 16, 1
3204 ; GFX11-NEXT: v_or_b32_e32 v3, 0x400000, v0
3205 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
3206 ; GFX11-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
3207 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3208 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
3209 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3210 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3211 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
3212 ; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
3213 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3214 ; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
3215 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[2:3], v4, s[0:3], 0 offen glc
3216 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3217 ; GFX11-NEXT: buffer_gl1_inv
3218 ; GFX11-NEXT: buffer_gl0_inv
3219 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
3220 ; GFX11-NEXT: v_mov_b32_e32 v1, v2
3221 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
3222 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3223 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
3224 ; GFX11-NEXT: s_cbranch_execnz .LBB9_1
3225 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
3226 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
3227 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v2
3228 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3230 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
3232 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3233 ; GFX10-NEXT: s_addk_i32 s18, 0x200
3234 ; GFX10-NEXT: s_mov_b32 s11, s17
3235 ; GFX10-NEXT: s_and_b32 s4, s18, -4
3236 ; GFX10-NEXT: s_mov_b32 s10, s16
3237 ; GFX10-NEXT: v_mov_b32_e32 v4, s4
3238 ; GFX10-NEXT: s_mov_b32 s9, s7
3239 ; GFX10-NEXT: s_mov_b32 s8, s6
3240 ; GFX10-NEXT: s_and_b32 s4, s18, 3
3241 ; GFX10-NEXT: v_lshlrev_b32_e32 v5, 16, v0
3242 ; GFX10-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen
3243 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3
3244 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
3245 ; GFX10-NEXT: s_not_b32 s6, s5
3246 ; GFX10-NEXT: s_mov_b32 s5, 0
3247 ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
3248 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
3249 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3250 ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3251 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3252 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v5
3253 ; GFX10-NEXT: v_bfe_u32 v2, v0, 16, 1
3254 ; GFX10-NEXT: v_or_b32_e32 v3, 0x400000, v0
3255 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
3256 ; GFX10-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
3257 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo
3258 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3259 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
3260 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
3261 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
3262 ; GFX10-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc
3263 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3264 ; GFX10-NEXT: buffer_gl1_inv
3265 ; GFX10-NEXT: buffer_gl0_inv
3266 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v1
3267 ; GFX10-NEXT: v_mov_b32_e32 v1, v2
3268 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
3269 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
3270 ; GFX10-NEXT: s_cbranch_execnz .LBB9_1
3271 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
3272 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
3273 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, s4, v2
3274 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3276 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
3278 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3279 ; GFX90A-NEXT: s_addk_i32 s18, 0x200
3280 ; GFX90A-NEXT: s_and_b32 s4, s18, -4
3281 ; GFX90A-NEXT: s_mov_b32 s11, s17
3282 ; GFX90A-NEXT: s_mov_b32 s10, s16
3283 ; GFX90A-NEXT: s_mov_b32 s9, s7
3284 ; GFX90A-NEXT: s_mov_b32 s8, s6
3285 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4
3286 ; GFX90A-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen
3287 ; GFX90A-NEXT: s_and_b32 s4, s18, 3
3288 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
3289 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
3290 ; GFX90A-NEXT: s_not_b32 s7, s4
3291 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
3292 ; GFX90A-NEXT: v_lshlrev_b32_e32 v5, 16, v0
3293 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
3294 ; GFX90A-NEXT: .LBB9_1: ; %atomicrmw.start
3295 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
3296 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
3297 ; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3298 ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v5
3299 ; GFX90A-NEXT: v_bfe_u32 v2, v0, 16, 1
3300 ; GFX90A-NEXT: v_or_b32_e32 v3, 0x400000, v0
3301 ; GFX90A-NEXT: v_add3_u32 v2, v2, v0, s12
3302 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
3303 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
3304 ; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3305 ; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
3306 ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
3307 ; GFX90A-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc
3308 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
3309 ; GFX90A-NEXT: buffer_wbinvl1
3310 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
3311 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3312 ; GFX90A-NEXT: v_mov_b32_e32 v1, v2
3313 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
3314 ; GFX90A-NEXT: s_cbranch_execnz .LBB9_1
3315 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
3316 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
3317 ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, s6, v2
3318 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
3320 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
3322 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3323 ; GFX908-NEXT: s_addk_i32 s18, 0x200
3324 ; GFX908-NEXT: s_and_b32 s4, s18, -4
3325 ; GFX908-NEXT: s_mov_b32 s11, s17
3326 ; GFX908-NEXT: s_mov_b32 s10, s16
3327 ; GFX908-NEXT: s_mov_b32 s9, s7
3328 ; GFX908-NEXT: s_mov_b32 s8, s6
3329 ; GFX908-NEXT: v_mov_b32_e32 v4, s4
3330 ; GFX908-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen
3331 ; GFX908-NEXT: s_and_b32 s4, s18, 3
3332 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3
3333 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
3334 ; GFX908-NEXT: s_not_b32 s7, s4
3335 ; GFX908-NEXT: s_mov_b64 s[4:5], 0
3336 ; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v0
3337 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff
3338 ; GFX908-NEXT: .LBB9_1: ; %atomicrmw.start
3339 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
3340 ; GFX908-NEXT: s_waitcnt vmcnt(0)
3341 ; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3342 ; GFX908-NEXT: v_max_f32_e32 v0, v0, v5
3343 ; GFX908-NEXT: v_bfe_u32 v2, v0, 16, 1
3344 ; GFX908-NEXT: v_or_b32_e32 v3, 0x400000, v0
3345 ; GFX908-NEXT: v_add3_u32 v2, v2, v0, s12
3346 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
3347 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
3348 ; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3349 ; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
3350 ; GFX908-NEXT: v_mov_b32_e32 v3, v1
3351 ; GFX908-NEXT: v_mov_b32_e32 v2, v0
3352 ; GFX908-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc
3353 ; GFX908-NEXT: s_waitcnt vmcnt(0)
3354 ; GFX908-NEXT: buffer_wbinvl1
3355 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
3356 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3357 ; GFX908-NEXT: v_mov_b32_e32 v1, v2
3358 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
3359 ; GFX908-NEXT: s_cbranch_execnz .LBB9_1
3360 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
3361 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
3362 ; GFX908-NEXT: v_lshrrev_b32_e32 v0, s6, v2
3363 ; GFX908-NEXT: s_setpc_b64 s[30:31]
3365 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
3367 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3368 ; GFX8-NEXT: s_addk_i32 s18, 0x200
3369 ; GFX8-NEXT: s_and_b32 s4, s18, -4
3370 ; GFX8-NEXT: s_mov_b32 s11, s17
3371 ; GFX8-NEXT: s_mov_b32 s10, s16
3372 ; GFX8-NEXT: s_mov_b32 s9, s7
3373 ; GFX8-NEXT: s_mov_b32 s8, s6
3374 ; GFX8-NEXT: v_mov_b32_e32 v4, s4
3375 ; GFX8-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen
3376 ; GFX8-NEXT: s_and_b32 s4, s18, 3
3377 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3
3378 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
3379 ; GFX8-NEXT: s_not_b32 s7, s4
3380 ; GFX8-NEXT: s_mov_b64 s[4:5], 0
3381 ; GFX8-NEXT: v_lshlrev_b32_e32 v5, 16, v0
3382 ; GFX8-NEXT: .LBB9_1: ; %atomicrmw.start
3383 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
3384 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
3385 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3386 ; GFX8-NEXT: v_lshrrev_b32_sdwa v3, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3387 ; GFX8-NEXT: v_max_f32_e32 v3, v3, v5
3388 ; GFX8-NEXT: v_bfe_u32 v6, v3, 16, 1
3389 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v3
3390 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
3391 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v3
3392 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
3393 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v6, v7, vcc
3394 ; GFX8-NEXT: v_and_b32_e32 v2, s7, v1
3395 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3396 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v0
3397 ; GFX8-NEXT: v_mov_b32_e32 v3, v1
3398 ; GFX8-NEXT: v_mov_b32_e32 v2, v0
3399 ; GFX8-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc
3400 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3401 ; GFX8-NEXT: buffer_wbinvl1
3402 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
3403 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3404 ; GFX8-NEXT: v_mov_b32_e32 v1, v2
3405 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
3406 ; GFX8-NEXT: s_cbranch_execnz .LBB9_1
3407 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
3408 ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
3409 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, s6, v2
3410 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3412 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
3414 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3415 ; GFX7-NEXT: s_addk_i32 s18, 0x200
3416 ; GFX7-NEXT: s_and_b32 s4, s18, -4
3417 ; GFX7-NEXT: s_mov_b32 s11, s17
3418 ; GFX7-NEXT: s_mov_b32 s10, s16
3419 ; GFX7-NEXT: s_mov_b32 s9, s7
3420 ; GFX7-NEXT: s_mov_b32 s8, s6
3421 ; GFX7-NEXT: v_mov_b32_e32 v4, s4
3422 ; GFX7-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen
3423 ; GFX7-NEXT: s_and_b32 s4, s18, 3
3424 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3
3425 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
3426 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
3427 ; GFX7-NEXT: s_not_b32 s7, s4
3428 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
3429 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
3430 ; GFX7-NEXT: .LBB9_1: ; %atomicrmw.start
3431 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
3432 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3433 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
3434 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3435 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
3436 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v5
3437 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3438 ; GFX7-NEXT: v_and_b32_e32 v2, s7, v1
3439 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
3440 ; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
3441 ; GFX7-NEXT: v_mov_b32_e32 v3, v1
3442 ; GFX7-NEXT: v_mov_b32_e32 v2, v0
3443 ; GFX7-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc
3444 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3445 ; GFX7-NEXT: buffer_wbinvl1
3446 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
3447 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3448 ; GFX7-NEXT: v_mov_b32_e32 v1, v2
3449 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
3450 ; GFX7-NEXT: s_cbranch_execnz .LBB9_1
3451 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
3452 ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
3453 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
3454 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3455 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3457 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset:
3459 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3460 ; GFX6-NEXT: s_addk_i32 s18, 0x200
3461 ; GFX6-NEXT: s_and_b32 s4, s18, -4
3462 ; GFX6-NEXT: s_mov_b32 s11, s17
3463 ; GFX6-NEXT: s_mov_b32 s10, s16
3464 ; GFX6-NEXT: s_mov_b32 s9, s7
3465 ; GFX6-NEXT: s_mov_b32 s8, s6
3466 ; GFX6-NEXT: v_mov_b32_e32 v4, s4
3467 ; GFX6-NEXT: buffer_load_dword v1, v4, s[8:11], 0 offen
3468 ; GFX6-NEXT: s_and_b32 s4, s18, 3
3469 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3
3470 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
3471 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
3472 ; GFX6-NEXT: s_not_b32 s7, s4
3473 ; GFX6-NEXT: s_mov_b64 s[4:5], 0
3474 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v0
3475 ; GFX6-NEXT: .LBB9_1: ; %atomicrmw.start
3476 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
3477 ; GFX6-NEXT: s_waitcnt vmcnt(0)
3478 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
3479 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3480 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
3481 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v5
3482 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3483 ; GFX6-NEXT: s_waitcnt expcnt(0)
3484 ; GFX6-NEXT: v_and_b32_e32 v2, s7, v1
3485 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
3486 ; GFX6-NEXT: v_or_b32_e32 v0, v2, v0
3487 ; GFX6-NEXT: v_mov_b32_e32 v3, v1
3488 ; GFX6-NEXT: v_mov_b32_e32 v2, v0
3489 ; GFX6-NEXT: buffer_atomic_cmpswap v[2:3], v4, s[8:11], 0 offen glc
3490 ; GFX6-NEXT: s_waitcnt vmcnt(0)
3491 ; GFX6-NEXT: buffer_wbinvl1
3492 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
3493 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3494 ; GFX6-NEXT: v_mov_b32_e32 v1, v2
3495 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
3496 ; GFX6-NEXT: s_cbranch_execnz .LBB9_1
3497 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
3498 ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
3499 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
3500 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3501 ; GFX6-NEXT: s_waitcnt expcnt(0)
3502 ; GFX6-NEXT: s_setpc_b64 s[30:31]
3503 %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
3504 %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
3508 define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset(ptr addrspace(7) inreg %ptr, bfloat %val) #0 {
3509 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
3511 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
3512 ; GFX12-NEXT: s_wait_expcnt 0x0
3513 ; GFX12-NEXT: s_wait_samplecnt 0x0
3514 ; GFX12-NEXT: s_wait_bvhcnt 0x0
3515 ; GFX12-NEXT: s_wait_kmcnt 0x0
3516 ; GFX12-NEXT: s_addk_co_i32 s6, 0x200
3517 ; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v0
3518 ; GFX12-NEXT: s_and_b32 s4, s6, -4
3519 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
3520 ; GFX12-NEXT: v_mov_b32_e32 v2, s4
3521 ; GFX12-NEXT: s_and_b32 s4, s6, 3
3522 ; GFX12-NEXT: s_lshl_b32 s4, s4, 3
3523 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3524 ; GFX12-NEXT: s_lshl_b32 s5, 0xffff, s4
3525 ; GFX12-NEXT: buffer_load_b32 v1, v2, s[0:3], null offen
3526 ; GFX12-NEXT: s_not_b32 s6, s5
3527 ; GFX12-NEXT: s_mov_b32 s5, 0
3528 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
3529 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
3530 ; GFX12-NEXT: s_wait_loadcnt 0x0
3531 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, s4, v1
3532 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
3533 ; GFX12-NEXT: s_wait_storecnt 0x0
3534 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3535 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3536 ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v3
3537 ; GFX12-NEXT: v_bfe_u32 v4, v0, 16, 1
3538 ; GFX12-NEXT: v_or_b32_e32 v5, 0x400000, v0
3539 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
3540 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3541 ; GFX12-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
3542 ; GFX12-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
3543 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3544 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3545 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, s4, v0
3546 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3547 ; GFX12-NEXT: v_and_or_b32 v0, v1, s6, v0
3548 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
3549 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
3550 ; GFX12-NEXT: s_wait_loadcnt 0x0
3551 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
3552 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
3553 ; GFX12-NEXT: v_mov_b32_e32 v1, v4
3554 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
3555 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3556 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
3557 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1
3558 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
3559 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
3560 ; GFX12-NEXT: s_setpc_b64 s[30:31]
3562 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
3564 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3565 ; GFX940-NEXT: s_addk_i32 s6, 0x200
3566 ; GFX940-NEXT: s_and_b32 s4, s6, -4
3567 ; GFX940-NEXT: v_mov_b32_e32 v2, s4
3568 ; GFX940-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen
3569 ; GFX940-NEXT: s_and_b32 s4, s6, 3
3570 ; GFX940-NEXT: s_lshl_b32 s6, s4, 3
3571 ; GFX940-NEXT: s_lshl_b32 s4, 0xffff, s6
3572 ; GFX940-NEXT: s_not_b32 s7, s4
3573 ; GFX940-NEXT: s_mov_b64 s[4:5], 0
3574 ; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v0
3575 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff
3576 ; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start
3577 ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
3578 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3579 ; GFX940-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3580 ; GFX940-NEXT: buffer_wbl2 sc1
3581 ; GFX940-NEXT: v_max_f32_e32 v0, v0, v3
3582 ; GFX940-NEXT: v_bfe_u32 v4, v0, 16, 1
3583 ; GFX940-NEXT: v_or_b32_e32 v5, 0x400000, v0
3584 ; GFX940-NEXT: v_add3_u32 v4, v4, v0, s8
3585 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
3586 ; GFX940-NEXT: s_nop 1
3587 ; GFX940-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
3588 ; GFX940-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3589 ; GFX940-NEXT: v_and_or_b32 v0, v1, s7, v0
3590 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
3591 ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[0:3], 0 offen sc0
3592 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3593 ; GFX940-NEXT: buffer_inv sc1
3594 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
3595 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3596 ; GFX940-NEXT: v_mov_b32_e32 v1, v4
3597 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
3598 ; GFX940-NEXT: s_cbranch_execnz .LBB10_1
3599 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
3600 ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
3601 ; GFX940-NEXT: s_setpc_b64 s[30:31]
3603 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
3605 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3606 ; GFX11-NEXT: s_addk_i32 s6, 0x200
3607 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0
3608 ; GFX11-NEXT: s_and_b32 s4, s6, -4
3609 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
3610 ; GFX11-NEXT: v_mov_b32_e32 v2, s4
3611 ; GFX11-NEXT: s_and_b32 s4, s6, 3
3612 ; GFX11-NEXT: s_lshl_b32 s4, s4, 3
3613 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3614 ; GFX11-NEXT: s_lshl_b32 s5, 0xffff, s4
3615 ; GFX11-NEXT: buffer_load_b32 v1, v2, s[0:3], 0 offen
3616 ; GFX11-NEXT: s_not_b32 s6, s5
3617 ; GFX11-NEXT: s_mov_b32 s5, 0
3618 ; GFX11-NEXT: .p2align 6
3619 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
3620 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
3621 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3622 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, s4, v1
3623 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3624 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3625 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3626 ; GFX11-NEXT: v_max_f32_e32 v0, v0, v3
3627 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
3628 ; GFX11-NEXT: v_bfe_u32 v4, v0, 16, 1
3629 ; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0
3630 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
3631 ; GFX11-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
3632 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3633 ; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
3634 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3635 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3636 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, s4, v0
3637 ; GFX11-NEXT: v_and_or_b32 v0, v1, s6, v0
3638 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3639 ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
3640 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], 0 offen glc
3641 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3642 ; GFX11-NEXT: buffer_gl1_inv
3643 ; GFX11-NEXT: buffer_gl0_inv
3644 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
3645 ; GFX11-NEXT: v_mov_b32_e32 v1, v4
3646 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
3647 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3648 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
3649 ; GFX11-NEXT: s_cbranch_execnz .LBB10_1
3650 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
3651 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
3652 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3654 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
3656 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3657 ; GFX10-NEXT: s_addk_i32 s18, 0x200
3658 ; GFX10-NEXT: s_mov_b32 s11, s17
3659 ; GFX10-NEXT: s_and_b32 s4, s18, -4
3660 ; GFX10-NEXT: s_mov_b32 s10, s16
3661 ; GFX10-NEXT: v_mov_b32_e32 v2, s4
3662 ; GFX10-NEXT: s_mov_b32 s9, s7
3663 ; GFX10-NEXT: s_mov_b32 s8, s6
3664 ; GFX10-NEXT: s_and_b32 s4, s18, 3
3665 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0
3666 ; GFX10-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen
3667 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3
3668 ; GFX10-NEXT: s_lshl_b32 s5, 0xffff, s4
3669 ; GFX10-NEXT: s_not_b32 s6, s5
3670 ; GFX10-NEXT: s_mov_b32 s5, 0
3671 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
3672 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
3673 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3674 ; GFX10-NEXT: v_lshrrev_b32_sdwa v0, s4, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3675 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3676 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v3
3677 ; GFX10-NEXT: v_bfe_u32 v4, v0, 16, 1
3678 ; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0
3679 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
3680 ; GFX10-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
3681 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc_lo
3682 ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, s4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3683 ; GFX10-NEXT: v_and_or_b32 v0, v1, s6, v0
3684 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
3685 ; GFX10-NEXT: v_mov_b32_e32 v4, v0
3686 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc
3687 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3688 ; GFX10-NEXT: buffer_gl1_inv
3689 ; GFX10-NEXT: buffer_gl0_inv
3690 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
3691 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
3692 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
3693 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
3694 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1
3695 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
3696 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
3697 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3699 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
3701 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3702 ; GFX90A-NEXT: s_addk_i32 s18, 0x200
3703 ; GFX90A-NEXT: s_and_b32 s4, s18, -4
3704 ; GFX90A-NEXT: s_mov_b32 s11, s17
3705 ; GFX90A-NEXT: s_mov_b32 s10, s16
3706 ; GFX90A-NEXT: s_mov_b32 s9, s7
3707 ; GFX90A-NEXT: s_mov_b32 s8, s6
3708 ; GFX90A-NEXT: v_mov_b32_e32 v2, s4
3709 ; GFX90A-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen
3710 ; GFX90A-NEXT: s_and_b32 s4, s18, 3
3711 ; GFX90A-NEXT: s_lshl_b32 s6, s4, 3
3712 ; GFX90A-NEXT: s_lshl_b32 s4, 0xffff, s6
3713 ; GFX90A-NEXT: s_not_b32 s7, s4
3714 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
3715 ; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v0
3716 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
3717 ; GFX90A-NEXT: .LBB10_1: ; %atomicrmw.start
3718 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
3719 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
3720 ; GFX90A-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3721 ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v3
3722 ; GFX90A-NEXT: v_bfe_u32 v4, v0, 16, 1
3723 ; GFX90A-NEXT: v_or_b32_e32 v5, 0x400000, v0
3724 ; GFX90A-NEXT: v_add3_u32 v4, v4, v0, s12
3725 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
3726 ; GFX90A-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
3727 ; GFX90A-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3728 ; GFX90A-NEXT: v_and_or_b32 v0, v1, s7, v0
3729 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
3730 ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc
3731 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
3732 ; GFX90A-NEXT: buffer_wbinvl1
3733 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
3734 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3735 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4
3736 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
3737 ; GFX90A-NEXT: s_cbranch_execnz .LBB10_1
3738 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
3739 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
3740 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
3742 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
3744 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3745 ; GFX908-NEXT: s_addk_i32 s18, 0x200
3746 ; GFX908-NEXT: s_and_b32 s4, s18, -4
3747 ; GFX908-NEXT: s_mov_b32 s11, s17
3748 ; GFX908-NEXT: s_mov_b32 s10, s16
3749 ; GFX908-NEXT: s_mov_b32 s9, s7
3750 ; GFX908-NEXT: s_mov_b32 s8, s6
3751 ; GFX908-NEXT: v_mov_b32_e32 v2, s4
3752 ; GFX908-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen
3753 ; GFX908-NEXT: s_and_b32 s4, s18, 3
3754 ; GFX908-NEXT: s_lshl_b32 s6, s4, 3
3755 ; GFX908-NEXT: s_lshl_b32 s4, 0xffff, s6
3756 ; GFX908-NEXT: s_not_b32 s7, s4
3757 ; GFX908-NEXT: s_mov_b64 s[4:5], 0
3758 ; GFX908-NEXT: v_lshlrev_b32_e32 v3, 16, v0
3759 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff
3760 ; GFX908-NEXT: .LBB10_1: ; %atomicrmw.start
3761 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
3762 ; GFX908-NEXT: s_waitcnt vmcnt(0)
3763 ; GFX908-NEXT: v_lshrrev_b32_sdwa v0, s6, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3764 ; GFX908-NEXT: v_max_f32_e32 v0, v0, v3
3765 ; GFX908-NEXT: v_bfe_u32 v4, v0, 16, 1
3766 ; GFX908-NEXT: v_or_b32_e32 v5, 0x400000, v0
3767 ; GFX908-NEXT: v_add3_u32 v4, v4, v0, s12
3768 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
3769 ; GFX908-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
3770 ; GFX908-NEXT: v_lshlrev_b32_sdwa v0, s6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3771 ; GFX908-NEXT: v_and_or_b32 v0, v1, s7, v0
3772 ; GFX908-NEXT: v_mov_b32_e32 v5, v1
3773 ; GFX908-NEXT: v_mov_b32_e32 v4, v0
3774 ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc
3775 ; GFX908-NEXT: s_waitcnt vmcnt(0)
3776 ; GFX908-NEXT: buffer_wbinvl1
3777 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
3778 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3779 ; GFX908-NEXT: v_mov_b32_e32 v1, v4
3780 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
3781 ; GFX908-NEXT: s_cbranch_execnz .LBB10_1
3782 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
3783 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
3784 ; GFX908-NEXT: s_setpc_b64 s[30:31]
3786 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
3788 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3789 ; GFX8-NEXT: s_addk_i32 s18, 0x200
3790 ; GFX8-NEXT: s_and_b32 s4, s18, -4
3791 ; GFX8-NEXT: s_mov_b32 s11, s17
3792 ; GFX8-NEXT: s_mov_b32 s10, s16
3793 ; GFX8-NEXT: s_mov_b32 s9, s7
3794 ; GFX8-NEXT: s_mov_b32 s8, s6
3795 ; GFX8-NEXT: v_mov_b32_e32 v2, s4
3796 ; GFX8-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen
3797 ; GFX8-NEXT: s_and_b32 s4, s18, 3
3798 ; GFX8-NEXT: s_lshl_b32 s6, s4, 3
3799 ; GFX8-NEXT: s_lshl_b32 s4, 0xffff, s6
3800 ; GFX8-NEXT: s_not_b32 s7, s4
3801 ; GFX8-NEXT: s_mov_b64 s[4:5], 0
3802 ; GFX8-NEXT: v_lshlrev_b32_e32 v3, 16, v0
3803 ; GFX8-NEXT: .LBB10_1: ; %atomicrmw.start
3804 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
3805 ; GFX8-NEXT: v_mov_b32_e32 v0, s6
3806 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3807 ; GFX8-NEXT: v_lshrrev_b32_sdwa v5, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
3808 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v3
3809 ; GFX8-NEXT: v_bfe_u32 v6, v5, 16, 1
3810 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v5
3811 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
3812 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v5
3813 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
3814 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v6, v7, vcc
3815 ; GFX8-NEXT: v_and_b32_e32 v4, s7, v1
3816 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
3817 ; GFX8-NEXT: v_or_b32_e32 v0, v4, v0
3818 ; GFX8-NEXT: v_mov_b32_e32 v5, v1
3819 ; GFX8-NEXT: v_mov_b32_e32 v4, v0
3820 ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc
3821 ; GFX8-NEXT: s_waitcnt vmcnt(0)
3822 ; GFX8-NEXT: buffer_wbinvl1
3823 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
3824 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3825 ; GFX8-NEXT: v_mov_b32_e32 v1, v4
3826 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
3827 ; GFX8-NEXT: s_cbranch_execnz .LBB10_1
3828 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
3829 ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
3830 ; GFX8-NEXT: s_setpc_b64 s[30:31]
3832 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
3834 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3835 ; GFX7-NEXT: s_addk_i32 s18, 0x200
3836 ; GFX7-NEXT: s_and_b32 s4, s18, -4
3837 ; GFX7-NEXT: s_mov_b32 s11, s17
3838 ; GFX7-NEXT: s_mov_b32 s10, s16
3839 ; GFX7-NEXT: s_mov_b32 s9, s7
3840 ; GFX7-NEXT: s_mov_b32 s8, s6
3841 ; GFX7-NEXT: v_mov_b32_e32 v2, s4
3842 ; GFX7-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen
3843 ; GFX7-NEXT: s_and_b32 s4, s18, 3
3844 ; GFX7-NEXT: s_lshl_b32 s6, s4, 3
3845 ; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
3846 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
3847 ; GFX7-NEXT: s_not_b32 s7, s4
3848 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
3849 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
3850 ; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
3851 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
3852 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3853 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v1
3854 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3855 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
3856 ; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
3857 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3858 ; GFX7-NEXT: v_and_b32_e32 v4, s7, v1
3859 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, s6, v0
3860 ; GFX7-NEXT: v_or_b32_e32 v0, v4, v0
3861 ; GFX7-NEXT: v_mov_b32_e32 v5, v1
3862 ; GFX7-NEXT: v_mov_b32_e32 v4, v0
3863 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc
3864 ; GFX7-NEXT: s_waitcnt vmcnt(0)
3865 ; GFX7-NEXT: buffer_wbinvl1
3866 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
3867 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3868 ; GFX7-NEXT: v_mov_b32_e32 v1, v4
3869 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
3870 ; GFX7-NEXT: s_cbranch_execnz .LBB10_1
3871 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
3872 ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
3873 ; GFX7-NEXT: s_setpc_b64 s[30:31]
3875 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset:
3877 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3878 ; GFX6-NEXT: s_addk_i32 s18, 0x200
3879 ; GFX6-NEXT: s_and_b32 s4, s18, -4
3880 ; GFX6-NEXT: s_mov_b32 s11, s17
3881 ; GFX6-NEXT: s_mov_b32 s10, s16
3882 ; GFX6-NEXT: s_mov_b32 s9, s7
3883 ; GFX6-NEXT: s_mov_b32 s8, s6
3884 ; GFX6-NEXT: v_mov_b32_e32 v2, s4
3885 ; GFX6-NEXT: buffer_load_dword v1, v2, s[8:11], 0 offen
3886 ; GFX6-NEXT: s_and_b32 s4, s18, 3
3887 ; GFX6-NEXT: s_lshl_b32 s6, s4, 3
3888 ; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
3889 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
3890 ; GFX6-NEXT: s_not_b32 s7, s4
3891 ; GFX6-NEXT: s_mov_b64 s[4:5], 0
3892 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
3893 ; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
3894 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
3895 ; GFX6-NEXT: s_waitcnt vmcnt(0)
3896 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v1
3897 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
3898 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
3899 ; GFX6-NEXT: v_max_f32_e32 v0, v0, v3
3900 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
3901 ; GFX6-NEXT: s_waitcnt expcnt(0)
3902 ; GFX6-NEXT: v_and_b32_e32 v4, s7, v1
3903 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, s6, v0
3904 ; GFX6-NEXT: v_or_b32_e32 v0, v4, v0
3905 ; GFX6-NEXT: v_mov_b32_e32 v5, v1
3906 ; GFX6-NEXT: v_mov_b32_e32 v4, v0
3907 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v2, s[8:11], 0 offen glc
3908 ; GFX6-NEXT: s_waitcnt vmcnt(0)
3909 ; GFX6-NEXT: buffer_wbinvl1
3910 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
3911 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3912 ; GFX6-NEXT: v_mov_b32_e32 v1, v4
3913 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
3914 ; GFX6-NEXT: s_cbranch_execnz .LBB10_1
3915 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
3916 ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
3917 ; GFX6-NEXT: s_waitcnt expcnt(0)
3918 ; GFX6-NEXT: s_setpc_b64 s[30:31]
3919 %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
3920 %unused = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
3924 define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall(ptr addrspace(7) %ptr, bfloat %val) #0 {
3925 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
3927 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
3928 ; GFX12-NEXT: s_wait_expcnt 0x0
3929 ; GFX12-NEXT: s_wait_samplecnt 0x0
3930 ; GFX12-NEXT: s_wait_bvhcnt 0x0
3931 ; GFX12-NEXT: s_wait_kmcnt 0x0
3932 ; GFX12-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
3933 ; GFX12-NEXT: s_mov_b32 s1, exec_lo
3934 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3935 ; GFX12-NEXT: v_and_b32_e32 v6, 3, v4
3936 ; GFX12-NEXT: v_and_b32_e32 v8, -4, v4
3937 ; GFX12-NEXT: v_lshlrev_b32_e32 v7, 3, v6
3938 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3939 ; GFX12-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
3940 ; GFX12-NEXT: v_not_b32_e32 v9, v6
3941 ; GFX12-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
3942 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0
3943 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1
3944 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2
3945 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3
3946 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
3947 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
3948 ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
3949 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3950 ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
3951 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0
3952 ; GFX12-NEXT: buffer_load_b32 v6, v8, s[4:7], null offen
3953 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
3954 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1
3955 ; GFX12-NEXT: ; %bb.2:
3956 ; GFX12-NEXT: s_mov_b32 exec_lo, s1
3957 ; GFX12-NEXT: v_lshlrev_b32_e32 v10, 16, v5
3958 ; GFX12-NEXT: s_mov_b32 s1, 0
3959 ; GFX12-NEXT: .LBB11_3: ; %atomicrmw.start
3960 ; GFX12-NEXT: ; =>This Loop Header: Depth=1
3961 ; GFX12-NEXT: ; Child Loop BB11_4 Depth 2
3962 ; GFX12-NEXT: s_wait_loadcnt 0x0
3963 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, v7, v6
3964 ; GFX12-NEXT: s_mov_b32 s2, exec_lo
3965 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
3966 ; GFX12-NEXT: s_wait_storecnt 0x0
3967 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v4
3968 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3969 ; GFX12-NEXT: v_max_num_f32_e32 v4, v4, v10
3970 ; GFX12-NEXT: v_bfe_u32 v5, v4, 16, 1
3971 ; GFX12-NEXT: v_or_b32_e32 v11, 0x400000, v4
3972 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
3973 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3974 ; GFX12-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
3975 ; GFX12-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
3976 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3977 ; GFX12-NEXT: v_lshrrev_b32_e32 v4, 16, v4
3978 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, v7, v4
3979 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3980 ; GFX12-NEXT: v_and_or_b32 v5, v6, v9, v4
3981 ; GFX12-NEXT: v_mov_b32_e32 v4, v5
3982 ; GFX12-NEXT: v_mov_b32_e32 v5, v6
3983 ; GFX12-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
3984 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
3985 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0
3986 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1
3987 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2
3988 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3
3989 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
3990 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
3991 ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
3992 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3993 ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
3994 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0
3995 ; GFX12-NEXT: s_wait_loadcnt 0x0
3996 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], null offen th:TH_ATOMIC_RETURN
3997 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
3998 ; GFX12-NEXT: s_cbranch_execnz .LBB11_4
3999 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
4000 ; GFX12-NEXT: s_mov_b32 exec_lo, s2
4001 ; GFX12-NEXT: s_wait_loadcnt 0x0
4002 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
4003 ; GFX12-NEXT: v_mov_b32_e32 v6, v4
4004 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
4005 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
4006 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4007 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
4008 ; GFX12-NEXT: s_cbranch_execnz .LBB11_3
4009 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
4010 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
4011 ; GFX12-NEXT: v_lshrrev_b32_e32 v0, v7, v4
4012 ; GFX12-NEXT: s_setpc_b64 s[30:31]
4014 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
4016 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4017 ; GFX940-NEXT: v_add_u32_e32 v4, 0x200, v4
4018 ; GFX940-NEXT: v_and_b32_e32 v9, -4, v4
4019 ; GFX940-NEXT: v_and_b32_e32 v4, 3, v4
4020 ; GFX940-NEXT: v_lshlrev_b32_e32 v8, 3, v4
4021 ; GFX940-NEXT: s_mov_b32 s0, 0xffff
4022 ; GFX940-NEXT: v_lshlrev_b32_e64 v4, v8, s0
4023 ; GFX940-NEXT: v_not_b32_e32 v10, v4
4024 ; GFX940-NEXT: s_mov_b64 s[2:3], exec
4025 ; GFX940-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
4026 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0
4027 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1
4028 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2
4029 ; GFX940-NEXT: v_readfirstlane_b32 s7, v3
4030 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
4031 ; GFX940-NEXT: s_nop 0
4032 ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
4033 ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
4034 ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
4035 ; GFX940-NEXT: buffer_load_dword v7, v9, s[4:7], 0 offen
4036 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
4037 ; GFX940-NEXT: s_cbranch_execnz .LBB11_1
4038 ; GFX940-NEXT: ; %bb.2:
4039 ; GFX940-NEXT: s_mov_b64 exec, s[2:3]
4040 ; GFX940-NEXT: s_mov_b64 s[2:3], 0
4041 ; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v5
4042 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff
4043 ; GFX940-NEXT: .LBB11_3: ; %atomicrmw.start
4044 ; GFX940-NEXT: ; =>This Loop Header: Depth=1
4045 ; GFX940-NEXT: ; Child Loop BB11_4 Depth 2
4046 ; GFX940-NEXT: s_waitcnt vmcnt(0)
4047 ; GFX940-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4048 ; GFX940-NEXT: s_mov_b64 s[8:9], exec
4049 ; GFX940-NEXT: v_max_f32_e32 v4, v4, v11
4050 ; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1
4051 ; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10
4052 ; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4
4053 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
4054 ; GFX940-NEXT: buffer_wbl2 sc1
4055 ; GFX940-NEXT: s_nop 0
4056 ; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
4057 ; GFX940-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4058 ; GFX940-NEXT: v_and_or_b32 v6, v7, v10, v4
4059 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
4060 ; GFX940-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
4061 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
4062 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0
4063 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1
4064 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2
4065 ; GFX940-NEXT: v_readfirstlane_b32 s7, v3
4066 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
4067 ; GFX940-NEXT: s_nop 0
4068 ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
4069 ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
4070 ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
4071 ; GFX940-NEXT: s_waitcnt vmcnt(0)
4072 ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[4:7], 0 offen sc0
4073 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
4074 ; GFX940-NEXT: s_cbranch_execnz .LBB11_4
4075 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
4076 ; GFX940-NEXT: s_mov_b64 exec, s[8:9]
4077 ; GFX940-NEXT: s_waitcnt vmcnt(0)
4078 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
4079 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
4080 ; GFX940-NEXT: v_mov_b32_e32 v7, v4
4081 ; GFX940-NEXT: buffer_inv sc1
4082 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
4083 ; GFX940-NEXT: s_cbranch_execnz .LBB11_3
4084 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
4085 ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
4086 ; GFX940-NEXT: v_lshrrev_b32_e32 v0, v8, v4
4087 ; GFX940-NEXT: s_setpc_b64 s[30:31]
4089 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
4091 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4092 ; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
4093 ; GFX11-NEXT: s_mov_b32 s1, 0
4094 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
4095 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4096 ; GFX11-NEXT: v_and_b32_e32 v6, 3, v4
4097 ; GFX11-NEXT: v_and_b32_e32 v8, -4, v4
4098 ; GFX11-NEXT: v_lshlrev_b32_e32 v7, 3, v6
4099 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4100 ; GFX11-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
4101 ; GFX11-NEXT: v_not_b32_e32 v9, v6
4102 ; GFX11-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
4103 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
4104 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1
4105 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2
4106 ; GFX11-NEXT: v_readfirstlane_b32 s7, v3
4107 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
4108 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
4109 ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
4110 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
4111 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
4112 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0
4113 ; GFX11-NEXT: buffer_load_b32 v6, v8, s[4:7], 0 offen
4114 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
4115 ; GFX11-NEXT: s_cbranch_execnz .LBB11_1
4116 ; GFX11-NEXT: ; %bb.2:
4117 ; GFX11-NEXT: s_mov_b32 exec_lo, s2
4118 ; GFX11-NEXT: v_lshlrev_b32_e32 v10, 16, v5
4119 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
4120 ; GFX11-NEXT: .p2align 6
4121 ; GFX11-NEXT: .LBB11_3: ; %atomicrmw.start
4122 ; GFX11-NEXT: ; =>This Loop Header: Depth=1
4123 ; GFX11-NEXT: ; Child Loop BB11_4 Depth 2
4124 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4125 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, v7, v6
4126 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
4127 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
4128 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4129 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v4
4130 ; GFX11-NEXT: v_max_f32_e32 v4, v4, v10
4131 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
4132 ; GFX11-NEXT: v_bfe_u32 v5, v4, 16, 1
4133 ; GFX11-NEXT: v_or_b32_e32 v11, 0x400000, v4
4134 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
4135 ; GFX11-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
4136 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4137 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
4138 ; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4
4139 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4140 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, v7, v4
4141 ; GFX11-NEXT: v_and_or_b32 v5, v6, v9, v4
4142 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
4143 ; GFX11-NEXT: v_mov_b32_e32 v4, v5
4144 ; GFX11-NEXT: v_mov_b32_e32 v5, v6
4145 ; GFX11-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
4146 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
4147 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
4148 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1
4149 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2
4150 ; GFX11-NEXT: v_readfirstlane_b32 s7, v3
4151 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
4152 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
4153 ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
4154 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
4155 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
4156 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0
4157 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4158 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v8, s[4:7], 0 offen glc
4159 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
4160 ; GFX11-NEXT: s_cbranch_execnz .LBB11_4
4161 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
4162 ; GFX11-NEXT: s_mov_b32 exec_lo, s2
4163 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4164 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
4165 ; GFX11-NEXT: v_mov_b32_e32 v6, v4
4166 ; GFX11-NEXT: buffer_gl1_inv
4167 ; GFX11-NEXT: buffer_gl0_inv
4168 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
4169 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4170 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
4171 ; GFX11-NEXT: s_cbranch_execnz .LBB11_3
4172 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
4173 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
4174 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
4175 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, v7, v4
4176 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4178 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
4180 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4181 ; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x200, v4
4182 ; GFX10-NEXT: s_mov_b32 s5, 0
4183 ; GFX10-NEXT: s_mov_b32 s6, exec_lo
4184 ; GFX10-NEXT: v_and_b32_e32 v6, 3, v4
4185 ; GFX10-NEXT: v_and_b32_e32 v8, -4, v4
4186 ; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v6
4187 ; GFX10-NEXT: v_lshlrev_b32_e64 v6, v7, 0xffff
4188 ; GFX10-NEXT: v_not_b32_e32 v9, v6
4189 ; GFX10-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
4190 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0
4191 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1
4192 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2
4193 ; GFX10-NEXT: v_readfirstlane_b32 s11, v3
4194 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
4195 ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
4196 ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
4197 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4
4198 ; GFX10-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
4199 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4200 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
4201 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1
4202 ; GFX10-NEXT: ; %bb.2:
4203 ; GFX10-NEXT: s_mov_b32 exec_lo, s6
4204 ; GFX10-NEXT: v_lshlrev_b32_e32 v10, 16, v5
4205 ; GFX10-NEXT: .LBB11_3: ; %atomicrmw.start
4206 ; GFX10-NEXT: ; =>This Loop Header: Depth=1
4207 ; GFX10-NEXT: ; Child Loop BB11_4 Depth 2
4208 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4209 ; GFX10-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4210 ; GFX10-NEXT: s_mov_b32 s6, exec_lo
4211 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4212 ; GFX10-NEXT: v_max_f32_e32 v4, v4, v10
4213 ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1
4214 ; GFX10-NEXT: v_or_b32_e32 v11, 0x400000, v4
4215 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
4216 ; GFX10-NEXT: v_add3_u32 v5, v5, v4, 0x7fff
4217 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc_lo
4218 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4219 ; GFX10-NEXT: v_and_or_b32 v5, v6, v9, v4
4220 ; GFX10-NEXT: v_mov_b32_e32 v4, v5
4221 ; GFX10-NEXT: v_mov_b32_e32 v5, v6
4222 ; GFX10-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
4223 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
4224 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0
4225 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1
4226 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2
4227 ; GFX10-NEXT: v_readfirstlane_b32 s11, v3
4228 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
4229 ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
4230 ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
4231 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4
4232 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4233 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
4234 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4235 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
4236 ; GFX10-NEXT: s_cbranch_execnz .LBB11_4
4237 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
4238 ; GFX10-NEXT: s_mov_b32 exec_lo, s6
4239 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4240 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
4241 ; GFX10-NEXT: v_mov_b32_e32 v6, v4
4242 ; GFX10-NEXT: buffer_gl1_inv
4243 ; GFX10-NEXT: buffer_gl0_inv
4244 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
4245 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
4246 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
4247 ; GFX10-NEXT: s_cbranch_execnz .LBB11_3
4248 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
4249 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
4250 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, v7, v4
4251 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4253 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
4255 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4256 ; GFX90A-NEXT: v_add_u32_e32 v4, 0x200, v4
4257 ; GFX90A-NEXT: v_and_b32_e32 v9, -4, v4
4258 ; GFX90A-NEXT: v_and_b32_e32 v4, 3, v4
4259 ; GFX90A-NEXT: v_lshlrev_b32_e32 v8, 3, v4
4260 ; GFX90A-NEXT: s_mov_b32 s4, 0xffff
4261 ; GFX90A-NEXT: v_lshlrev_b32_e64 v4, v8, s4
4262 ; GFX90A-NEXT: v_not_b32_e32 v10, v4
4263 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec
4264 ; GFX90A-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
4265 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
4266 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
4267 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
4268 ; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
4269 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4270 ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4271 ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
4272 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
4273 ; GFX90A-NEXT: s_nop 0
4274 ; GFX90A-NEXT: buffer_load_dword v7, v9, s[8:11], 0 offen
4275 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
4276 ; GFX90A-NEXT: s_cbranch_execnz .LBB11_1
4277 ; GFX90A-NEXT: ; %bb.2:
4278 ; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
4279 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0
4280 ; GFX90A-NEXT: v_lshlrev_b32_e32 v11, 16, v5
4281 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
4282 ; GFX90A-NEXT: .LBB11_3: ; %atomicrmw.start
4283 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1
4284 ; GFX90A-NEXT: ; Child Loop BB11_4 Depth 2
4285 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
4286 ; GFX90A-NEXT: v_lshrrev_b32_sdwa v4, v8, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4287 ; GFX90A-NEXT: v_max_f32_e32 v4, v4, v11
4288 ; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
4289 ; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14
4290 ; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
4291 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
4292 ; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
4293 ; GFX90A-NEXT: v_lshlrev_b32_sdwa v4, v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4294 ; GFX90A-NEXT: v_and_or_b32 v6, v7, v10, v4
4295 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec
4296 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
4297 ; GFX90A-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
4298 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
4299 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
4300 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
4301 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
4302 ; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
4303 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4304 ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4305 ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
4306 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
4307 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
4308 ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v9, s[8:11], 0 offen glc
4309 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
4310 ; GFX90A-NEXT: s_cbranch_execnz .LBB11_4
4311 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
4312 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
4313 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
4314 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
4315 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
4316 ; GFX90A-NEXT: v_mov_b32_e32 v7, v4
4317 ; GFX90A-NEXT: buffer_wbinvl1
4318 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
4319 ; GFX90A-NEXT: s_cbranch_execnz .LBB11_3
4320 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
4321 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
4322 ; GFX90A-NEXT: v_lshrrev_b32_e32 v0, v8, v4
4323 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
4325 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
4327 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4328 ; GFX908-NEXT: v_add_u32_e32 v4, 0x200, v4
4329 ; GFX908-NEXT: v_and_b32_e32 v8, -4, v4
4330 ; GFX908-NEXT: v_and_b32_e32 v4, 3, v4
4331 ; GFX908-NEXT: v_lshlrev_b32_e32 v7, 3, v4
4332 ; GFX908-NEXT: s_mov_b32 s4, 0xffff
4333 ; GFX908-NEXT: v_lshlrev_b32_e64 v4, v7, s4
4334 ; GFX908-NEXT: v_not_b32_e32 v9, v4
4335 ; GFX908-NEXT: s_mov_b64 s[6:7], exec
4336 ; GFX908-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
4337 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0
4338 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1
4339 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2
4340 ; GFX908-NEXT: v_readfirstlane_b32 s11, v3
4341 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4342 ; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4343 ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
4344 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
4345 ; GFX908-NEXT: s_nop 0
4346 ; GFX908-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
4347 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
4348 ; GFX908-NEXT: s_cbranch_execnz .LBB11_1
4349 ; GFX908-NEXT: ; %bb.2:
4350 ; GFX908-NEXT: s_mov_b64 exec, s[6:7]
4351 ; GFX908-NEXT: s_mov_b64 s[6:7], 0
4352 ; GFX908-NEXT: v_lshlrev_b32_e32 v10, 16, v5
4353 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff
4354 ; GFX908-NEXT: .LBB11_3: ; %atomicrmw.start
4355 ; GFX908-NEXT: ; =>This Loop Header: Depth=1
4356 ; GFX908-NEXT: ; Child Loop BB11_4 Depth 2
4357 ; GFX908-NEXT: s_waitcnt vmcnt(0)
4358 ; GFX908-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4359 ; GFX908-NEXT: v_max_f32_e32 v4, v4, v10
4360 ; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
4361 ; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14
4362 ; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v4
4363 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
4364 ; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc
4365 ; GFX908-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4366 ; GFX908-NEXT: v_and_or_b32 v5, v6, v9, v4
4367 ; GFX908-NEXT: v_mov_b32_e32 v4, v5
4368 ; GFX908-NEXT: s_mov_b64 s[12:13], exec
4369 ; GFX908-NEXT: v_mov_b32_e32 v5, v6
4370 ; GFX908-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
4371 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
4372 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0
4373 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1
4374 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2
4375 ; GFX908-NEXT: v_readfirstlane_b32 s11, v3
4376 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4377 ; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4378 ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
4379 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
4380 ; GFX908-NEXT: s_waitcnt vmcnt(0)
4381 ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
4382 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
4383 ; GFX908-NEXT: s_cbranch_execnz .LBB11_4
4384 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
4385 ; GFX908-NEXT: s_mov_b64 exec, s[12:13]
4386 ; GFX908-NEXT: s_waitcnt vmcnt(0)
4387 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
4388 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
4389 ; GFX908-NEXT: v_mov_b32_e32 v6, v4
4390 ; GFX908-NEXT: buffer_wbinvl1
4391 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
4392 ; GFX908-NEXT: s_cbranch_execnz .LBB11_3
4393 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
4394 ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
4395 ; GFX908-NEXT: v_lshrrev_b32_e32 v0, v7, v4
4396 ; GFX908-NEXT: s_setpc_b64 s[30:31]
4398 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
4400 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4401 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0x200, v4
4402 ; GFX8-NEXT: v_and_b32_e32 v8, -4, v4
4403 ; GFX8-NEXT: v_and_b32_e32 v4, 3, v4
4404 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 3, v4
4405 ; GFX8-NEXT: s_mov_b32 s4, 0xffff
4406 ; GFX8-NEXT: v_lshlrev_b32_e64 v4, v7, s4
4407 ; GFX8-NEXT: v_not_b32_e32 v9, v4
4408 ; GFX8-NEXT: s_mov_b64 s[6:7], exec
4409 ; GFX8-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
4410 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0
4411 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1
4412 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2
4413 ; GFX8-NEXT: v_readfirstlane_b32 s11, v3
4414 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4415 ; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4416 ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
4417 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
4418 ; GFX8-NEXT: s_nop 0
4419 ; GFX8-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
4420 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
4421 ; GFX8-NEXT: s_cbranch_execnz .LBB11_1
4422 ; GFX8-NEXT: ; %bb.2:
4423 ; GFX8-NEXT: s_mov_b64 exec, s[6:7]
4424 ; GFX8-NEXT: s_mov_b64 s[6:7], 0
4425 ; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v5
4426 ; GFX8-NEXT: .LBB11_3: ; %atomicrmw.start
4427 ; GFX8-NEXT: ; =>This Loop Header: Depth=1
4428 ; GFX8-NEXT: ; Child Loop BB11_4 Depth 2
4429 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4430 ; GFX8-NEXT: v_lshrrev_b32_sdwa v4, v7, v6 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4431 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v10
4432 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
4433 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
4434 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
4435 ; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v4
4436 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
4437 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v11, vcc
4438 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v7, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
4439 ; GFX8-NEXT: v_and_b32_e32 v5, v6, v9
4440 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
4441 ; GFX8-NEXT: v_mov_b32_e32 v4, v5
4442 ; GFX8-NEXT: s_mov_b64 s[12:13], exec
4443 ; GFX8-NEXT: v_mov_b32_e32 v5, v6
4444 ; GFX8-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
4445 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
4446 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0
4447 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1
4448 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2
4449 ; GFX8-NEXT: v_readfirstlane_b32 s11, v3
4450 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4451 ; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4452 ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
4453 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
4454 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4455 ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
4456 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
4457 ; GFX8-NEXT: s_cbranch_execnz .LBB11_4
4458 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
4459 ; GFX8-NEXT: s_mov_b64 exec, s[12:13]
4460 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4461 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
4462 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
4463 ; GFX8-NEXT: v_mov_b32_e32 v6, v4
4464 ; GFX8-NEXT: buffer_wbinvl1
4465 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
4466 ; GFX8-NEXT: s_cbranch_execnz .LBB11_3
4467 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
4468 ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
4469 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, v7, v4
4470 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4472 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
4474 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4475 ; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
4476 ; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
4477 ; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
4478 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
4479 ; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
4480 ; GFX7-NEXT: v_not_b32_e32 v9, v4
4481 ; GFX7-NEXT: s_mov_b64 s[6:7], exec
4482 ; GFX7-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
4483 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0
4484 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1
4485 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2
4486 ; GFX7-NEXT: v_readfirstlane_b32 s11, v3
4487 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4488 ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4489 ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
4490 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
4491 ; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
4492 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
4493 ; GFX7-NEXT: s_cbranch_execnz .LBB11_1
4494 ; GFX7-NEXT: ; %bb.2:
4495 ; GFX7-NEXT: s_mov_b64 exec, s[6:7]
4496 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
4497 ; GFX7-NEXT: s_mov_b64 s[6:7], 0
4498 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
4499 ; GFX7-NEXT: .LBB11_3: ; %atomicrmw.start
4500 ; GFX7-NEXT: ; =>This Loop Header: Depth=1
4501 ; GFX7-NEXT: ; Child Loop BB11_4 Depth 2
4502 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4503 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
4504 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
4505 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
4506 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v10
4507 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
4508 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
4509 ; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
4510 ; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
4511 ; GFX7-NEXT: v_mov_b32_e32 v4, v5
4512 ; GFX7-NEXT: s_mov_b64 s[12:13], exec
4513 ; GFX7-NEXT: v_mov_b32_e32 v5, v6
4514 ; GFX7-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
4515 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
4516 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0
4517 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1
4518 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2
4519 ; GFX7-NEXT: v_readfirstlane_b32 s11, v3
4520 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4521 ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4522 ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
4523 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
4524 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4525 ; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
4526 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
4527 ; GFX7-NEXT: s_cbranch_execnz .LBB11_4
4528 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
4529 ; GFX7-NEXT: s_mov_b64 exec, s[12:13]
4530 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4531 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
4532 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
4533 ; GFX7-NEXT: v_mov_b32_e32 v6, v4
4534 ; GFX7-NEXT: buffer_wbinvl1
4535 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
4536 ; GFX7-NEXT: s_cbranch_execnz .LBB11_3
4537 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
4538 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
4539 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
4540 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
4541 ; GFX7-NEXT: s_setpc_b64 s[30:31]
4543 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall:
4545 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4546 ; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
4547 ; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
4548 ; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
4549 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
4550 ; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
4551 ; GFX6-NEXT: v_not_b32_e32 v9, v4
4552 ; GFX6-NEXT: s_mov_b64 s[6:7], exec
4553 ; GFX6-NEXT: .LBB11_1: ; =>This Inner Loop Header: Depth=1
4554 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0
4555 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1
4556 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2
4557 ; GFX6-NEXT: v_readfirstlane_b32 s11, v3
4558 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4559 ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4560 ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
4561 ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
4562 ; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
4563 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
4564 ; GFX6-NEXT: s_cbranch_execnz .LBB11_1
4565 ; GFX6-NEXT: ; %bb.2:
4566 ; GFX6-NEXT: s_mov_b64 exec, s[6:7]
4567 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5
4568 ; GFX6-NEXT: s_mov_b64 s[6:7], 0
4569 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
4570 ; GFX6-NEXT: .LBB11_3: ; %atomicrmw.start
4571 ; GFX6-NEXT: ; =>This Loop Header: Depth=1
4572 ; GFX6-NEXT: ; Child Loop BB11_4 Depth 2
4573 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4574 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
4575 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
4576 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
4577 ; GFX6-NEXT: v_max_f32_e32 v4, v4, v10
4578 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
4579 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
4580 ; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
4581 ; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
4582 ; GFX6-NEXT: v_mov_b32_e32 v4, v5
4583 ; GFX6-NEXT: s_mov_b64 s[12:13], exec
4584 ; GFX6-NEXT: v_mov_b32_e32 v5, v6
4585 ; GFX6-NEXT: .LBB11_4: ; Parent Loop BB11_3 Depth=1
4586 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
4587 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0
4588 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1
4589 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2
4590 ; GFX6-NEXT: v_readfirstlane_b32 s11, v3
4591 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
4592 ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
4593 ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
4594 ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
4595 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
4596 ; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
4597 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
4598 ; GFX6-NEXT: s_cbranch_execnz .LBB11_4
4599 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB11_3 Depth=1
4600 ; GFX6-NEXT: s_mov_b64 exec, s[12:13]
4601 ; GFX6-NEXT: s_waitcnt vmcnt(0)
4602 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
4603 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
4604 ; GFX6-NEXT: v_mov_b32_e32 v6, v4
4605 ; GFX6-NEXT: buffer_wbinvl1
4606 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
4607 ; GFX6-NEXT: s_cbranch_execnz .LBB11_3
4608 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
4609 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
4610 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
4611 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
4612 ; GFX6-NEXT: s_waitcnt expcnt(0)
4613 ; GFX6-NEXT: s_setpc_b64 s[30:31]
4614 %gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
4615 %result = atomicrmw fmax ptr addrspace(7) %gep, bfloat %val syncscope("agent") seq_cst
4619 ; --------------------------------------------------------------------
4621 ; --------------------------------------------------------------------
4623 define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
4624 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
4626 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
4627 ; GFX12-NEXT: s_wait_expcnt 0x0
4628 ; GFX12-NEXT: s_wait_samplecnt 0x0
4629 ; GFX12-NEXT: s_wait_bvhcnt 0x0
4630 ; GFX12-NEXT: s_wait_kmcnt 0x0
4631 ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
4632 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
4633 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
4634 ; GFX12-NEXT: v_mov_b32_e32 v3, s4
4635 ; GFX12-NEXT: v_pk_max_num_f16 v2, v1, v1
4636 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
4637 ; GFX12-NEXT: s_mov_b32 s4, 0
4638 ; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
4639 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
4640 ; GFX12-NEXT: s_wait_loadcnt 0x0
4641 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
4642 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
4643 ; GFX12-NEXT: s_wait_storecnt 0x0
4644 ; GFX12-NEXT: v_pk_max_num_f16 v0, v5, v5
4645 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4646 ; GFX12-NEXT: v_pk_max_num_f16 v4, v0, v2
4647 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
4648 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
4649 ; GFX12-NEXT: s_wait_loadcnt 0x0
4650 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
4651 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
4652 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
4653 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4654 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
4655 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1
4656 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
4657 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
4658 ; GFX12-NEXT: s_setpc_b64 s[30:31]
4660 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
4662 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4663 ; GFX940-NEXT: v_mov_b32_e32 v1, v0
4664 ; GFX940-NEXT: v_mov_b32_e32 v0, s6
4665 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
4666 ; GFX940-NEXT: s_addk_i32 s6, 0x400
4667 ; GFX940-NEXT: s_mov_b64 s[4:5], 0
4668 ; GFX940-NEXT: v_pk_max_f16 v2, v1, v1
4669 ; GFX940-NEXT: v_mov_b32_e32 v3, s6
4670 ; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start
4671 ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
4672 ; GFX940-NEXT: s_waitcnt vmcnt(0)
4673 ; GFX940-NEXT: v_mov_b32_e32 v5, v0
4674 ; GFX940-NEXT: v_pk_max_f16 v0, v5, v5
4675 ; GFX940-NEXT: buffer_wbl2 sc1
4676 ; GFX940-NEXT: v_pk_max_f16 v4, v0, v2
4677 ; GFX940-NEXT: s_nop 0
4678 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
4679 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
4680 ; GFX940-NEXT: s_waitcnt vmcnt(0)
4681 ; GFX940-NEXT: buffer_inv sc1
4682 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
4683 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4684 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
4685 ; GFX940-NEXT: s_cbranch_execnz .LBB12_1
4686 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
4687 ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
4688 ; GFX940-NEXT: s_setpc_b64 s[30:31]
4690 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
4692 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4693 ; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
4694 ; GFX11-NEXT: s_add_i32 s4, s6, 0x400
4695 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
4696 ; GFX11-NEXT: v_mov_b32_e32 v3, s4
4697 ; GFX11-NEXT: v_pk_max_f16 v2, v1, v1
4698 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
4699 ; GFX11-NEXT: s_mov_b32 s4, 0
4700 ; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
4701 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
4702 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4703 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
4704 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
4705 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4706 ; GFX11-NEXT: v_pk_max_f16 v0, v5, v5
4707 ; GFX11-NEXT: v_pk_max_f16 v4, v0, v2
4708 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
4709 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
4710 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
4711 ; GFX11-NEXT: s_waitcnt vmcnt(0)
4712 ; GFX11-NEXT: buffer_gl1_inv
4713 ; GFX11-NEXT: buffer_gl0_inv
4714 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
4715 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
4716 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4717 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
4718 ; GFX11-NEXT: s_cbranch_execnz .LBB12_1
4719 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
4720 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
4721 ; GFX11-NEXT: s_setpc_b64 s[30:31]
4723 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
4725 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4726 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
4727 ; GFX10-NEXT: v_mov_b32_e32 v0, s18
4728 ; GFX10-NEXT: s_mov_b32 s11, s17
4729 ; GFX10-NEXT: s_mov_b32 s10, s16
4730 ; GFX10-NEXT: s_mov_b32 s9, s7
4731 ; GFX10-NEXT: s_mov_b32 s8, s6
4732 ; GFX10-NEXT: s_add_i32 s4, s18, 0x400
4733 ; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
4734 ; GFX10-NEXT: v_pk_max_f16 v2, v1, v1
4735 ; GFX10-NEXT: v_mov_b32_e32 v3, s4
4736 ; GFX10-NEXT: s_mov_b32 s4, 0
4737 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
4738 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
4739 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4740 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
4741 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
4742 ; GFX10-NEXT: v_pk_max_f16 v0, v5, v5
4743 ; GFX10-NEXT: v_pk_max_f16 v4, v0, v2
4744 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
4745 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
4746 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
4747 ; GFX10-NEXT: s_waitcnt vmcnt(0)
4748 ; GFX10-NEXT: buffer_gl1_inv
4749 ; GFX10-NEXT: buffer_gl0_inv
4750 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
4751 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
4752 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
4753 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1
4754 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
4755 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
4756 ; GFX10-NEXT: s_setpc_b64 s[30:31]
4758 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
4760 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4761 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0
4762 ; GFX90A-NEXT: s_mov_b32 s11, s17
4763 ; GFX90A-NEXT: s_mov_b32 s10, s16
4764 ; GFX90A-NEXT: s_mov_b32 s9, s7
4765 ; GFX90A-NEXT: s_mov_b32 s8, s6
4766 ; GFX90A-NEXT: v_mov_b32_e32 v0, s18
4767 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
4768 ; GFX90A-NEXT: s_add_i32 s6, s18, 0x400
4769 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
4770 ; GFX90A-NEXT: v_pk_max_f16 v2, v1, v1
4771 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6
4772 ; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start
4773 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
4774 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
4775 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0
4776 ; GFX90A-NEXT: v_pk_max_f16 v0, v5, v5
4777 ; GFX90A-NEXT: v_pk_max_f16 v4, v0, v2
4778 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
4779 ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
4780 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
4781 ; GFX90A-NEXT: buffer_wbinvl1
4782 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
4783 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4784 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
4785 ; GFX90A-NEXT: s_cbranch_execnz .LBB12_1
4786 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
4787 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
4788 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
4790 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
4792 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4793 ; GFX908-NEXT: v_mov_b32_e32 v1, v0
4794 ; GFX908-NEXT: s_mov_b32 s11, s17
4795 ; GFX908-NEXT: s_mov_b32 s10, s16
4796 ; GFX908-NEXT: s_mov_b32 s9, s7
4797 ; GFX908-NEXT: s_mov_b32 s8, s6
4798 ; GFX908-NEXT: v_mov_b32_e32 v0, s18
4799 ; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
4800 ; GFX908-NEXT: s_add_i32 s6, s18, 0x400
4801 ; GFX908-NEXT: s_mov_b64 s[4:5], 0
4802 ; GFX908-NEXT: v_pk_max_f16 v2, v1, v1
4803 ; GFX908-NEXT: v_mov_b32_e32 v3, s6
4804 ; GFX908-NEXT: .LBB12_1: ; %atomicrmw.start
4805 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
4806 ; GFX908-NEXT: s_waitcnt vmcnt(0)
4807 ; GFX908-NEXT: v_mov_b32_e32 v5, v0
4808 ; GFX908-NEXT: v_pk_max_f16 v0, v5, v5
4809 ; GFX908-NEXT: v_pk_max_f16 v4, v0, v2
4810 ; GFX908-NEXT: v_mov_b32_e32 v0, v4
4811 ; GFX908-NEXT: v_mov_b32_e32 v1, v5
4812 ; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
4813 ; GFX908-NEXT: s_waitcnt vmcnt(0)
4814 ; GFX908-NEXT: buffer_wbinvl1
4815 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
4816 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4817 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
4818 ; GFX908-NEXT: s_cbranch_execnz .LBB12_1
4819 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
4820 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
4821 ; GFX908-NEXT: s_setpc_b64 s[30:31]
4823 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
4825 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4826 ; GFX8-NEXT: v_mov_b32_e32 v1, v0
4827 ; GFX8-NEXT: s_mov_b32 s11, s17
4828 ; GFX8-NEXT: s_mov_b32 s10, s16
4829 ; GFX8-NEXT: s_mov_b32 s9, s7
4830 ; GFX8-NEXT: s_mov_b32 s8, s6
4831 ; GFX8-NEXT: v_mov_b32_e32 v0, s18
4832 ; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
4833 ; GFX8-NEXT: s_add_i32 s6, s18, 0x400
4834 ; GFX8-NEXT: s_mov_b64 s[4:5], 0
4835 ; GFX8-NEXT: v_max_f16_sdwa v2, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4836 ; GFX8-NEXT: v_max_f16_e32 v3, v1, v1
4837 ; GFX8-NEXT: v_mov_b32_e32 v4, s6
4838 ; GFX8-NEXT: .LBB12_1: ; %atomicrmw.start
4839 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
4840 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4841 ; GFX8-NEXT: v_mov_b32_e32 v6, v0
4842 ; GFX8-NEXT: v_max_f16_sdwa v0, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
4843 ; GFX8-NEXT: v_max_f16_e32 v1, v6, v6
4844 ; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
4845 ; GFX8-NEXT: v_max_f16_e32 v1, v1, v3
4846 ; GFX8-NEXT: v_or_b32_e32 v5, v1, v0
4847 ; GFX8-NEXT: v_mov_b32_e32 v0, v5
4848 ; GFX8-NEXT: v_mov_b32_e32 v1, v6
4849 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc
4850 ; GFX8-NEXT: s_waitcnt vmcnt(0)
4851 ; GFX8-NEXT: buffer_wbinvl1
4852 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
4853 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4854 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
4855 ; GFX8-NEXT: s_cbranch_execnz .LBB12_1
4856 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
4857 ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
4858 ; GFX8-NEXT: s_setpc_b64 s[30:31]
4860 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
4862 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4863 ; GFX7-NEXT: s_mov_b32 s11, s17
4864 ; GFX7-NEXT: s_mov_b32 s10, s16
4865 ; GFX7-NEXT: s_mov_b32 s9, s7
4866 ; GFX7-NEXT: s_mov_b32 s8, s6
4867 ; GFX7-NEXT: v_mov_b32_e32 v2, s18
4868 ; GFX7-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024
4869 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
4870 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v0
4871 ; GFX7-NEXT: s_add_i32 s6, s18, 0x400
4872 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
4873 ; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v1
4874 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4875 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v3
4876 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
4877 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
4878 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
4879 ; GFX7-NEXT: v_mov_b32_e32 v4, s6
4880 ; GFX7-NEXT: .LBB12_1: ; %atomicrmw.start
4881 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
4882 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
4883 ; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
4884 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v1
4885 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v0
4886 ; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
4887 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v2
4888 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v3
4889 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
4890 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6
4891 ; GFX7-NEXT: v_or_b32_e32 v6, v0, v1
4892 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
4893 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v0
4894 ; GFX7-NEXT: v_mov_b32_e32 v8, v6
4895 ; GFX7-NEXT: v_mov_b32_e32 v7, v5
4896 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc
4897 ; GFX7-NEXT: s_waitcnt vmcnt(0)
4898 ; GFX7-NEXT: buffer_wbinvl1
4899 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v7
4900 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v7
4901 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
4902 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
4903 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4904 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
4905 ; GFX7-NEXT: s_cbranch_execnz .LBB12_1
4906 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
4907 ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
4908 ; GFX7-NEXT: s_setpc_b64 s[30:31]
4910 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset:
4912 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4913 ; GFX6-NEXT: s_mov_b32 s11, s17
4914 ; GFX6-NEXT: s_mov_b32 s10, s16
4915 ; GFX6-NEXT: s_mov_b32 s9, s7
4916 ; GFX6-NEXT: s_mov_b32 s8, s6
4917 ; GFX6-NEXT: v_mov_b32_e32 v2, s18
4918 ; GFX6-NEXT: buffer_load_dword v3, v2, s[8:11], 0 offen offset:1024
4919 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
4920 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v0
4921 ; GFX6-NEXT: s_add_i32 s6, s18, 0x400
4922 ; GFX6-NEXT: s_mov_b64 s[4:5], 0
4923 ; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v1
4924 ; GFX6-NEXT: s_waitcnt vmcnt(0)
4925 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v3
4926 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v3
4927 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
4928 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v4
4929 ; GFX6-NEXT: v_mov_b32_e32 v4, s6
4930 ; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start
4931 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
4932 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
4933 ; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
4934 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v1
4935 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v0
4936 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
4937 ; GFX6-NEXT: v_max_f32_e32 v5, v5, v2
4938 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v3
4939 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
4940 ; GFX6-NEXT: s_waitcnt expcnt(0)
4941 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6
4942 ; GFX6-NEXT: v_or_b32_e32 v6, v0, v1
4943 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
4944 ; GFX6-NEXT: v_or_b32_e32 v5, v7, v0
4945 ; GFX6-NEXT: v_mov_b32_e32 v8, v6
4946 ; GFX6-NEXT: v_mov_b32_e32 v7, v5
4947 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v4, s[8:11], 0 offen glc
4948 ; GFX6-NEXT: s_waitcnt vmcnt(0)
4949 ; GFX6-NEXT: buffer_wbinvl1
4950 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v7
4951 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v7
4952 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
4953 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
4954 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4955 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
4956 ; GFX6-NEXT: s_cbranch_execnz .LBB12_1
4957 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
4958 ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
4959 ; GFX6-NEXT: s_waitcnt expcnt(0)
4960 ; GFX6-NEXT: s_setpc_b64 s[30:31]
4961 %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
4962 %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
4963 ret <2 x half> %result
4966 define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset(ptr addrspace(7) inreg %ptr, <2 x half> %val) #0 {
4967 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
4969 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
4970 ; GFX12-NEXT: s_wait_expcnt 0x0
4971 ; GFX12-NEXT: s_wait_samplecnt 0x0
4972 ; GFX12-NEXT: s_wait_bvhcnt 0x0
4973 ; GFX12-NEXT: s_wait_kmcnt 0x0
4974 ; GFX12-NEXT: v_mov_b32_e32 v1, s6
4975 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
4976 ; GFX12-NEXT: v_pk_max_num_f16 v2, v0, v0
4977 ; GFX12-NEXT: v_mov_b32_e32 v3, s4
4978 ; GFX12-NEXT: s_mov_b32 s4, 0
4979 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
4980 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
4981 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
4982 ; GFX12-NEXT: s_wait_loadcnt 0x0
4983 ; GFX12-NEXT: v_pk_max_num_f16 v0, v1, v1
4984 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
4985 ; GFX12-NEXT: s_wait_storecnt 0x0
4986 ; GFX12-NEXT: v_pk_max_num_f16 v0, v0, v2
4987 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
4988 ; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
4989 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
4990 ; GFX12-NEXT: s_wait_loadcnt 0x0
4991 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
4992 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
4993 ; GFX12-NEXT: v_mov_b32_e32 v1, v4
4994 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
4995 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4996 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
4997 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1
4998 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
4999 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
5000 ; GFX12-NEXT: s_setpc_b64 s[30:31]
5002 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
5004 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5005 ; GFX940-NEXT: v_mov_b32_e32 v1, s6
5006 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
5007 ; GFX940-NEXT: s_addk_i32 s6, 0x400
5008 ; GFX940-NEXT: s_mov_b64 s[4:5], 0
5009 ; GFX940-NEXT: v_pk_max_f16 v2, v0, v0
5010 ; GFX940-NEXT: v_mov_b32_e32 v3, s6
5011 ; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start
5012 ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
5013 ; GFX940-NEXT: s_waitcnt vmcnt(0)
5014 ; GFX940-NEXT: v_pk_max_f16 v0, v1, v1
5015 ; GFX940-NEXT: buffer_wbl2 sc1
5016 ; GFX940-NEXT: v_pk_max_f16 v0, v0, v2
5017 ; GFX940-NEXT: s_nop 0
5018 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[0:1]
5019 ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[0:3], 0 offen sc0
5020 ; GFX940-NEXT: s_waitcnt vmcnt(0)
5021 ; GFX940-NEXT: buffer_inv sc1
5022 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
5023 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5024 ; GFX940-NEXT: v_mov_b32_e32 v1, v4
5025 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
5026 ; GFX940-NEXT: s_cbranch_execnz .LBB13_1
5027 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
5028 ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
5029 ; GFX940-NEXT: s_setpc_b64 s[30:31]
5031 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
5033 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5034 ; GFX11-NEXT: v_mov_b32_e32 v1, s6
5035 ; GFX11-NEXT: s_add_i32 s4, s6, 0x400
5036 ; GFX11-NEXT: v_pk_max_f16 v2, v0, v0
5037 ; GFX11-NEXT: v_mov_b32_e32 v3, s4
5038 ; GFX11-NEXT: s_mov_b32 s4, 0
5039 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
5040 ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
5041 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
5042 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5043 ; GFX11-NEXT: v_pk_max_f16 v0, v1, v1
5044 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
5045 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5046 ; GFX11-NEXT: v_pk_max_f16 v0, v0, v2
5047 ; GFX11-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
5048 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v3, s[0:3], 0 offen glc
5049 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5050 ; GFX11-NEXT: buffer_gl1_inv
5051 ; GFX11-NEXT: buffer_gl0_inv
5052 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
5053 ; GFX11-NEXT: v_mov_b32_e32 v1, v4
5054 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
5055 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
5056 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
5057 ; GFX11-NEXT: s_cbranch_execnz .LBB13_1
5058 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
5059 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
5060 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5062 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
5064 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5065 ; GFX10-NEXT: v_mov_b32_e32 v1, s18
5066 ; GFX10-NEXT: s_mov_b32 s11, s17
5067 ; GFX10-NEXT: s_mov_b32 s10, s16
5068 ; GFX10-NEXT: s_mov_b32 s9, s7
5069 ; GFX10-NEXT: s_mov_b32 s8, s6
5070 ; GFX10-NEXT: s_add_i32 s4, s18, 0x400
5071 ; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024
5072 ; GFX10-NEXT: v_pk_max_f16 v2, v0, v0
5073 ; GFX10-NEXT: v_mov_b32_e32 v3, s4
5074 ; GFX10-NEXT: s_mov_b32 s4, 0
5075 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
5076 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
5077 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5078 ; GFX10-NEXT: v_pk_max_f16 v0, v1, v1
5079 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5080 ; GFX10-NEXT: v_pk_max_f16 v0, v0, v2
5081 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
5082 ; GFX10-NEXT: v_mov_b32_e32 v4, v0
5083 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc
5084 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5085 ; GFX10-NEXT: buffer_gl1_inv
5086 ; GFX10-NEXT: buffer_gl0_inv
5087 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v1
5088 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
5089 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
5090 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
5091 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1
5092 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
5093 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
5094 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5096 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
5098 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5099 ; GFX90A-NEXT: s_mov_b32 s11, s17
5100 ; GFX90A-NEXT: s_mov_b32 s10, s16
5101 ; GFX90A-NEXT: s_mov_b32 s9, s7
5102 ; GFX90A-NEXT: s_mov_b32 s8, s6
5103 ; GFX90A-NEXT: v_mov_b32_e32 v1, s18
5104 ; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024
5105 ; GFX90A-NEXT: s_add_i32 s6, s18, 0x400
5106 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
5107 ; GFX90A-NEXT: v_pk_max_f16 v2, v0, v0
5108 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6
5109 ; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start
5110 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
5111 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
5112 ; GFX90A-NEXT: v_pk_max_f16 v0, v1, v1
5113 ; GFX90A-NEXT: v_pk_max_f16 v0, v0, v2
5114 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[0:1], v[0:1] op_sel:[0,1]
5115 ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc
5116 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
5117 ; GFX90A-NEXT: buffer_wbinvl1
5118 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
5119 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5120 ; GFX90A-NEXT: v_mov_b32_e32 v1, v4
5121 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
5122 ; GFX90A-NEXT: s_cbranch_execnz .LBB13_1
5123 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
5124 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
5125 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
5127 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
5129 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5130 ; GFX908-NEXT: s_mov_b32 s11, s17
5131 ; GFX908-NEXT: s_mov_b32 s10, s16
5132 ; GFX908-NEXT: s_mov_b32 s9, s7
5133 ; GFX908-NEXT: s_mov_b32 s8, s6
5134 ; GFX908-NEXT: v_mov_b32_e32 v1, s18
5135 ; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024
5136 ; GFX908-NEXT: s_add_i32 s6, s18, 0x400
5137 ; GFX908-NEXT: s_mov_b64 s[4:5], 0
5138 ; GFX908-NEXT: v_pk_max_f16 v2, v0, v0
5139 ; GFX908-NEXT: v_mov_b32_e32 v3, s6
5140 ; GFX908-NEXT: .LBB13_1: ; %atomicrmw.start
5141 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
5142 ; GFX908-NEXT: s_waitcnt vmcnt(0)
5143 ; GFX908-NEXT: v_pk_max_f16 v0, v1, v1
5144 ; GFX908-NEXT: v_pk_max_f16 v0, v0, v2
5145 ; GFX908-NEXT: v_mov_b32_e32 v5, v1
5146 ; GFX908-NEXT: v_mov_b32_e32 v4, v0
5147 ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v3, s[8:11], 0 offen glc
5148 ; GFX908-NEXT: s_waitcnt vmcnt(0)
5149 ; GFX908-NEXT: buffer_wbinvl1
5150 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v1
5151 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5152 ; GFX908-NEXT: v_mov_b32_e32 v1, v4
5153 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
5154 ; GFX908-NEXT: s_cbranch_execnz .LBB13_1
5155 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
5156 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
5157 ; GFX908-NEXT: s_setpc_b64 s[30:31]
5159 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
5161 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5162 ; GFX8-NEXT: s_mov_b32 s11, s17
5163 ; GFX8-NEXT: s_mov_b32 s10, s16
5164 ; GFX8-NEXT: s_mov_b32 s9, s7
5165 ; GFX8-NEXT: s_mov_b32 s8, s6
5166 ; GFX8-NEXT: v_mov_b32_e32 v1, s18
5167 ; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024
5168 ; GFX8-NEXT: s_add_i32 s6, s18, 0x400
5169 ; GFX8-NEXT: s_mov_b64 s[4:5], 0
5170 ; GFX8-NEXT: v_max_f16_sdwa v2, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
5171 ; GFX8-NEXT: v_max_f16_e32 v3, v0, v0
5172 ; GFX8-NEXT: v_mov_b32_e32 v4, s6
5173 ; GFX8-NEXT: .LBB13_1: ; %atomicrmw.start
5174 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
5175 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5176 ; GFX8-NEXT: v_max_f16_sdwa v0, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
5177 ; GFX8-NEXT: v_max_f16_e32 v5, v1, v1
5178 ; GFX8-NEXT: v_max_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5179 ; GFX8-NEXT: v_max_f16_e32 v5, v5, v3
5180 ; GFX8-NEXT: v_or_b32_e32 v0, v5, v0
5181 ; GFX8-NEXT: v_mov_b32_e32 v6, v1
5182 ; GFX8-NEXT: v_mov_b32_e32 v5, v0
5183 ; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc
5184 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5185 ; GFX8-NEXT: buffer_wbinvl1
5186 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
5187 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5188 ; GFX8-NEXT: v_mov_b32_e32 v1, v5
5189 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
5190 ; GFX8-NEXT: s_cbranch_execnz .LBB13_1
5191 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
5192 ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
5193 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5195 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
5197 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5198 ; GFX7-NEXT: s_mov_b32 s11, s17
5199 ; GFX7-NEXT: s_mov_b32 s10, s16
5200 ; GFX7-NEXT: s_mov_b32 s9, s7
5201 ; GFX7-NEXT: s_mov_b32 s8, s6
5202 ; GFX7-NEXT: v_mov_b32_e32 v2, s18
5203 ; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024
5204 ; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
5205 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v0
5206 ; GFX7-NEXT: s_add_i32 s6, s18, 0x400
5207 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
5208 ; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
5209 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5210 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v2
5211 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v2
5212 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
5213 ; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v5
5214 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
5215 ; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
5216 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
5217 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
5218 ; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
5219 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v4
5220 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v3
5221 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
5222 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v0
5223 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v1
5224 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v5
5225 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
5226 ; GFX7-NEXT: v_or_b32_e32 v5, v3, v4
5227 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v7
5228 ; GFX7-NEXT: v_or_b32_e32 v4, v6, v3
5229 ; GFX7-NEXT: v_mov_b32_e32 v7, v5
5230 ; GFX7-NEXT: v_mov_b32_e32 v6, v4
5231 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc
5232 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5233 ; GFX7-NEXT: buffer_wbinvl1
5234 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v6
5235 ; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v6
5236 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
5237 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
5238 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5239 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
5240 ; GFX7-NEXT: s_cbranch_execnz .LBB13_1
5241 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
5242 ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
5243 ; GFX7-NEXT: s_setpc_b64 s[30:31]
5245 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2f16__offset:
5247 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5248 ; GFX6-NEXT: s_mov_b32 s11, s17
5249 ; GFX6-NEXT: s_mov_b32 s10, s16
5250 ; GFX6-NEXT: s_mov_b32 s9, s7
5251 ; GFX6-NEXT: s_mov_b32 s8, s6
5252 ; GFX6-NEXT: v_mov_b32_e32 v2, s18
5253 ; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024
5254 ; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
5255 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v0
5256 ; GFX6-NEXT: s_add_i32 s6, s18, 0x400
5257 ; GFX6-NEXT: s_mov_b64 s[4:5], 0
5258 ; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
5259 ; GFX6-NEXT: s_waitcnt vmcnt(0)
5260 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v2
5261 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
5262 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
5263 ; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v5
5264 ; GFX6-NEXT: v_mov_b32_e32 v2, s6
5265 ; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
5266 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
5267 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
5268 ; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
5269 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v4
5270 ; GFX6-NEXT: s_waitcnt expcnt(0)
5271 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v3
5272 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
5273 ; GFX6-NEXT: v_max_f32_e32 v5, v5, v0
5274 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v1
5275 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v5
5276 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
5277 ; GFX6-NEXT: v_or_b32_e32 v5, v3, v4
5278 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v7
5279 ; GFX6-NEXT: v_or_b32_e32 v4, v6, v3
5280 ; GFX6-NEXT: v_mov_b32_e32 v7, v5
5281 ; GFX6-NEXT: v_mov_b32_e32 v6, v4
5282 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v2, s[8:11], 0 offen glc
5283 ; GFX6-NEXT: s_waitcnt vmcnt(0)
5284 ; GFX6-NEXT: buffer_wbinvl1
5285 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v6
5286 ; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v6
5287 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
5288 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
5289 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5290 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
5291 ; GFX6-NEXT: s_cbranch_execnz .LBB13_1
5292 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
5293 ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
5294 ; GFX6-NEXT: s_waitcnt expcnt(0)
5295 ; GFX6-NEXT: s_setpc_b64 s[30:31]
5296 %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
5297 %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
5301 define <2 x half> @buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall(ptr addrspace(7) %ptr, <2 x half> %val) #0 {
5302 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
5304 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
5305 ; GFX12-NEXT: s_wait_expcnt 0x0
5306 ; GFX12-NEXT: s_wait_samplecnt 0x0
5307 ; GFX12-NEXT: s_wait_bvhcnt 0x0
5308 ; GFX12-NEXT: s_wait_kmcnt 0x0
5309 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
5310 ; GFX12-NEXT: s_mov_b32 s1, exec_lo
5311 ; GFX12-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
5312 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0
5313 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1
5314 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2
5315 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3
5316 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5317 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
5318 ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
5319 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
5320 ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
5321 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0
5322 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
5323 ; GFX12-NEXT: ; implicit-def: $vgpr4
5324 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
5325 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1
5326 ; GFX12-NEXT: ; %bb.2:
5327 ; GFX12-NEXT: s_mov_b32 exec_lo, s1
5328 ; GFX12-NEXT: v_pk_max_num_f16 v8, v5, v5
5329 ; GFX12-NEXT: s_mov_b32 s1, 0
5330 ; GFX12-NEXT: .LBB14_3: ; %atomicrmw.start
5331 ; GFX12-NEXT: ; =>This Loop Header: Depth=1
5332 ; GFX12-NEXT: ; Child Loop BB14_4 Depth 2
5333 ; GFX12-NEXT: s_wait_loadcnt 0x0
5334 ; GFX12-NEXT: v_pk_max_num_f16 v4, v6, v6
5335 ; GFX12-NEXT: s_mov_b32 s2, exec_lo
5336 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
5337 ; GFX12-NEXT: s_wait_storecnt 0x0
5338 ; GFX12-NEXT: v_pk_max_num_f16 v5, v4, v8
5339 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
5340 ; GFX12-NEXT: v_mov_b32_e32 v4, v5
5341 ; GFX12-NEXT: v_mov_b32_e32 v5, v6
5342 ; GFX12-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5343 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
5344 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0
5345 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1
5346 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2
5347 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3
5348 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5349 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
5350 ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
5351 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
5352 ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
5353 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0
5354 ; GFX12-NEXT: s_wait_loadcnt 0x0
5355 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
5356 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
5357 ; GFX12-NEXT: s_cbranch_execnz .LBB14_4
5358 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5359 ; GFX12-NEXT: s_mov_b32 exec_lo, s2
5360 ; GFX12-NEXT: s_wait_loadcnt 0x0
5361 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
5362 ; GFX12-NEXT: v_mov_b32_e32 v6, v4
5363 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
5364 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
5365 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
5366 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
5367 ; GFX12-NEXT: s_cbranch_execnz .LBB14_3
5368 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
5369 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
5370 ; GFX12-NEXT: v_mov_b32_e32 v0, v4
5371 ; GFX12-NEXT: s_setpc_b64 s[30:31]
5373 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
5375 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5376 ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4
5377 ; GFX940-NEXT: s_mov_b64 s[2:3], exec
5378 ; GFX940-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
5379 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0
5380 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1
5381 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2
5382 ; GFX940-NEXT: v_readfirstlane_b32 s7, v3
5383 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
5384 ; GFX940-NEXT: s_nop 0
5385 ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
5386 ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
5387 ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
5388 ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
5389 ; GFX940-NEXT: ; implicit-def: $vgpr4
5390 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
5391 ; GFX940-NEXT: s_cbranch_execnz .LBB14_1
5392 ; GFX940-NEXT: ; %bb.2:
5393 ; GFX940-NEXT: s_mov_b64 exec, s[2:3]
5394 ; GFX940-NEXT: s_mov_b64 s[2:3], 0
5395 ; GFX940-NEXT: v_pk_max_f16 v9, v5, v5
5396 ; GFX940-NEXT: .LBB14_3: ; %atomicrmw.start
5397 ; GFX940-NEXT: ; =>This Loop Header: Depth=1
5398 ; GFX940-NEXT: ; Child Loop BB14_4 Depth 2
5399 ; GFX940-NEXT: s_waitcnt vmcnt(0)
5400 ; GFX940-NEXT: v_pk_max_f16 v4, v7, v7
5401 ; GFX940-NEXT: s_mov_b64 s[8:9], exec
5402 ; GFX940-NEXT: v_pk_max_f16 v6, v4, v9
5403 ; GFX940-NEXT: buffer_wbl2 sc1
5404 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
5405 ; GFX940-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5406 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
5407 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0
5408 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1
5409 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2
5410 ; GFX940-NEXT: v_readfirstlane_b32 s7, v3
5411 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
5412 ; GFX940-NEXT: s_nop 0
5413 ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
5414 ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
5415 ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
5416 ; GFX940-NEXT: s_waitcnt vmcnt(0)
5417 ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
5418 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
5419 ; GFX940-NEXT: s_cbranch_execnz .LBB14_4
5420 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5421 ; GFX940-NEXT: s_mov_b64 exec, s[8:9]
5422 ; GFX940-NEXT: s_waitcnt vmcnt(0)
5423 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
5424 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
5425 ; GFX940-NEXT: v_mov_b32_e32 v7, v4
5426 ; GFX940-NEXT: buffer_inv sc1
5427 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
5428 ; GFX940-NEXT: s_cbranch_execnz .LBB14_3
5429 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
5430 ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
5431 ; GFX940-NEXT: v_mov_b32_e32 v0, v4
5432 ; GFX940-NEXT: s_setpc_b64 s[30:31]
5434 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
5436 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5437 ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
5438 ; GFX11-NEXT: s_mov_b32 s1, 0
5439 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
5440 ; GFX11-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
5441 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
5442 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1
5443 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2
5444 ; GFX11-NEXT: v_readfirstlane_b32 s7, v3
5445 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5446 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
5447 ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
5448 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
5449 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
5450 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0
5451 ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
5452 ; GFX11-NEXT: ; implicit-def: $vgpr4
5453 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
5454 ; GFX11-NEXT: s_cbranch_execnz .LBB14_1
5455 ; GFX11-NEXT: ; %bb.2:
5456 ; GFX11-NEXT: s_mov_b32 exec_lo, s2
5457 ; GFX11-NEXT: v_pk_max_f16 v8, v5, v5
5458 ; GFX11-NEXT: .p2align 6
5459 ; GFX11-NEXT: .LBB14_3: ; %atomicrmw.start
5460 ; GFX11-NEXT: ; =>This Loop Header: Depth=1
5461 ; GFX11-NEXT: ; Child Loop BB14_4 Depth 2
5462 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5463 ; GFX11-NEXT: v_pk_max_f16 v4, v6, v6
5464 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
5465 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
5466 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5467 ; GFX11-NEXT: v_pk_max_f16 v5, v4, v8
5468 ; GFX11-NEXT: v_mov_b32_e32 v4, v5
5469 ; GFX11-NEXT: v_mov_b32_e32 v5, v6
5470 ; GFX11-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5471 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
5472 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
5473 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1
5474 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2
5475 ; GFX11-NEXT: v_readfirstlane_b32 s7, v3
5476 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
5477 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
5478 ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
5479 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
5480 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
5481 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0
5482 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5483 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
5484 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
5485 ; GFX11-NEXT: s_cbranch_execnz .LBB14_4
5486 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5487 ; GFX11-NEXT: s_mov_b32 exec_lo, s2
5488 ; GFX11-NEXT: s_waitcnt vmcnt(0)
5489 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
5490 ; GFX11-NEXT: v_mov_b32_e32 v6, v4
5491 ; GFX11-NEXT: buffer_gl1_inv
5492 ; GFX11-NEXT: buffer_gl0_inv
5493 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
5494 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
5495 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
5496 ; GFX11-NEXT: s_cbranch_execnz .LBB14_3
5497 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
5498 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
5499 ; GFX11-NEXT: v_mov_b32_e32 v0, v4
5500 ; GFX11-NEXT: s_setpc_b64 s[30:31]
5502 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
5504 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5505 ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
5506 ; GFX10-NEXT: s_mov_b32 s5, 0
5507 ; GFX10-NEXT: s_mov_b32 s6, exec_lo
5508 ; GFX10-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
5509 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0
5510 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1
5511 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2
5512 ; GFX10-NEXT: v_readfirstlane_b32 s11, v3
5513 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
5514 ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
5515 ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
5516 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4
5517 ; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
5518 ; GFX10-NEXT: ; implicit-def: $vgpr4
5519 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
5520 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
5521 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1
5522 ; GFX10-NEXT: ; %bb.2:
5523 ; GFX10-NEXT: s_mov_b32 exec_lo, s6
5524 ; GFX10-NEXT: v_pk_max_f16 v8, v5, v5
5525 ; GFX10-NEXT: .LBB14_3: ; %atomicrmw.start
5526 ; GFX10-NEXT: ; =>This Loop Header: Depth=1
5527 ; GFX10-NEXT: ; Child Loop BB14_4 Depth 2
5528 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5529 ; GFX10-NEXT: v_pk_max_f16 v4, v6, v6
5530 ; GFX10-NEXT: s_mov_b32 s6, exec_lo
5531 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
5532 ; GFX10-NEXT: v_pk_max_f16 v5, v4, v8
5533 ; GFX10-NEXT: v_mov_b32_e32 v4, v5
5534 ; GFX10-NEXT: v_mov_b32_e32 v5, v6
5535 ; GFX10-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5536 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
5537 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0
5538 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1
5539 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2
5540 ; GFX10-NEXT: v_readfirstlane_b32 s11, v3
5541 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
5542 ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
5543 ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
5544 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4
5545 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5546 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
5547 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
5548 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
5549 ; GFX10-NEXT: s_cbranch_execnz .LBB14_4
5550 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5551 ; GFX10-NEXT: s_mov_b32 exec_lo, s6
5552 ; GFX10-NEXT: s_waitcnt vmcnt(0)
5553 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
5554 ; GFX10-NEXT: v_mov_b32_e32 v6, v4
5555 ; GFX10-NEXT: buffer_gl1_inv
5556 ; GFX10-NEXT: buffer_gl0_inv
5557 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
5558 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
5559 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
5560 ; GFX10-NEXT: s_cbranch_execnz .LBB14_3
5561 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
5562 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
5563 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
5564 ; GFX10-NEXT: s_setpc_b64 s[30:31]
5566 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
5568 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5569 ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
5570 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec
5571 ; GFX90A-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
5572 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
5573 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
5574 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
5575 ; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
5576 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5577 ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5578 ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
5579 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
5580 ; GFX90A-NEXT: s_nop 0
5581 ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
5582 ; GFX90A-NEXT: ; implicit-def: $vgpr4
5583 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
5584 ; GFX90A-NEXT: s_cbranch_execnz .LBB14_1
5585 ; GFX90A-NEXT: ; %bb.2:
5586 ; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
5587 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0
5588 ; GFX90A-NEXT: v_pk_max_f16 v9, v5, v5
5589 ; GFX90A-NEXT: .LBB14_3: ; %atomicrmw.start
5590 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1
5591 ; GFX90A-NEXT: ; Child Loop BB14_4 Depth 2
5592 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
5593 ; GFX90A-NEXT: v_pk_max_f16 v4, v7, v7
5594 ; GFX90A-NEXT: v_pk_max_f16 v6, v4, v9
5595 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec
5596 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
5597 ; GFX90A-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5598 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
5599 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
5600 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
5601 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
5602 ; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
5603 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5604 ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5605 ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
5606 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
5607 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
5608 ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
5609 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
5610 ; GFX90A-NEXT: s_cbranch_execnz .LBB14_4
5611 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5612 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
5613 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
5614 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
5615 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5616 ; GFX90A-NEXT: v_mov_b32_e32 v7, v4
5617 ; GFX90A-NEXT: buffer_wbinvl1
5618 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
5619 ; GFX90A-NEXT: s_cbranch_execnz .LBB14_3
5620 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
5621 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
5622 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4
5623 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
5625 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
5627 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5628 ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
5629 ; GFX908-NEXT: s_mov_b64 s[6:7], exec
5630 ; GFX908-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
5631 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0
5632 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1
5633 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2
5634 ; GFX908-NEXT: v_readfirstlane_b32 s11, v3
5635 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5636 ; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5637 ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
5638 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
5639 ; GFX908-NEXT: s_nop 0
5640 ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
5641 ; GFX908-NEXT: ; implicit-def: $vgpr4
5642 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
5643 ; GFX908-NEXT: s_cbranch_execnz .LBB14_1
5644 ; GFX908-NEXT: ; %bb.2:
5645 ; GFX908-NEXT: s_mov_b64 exec, s[6:7]
5646 ; GFX908-NEXT: s_mov_b64 s[6:7], 0
5647 ; GFX908-NEXT: v_pk_max_f16 v8, v5, v5
5648 ; GFX908-NEXT: .LBB14_3: ; %atomicrmw.start
5649 ; GFX908-NEXT: ; =>This Loop Header: Depth=1
5650 ; GFX908-NEXT: ; Child Loop BB14_4 Depth 2
5651 ; GFX908-NEXT: s_waitcnt vmcnt(0)
5652 ; GFX908-NEXT: v_pk_max_f16 v4, v6, v6
5653 ; GFX908-NEXT: v_pk_max_f16 v5, v4, v8
5654 ; GFX908-NEXT: v_mov_b32_e32 v4, v5
5655 ; GFX908-NEXT: s_mov_b64 s[12:13], exec
5656 ; GFX908-NEXT: v_mov_b32_e32 v5, v6
5657 ; GFX908-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5658 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
5659 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0
5660 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1
5661 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2
5662 ; GFX908-NEXT: v_readfirstlane_b32 s11, v3
5663 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5664 ; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5665 ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
5666 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
5667 ; GFX908-NEXT: s_waitcnt vmcnt(0)
5668 ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
5669 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
5670 ; GFX908-NEXT: s_cbranch_execnz .LBB14_4
5671 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5672 ; GFX908-NEXT: s_mov_b64 exec, s[12:13]
5673 ; GFX908-NEXT: s_waitcnt vmcnt(0)
5674 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
5675 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5676 ; GFX908-NEXT: v_mov_b32_e32 v6, v4
5677 ; GFX908-NEXT: buffer_wbinvl1
5678 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
5679 ; GFX908-NEXT: s_cbranch_execnz .LBB14_3
5680 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
5681 ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
5682 ; GFX908-NEXT: v_mov_b32_e32 v0, v4
5683 ; GFX908-NEXT: s_setpc_b64 s[30:31]
5685 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
5687 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5688 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
5689 ; GFX8-NEXT: s_mov_b64 s[6:7], exec
5690 ; GFX8-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
5691 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0
5692 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1
5693 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2
5694 ; GFX8-NEXT: v_readfirstlane_b32 s11, v3
5695 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5696 ; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5697 ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
5698 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
5699 ; GFX8-NEXT: s_nop 0
5700 ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
5701 ; GFX8-NEXT: ; implicit-def: $vgpr4
5702 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
5703 ; GFX8-NEXT: s_cbranch_execnz .LBB14_1
5704 ; GFX8-NEXT: ; %bb.2:
5705 ; GFX8-NEXT: s_mov_b64 exec, s[6:7]
5706 ; GFX8-NEXT: s_mov_b64 s[6:7], 0
5707 ; GFX8-NEXT: v_max_f16_sdwa v8, v5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
5708 ; GFX8-NEXT: v_max_f16_e32 v9, v5, v5
5709 ; GFX8-NEXT: .LBB14_3: ; %atomicrmw.start
5710 ; GFX8-NEXT: ; =>This Loop Header: Depth=1
5711 ; GFX8-NEXT: ; Child Loop BB14_4 Depth 2
5712 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5713 ; GFX8-NEXT: v_max_f16_sdwa v4, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
5714 ; GFX8-NEXT: v_max_f16_e32 v5, v6, v6
5715 ; GFX8-NEXT: v_max_f16_sdwa v4, v4, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
5716 ; GFX8-NEXT: v_max_f16_e32 v5, v5, v9
5717 ; GFX8-NEXT: v_or_b32_e32 v5, v5, v4
5718 ; GFX8-NEXT: v_mov_b32_e32 v4, v5
5719 ; GFX8-NEXT: s_mov_b64 s[12:13], exec
5720 ; GFX8-NEXT: v_mov_b32_e32 v5, v6
5721 ; GFX8-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5722 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
5723 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0
5724 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1
5725 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2
5726 ; GFX8-NEXT: v_readfirstlane_b32 s11, v3
5727 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5728 ; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5729 ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
5730 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
5731 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5732 ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
5733 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
5734 ; GFX8-NEXT: s_cbranch_execnz .LBB14_4
5735 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5736 ; GFX8-NEXT: s_mov_b64 exec, s[12:13]
5737 ; GFX8-NEXT: s_waitcnt vmcnt(0)
5738 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
5739 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5740 ; GFX8-NEXT: v_mov_b32_e32 v6, v4
5741 ; GFX8-NEXT: buffer_wbinvl1
5742 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
5743 ; GFX8-NEXT: s_cbranch_execnz .LBB14_3
5744 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
5745 ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
5746 ; GFX8-NEXT: v_mov_b32_e32 v0, v4
5747 ; GFX8-NEXT: s_setpc_b64 s[30:31]
5749 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
5751 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5752 ; GFX7-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4
5753 ; GFX7-NEXT: s_mov_b64 s[6:7], exec
5754 ; GFX7-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
5755 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0
5756 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1
5757 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2
5758 ; GFX7-NEXT: v_readfirstlane_b32 s11, v3
5759 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5760 ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5761 ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
5762 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
5763 ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
5764 ; GFX7-NEXT: ; implicit-def: $vgpr4
5765 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
5766 ; GFX7-NEXT: s_cbranch_execnz .LBB14_1
5767 ; GFX7-NEXT: ; %bb.2:
5768 ; GFX7-NEXT: s_mov_b64 exec, s[6:7]
5769 ; GFX7-NEXT: v_cvt_f16_f32_e32 v6, v6
5770 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v5
5771 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5772 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
5773 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
5774 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
5775 ; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v6
5776 ; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v8
5777 ; GFX7-NEXT: s_mov_b64 s[6:7], 0
5778 ; GFX7-NEXT: .LBB14_3: ; %atomicrmw.start
5779 ; GFX7-NEXT: ; =>This Loop Header: Depth=1
5780 ; GFX7-NEXT: ; Child Loop BB14_4 Depth 2
5781 ; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v5
5782 ; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
5783 ; GFX7-NEXT: s_mov_b64 s[12:13], exec
5784 ; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
5785 ; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v4
5786 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
5787 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v10
5788 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v11
5789 ; GFX7-NEXT: v_cvt_f16_f32_e32 v8, v6
5790 ; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v7
5791 ; GFX7-NEXT: v_or_b32_e32 v6, v4, v5
5792 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v8
5793 ; GFX7-NEXT: v_or_b32_e32 v5, v7, v4
5794 ; GFX7-NEXT: v_mov_b32_e32 v8, v6
5795 ; GFX7-NEXT: v_mov_b32_e32 v7, v5
5796 ; GFX7-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5797 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
5798 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0
5799 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1
5800 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2
5801 ; GFX7-NEXT: v_readfirstlane_b32 s11, v3
5802 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5803 ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5804 ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
5805 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
5806 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5807 ; GFX7-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
5808 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
5809 ; GFX7-NEXT: s_cbranch_execnz .LBB14_4
5810 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5811 ; GFX7-NEXT: s_mov_b64 exec, s[12:13]
5812 ; GFX7-NEXT: s_waitcnt vmcnt(0)
5813 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v7
5814 ; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v7
5815 ; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
5816 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
5817 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5818 ; GFX7-NEXT: buffer_wbinvl1
5819 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
5820 ; GFX7-NEXT: s_cbranch_execnz .LBB14_3
5821 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
5822 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
5823 ; GFX7-NEXT: v_mov_b32_e32 v0, v4
5824 ; GFX7-NEXT: v_mov_b32_e32 v1, v5
5825 ; GFX7-NEXT: s_setpc_b64 s[30:31]
5827 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2f16__offset__waterfall:
5829 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5830 ; GFX6-NEXT: v_add_i32_e32 v9, vcc, 0x400, v4
5831 ; GFX6-NEXT: s_mov_b64 s[6:7], exec
5832 ; GFX6-NEXT: .LBB14_1: ; =>This Inner Loop Header: Depth=1
5833 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0
5834 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1
5835 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2
5836 ; GFX6-NEXT: v_readfirstlane_b32 s11, v3
5837 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5838 ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5839 ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
5840 ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
5841 ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
5842 ; GFX6-NEXT: ; implicit-def: $vgpr4
5843 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
5844 ; GFX6-NEXT: s_cbranch_execnz .LBB14_1
5845 ; GFX6-NEXT: ; %bb.2:
5846 ; GFX6-NEXT: s_mov_b64 exec, s[6:7]
5847 ; GFX6-NEXT: v_cvt_f16_f32_e32 v6, v6
5848 ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v5
5849 ; GFX6-NEXT: s_waitcnt vmcnt(0)
5850 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
5851 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
5852 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
5853 ; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v6
5854 ; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v8
5855 ; GFX6-NEXT: s_mov_b64 s[6:7], 0
5856 ; GFX6-NEXT: .LBB14_3: ; %atomicrmw.start
5857 ; GFX6-NEXT: ; =>This Loop Header: Depth=1
5858 ; GFX6-NEXT: ; Child Loop BB14_4 Depth 2
5859 ; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v5
5860 ; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
5861 ; GFX6-NEXT: s_mov_b64 s[12:13], exec
5862 ; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
5863 ; GFX6-NEXT: s_waitcnt expcnt(0)
5864 ; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v4
5865 ; GFX6-NEXT: v_lshlrev_b32_e32 v5, 16, v5
5866 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v10
5867 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v11
5868 ; GFX6-NEXT: v_cvt_f16_f32_e32 v8, v6
5869 ; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v7
5870 ; GFX6-NEXT: v_or_b32_e32 v6, v4, v5
5871 ; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v8
5872 ; GFX6-NEXT: v_or_b32_e32 v5, v7, v4
5873 ; GFX6-NEXT: v_mov_b32_e32 v8, v6
5874 ; GFX6-NEXT: v_mov_b32_e32 v7, v5
5875 ; GFX6-NEXT: .LBB14_4: ; Parent Loop BB14_3 Depth=1
5876 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
5877 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0
5878 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1
5879 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2
5880 ; GFX6-NEXT: v_readfirstlane_b32 s11, v3
5881 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
5882 ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
5883 ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
5884 ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
5885 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
5886 ; GFX6-NEXT: buffer_atomic_cmpswap v[7:8], v9, s[8:11], 0 offen glc
5887 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
5888 ; GFX6-NEXT: s_cbranch_execnz .LBB14_4
5889 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB14_3 Depth=1
5890 ; GFX6-NEXT: s_mov_b64 exec, s[12:13]
5891 ; GFX6-NEXT: s_waitcnt vmcnt(0)
5892 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v7
5893 ; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v7
5894 ; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
5895 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
5896 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
5897 ; GFX6-NEXT: buffer_wbinvl1
5898 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
5899 ; GFX6-NEXT: s_cbranch_execnz .LBB14_3
5900 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
5901 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
5902 ; GFX6-NEXT: v_mov_b32_e32 v0, v4
5903 ; GFX6-NEXT: v_mov_b32_e32 v1, v5
5904 ; GFX6-NEXT: s_waitcnt expcnt(0)
5905 ; GFX6-NEXT: s_setpc_b64 s[30:31]
5906 %gep = getelementptr <2 x half>, ptr addrspace(7) %ptr, i32 256
5907 %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x half> %val syncscope("agent") seq_cst
5908 ret <2 x half> %result
5911 ; --------------------------------------------------------------------
5913 ; --------------------------------------------------------------------
5915 define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
5916 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
5918 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
5919 ; GFX12-NEXT: s_wait_expcnt 0x0
5920 ; GFX12-NEXT: s_wait_samplecnt 0x0
5921 ; GFX12-NEXT: s_wait_bvhcnt 0x0
5922 ; GFX12-NEXT: s_wait_kmcnt 0x0
5923 ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
5924 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
5925 ; GFX12-NEXT: s_mov_b32 s5, 0
5926 ; GFX12-NEXT: v_mov_b32_e32 v4, s4
5927 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
5928 ; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v1
5929 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
5930 ; GFX12-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
5931 ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
5932 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
5933 ; GFX12-NEXT: s_wait_loadcnt 0x0
5934 ; GFX12-NEXT: v_mov_b32_e32 v6, v0
5935 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
5936 ; GFX12-NEXT: s_wait_storecnt 0x0
5937 ; GFX12-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
5938 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5939 ; GFX12-NEXT: v_max_num_f32_e32 v1, v1, v3
5940 ; GFX12-NEXT: v_bfe_u32 v7, v1, 16, 1
5941 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v1
5942 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
5943 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
5944 ; GFX12-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
5945 ; GFX12-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
5946 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5947 ; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v2
5948 ; GFX12-NEXT: v_bfe_u32 v5, v0, 16, 1
5949 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
5950 ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0
5951 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
5952 ; GFX12-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
5953 ; GFX12-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
5954 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5955 ; GFX12-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
5956 ; GFX12-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
5957 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
5958 ; GFX12-NEXT: s_wait_loadcnt 0x0
5959 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
5960 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
5961 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
5962 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
5963 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
5964 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1
5965 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
5966 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
5967 ; GFX12-NEXT: s_setpc_b64 s[30:31]
5969 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
5971 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
5972 ; GFX940-NEXT: v_mov_b32_e32 v1, v0
5973 ; GFX940-NEXT: v_mov_b32_e32 v0, s6
5974 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
5975 ; GFX940-NEXT: s_add_i32 s4, s6, 0x400
5976 ; GFX940-NEXT: s_mov_b64 s[6:7], 0
5977 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v1
5978 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff
5979 ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
5980 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302
5981 ; GFX940-NEXT: v_mov_b32_e32 v4, s4
5982 ; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start
5983 ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
5984 ; GFX940-NEXT: s_waitcnt vmcnt(0)
5985 ; GFX940-NEXT: v_mov_b32_e32 v7, v0
5986 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7
5987 ; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
5988 ; GFX940-NEXT: v_max_f32_e32 v0, v0, v2
5989 ; GFX940-NEXT: v_max_f32_e32 v1, v1, v3
5990 ; GFX940-NEXT: v_bfe_u32 v5, v0, 16, 1
5991 ; GFX940-NEXT: v_bfe_u32 v8, v1, 16, 1
5992 ; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v0
5993 ; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v1
5994 ; GFX940-NEXT: v_add3_u32 v5, v5, v0, s8
5995 ; GFX940-NEXT: v_add3_u32 v8, v8, v1, s8
5996 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
5997 ; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
5998 ; GFX940-NEXT: buffer_wbl2 sc1
5999 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
6000 ; GFX940-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
6001 ; GFX940-NEXT: v_perm_b32 v6, v1, v0, s9
6002 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[6:7]
6003 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[0:3], 0 offen sc0
6004 ; GFX940-NEXT: s_waitcnt vmcnt(0)
6005 ; GFX940-NEXT: buffer_inv sc1
6006 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
6007 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6008 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
6009 ; GFX940-NEXT: s_cbranch_execnz .LBB15_1
6010 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
6011 ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7]
6012 ; GFX940-NEXT: s_setpc_b64 s[30:31]
6014 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
6016 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6017 ; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
6018 ; GFX11-NEXT: s_add_i32 s4, s6, 0x400
6019 ; GFX11-NEXT: s_mov_b32 s5, 0
6020 ; GFX11-NEXT: v_mov_b32_e32 v4, s4
6021 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
6022 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v1
6023 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
6024 ; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
6025 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
6026 ; GFX11-NEXT: .p2align 6
6027 ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start
6028 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
6029 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6030 ; GFX11-NEXT: v_mov_b32_e32 v6, v0
6031 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
6032 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6033 ; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
6034 ; GFX11-NEXT: v_max_f32_e32 v1, v1, v3
6035 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
6036 ; GFX11-NEXT: v_bfe_u32 v7, v1, 16, 1
6037 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v1
6038 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
6039 ; GFX11-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
6040 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6041 ; GFX11-NEXT: v_dual_cndmask_b32 v1, v7, v9 :: v_dual_lshlrev_b32 v0, 16, v6
6042 ; GFX11-NEXT: v_max_f32_e32 v0, v0, v2
6043 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
6044 ; GFX11-NEXT: v_bfe_u32 v5, v0, 16, 1
6045 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
6046 ; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
6047 ; GFX11-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
6048 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6049 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
6050 ; GFX11-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
6051 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
6052 ; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v1, v6
6053 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v4, s[0:3], 0 offen glc
6054 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6055 ; GFX11-NEXT: buffer_gl1_inv
6056 ; GFX11-NEXT: buffer_gl0_inv
6057 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
6058 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
6059 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
6060 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
6061 ; GFX11-NEXT: s_cbranch_execnz .LBB15_1
6062 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
6063 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
6064 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
6065 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6067 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
6069 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6070 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
6071 ; GFX10-NEXT: v_mov_b32_e32 v0, s18
6072 ; GFX10-NEXT: s_mov_b32 s11, s17
6073 ; GFX10-NEXT: s_mov_b32 s10, s16
6074 ; GFX10-NEXT: s_mov_b32 s9, s7
6075 ; GFX10-NEXT: s_mov_b32 s8, s6
6076 ; GFX10-NEXT: s_add_i32 s4, s18, 0x400
6077 ; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
6078 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v1
6079 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
6080 ; GFX10-NEXT: v_mov_b32_e32 v4, s4
6081 ; GFX10-NEXT: s_mov_b32 s5, 0
6082 ; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start
6083 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
6084 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6085 ; GFX10-NEXT: v_mov_b32_e32 v6, v0
6086 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
6087 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v6
6088 ; GFX10-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
6089 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
6090 ; GFX10-NEXT: v_max_f32_e32 v1, v1, v3
6091 ; GFX10-NEXT: v_bfe_u32 v5, v0, 16, 1
6092 ; GFX10-NEXT: v_bfe_u32 v7, v1, 16, 1
6093 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
6094 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v1
6095 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
6096 ; GFX10-NEXT: v_add3_u32 v5, v5, v0, 0x7fff
6097 ; GFX10-NEXT: v_add3_u32 v7, v7, v1, 0x7fff
6098 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
6099 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc_lo
6100 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v5, v8, s4
6101 ; GFX10-NEXT: v_perm_b32 v5, v1, v0, 0x7060302
6102 ; GFX10-NEXT: v_mov_b32_e32 v0, v5
6103 ; GFX10-NEXT: v_mov_b32_e32 v1, v6
6104 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc
6105 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6106 ; GFX10-NEXT: buffer_gl1_inv
6107 ; GFX10-NEXT: buffer_gl0_inv
6108 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v6
6109 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
6110 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
6111 ; GFX10-NEXT: s_cbranch_execnz .LBB15_1
6112 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
6113 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
6114 ; GFX10-NEXT: s_setpc_b64 s[30:31]
6116 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
6118 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6119 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0
6120 ; GFX90A-NEXT: s_mov_b32 s11, s17
6121 ; GFX90A-NEXT: s_mov_b32 s10, s16
6122 ; GFX90A-NEXT: s_mov_b32 s9, s7
6123 ; GFX90A-NEXT: s_mov_b32 s8, s6
6124 ; GFX90A-NEXT: v_mov_b32_e32 v0, s18
6125 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
6126 ; GFX90A-NEXT: s_add_i32 s4, s18, 0x400
6127 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0
6128 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v1
6129 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
6130 ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
6131 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
6132 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4
6133 ; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start
6134 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
6135 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
6136 ; GFX90A-NEXT: v_mov_b32_e32 v7, v0
6137 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v7
6138 ; GFX90A-NEXT: v_and_b32_e32 v1, 0xffff0000, v7
6139 ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2
6140 ; GFX90A-NEXT: v_max_f32_e32 v1, v1, v3
6141 ; GFX90A-NEXT: v_bfe_u32 v5, v0, 16, 1
6142 ; GFX90A-NEXT: v_bfe_u32 v8, v1, 16, 1
6143 ; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v0
6144 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v1
6145 ; GFX90A-NEXT: v_add3_u32 v5, v5, v0, s12
6146 ; GFX90A-NEXT: v_add3_u32 v8, v8, v1, s12
6147 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
6148 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
6149 ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v5, v6, s[4:5]
6150 ; GFX90A-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
6151 ; GFX90A-NEXT: v_perm_b32 v6, v1, v0, s13
6152 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[0,1]
6153 ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc
6154 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
6155 ; GFX90A-NEXT: buffer_wbinvl1
6156 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
6157 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6158 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
6159 ; GFX90A-NEXT: s_cbranch_execnz .LBB15_1
6160 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
6161 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
6162 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
6164 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
6166 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6167 ; GFX908-NEXT: v_mov_b32_e32 v1, v0
6168 ; GFX908-NEXT: s_mov_b32 s11, s17
6169 ; GFX908-NEXT: s_mov_b32 s10, s16
6170 ; GFX908-NEXT: s_mov_b32 s9, s7
6171 ; GFX908-NEXT: s_mov_b32 s8, s6
6172 ; GFX908-NEXT: v_mov_b32_e32 v0, s18
6173 ; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
6174 ; GFX908-NEXT: s_add_i32 s4, s18, 0x400
6175 ; GFX908-NEXT: s_mov_b64 s[6:7], 0
6176 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v1
6177 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff
6178 ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
6179 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302
6180 ; GFX908-NEXT: v_mov_b32_e32 v4, s4
6181 ; GFX908-NEXT: .LBB15_1: ; %atomicrmw.start
6182 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
6183 ; GFX908-NEXT: s_waitcnt vmcnt(0)
6184 ; GFX908-NEXT: v_mov_b32_e32 v6, v0
6185 ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v6
6186 ; GFX908-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
6187 ; GFX908-NEXT: v_max_f32_e32 v0, v0, v2
6188 ; GFX908-NEXT: v_max_f32_e32 v1, v1, v3
6189 ; GFX908-NEXT: v_bfe_u32 v5, v0, 16, 1
6190 ; GFX908-NEXT: v_bfe_u32 v8, v1, 16, 1
6191 ; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
6192 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v1
6193 ; GFX908-NEXT: v_add3_u32 v5, v5, v0, s12
6194 ; GFX908-NEXT: v_add3_u32 v8, v8, v1, s12
6195 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
6196 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
6197 ; GFX908-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
6198 ; GFX908-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
6199 ; GFX908-NEXT: v_perm_b32 v5, v1, v0, s13
6200 ; GFX908-NEXT: v_mov_b32_e32 v0, v5
6201 ; GFX908-NEXT: v_mov_b32_e32 v1, v6
6202 ; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc
6203 ; GFX908-NEXT: s_waitcnt vmcnt(0)
6204 ; GFX908-NEXT: buffer_wbinvl1
6205 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
6206 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6207 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
6208 ; GFX908-NEXT: s_cbranch_execnz .LBB15_1
6209 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
6210 ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
6211 ; GFX908-NEXT: s_setpc_b64 s[30:31]
6213 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
6215 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6216 ; GFX8-NEXT: v_mov_b32_e32 v1, v0
6217 ; GFX8-NEXT: s_mov_b32 s11, s17
6218 ; GFX8-NEXT: s_mov_b32 s10, s16
6219 ; GFX8-NEXT: s_mov_b32 s9, s7
6220 ; GFX8-NEXT: s_mov_b32 s8, s6
6221 ; GFX8-NEXT: v_mov_b32_e32 v0, s18
6222 ; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
6223 ; GFX8-NEXT: s_add_i32 s4, s18, 0x400
6224 ; GFX8-NEXT: s_mov_b64 s[6:7], 0
6225 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v1
6226 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
6227 ; GFX8-NEXT: v_mov_b32_e32 v4, s4
6228 ; GFX8-NEXT: .LBB15_1: ; %atomicrmw.start
6229 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
6230 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6231 ; GFX8-NEXT: v_mov_b32_e32 v6, v0
6232 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v6
6233 ; GFX8-NEXT: v_and_b32_e32 v1, 0xffff0000, v6
6234 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
6235 ; GFX8-NEXT: v_max_f32_e32 v1, v1, v3
6236 ; GFX8-NEXT: v_bfe_u32 v5, v0, 16, 1
6237 ; GFX8-NEXT: v_bfe_u32 v8, v1, 16, 1
6238 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v0
6239 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v1
6240 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
6241 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
6242 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v1
6243 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
6244 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
6245 ; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
6246 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc
6247 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, v7, s[4:5]
6248 ; GFX8-NEXT: v_lshrrev_b32_e32 v1, 16, v1
6249 ; GFX8-NEXT: v_alignbit_b32 v5, v1, v0, 16
6250 ; GFX8-NEXT: v_mov_b32_e32 v0, v5
6251 ; GFX8-NEXT: v_mov_b32_e32 v1, v6
6252 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v4, s[8:11], 0 offen glc
6253 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6254 ; GFX8-NEXT: buffer_wbinvl1
6255 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
6256 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6257 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
6258 ; GFX8-NEXT: s_cbranch_execnz .LBB15_1
6259 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
6260 ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
6261 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6263 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
6265 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6266 ; GFX7-NEXT: s_mov_b32 s11, s17
6267 ; GFX7-NEXT: s_mov_b32 s10, s16
6268 ; GFX7-NEXT: s_mov_b32 s9, s7
6269 ; GFX7-NEXT: s_mov_b32 s8, s6
6270 ; GFX7-NEXT: v_mov_b32_e32 v2, s18
6271 ; GFX7-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024
6272 ; GFX7-NEXT: s_add_i32 s6, s18, 0x400
6273 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
6274 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
6275 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
6276 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
6277 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6278 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6279 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6280 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6281 ; GFX7-NEXT: v_mov_b32_e32 v4, s6
6282 ; GFX7-NEXT: .LBB15_1: ; %atomicrmw.start
6283 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
6284 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
6285 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
6286 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
6287 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
6288 ; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
6289 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v2
6290 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v3
6291 ; GFX7-NEXT: v_alignbit_b32 v1, v1, v0, 16
6292 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v5
6293 ; GFX7-NEXT: v_alignbit_b32 v0, v0, v6, 16
6294 ; GFX7-NEXT: v_mov_b32_e32 v6, v1
6295 ; GFX7-NEXT: v_mov_b32_e32 v5, v0
6296 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc
6297 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6298 ; GFX7-NEXT: buffer_wbinvl1
6299 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
6300 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
6301 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6302 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v5
6303 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
6304 ; GFX7-NEXT: s_cbranch_execnz .LBB15_1
6305 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
6306 ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
6307 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6309 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset:
6311 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6312 ; GFX6-NEXT: s_mov_b32 s11, s17
6313 ; GFX6-NEXT: s_mov_b32 s10, s16
6314 ; GFX6-NEXT: s_mov_b32 s9, s7
6315 ; GFX6-NEXT: s_mov_b32 s8, s6
6316 ; GFX6-NEXT: v_mov_b32_e32 v2, s18
6317 ; GFX6-NEXT: buffer_load_dword v4, v2, s[8:11], 0 offen offset:1024
6318 ; GFX6-NEXT: s_add_i32 s6, s18, 0x400
6319 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
6320 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
6321 ; GFX6-NEXT: s_mov_b64 s[4:5], 0
6322 ; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
6323 ; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6324 ; GFX6-NEXT: s_waitcnt vmcnt(0)
6325 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
6326 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
6327 ; GFX6-NEXT: v_mov_b32_e32 v4, s6
6328 ; GFX6-NEXT: .LBB15_1: ; %atomicrmw.start
6329 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
6330 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
6331 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
6332 ; GFX6-NEXT: s_waitcnt expcnt(0)
6333 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
6334 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v0
6335 ; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
6336 ; GFX6-NEXT: v_max_f32_e32 v5, v5, v2
6337 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v3
6338 ; GFX6-NEXT: v_alignbit_b32 v1, v1, v0, 16
6339 ; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v5
6340 ; GFX6-NEXT: v_alignbit_b32 v0, v0, v6, 16
6341 ; GFX6-NEXT: v_mov_b32_e32 v6, v1
6342 ; GFX6-NEXT: v_mov_b32_e32 v5, v0
6343 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc
6344 ; GFX6-NEXT: s_waitcnt vmcnt(0)
6345 ; GFX6-NEXT: buffer_wbinvl1
6346 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
6347 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v5
6348 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6349 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v5
6350 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
6351 ; GFX6-NEXT: s_cbranch_execnz .LBB15_1
6352 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
6353 ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
6354 ; GFX6-NEXT: s_waitcnt expcnt(0)
6355 ; GFX6-NEXT: s_setpc_b64 s[30:31]
6356 %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
6357 %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
6358 ret <2 x bfloat> %result
6361 define void @buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset(ptr addrspace(7) inreg %ptr, <2 x bfloat> %val) #0 {
6362 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
6364 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
6365 ; GFX12-NEXT: s_wait_expcnt 0x0
6366 ; GFX12-NEXT: s_wait_samplecnt 0x0
6367 ; GFX12-NEXT: s_wait_bvhcnt 0x0
6368 ; GFX12-NEXT: s_wait_kmcnt 0x0
6369 ; GFX12-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0
6370 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
6371 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
6372 ; GFX12-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
6373 ; GFX12-NEXT: buffer_load_b32 v1, v1, s[0:3], null offen offset:1024
6374 ; GFX12-NEXT: s_mov_b32 s5, 0
6375 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
6376 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
6377 ; GFX12-NEXT: s_wait_loadcnt 0x0
6378 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
6379 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6380 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
6381 ; GFX12-NEXT: s_wait_storecnt 0x0
6382 ; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v3 :: v_dual_max_num_f32 v0, v0, v2
6383 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
6384 ; GFX12-NEXT: v_bfe_u32 v7, v5, 16, 1
6385 ; GFX12-NEXT: v_bfe_u32 v6, v0, 16, 1
6386 ; GFX12-NEXT: v_or_b32_e32 v8, 0x400000, v0
6387 ; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v5
6388 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
6389 ; GFX12-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
6390 ; GFX12-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
6391 ; GFX12-NEXT: v_cmp_u_f32_e64 s4, v0, v0
6392 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
6393 ; GFX12-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
6394 ; GFX12-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
6395 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6396 ; GFX12-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
6397 ; GFX12-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
6398 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], null offen th:TH_ATOMIC_RETURN
6399 ; GFX12-NEXT: s_wait_loadcnt 0x0
6400 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
6401 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
6402 ; GFX12-NEXT: v_mov_b32_e32 v1, v5
6403 ; GFX12-NEXT: s_or_b32 s5, vcc_lo, s5
6404 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
6405 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
6406 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1
6407 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
6408 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s5
6409 ; GFX12-NEXT: s_setpc_b64 s[30:31]
6411 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
6413 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6414 ; GFX940-NEXT: v_mov_b32_e32 v1, s6
6415 ; GFX940-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen offset:1024
6416 ; GFX940-NEXT: s_add_i32 s4, s6, 0x400
6417 ; GFX940-NEXT: s_mov_b64 s[6:7], 0
6418 ; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6419 ; GFX940-NEXT: s_movk_i32 s8, 0x7fff
6420 ; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6421 ; GFX940-NEXT: s_mov_b32 s9, 0x7060302
6422 ; GFX940-NEXT: v_mov_b32_e32 v4, s4
6423 ; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start
6424 ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
6425 ; GFX940-NEXT: s_waitcnt vmcnt(0)
6426 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6427 ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
6428 ; GFX940-NEXT: v_max_f32_e32 v0, v0, v2
6429 ; GFX940-NEXT: v_max_f32_e32 v5, v5, v3
6430 ; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1
6431 ; GFX940-NEXT: v_bfe_u32 v8, v5, 16, 1
6432 ; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v0
6433 ; GFX940-NEXT: v_or_b32_e32 v9, 0x400000, v5
6434 ; GFX940-NEXT: v_add3_u32 v6, v6, v0, s8
6435 ; GFX940-NEXT: v_add3_u32 v8, v8, v5, s8
6436 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
6437 ; GFX940-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
6438 ; GFX940-NEXT: buffer_wbl2 sc1
6439 ; GFX940-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
6440 ; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
6441 ; GFX940-NEXT: v_perm_b32 v0, v5, v0, s9
6442 ; GFX940-NEXT: v_mov_b64_e32 v[6:7], v[0:1]
6443 ; GFX940-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[0:3], 0 offen sc0
6444 ; GFX940-NEXT: s_waitcnt vmcnt(0)
6445 ; GFX940-NEXT: buffer_inv sc1
6446 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
6447 ; GFX940-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6448 ; GFX940-NEXT: v_mov_b32_e32 v1, v6
6449 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[6:7]
6450 ; GFX940-NEXT: s_cbranch_execnz .LBB16_1
6451 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
6452 ; GFX940-NEXT: s_or_b64 exec, exec, s[6:7]
6453 ; GFX940-NEXT: s_setpc_b64 s[30:31]
6455 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
6457 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6458 ; GFX11-NEXT: v_dual_mov_b32 v1, s6 :: v_dual_lshlrev_b32 v2, 16, v0
6459 ; GFX11-NEXT: s_add_i32 s4, s6, 0x400
6460 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
6461 ; GFX11-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_and_b32 v3, 0xffff0000, v0
6462 ; GFX11-NEXT: buffer_load_b32 v1, v1, s[0:3], 0 offen offset:1024
6463 ; GFX11-NEXT: s_mov_b32 s5, 0
6464 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
6465 ; GFX11-NEXT: .p2align 6
6466 ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
6467 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
6468 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6469 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
6470 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6471 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
6472 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6473 ; GFX11-NEXT: v_dual_max_f32 v5, v5, v3 :: v_dual_max_f32 v0, v0, v2
6474 ; GFX11-NEXT: v_bfe_u32 v7, v5, 16, 1
6475 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
6476 ; GFX11-NEXT: v_bfe_u32 v6, v0, 16, 1
6477 ; GFX11-NEXT: v_or_b32_e32 v8, 0x400000, v0
6478 ; GFX11-NEXT: v_or_b32_e32 v9, 0x400000, v5
6479 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
6480 ; GFX11-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
6481 ; GFX11-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
6482 ; GFX11-NEXT: v_cmp_u_f32_e64 s4, v0, v0
6483 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
6484 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
6485 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
6486 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6487 ; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
6488 ; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v5, v0
6489 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[5:6], v4, s[0:3], 0 offen glc
6490 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6491 ; GFX11-NEXT: buffer_gl1_inv
6492 ; GFX11-NEXT: buffer_gl0_inv
6493 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
6494 ; GFX11-NEXT: v_mov_b32_e32 v1, v5
6495 ; GFX11-NEXT: s_or_b32 s5, vcc_lo, s5
6496 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
6497 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s5
6498 ; GFX11-NEXT: s_cbranch_execnz .LBB16_1
6499 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
6500 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
6501 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s5
6502 ; GFX11-NEXT: s_setpc_b64 s[30:31]
6504 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
6506 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6507 ; GFX10-NEXT: v_mov_b32_e32 v1, s18
6508 ; GFX10-NEXT: s_mov_b32 s11, s17
6509 ; GFX10-NEXT: s_mov_b32 s10, s16
6510 ; GFX10-NEXT: s_mov_b32 s9, s7
6511 ; GFX10-NEXT: s_mov_b32 s8, s6
6512 ; GFX10-NEXT: s_add_i32 s4, s18, 0x400
6513 ; GFX10-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024
6514 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6515 ; GFX10-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6516 ; GFX10-NEXT: v_mov_b32_e32 v4, s4
6517 ; GFX10-NEXT: s_mov_b32 s5, 0
6518 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
6519 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
6520 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6521 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6522 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
6523 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
6524 ; GFX10-NEXT: v_max_f32_e32 v0, v0, v2
6525 ; GFX10-NEXT: v_max_f32_e32 v5, v5, v3
6526 ; GFX10-NEXT: v_bfe_u32 v6, v0, 16, 1
6527 ; GFX10-NEXT: v_bfe_u32 v7, v5, 16, 1
6528 ; GFX10-NEXT: v_or_b32_e32 v8, 0x400000, v0
6529 ; GFX10-NEXT: v_or_b32_e32 v9, 0x400000, v5
6530 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
6531 ; GFX10-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
6532 ; GFX10-NEXT: v_add3_u32 v7, v7, v5, 0x7fff
6533 ; GFX10-NEXT: v_cmp_u_f32_e64 s4, v0, v0
6534 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v7, v9, vcc_lo
6535 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v6, v8, s4
6536 ; GFX10-NEXT: v_perm_b32 v0, v5, v0, 0x7060302
6537 ; GFX10-NEXT: v_mov_b32_e32 v6, v1
6538 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
6539 ; GFX10-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc
6540 ; GFX10-NEXT: s_waitcnt vmcnt(0)
6541 ; GFX10-NEXT: buffer_gl1_inv
6542 ; GFX10-NEXT: buffer_gl0_inv
6543 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v5, v1
6544 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
6545 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
6546 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
6547 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1
6548 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
6549 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
6550 ; GFX10-NEXT: s_setpc_b64 s[30:31]
6552 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
6554 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6555 ; GFX90A-NEXT: s_mov_b32 s11, s17
6556 ; GFX90A-NEXT: s_mov_b32 s10, s16
6557 ; GFX90A-NEXT: s_mov_b32 s9, s7
6558 ; GFX90A-NEXT: s_mov_b32 s8, s6
6559 ; GFX90A-NEXT: v_mov_b32_e32 v1, s18
6560 ; GFX90A-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024
6561 ; GFX90A-NEXT: s_add_i32 s4, s18, 0x400
6562 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0
6563 ; GFX90A-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6564 ; GFX90A-NEXT: s_movk_i32 s12, 0x7fff
6565 ; GFX90A-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6566 ; GFX90A-NEXT: s_mov_b32 s13, 0x7060302
6567 ; GFX90A-NEXT: v_mov_b32_e32 v4, s4
6568 ; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start
6569 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
6570 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
6571 ; GFX90A-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6572 ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
6573 ; GFX90A-NEXT: v_max_f32_e32 v0, v0, v2
6574 ; GFX90A-NEXT: v_max_f32_e32 v5, v5, v3
6575 ; GFX90A-NEXT: v_bfe_u32 v6, v0, 16, 1
6576 ; GFX90A-NEXT: v_bfe_u32 v8, v5, 16, 1
6577 ; GFX90A-NEXT: v_or_b32_e32 v7, 0x400000, v0
6578 ; GFX90A-NEXT: v_or_b32_e32 v9, 0x400000, v5
6579 ; GFX90A-NEXT: v_add3_u32 v6, v6, v0, s12
6580 ; GFX90A-NEXT: v_add3_u32 v8, v8, v5, s12
6581 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
6582 ; GFX90A-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
6583 ; GFX90A-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
6584 ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
6585 ; GFX90A-NEXT: v_perm_b32 v0, v5, v0, s13
6586 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
6587 ; GFX90A-NEXT: buffer_atomic_cmpswap v[6:7], v4, s[8:11], 0 offen glc
6588 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
6589 ; GFX90A-NEXT: buffer_wbinvl1
6590 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v6, v1
6591 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6592 ; GFX90A-NEXT: v_mov_b32_e32 v1, v6
6593 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
6594 ; GFX90A-NEXT: s_cbranch_execnz .LBB16_1
6595 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
6596 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
6597 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
6599 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
6601 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6602 ; GFX908-NEXT: s_mov_b32 s11, s17
6603 ; GFX908-NEXT: s_mov_b32 s10, s16
6604 ; GFX908-NEXT: s_mov_b32 s9, s7
6605 ; GFX908-NEXT: s_mov_b32 s8, s6
6606 ; GFX908-NEXT: v_mov_b32_e32 v1, s18
6607 ; GFX908-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024
6608 ; GFX908-NEXT: s_add_i32 s4, s18, 0x400
6609 ; GFX908-NEXT: s_mov_b64 s[6:7], 0
6610 ; GFX908-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6611 ; GFX908-NEXT: s_movk_i32 s12, 0x7fff
6612 ; GFX908-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6613 ; GFX908-NEXT: s_mov_b32 s13, 0x7060302
6614 ; GFX908-NEXT: v_mov_b32_e32 v4, s4
6615 ; GFX908-NEXT: .LBB16_1: ; %atomicrmw.start
6616 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
6617 ; GFX908-NEXT: s_waitcnt vmcnt(0)
6618 ; GFX908-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6619 ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
6620 ; GFX908-NEXT: v_max_f32_e32 v0, v0, v2
6621 ; GFX908-NEXT: v_max_f32_e32 v5, v5, v3
6622 ; GFX908-NEXT: v_bfe_u32 v6, v0, 16, 1
6623 ; GFX908-NEXT: v_bfe_u32 v8, v5, 16, 1
6624 ; GFX908-NEXT: v_or_b32_e32 v7, 0x400000, v0
6625 ; GFX908-NEXT: v_or_b32_e32 v9, 0x400000, v5
6626 ; GFX908-NEXT: v_add3_u32 v6, v6, v0, s12
6627 ; GFX908-NEXT: v_add3_u32 v8, v8, v5, s12
6628 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
6629 ; GFX908-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
6630 ; GFX908-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
6631 ; GFX908-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
6632 ; GFX908-NEXT: v_perm_b32 v0, v5, v0, s13
6633 ; GFX908-NEXT: v_mov_b32_e32 v6, v1
6634 ; GFX908-NEXT: v_mov_b32_e32 v5, v0
6635 ; GFX908-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc
6636 ; GFX908-NEXT: s_waitcnt vmcnt(0)
6637 ; GFX908-NEXT: buffer_wbinvl1
6638 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
6639 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6640 ; GFX908-NEXT: v_mov_b32_e32 v1, v5
6641 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
6642 ; GFX908-NEXT: s_cbranch_execnz .LBB16_1
6643 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
6644 ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
6645 ; GFX908-NEXT: s_setpc_b64 s[30:31]
6647 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
6649 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6650 ; GFX8-NEXT: s_mov_b32 s11, s17
6651 ; GFX8-NEXT: s_mov_b32 s10, s16
6652 ; GFX8-NEXT: s_mov_b32 s9, s7
6653 ; GFX8-NEXT: s_mov_b32 s8, s6
6654 ; GFX8-NEXT: v_mov_b32_e32 v1, s18
6655 ; GFX8-NEXT: buffer_load_dword v1, v1, s[8:11], 0 offen offset:1024
6656 ; GFX8-NEXT: s_add_i32 s4, s18, 0x400
6657 ; GFX8-NEXT: s_mov_b64 s[6:7], 0
6658 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v0
6659 ; GFX8-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
6660 ; GFX8-NEXT: v_mov_b32_e32 v4, s4
6661 ; GFX8-NEXT: .LBB16_1: ; %atomicrmw.start
6662 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
6663 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6664 ; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v1
6665 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v1
6666 ; GFX8-NEXT: v_max_f32_e32 v0, v0, v2
6667 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v3
6668 ; GFX8-NEXT: v_bfe_u32 v6, v0, 16, 1
6669 ; GFX8-NEXT: v_bfe_u32 v8, v5, 16, 1
6670 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v0
6671 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v5
6672 ; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0x7fff, v6
6673 ; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0x7fff, v8
6674 ; GFX8-NEXT: v_or_b32_e32 v9, 0x400000, v5
6675 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
6676 ; GFX8-NEXT: v_or_b32_e32 v7, 0x400000, v0
6677 ; GFX8-NEXT: v_cmp_u_f32_e64 s[4:5], v0, v0
6678 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
6679 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, v7, s[4:5]
6680 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
6681 ; GFX8-NEXT: v_alignbit_b32 v0, v5, v0, 16
6682 ; GFX8-NEXT: v_mov_b32_e32 v6, v1
6683 ; GFX8-NEXT: v_mov_b32_e32 v5, v0
6684 ; GFX8-NEXT: buffer_atomic_cmpswap v[5:6], v4, s[8:11], 0 offen glc
6685 ; GFX8-NEXT: s_waitcnt vmcnt(0)
6686 ; GFX8-NEXT: buffer_wbinvl1
6687 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v5, v1
6688 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
6689 ; GFX8-NEXT: v_mov_b32_e32 v1, v5
6690 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
6691 ; GFX8-NEXT: s_cbranch_execnz .LBB16_1
6692 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
6693 ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
6694 ; GFX8-NEXT: s_setpc_b64 s[30:31]
6696 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
6698 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6699 ; GFX7-NEXT: s_mov_b32 s11, s17
6700 ; GFX7-NEXT: s_mov_b32 s10, s16
6701 ; GFX7-NEXT: s_mov_b32 s9, s7
6702 ; GFX7-NEXT: s_mov_b32 s8, s6
6703 ; GFX7-NEXT: v_mov_b32_e32 v2, s18
6704 ; GFX7-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024
6705 ; GFX7-NEXT: s_add_i32 s6, s18, 0x400
6706 ; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
6707 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v0
6708 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
6709 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
6710 ; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
6711 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6712 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
6713 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v2
6714 ; GFX7-NEXT: v_mov_b32_e32 v2, s6
6715 ; GFX7-NEXT: .LBB16_1: ; %atomicrmw.start
6716 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
6717 ; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
6718 ; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
6719 ; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
6720 ; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
6721 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
6722 ; GFX7-NEXT: v_max_f32_e32 v5, v5, v0
6723 ; GFX7-NEXT: v_max_f32_e32 v6, v6, v1
6724 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v3, 16
6725 ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v5
6726 ; GFX7-NEXT: v_alignbit_b32 v3, v3, v6, 16
6727 ; GFX7-NEXT: v_mov_b32_e32 v6, v4
6728 ; GFX7-NEXT: v_mov_b32_e32 v5, v3
6729 ; GFX7-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc
6730 ; GFX7-NEXT: s_waitcnt vmcnt(0)
6731 ; GFX7-NEXT: buffer_wbinvl1
6732 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
6733 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
6734 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6735 ; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
6736 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
6737 ; GFX7-NEXT: s_cbranch_execnz .LBB16_1
6738 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
6739 ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
6740 ; GFX7-NEXT: s_setpc_b64 s[30:31]
6742 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_v2bf16__offset:
6744 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6745 ; GFX6-NEXT: s_mov_b32 s11, s17
6746 ; GFX6-NEXT: s_mov_b32 s10, s16
6747 ; GFX6-NEXT: s_mov_b32 s9, s7
6748 ; GFX6-NEXT: s_mov_b32 s8, s6
6749 ; GFX6-NEXT: v_mov_b32_e32 v2, s18
6750 ; GFX6-NEXT: buffer_load_dword v2, v2, s[8:11], 0 offen offset:1024
6751 ; GFX6-NEXT: s_add_i32 s6, s18, 0x400
6752 ; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
6753 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v0
6754 ; GFX6-NEXT: s_mov_b64 s[4:5], 0
6755 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
6756 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v3
6757 ; GFX6-NEXT: s_waitcnt vmcnt(0)
6758 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
6759 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v2
6760 ; GFX6-NEXT: v_mov_b32_e32 v2, s6
6761 ; GFX6-NEXT: .LBB16_1: ; %atomicrmw.start
6762 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
6763 ; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
6764 ; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
6765 ; GFX6-NEXT: s_waitcnt expcnt(0)
6766 ; GFX6-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
6767 ; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
6768 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
6769 ; GFX6-NEXT: v_max_f32_e32 v5, v5, v0
6770 ; GFX6-NEXT: v_max_f32_e32 v6, v6, v1
6771 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v3, 16
6772 ; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v5
6773 ; GFX6-NEXT: v_alignbit_b32 v3, v3, v6, 16
6774 ; GFX6-NEXT: v_mov_b32_e32 v6, v4
6775 ; GFX6-NEXT: v_mov_b32_e32 v5, v3
6776 ; GFX6-NEXT: buffer_atomic_cmpswap v[5:6], v2, s[8:11], 0 offen glc
6777 ; GFX6-NEXT: s_waitcnt vmcnt(0)
6778 ; GFX6-NEXT: buffer_wbinvl1
6779 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v4
6780 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
6781 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
6782 ; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
6783 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
6784 ; GFX6-NEXT: s_cbranch_execnz .LBB16_1
6785 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
6786 ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
6787 ; GFX6-NEXT: s_waitcnt expcnt(0)
6788 ; GFX6-NEXT: s_setpc_b64 s[30:31]
6789 %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
6790 %unused = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
6794 define <2 x bfloat> @buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall(ptr addrspace(7) %ptr, <2 x bfloat> %val) #0 {
6795 ; GFX12-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
6797 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
6798 ; GFX12-NEXT: s_wait_expcnt 0x0
6799 ; GFX12-NEXT: s_wait_samplecnt 0x0
6800 ; GFX12-NEXT: s_wait_bvhcnt 0x0
6801 ; GFX12-NEXT: s_wait_kmcnt 0x0
6802 ; GFX12-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
6803 ; GFX12-NEXT: s_mov_b32 s1, exec_lo
6804 ; GFX12-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
6805 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0
6806 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1
6807 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2
6808 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3
6809 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
6810 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
6811 ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
6812 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
6813 ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
6814 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0
6815 ; GFX12-NEXT: buffer_load_b32 v6, v4, s[4:7], null offen offset:1024
6816 ; GFX12-NEXT: ; implicit-def: $vgpr4
6817 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
6818 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1
6819 ; GFX12-NEXT: ; %bb.2:
6820 ; GFX12-NEXT: s_mov_b32 exec_lo, s1
6821 ; GFX12-NEXT: v_lshlrev_b32_e32 v8, 16, v5
6822 ; GFX12-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
6823 ; GFX12-NEXT: s_mov_b32 s1, 0
6824 ; GFX12-NEXT: .LBB17_3: ; %atomicrmw.start
6825 ; GFX12-NEXT: ; =>This Loop Header: Depth=1
6826 ; GFX12-NEXT: ; Child Loop BB17_4 Depth 2
6827 ; GFX12-NEXT: s_wait_loadcnt 0x0
6828 ; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6829 ; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6830 ; GFX12-NEXT: s_mov_b32 s2, exec_lo
6831 ; GFX12-NEXT: global_wb scope:SCOPE_DEV
6832 ; GFX12-NEXT: s_wait_storecnt 0x0
6833 ; GFX12-NEXT: v_dual_max_num_f32 v5, v5, v9 :: v_dual_max_num_f32 v4, v4, v8
6834 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
6835 ; GFX12-NEXT: v_bfe_u32 v11, v5, 16, 1
6836 ; GFX12-NEXT: v_bfe_u32 v10, v4, 16, 1
6837 ; GFX12-NEXT: v_or_b32_e32 v12, 0x400000, v4
6838 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
6839 ; GFX12-NEXT: v_or_b32_e32 v13, 0x400000, v5
6840 ; GFX12-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
6841 ; GFX12-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
6842 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
6843 ; GFX12-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
6844 ; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
6845 ; GFX12-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
6846 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6847 ; GFX12-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
6848 ; GFX12-NEXT: v_mov_b32_e32 v4, v5
6849 ; GFX12-NEXT: v_mov_b32_e32 v5, v6
6850 ; GFX12-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
6851 ; GFX12-NEXT: ; => This Inner Loop Header: Depth=2
6852 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0
6853 ; GFX12-NEXT: v_readfirstlane_b32 s5, v1
6854 ; GFX12-NEXT: v_readfirstlane_b32 s6, v2
6855 ; GFX12-NEXT: v_readfirstlane_b32 s7, v3
6856 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
6857 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
6858 ; GFX12-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
6859 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
6860 ; GFX12-NEXT: s_and_b32 s0, vcc_lo, s0
6861 ; GFX12-NEXT: s_and_saveexec_b32 s0, s0
6862 ; GFX12-NEXT: s_wait_loadcnt 0x0
6863 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], null offen th:TH_ATOMIC_RETURN
6864 ; GFX12-NEXT: s_xor_b32 exec_lo, exec_lo, s0
6865 ; GFX12-NEXT: s_cbranch_execnz .LBB17_4
6866 ; GFX12-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
6867 ; GFX12-NEXT: s_mov_b32 exec_lo, s2
6868 ; GFX12-NEXT: s_wait_loadcnt 0x0
6869 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
6870 ; GFX12-NEXT: v_mov_b32_e32 v6, v4
6871 ; GFX12-NEXT: global_inv scope:SCOPE_DEV
6872 ; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1
6873 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
6874 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
6875 ; GFX12-NEXT: s_cbranch_execnz .LBB17_3
6876 ; GFX12-NEXT: ; %bb.6: ; %atomicrmw.end
6877 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1
6878 ; GFX12-NEXT: v_mov_b32_e32 v0, v4
6879 ; GFX12-NEXT: s_setpc_b64 s[30:31]
6881 ; GFX940-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
6883 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6884 ; GFX940-NEXT: v_add_u32_e32 v8, 0x400, v4
6885 ; GFX940-NEXT: s_mov_b64 s[2:3], exec
6886 ; GFX940-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
6887 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0
6888 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1
6889 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2
6890 ; GFX940-NEXT: v_readfirstlane_b32 s7, v3
6891 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
6892 ; GFX940-NEXT: s_nop 0
6893 ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
6894 ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
6895 ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
6896 ; GFX940-NEXT: buffer_load_dword v7, v4, s[4:7], 0 offen offset:1024
6897 ; GFX940-NEXT: ; implicit-def: $vgpr4
6898 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
6899 ; GFX940-NEXT: s_cbranch_execnz .LBB17_1
6900 ; GFX940-NEXT: ; %bb.2:
6901 ; GFX940-NEXT: s_mov_b64 exec, s[2:3]
6902 ; GFX940-NEXT: s_mov_b64 s[2:3], 0
6903 ; GFX940-NEXT: v_lshlrev_b32_e32 v9, 16, v5
6904 ; GFX940-NEXT: s_movk_i32 s10, 0x7fff
6905 ; GFX940-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
6906 ; GFX940-NEXT: s_mov_b32 s11, 0x7060302
6907 ; GFX940-NEXT: .LBB17_3: ; %atomicrmw.start
6908 ; GFX940-NEXT: ; =>This Loop Header: Depth=1
6909 ; GFX940-NEXT: ; Child Loop BB17_4 Depth 2
6910 ; GFX940-NEXT: s_waitcnt vmcnt(0)
6911 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v7
6912 ; GFX940-NEXT: v_max_f32_e32 v4, v4, v9
6913 ; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1
6914 ; GFX940-NEXT: v_add3_u32 v5, v5, v4, s10
6915 ; GFX940-NEXT: v_or_b32_e32 v6, 0x400000, v4
6916 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
6917 ; GFX940-NEXT: s_mov_b64 s[8:9], exec
6918 ; GFX940-NEXT: buffer_wbl2 sc1
6919 ; GFX940-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
6920 ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
6921 ; GFX940-NEXT: v_max_f32_e32 v5, v5, v10
6922 ; GFX940-NEXT: v_bfe_u32 v6, v5, 16, 1
6923 ; GFX940-NEXT: v_add3_u32 v6, v6, v5, s10
6924 ; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v5
6925 ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
6926 ; GFX940-NEXT: s_nop 1
6927 ; GFX940-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
6928 ; GFX940-NEXT: v_perm_b32 v6, v5, v4, s11
6929 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[6:7]
6930 ; GFX940-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
6931 ; GFX940-NEXT: ; => This Inner Loop Header: Depth=2
6932 ; GFX940-NEXT: v_readfirstlane_b32 s4, v0
6933 ; GFX940-NEXT: v_readfirstlane_b32 s5, v1
6934 ; GFX940-NEXT: v_readfirstlane_b32 s6, v2
6935 ; GFX940-NEXT: v_readfirstlane_b32 s7, v3
6936 ; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, s[4:5], v[0:1]
6937 ; GFX940-NEXT: s_nop 0
6938 ; GFX940-NEXT: v_cmp_eq_u64_e64 s[0:1], s[6:7], v[2:3]
6939 ; GFX940-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
6940 ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], s[0:1]
6941 ; GFX940-NEXT: s_waitcnt vmcnt(0)
6942 ; GFX940-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[4:7], 0 offen sc0
6943 ; GFX940-NEXT: s_xor_b64 exec, exec, s[0:1]
6944 ; GFX940-NEXT: s_cbranch_execnz .LBB17_4
6945 ; GFX940-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
6946 ; GFX940-NEXT: s_mov_b64 exec, s[8:9]
6947 ; GFX940-NEXT: s_waitcnt vmcnt(0)
6948 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
6949 ; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
6950 ; GFX940-NEXT: v_mov_b32_e32 v7, v4
6951 ; GFX940-NEXT: buffer_inv sc1
6952 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
6953 ; GFX940-NEXT: s_cbranch_execnz .LBB17_3
6954 ; GFX940-NEXT: ; %bb.6: ; %atomicrmw.end
6955 ; GFX940-NEXT: s_or_b64 exec, exec, s[2:3]
6956 ; GFX940-NEXT: v_mov_b32_e32 v0, v4
6957 ; GFX940-NEXT: s_setpc_b64 s[30:31]
6959 ; GFX11-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
6961 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
6962 ; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
6963 ; GFX11-NEXT: s_mov_b32 s1, 0
6964 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
6965 ; GFX11-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
6966 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
6967 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1
6968 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2
6969 ; GFX11-NEXT: v_readfirstlane_b32 s7, v3
6970 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
6971 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
6972 ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
6973 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
6974 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
6975 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0
6976 ; GFX11-NEXT: buffer_load_b32 v6, v4, s[4:7], 0 offen offset:1024
6977 ; GFX11-NEXT: ; implicit-def: $vgpr4
6978 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
6979 ; GFX11-NEXT: s_cbranch_execnz .LBB17_1
6980 ; GFX11-NEXT: ; %bb.2:
6981 ; GFX11-NEXT: s_mov_b32 exec_lo, s2
6982 ; GFX11-NEXT: v_lshlrev_b32_e32 v8, 16, v5
6983 ; GFX11-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
6984 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x1
6985 ; GFX11-NEXT: .p2align 6
6986 ; GFX11-NEXT: .LBB17_3: ; %atomicrmw.start
6987 ; GFX11-NEXT: ; =>This Loop Header: Depth=1
6988 ; GFX11-NEXT: ; Child Loop BB17_4 Depth 2
6989 ; GFX11-NEXT: s_waitcnt vmcnt(0)
6990 ; GFX11-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
6991 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 16, v6
6992 ; GFX11-NEXT: s_mov_b32 s2, exec_lo
6993 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
6994 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6995 ; GFX11-NEXT: v_dual_max_f32 v5, v5, v9 :: v_dual_max_f32 v4, v4, v8
6996 ; GFX11-NEXT: v_bfe_u32 v11, v5, 16, 1
6997 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
6998 ; GFX11-NEXT: v_bfe_u32 v10, v4, 16, 1
6999 ; GFX11-NEXT: v_or_b32_e32 v12, 0x400000, v4
7000 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
7001 ; GFX11-NEXT: v_or_b32_e32 v13, 0x400000, v5
7002 ; GFX11-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
7003 ; GFX11-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
7004 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
7005 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
7006 ; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
7007 ; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
7008 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7009 ; GFX11-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
7010 ; GFX11-NEXT: v_mov_b32_e32 v4, v5
7011 ; GFX11-NEXT: v_mov_b32_e32 v5, v6
7012 ; GFX11-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
7013 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2
7014 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
7015 ; GFX11-NEXT: v_readfirstlane_b32 s5, v1
7016 ; GFX11-NEXT: v_readfirstlane_b32 s6, v2
7017 ; GFX11-NEXT: v_readfirstlane_b32 s7, v3
7018 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
7019 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[4:5], v[0:1]
7020 ; GFX11-NEXT: v_cmp_eq_u64_e64 s0, s[6:7], v[2:3]
7021 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
7022 ; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0
7023 ; GFX11-NEXT: s_and_saveexec_b32 s0, s0
7024 ; GFX11-NEXT: s_waitcnt vmcnt(0)
7025 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v7, s[4:7], 0 offen glc
7026 ; GFX11-NEXT: s_xor_b32 exec_lo, exec_lo, s0
7027 ; GFX11-NEXT: s_cbranch_execnz .LBB17_4
7028 ; GFX11-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
7029 ; GFX11-NEXT: s_mov_b32 exec_lo, s2
7030 ; GFX11-NEXT: s_waitcnt vmcnt(0)
7031 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
7032 ; GFX11-NEXT: v_mov_b32_e32 v6, v4
7033 ; GFX11-NEXT: buffer_gl1_inv
7034 ; GFX11-NEXT: buffer_gl0_inv
7035 ; GFX11-NEXT: s_or_b32 s1, vcc_lo, s1
7036 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
7037 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1
7038 ; GFX11-NEXT: s_cbranch_execnz .LBB17_3
7039 ; GFX11-NEXT: ; %bb.6: ; %atomicrmw.end
7040 ; GFX11-NEXT: s_set_inst_prefetch_distance 0x2
7041 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s1
7042 ; GFX11-NEXT: v_mov_b32_e32 v0, v4
7043 ; GFX11-NEXT: s_setpc_b64 s[30:31]
7045 ; GFX10-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
7047 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7048 ; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x400, v4
7049 ; GFX10-NEXT: s_mov_b32 s5, 0
7050 ; GFX10-NEXT: s_mov_b32 s6, exec_lo
7051 ; GFX10-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
7052 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0
7053 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1
7054 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2
7055 ; GFX10-NEXT: v_readfirstlane_b32 s11, v3
7056 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
7057 ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
7058 ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
7059 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4
7060 ; GFX10-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
7061 ; GFX10-NEXT: ; implicit-def: $vgpr4
7062 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
7063 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
7064 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1
7065 ; GFX10-NEXT: ; %bb.2:
7066 ; GFX10-NEXT: s_mov_b32 exec_lo, s6
7067 ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 16, v5
7068 ; GFX10-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
7069 ; GFX10-NEXT: .LBB17_3: ; %atomicrmw.start
7070 ; GFX10-NEXT: ; =>This Loop Header: Depth=1
7071 ; GFX10-NEXT: ; Child Loop BB17_4 Depth 2
7072 ; GFX10-NEXT: s_waitcnt vmcnt(0)
7073 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 16, v6
7074 ; GFX10-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
7075 ; GFX10-NEXT: s_mov_b32 s6, exec_lo
7076 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
7077 ; GFX10-NEXT: v_max_f32_e32 v4, v4, v8
7078 ; GFX10-NEXT: v_max_f32_e32 v5, v5, v9
7079 ; GFX10-NEXT: v_bfe_u32 v10, v4, 16, 1
7080 ; GFX10-NEXT: v_bfe_u32 v11, v5, 16, 1
7081 ; GFX10-NEXT: v_or_b32_e32 v12, 0x400000, v4
7082 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v4, v4
7083 ; GFX10-NEXT: v_or_b32_e32 v13, 0x400000, v5
7084 ; GFX10-NEXT: v_add3_u32 v10, v10, v4, 0x7fff
7085 ; GFX10-NEXT: v_add3_u32 v11, v11, v5, 0x7fff
7086 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v10, v12, vcc_lo
7087 ; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5
7088 ; GFX10-NEXT: v_cndmask_b32_e32 v5, v11, v13, vcc_lo
7089 ; GFX10-NEXT: v_perm_b32 v5, v5, v4, 0x7060302
7090 ; GFX10-NEXT: v_mov_b32_e32 v4, v5
7091 ; GFX10-NEXT: v_mov_b32_e32 v5, v6
7092 ; GFX10-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
7093 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2
7094 ; GFX10-NEXT: v_readfirstlane_b32 s8, v0
7095 ; GFX10-NEXT: v_readfirstlane_b32 s9, v1
7096 ; GFX10-NEXT: v_readfirstlane_b32 s10, v2
7097 ; GFX10-NEXT: v_readfirstlane_b32 s11, v3
7098 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[0:1]
7099 ; GFX10-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3]
7100 ; GFX10-NEXT: s_and_b32 s4, vcc_lo, s4
7101 ; GFX10-NEXT: s_and_saveexec_b32 s4, s4
7102 ; GFX10-NEXT: s_waitcnt vmcnt(0)
7103 ; GFX10-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
7104 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
7105 ; GFX10-NEXT: s_xor_b32 exec_lo, exec_lo, s4
7106 ; GFX10-NEXT: s_cbranch_execnz .LBB17_4
7107 ; GFX10-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
7108 ; GFX10-NEXT: s_mov_b32 exec_lo, s6
7109 ; GFX10-NEXT: s_waitcnt vmcnt(0)
7110 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v6
7111 ; GFX10-NEXT: v_mov_b32_e32 v6, v4
7112 ; GFX10-NEXT: buffer_gl1_inv
7113 ; GFX10-NEXT: buffer_gl0_inv
7114 ; GFX10-NEXT: s_or_b32 s5, vcc_lo, s5
7115 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
7116 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s5
7117 ; GFX10-NEXT: s_cbranch_execnz .LBB17_3
7118 ; GFX10-NEXT: ; %bb.6: ; %atomicrmw.end
7119 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s5
7120 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
7121 ; GFX10-NEXT: s_setpc_b64 s[30:31]
7123 ; GFX90A-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
7125 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7126 ; GFX90A-NEXT: v_add_u32_e32 v8, 0x400, v4
7127 ; GFX90A-NEXT: s_mov_b64 s[6:7], exec
7128 ; GFX90A-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
7129 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
7130 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
7131 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
7132 ; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
7133 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7134 ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7135 ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
7136 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
7137 ; GFX90A-NEXT: s_nop 0
7138 ; GFX90A-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
7139 ; GFX90A-NEXT: ; implicit-def: $vgpr4
7140 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
7141 ; GFX90A-NEXT: s_cbranch_execnz .LBB17_1
7142 ; GFX90A-NEXT: ; %bb.2:
7143 ; GFX90A-NEXT: s_mov_b64 exec, s[6:7]
7144 ; GFX90A-NEXT: s_mov_b64 s[6:7], 0
7145 ; GFX90A-NEXT: v_lshlrev_b32_e32 v9, 16, v5
7146 ; GFX90A-NEXT: s_movk_i32 s14, 0x7fff
7147 ; GFX90A-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
7148 ; GFX90A-NEXT: s_mov_b32 s15, 0x7060302
7149 ; GFX90A-NEXT: .LBB17_3: ; %atomicrmw.start
7150 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1
7151 ; GFX90A-NEXT: ; Child Loop BB17_4 Depth 2
7152 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
7153 ; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 16, v7
7154 ; GFX90A-NEXT: v_max_f32_e32 v4, v4, v9
7155 ; GFX90A-NEXT: v_bfe_u32 v5, v4, 16, 1
7156 ; GFX90A-NEXT: v_add3_u32 v5, v5, v4, s14
7157 ; GFX90A-NEXT: v_or_b32_e32 v6, 0x400000, v4
7158 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
7159 ; GFX90A-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc
7160 ; GFX90A-NEXT: v_and_b32_e32 v5, 0xffff0000, v7
7161 ; GFX90A-NEXT: v_max_f32_e32 v5, v5, v10
7162 ; GFX90A-NEXT: v_bfe_u32 v6, v5, 16, 1
7163 ; GFX90A-NEXT: v_add3_u32 v6, v6, v5, s14
7164 ; GFX90A-NEXT: v_or_b32_e32 v11, 0x400000, v5
7165 ; GFX90A-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
7166 ; GFX90A-NEXT: v_cndmask_b32_e32 v5, v6, v11, vcc
7167 ; GFX90A-NEXT: v_perm_b32 v6, v5, v4, s15
7168 ; GFX90A-NEXT: s_mov_b64 s[12:13], exec
7169 ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
7170 ; GFX90A-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
7171 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
7172 ; GFX90A-NEXT: v_readfirstlane_b32 s8, v0
7173 ; GFX90A-NEXT: v_readfirstlane_b32 s9, v1
7174 ; GFX90A-NEXT: v_readfirstlane_b32 s10, v2
7175 ; GFX90A-NEXT: v_readfirstlane_b32 s11, v3
7176 ; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7177 ; GFX90A-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7178 ; GFX90A-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
7179 ; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
7180 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
7181 ; GFX90A-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
7182 ; GFX90A-NEXT: s_xor_b64 exec, exec, s[4:5]
7183 ; GFX90A-NEXT: s_cbranch_execnz .LBB17_4
7184 ; GFX90A-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
7185 ; GFX90A-NEXT: s_mov_b64 exec, s[12:13]
7186 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
7187 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
7188 ; GFX90A-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7189 ; GFX90A-NEXT: v_mov_b32_e32 v7, v4
7190 ; GFX90A-NEXT: buffer_wbinvl1
7191 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[6:7]
7192 ; GFX90A-NEXT: s_cbranch_execnz .LBB17_3
7193 ; GFX90A-NEXT: ; %bb.6: ; %atomicrmw.end
7194 ; GFX90A-NEXT: s_or_b64 exec, exec, s[6:7]
7195 ; GFX90A-NEXT: v_mov_b32_e32 v0, v4
7196 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
7198 ; GFX908-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
7200 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7201 ; GFX908-NEXT: v_add_u32_e32 v7, 0x400, v4
7202 ; GFX908-NEXT: s_mov_b64 s[6:7], exec
7203 ; GFX908-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
7204 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0
7205 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1
7206 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2
7207 ; GFX908-NEXT: v_readfirstlane_b32 s11, v3
7208 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7209 ; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7210 ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
7211 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
7212 ; GFX908-NEXT: s_nop 0
7213 ; GFX908-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
7214 ; GFX908-NEXT: ; implicit-def: $vgpr4
7215 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
7216 ; GFX908-NEXT: s_cbranch_execnz .LBB17_1
7217 ; GFX908-NEXT: ; %bb.2:
7218 ; GFX908-NEXT: s_mov_b64 exec, s[6:7]
7219 ; GFX908-NEXT: s_mov_b64 s[6:7], 0
7220 ; GFX908-NEXT: v_lshlrev_b32_e32 v8, 16, v5
7221 ; GFX908-NEXT: s_movk_i32 s14, 0x7fff
7222 ; GFX908-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
7223 ; GFX908-NEXT: s_mov_b32 s15, 0x7060302
7224 ; GFX908-NEXT: .LBB17_3: ; %atomicrmw.start
7225 ; GFX908-NEXT: ; =>This Loop Header: Depth=1
7226 ; GFX908-NEXT: ; Child Loop BB17_4 Depth 2
7227 ; GFX908-NEXT: s_waitcnt vmcnt(0)
7228 ; GFX908-NEXT: v_lshlrev_b32_e32 v4, 16, v6
7229 ; GFX908-NEXT: v_max_f32_e32 v4, v4, v8
7230 ; GFX908-NEXT: v_bfe_u32 v5, v4, 16, 1
7231 ; GFX908-NEXT: v_add3_u32 v5, v5, v4, s14
7232 ; GFX908-NEXT: v_or_b32_e32 v10, 0x400000, v4
7233 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
7234 ; GFX908-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
7235 ; GFX908-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
7236 ; GFX908-NEXT: v_max_f32_e32 v5, v5, v9
7237 ; GFX908-NEXT: v_bfe_u32 v10, v5, 16, 1
7238 ; GFX908-NEXT: v_add3_u32 v10, v10, v5, s14
7239 ; GFX908-NEXT: v_or_b32_e32 v11, 0x400000, v5
7240 ; GFX908-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
7241 ; GFX908-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
7242 ; GFX908-NEXT: v_perm_b32 v5, v5, v4, s15
7243 ; GFX908-NEXT: v_mov_b32_e32 v4, v5
7244 ; GFX908-NEXT: s_mov_b64 s[12:13], exec
7245 ; GFX908-NEXT: v_mov_b32_e32 v5, v6
7246 ; GFX908-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
7247 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
7248 ; GFX908-NEXT: v_readfirstlane_b32 s8, v0
7249 ; GFX908-NEXT: v_readfirstlane_b32 s9, v1
7250 ; GFX908-NEXT: v_readfirstlane_b32 s10, v2
7251 ; GFX908-NEXT: v_readfirstlane_b32 s11, v3
7252 ; GFX908-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7253 ; GFX908-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7254 ; GFX908-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
7255 ; GFX908-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
7256 ; GFX908-NEXT: s_waitcnt vmcnt(0)
7257 ; GFX908-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
7258 ; GFX908-NEXT: s_xor_b64 exec, exec, s[4:5]
7259 ; GFX908-NEXT: s_cbranch_execnz .LBB17_4
7260 ; GFX908-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
7261 ; GFX908-NEXT: s_mov_b64 exec, s[12:13]
7262 ; GFX908-NEXT: s_waitcnt vmcnt(0)
7263 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
7264 ; GFX908-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7265 ; GFX908-NEXT: v_mov_b32_e32 v6, v4
7266 ; GFX908-NEXT: buffer_wbinvl1
7267 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[6:7]
7268 ; GFX908-NEXT: s_cbranch_execnz .LBB17_3
7269 ; GFX908-NEXT: ; %bb.6: ; %atomicrmw.end
7270 ; GFX908-NEXT: s_or_b64 exec, exec, s[6:7]
7271 ; GFX908-NEXT: v_mov_b32_e32 v0, v4
7272 ; GFX908-NEXT: s_setpc_b64 s[30:31]
7274 ; GFX8-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
7276 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7277 ; GFX8-NEXT: v_add_u32_e32 v7, vcc, 0x400, v4
7278 ; GFX8-NEXT: s_mov_b64 s[6:7], exec
7279 ; GFX8-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
7280 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0
7281 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1
7282 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2
7283 ; GFX8-NEXT: v_readfirstlane_b32 s11, v3
7284 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7285 ; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7286 ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
7287 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
7288 ; GFX8-NEXT: s_nop 0
7289 ; GFX8-NEXT: buffer_load_dword v6, v4, s[8:11], 0 offen offset:1024
7290 ; GFX8-NEXT: ; implicit-def: $vgpr4
7291 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
7292 ; GFX8-NEXT: s_cbranch_execnz .LBB17_1
7293 ; GFX8-NEXT: ; %bb.2:
7294 ; GFX8-NEXT: s_mov_b64 exec, s[6:7]
7295 ; GFX8-NEXT: s_mov_b64 s[6:7], 0
7296 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v5
7297 ; GFX8-NEXT: v_and_b32_e32 v9, 0xffff0000, v5
7298 ; GFX8-NEXT: .LBB17_3: ; %atomicrmw.start
7299 ; GFX8-NEXT: ; =>This Loop Header: Depth=1
7300 ; GFX8-NEXT: ; Child Loop BB17_4 Depth 2
7301 ; GFX8-NEXT: s_waitcnt vmcnt(0)
7302 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v6
7303 ; GFX8-NEXT: v_max_f32_e32 v4, v4, v8
7304 ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1
7305 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, v5, v4
7306 ; GFX8-NEXT: v_add_u32_e32 v5, vcc, 0x7fff, v5
7307 ; GFX8-NEXT: v_or_b32_e32 v10, 0x400000, v4
7308 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v4, v4
7309 ; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v10, vcc
7310 ; GFX8-NEXT: v_and_b32_e32 v5, 0xffff0000, v6
7311 ; GFX8-NEXT: v_max_f32_e32 v5, v5, v9
7312 ; GFX8-NEXT: v_bfe_u32 v10, v5, 16, 1
7313 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, v10, v5
7314 ; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0x7fff, v10
7315 ; GFX8-NEXT: v_or_b32_e32 v11, 0x400000, v5
7316 ; GFX8-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
7317 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v10, v11, vcc
7318 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v5
7319 ; GFX8-NEXT: v_alignbit_b32 v5, v5, v4, 16
7320 ; GFX8-NEXT: v_mov_b32_e32 v4, v5
7321 ; GFX8-NEXT: s_mov_b64 s[12:13], exec
7322 ; GFX8-NEXT: v_mov_b32_e32 v5, v6
7323 ; GFX8-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
7324 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2
7325 ; GFX8-NEXT: v_readfirstlane_b32 s8, v0
7326 ; GFX8-NEXT: v_readfirstlane_b32 s9, v1
7327 ; GFX8-NEXT: v_readfirstlane_b32 s10, v2
7328 ; GFX8-NEXT: v_readfirstlane_b32 s11, v3
7329 ; GFX8-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7330 ; GFX8-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7331 ; GFX8-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
7332 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
7333 ; GFX8-NEXT: s_waitcnt vmcnt(0)
7334 ; GFX8-NEXT: buffer_atomic_cmpswap v[4:5], v7, s[8:11], 0 offen glc
7335 ; GFX8-NEXT: s_xor_b64 exec, exec, s[4:5]
7336 ; GFX8-NEXT: s_cbranch_execnz .LBB17_4
7337 ; GFX8-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
7338 ; GFX8-NEXT: s_mov_b64 exec, s[12:13]
7339 ; GFX8-NEXT: s_waitcnt vmcnt(0)
7340 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
7341 ; GFX8-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7342 ; GFX8-NEXT: v_mov_b32_e32 v6, v4
7343 ; GFX8-NEXT: buffer_wbinvl1
7344 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[6:7]
7345 ; GFX8-NEXT: s_cbranch_execnz .LBB17_3
7346 ; GFX8-NEXT: ; %bb.6: ; %atomicrmw.end
7347 ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7]
7348 ; GFX8-NEXT: v_mov_b32_e32 v0, v4
7349 ; GFX8-NEXT: s_setpc_b64 s[30:31]
7351 ; GFX7-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
7353 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7354 ; GFX7-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4
7355 ; GFX7-NEXT: s_mov_b64 s[6:7], exec
7356 ; GFX7-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
7357 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0
7358 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1
7359 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2
7360 ; GFX7-NEXT: v_readfirstlane_b32 s11, v3
7361 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7362 ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7363 ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
7364 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
7365 ; GFX7-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
7366 ; GFX7-NEXT: ; implicit-def: $vgpr4
7367 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
7368 ; GFX7-NEXT: s_cbranch_execnz .LBB17_1
7369 ; GFX7-NEXT: ; %bb.2:
7370 ; GFX7-NEXT: s_mov_b64 exec, s[6:7]
7371 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
7372 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
7373 ; GFX7-NEXT: s_waitcnt vmcnt(0)
7374 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
7375 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
7376 ; GFX7-NEXT: s_mov_b64 s[6:7], 0
7377 ; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
7378 ; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
7379 ; GFX7-NEXT: .LBB17_3: ; %atomicrmw.start
7380 ; GFX7-NEXT: ; =>This Loop Header: Depth=1
7381 ; GFX7-NEXT: ; Child Loop BB17_4 Depth 2
7382 ; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v4
7383 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
7384 ; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v7
7385 ; GFX7-NEXT: v_max_f32_e32 v4, v4, v9
7386 ; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
7387 ; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
7388 ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
7389 ; GFX7-NEXT: v_max_f32_e32 v7, v7, v10
7390 ; GFX7-NEXT: v_alignbit_b32 v5, v5, v6, 16
7391 ; GFX7-NEXT: v_alignbit_b32 v4, v4, v7, 16
7392 ; GFX7-NEXT: v_mov_b32_e32 v7, v5
7393 ; GFX7-NEXT: s_mov_b64 s[12:13], exec
7394 ; GFX7-NEXT: v_mov_b32_e32 v6, v4
7395 ; GFX7-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
7396 ; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
7397 ; GFX7-NEXT: v_readfirstlane_b32 s8, v0
7398 ; GFX7-NEXT: v_readfirstlane_b32 s9, v1
7399 ; GFX7-NEXT: v_readfirstlane_b32 s10, v2
7400 ; GFX7-NEXT: v_readfirstlane_b32 s11, v3
7401 ; GFX7-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7402 ; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7403 ; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
7404 ; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
7405 ; GFX7-NEXT: s_waitcnt vmcnt(0)
7406 ; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
7407 ; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
7408 ; GFX7-NEXT: s_cbranch_execnz .LBB17_4
7409 ; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
7410 ; GFX7-NEXT: s_mov_b64 exec, s[12:13]
7411 ; GFX7-NEXT: s_waitcnt vmcnt(0)
7412 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
7413 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
7414 ; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7415 ; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v6
7416 ; GFX7-NEXT: buffer_wbinvl1
7417 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
7418 ; GFX7-NEXT: s_cbranch_execnz .LBB17_3
7419 ; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
7420 ; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
7421 ; GFX7-NEXT: v_mov_b32_e32 v0, v7
7422 ; GFX7-NEXT: v_mov_b32_e32 v1, v4
7423 ; GFX7-NEXT: s_setpc_b64 s[30:31]
7425 ; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_v2bf16__offset__waterfall:
7427 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7428 ; GFX6-NEXT: v_add_i32_e32 v8, vcc, 0x400, v4
7429 ; GFX6-NEXT: s_mov_b64 s[6:7], exec
7430 ; GFX6-NEXT: .LBB17_1: ; =>This Inner Loop Header: Depth=1
7431 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0
7432 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1
7433 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2
7434 ; GFX6-NEXT: v_readfirstlane_b32 s11, v3
7435 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7436 ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7437 ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
7438 ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
7439 ; GFX6-NEXT: buffer_load_dword v7, v4, s[8:11], 0 offen offset:1024
7440 ; GFX6-NEXT: ; implicit-def: $vgpr4
7441 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
7442 ; GFX6-NEXT: s_cbranch_execnz .LBB17_1
7443 ; GFX6-NEXT: ; %bb.2:
7444 ; GFX6-NEXT: s_mov_b64 exec, s[6:7]
7445 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v6
7446 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v5
7447 ; GFX6-NEXT: s_waitcnt vmcnt(0)
7448 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v7
7449 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
7450 ; GFX6-NEXT: s_mov_b64 s[6:7], 0
7451 ; GFX6-NEXT: v_and_b32_e32 v9, 0xffff0000, v6
7452 ; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v5
7453 ; GFX6-NEXT: .LBB17_3: ; %atomicrmw.start
7454 ; GFX6-NEXT: ; =>This Loop Header: Depth=1
7455 ; GFX6-NEXT: ; Child Loop BB17_4 Depth 2
7456 ; GFX6-NEXT: v_mul_f32_e32 v5, 1.0, v4
7457 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
7458 ; GFX6-NEXT: v_mul_f32_e32 v6, 1.0, v7
7459 ; GFX6-NEXT: v_max_f32_e32 v4, v4, v9
7460 ; GFX6-NEXT: v_and_b32_e32 v7, 0xffff0000, v6
7461 ; GFX6-NEXT: v_lshrrev_b32_e32 v5, 16, v5
7462 ; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
7463 ; GFX6-NEXT: v_max_f32_e32 v7, v7, v10
7464 ; GFX6-NEXT: v_alignbit_b32 v5, v5, v6, 16
7465 ; GFX6-NEXT: v_alignbit_b32 v4, v4, v7, 16
7466 ; GFX6-NEXT: v_mov_b32_e32 v7, v5
7467 ; GFX6-NEXT: s_mov_b64 s[12:13], exec
7468 ; GFX6-NEXT: v_mov_b32_e32 v6, v4
7469 ; GFX6-NEXT: .LBB17_4: ; Parent Loop BB17_3 Depth=1
7470 ; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
7471 ; GFX6-NEXT: v_readfirstlane_b32 s8, v0
7472 ; GFX6-NEXT: v_readfirstlane_b32 s9, v1
7473 ; GFX6-NEXT: v_readfirstlane_b32 s10, v2
7474 ; GFX6-NEXT: v_readfirstlane_b32 s11, v3
7475 ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
7476 ; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
7477 ; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
7478 ; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
7479 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
7480 ; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v8, s[8:11], 0 offen glc
7481 ; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
7482 ; GFX6-NEXT: s_cbranch_execnz .LBB17_4
7483 ; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB17_3 Depth=1
7484 ; GFX6-NEXT: s_mov_b64 exec, s[12:13]
7485 ; GFX6-NEXT: s_waitcnt vmcnt(0)
7486 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
7487 ; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v6
7488 ; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
7489 ; GFX6-NEXT: s_waitcnt expcnt(0)
7490 ; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v6
7491 ; GFX6-NEXT: buffer_wbinvl1
7492 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
7493 ; GFX6-NEXT: s_cbranch_execnz .LBB17_3
7494 ; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
7495 ; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
7496 ; GFX6-NEXT: v_mov_b32_e32 v0, v7
7497 ; GFX6-NEXT: v_mov_b32_e32 v1, v4
7498 ; GFX6-NEXT: s_setpc_b64 s[30:31]
7499 %gep = getelementptr <2 x bfloat>, ptr addrspace(7) %ptr, i32 256
7500 %result = atomicrmw fmax ptr addrspace(7) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
7501 ret <2 x bfloat> %result
7504 ; --------------------------------------------------------------------
7506 ; --------------------------------------------------------------------
7508 define float @buffer_fat_ptr_system_atomic_fmax_ret_f32__offset(ptr addrspace(7) inreg %ptr, float %val) #0 {
7509 ; GFX12-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
7511 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
7512 ; GFX12-NEXT: s_wait_expcnt 0x0
7513 ; GFX12-NEXT: s_wait_samplecnt 0x0
7514 ; GFX12-NEXT: s_wait_bvhcnt 0x0
7515 ; GFX12-NEXT: s_wait_kmcnt 0x0
7516 ; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
7517 ; GFX12-NEXT: s_add_co_i32 s4, s6, 0x400
7518 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
7519 ; GFX12-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_num_f32 v2, v1, v1
7520 ; GFX12-NEXT: buffer_load_b32 v0, v0, s[0:3], null offen offset:1024
7521 ; GFX12-NEXT: s_mov_b32 s4, 0
7522 ; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start
7523 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
7524 ; GFX12-NEXT: s_wait_loadcnt 0x0
7525 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
7526 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
7527 ; GFX12-NEXT: s_wait_storecnt 0x0
7528 ; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5
7529 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7530 ; GFX12-NEXT: v_max_num_f32_e32 v4, v0, v2
7531 ; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
7532 ; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], null offen th:TH_ATOMIC_RETURN
7533 ; GFX12-NEXT: s_wait_loadcnt 0x0
7534 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
7535 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
7536 ; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
7537 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
7538 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
7539 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1
7540 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
7541 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
7542 ; GFX12-NEXT: s_setpc_b64 s[30:31]
7544 ; GFX940-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
7546 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7547 ; GFX940-NEXT: v_mov_b32_e32 v1, v0
7548 ; GFX940-NEXT: v_mov_b32_e32 v0, s6
7549 ; GFX940-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:1024
7550 ; GFX940-NEXT: s_addk_i32 s6, 0x400
7551 ; GFX940-NEXT: s_mov_b64 s[4:5], 0
7552 ; GFX940-NEXT: v_max_f32_e32 v2, v1, v1
7553 ; GFX940-NEXT: v_mov_b32_e32 v3, s6
7554 ; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
7555 ; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
7556 ; GFX940-NEXT: s_waitcnt vmcnt(0)
7557 ; GFX940-NEXT: v_mov_b32_e32 v5, v0
7558 ; GFX940-NEXT: v_max_f32_e32 v0, v5, v5
7559 ; GFX940-NEXT: v_max_f32_e32 v4, v0, v2
7560 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
7561 ; GFX940-NEXT: buffer_wbl2 sc0 sc1
7562 ; GFX940-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[0:3], 0 offen sc0
7563 ; GFX940-NEXT: s_waitcnt vmcnt(0)
7564 ; GFX940-NEXT: buffer_inv sc0 sc1
7565 ; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
7566 ; GFX940-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7567 ; GFX940-NEXT: s_andn2_b64 exec, exec, s[4:5]
7568 ; GFX940-NEXT: s_cbranch_execnz .LBB18_1
7569 ; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
7570 ; GFX940-NEXT: s_or_b64 exec, exec, s[4:5]
7571 ; GFX940-NEXT: s_setpc_b64 s[30:31]
7573 ; GFX11-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
7575 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7576 ; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s6
7577 ; GFX11-NEXT: s_add_i32 s4, s6, 0x400
7578 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
7579 ; GFX11-NEXT: v_dual_mov_b32 v3, s4 :: v_dual_max_f32 v2, v1, v1
7580 ; GFX11-NEXT: buffer_load_b32 v0, v0, s[0:3], 0 offen offset:1024
7581 ; GFX11-NEXT: s_mov_b32 s4, 0
7582 ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start
7583 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
7584 ; GFX11-NEXT: s_waitcnt vmcnt(0)
7585 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
7586 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
7587 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7588 ; GFX11-NEXT: v_max_f32_e32 v0, v5, v5
7589 ; GFX11-NEXT: v_max_f32_e32 v4, v0, v2
7590 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
7591 ; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
7592 ; GFX11-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v3, s[0:3], 0 offen glc
7593 ; GFX11-NEXT: s_waitcnt vmcnt(0)
7594 ; GFX11-NEXT: buffer_gl1_inv
7595 ; GFX11-NEXT: buffer_gl0_inv
7596 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
7597 ; GFX11-NEXT: s_or_b32 s4, vcc_lo, s4
7598 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
7599 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
7600 ; GFX11-NEXT: s_cbranch_execnz .LBB18_1
7601 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
7602 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s4
7603 ; GFX11-NEXT: s_setpc_b64 s[30:31]
7605 ; GFX10-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
7607 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7608 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
7609 ; GFX10-NEXT: v_mov_b32_e32 v0, s18
7610 ; GFX10-NEXT: s_mov_b32 s11, s17
7611 ; GFX10-NEXT: s_mov_b32 s10, s16
7612 ; GFX10-NEXT: s_mov_b32 s9, s7
7613 ; GFX10-NEXT: s_mov_b32 s8, s6
7614 ; GFX10-NEXT: s_add_i32 s4, s18, 0x400
7615 ; GFX10-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
7616 ; GFX10-NEXT: v_max_f32_e32 v2, v1, v1
7617 ; GFX10-NEXT: v_mov_b32_e32 v3, s4
7618 ; GFX10-NEXT: s_mov_b32 s4, 0
7619 ; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start
7620 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
7621 ; GFX10-NEXT: s_waitcnt vmcnt(0)
7622 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
7623 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
7624 ; GFX10-NEXT: v_max_f32_e32 v0, v5, v5
7625 ; GFX10-NEXT: v_max_f32_e32 v4, v0, v2
7626 ; GFX10-NEXT: v_mov_b32_e32 v0, v4
7627 ; GFX10-NEXT: v_mov_b32_e32 v1, v5
7628 ; GFX10-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
7629 ; GFX10-NEXT: s_waitcnt vmcnt(0)
7630 ; GFX10-NEXT: buffer_gl1_inv
7631 ; GFX10-NEXT: buffer_gl0_inv
7632 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
7633 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
7634 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
7635 ; GFX10-NEXT: s_cbranch_execnz .LBB18_1
7636 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
7637 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
7638 ; GFX10-NEXT: s_setpc_b64 s[30:31]
7640 ; GFX90A-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
7642 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7643 ; GFX90A-NEXT: v_mov_b32_e32 v1, v0
7644 ; GFX90A-NEXT: s_mov_b32 s11, s17
7645 ; GFX90A-NEXT: s_mov_b32 s10, s16
7646 ; GFX90A-NEXT: s_mov_b32 s9, s7
7647 ; GFX90A-NEXT: s_mov_b32 s8, s6
7648 ; GFX90A-NEXT: v_mov_b32_e32 v0, s18
7649 ; GFX90A-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
7650 ; GFX90A-NEXT: s_add_i32 s6, s18, 0x400
7651 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
7652 ; GFX90A-NEXT: v_max_f32_e32 v2, v1, v1
7653 ; GFX90A-NEXT: v_mov_b32_e32 v3, s6
7654 ; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start
7655 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
7656 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
7657 ; GFX90A-NEXT: v_mov_b32_e32 v5, v0
7658 ; GFX90A-NEXT: v_max_f32_e32 v0, v5, v5
7659 ; GFX90A-NEXT: v_max_f32_e32 v4, v0, v2
7660 ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
7661 ; GFX90A-NEXT: buffer_wbl2
7662 ; GFX90A-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
7663 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
7664 ; GFX90A-NEXT: buffer_invl2
7665 ; GFX90A-NEXT: buffer_wbinvl1
7666 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
7667 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7668 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
7669 ; GFX90A-NEXT: s_cbranch_execnz .LBB18_1
7670 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
7671 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
7672 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
7674 ; GFX908-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
7676 ; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7677 ; GFX908-NEXT: v_mov_b32_e32 v1, v0
7678 ; GFX908-NEXT: s_mov_b32 s11, s17
7679 ; GFX908-NEXT: s_mov_b32 s10, s16
7680 ; GFX908-NEXT: s_mov_b32 s9, s7
7681 ; GFX908-NEXT: s_mov_b32 s8, s6
7682 ; GFX908-NEXT: v_mov_b32_e32 v0, s18
7683 ; GFX908-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
7684 ; GFX908-NEXT: s_add_i32 s6, s18, 0x400
7685 ; GFX908-NEXT: s_mov_b64 s[4:5], 0
7686 ; GFX908-NEXT: v_max_f32_e32 v2, v1, v1
7687 ; GFX908-NEXT: v_mov_b32_e32 v3, s6
7688 ; GFX908-NEXT: .LBB18_1: ; %atomicrmw.start
7689 ; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
7690 ; GFX908-NEXT: s_waitcnt vmcnt(0)
7691 ; GFX908-NEXT: v_mov_b32_e32 v5, v0
7692 ; GFX908-NEXT: v_max_f32_e32 v0, v5, v5
7693 ; GFX908-NEXT: v_max_f32_e32 v4, v0, v2
7694 ; GFX908-NEXT: v_mov_b32_e32 v0, v4
7695 ; GFX908-NEXT: v_mov_b32_e32 v1, v5
7696 ; GFX908-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
7697 ; GFX908-NEXT: s_waitcnt vmcnt(0)
7698 ; GFX908-NEXT: buffer_wbinvl1
7699 ; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
7700 ; GFX908-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7701 ; GFX908-NEXT: s_andn2_b64 exec, exec, s[4:5]
7702 ; GFX908-NEXT: s_cbranch_execnz .LBB18_1
7703 ; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
7704 ; GFX908-NEXT: s_or_b64 exec, exec, s[4:5]
7705 ; GFX908-NEXT: s_setpc_b64 s[30:31]
7707 ; GFX8-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
7709 ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7710 ; GFX8-NEXT: v_mov_b32_e32 v1, v0
7711 ; GFX8-NEXT: s_mov_b32 s11, s17
7712 ; GFX8-NEXT: s_mov_b32 s10, s16
7713 ; GFX8-NEXT: s_mov_b32 s9, s7
7714 ; GFX8-NEXT: s_mov_b32 s8, s6
7715 ; GFX8-NEXT: v_mov_b32_e32 v0, s18
7716 ; GFX8-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
7717 ; GFX8-NEXT: s_add_i32 s6, s18, 0x400
7718 ; GFX8-NEXT: s_mov_b64 s[4:5], 0
7719 ; GFX8-NEXT: v_mul_f32_e32 v2, 1.0, v1
7720 ; GFX8-NEXT: v_mov_b32_e32 v3, s6
7721 ; GFX8-NEXT: .LBB18_1: ; %atomicrmw.start
7722 ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1
7723 ; GFX8-NEXT: s_waitcnt vmcnt(0)
7724 ; GFX8-NEXT: v_mov_b32_e32 v5, v0
7725 ; GFX8-NEXT: v_mul_f32_e32 v0, 1.0, v5
7726 ; GFX8-NEXT: v_max_f32_e32 v4, v0, v2
7727 ; GFX8-NEXT: v_mov_b32_e32 v0, v4
7728 ; GFX8-NEXT: v_mov_b32_e32 v1, v5
7729 ; GFX8-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
7730 ; GFX8-NEXT: s_waitcnt vmcnt(0)
7731 ; GFX8-NEXT: buffer_wbinvl1
7732 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
7733 ; GFX8-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7734 ; GFX8-NEXT: s_andn2_b64 exec, exec, s[4:5]
7735 ; GFX8-NEXT: s_cbranch_execnz .LBB18_1
7736 ; GFX8-NEXT: ; %bb.2: ; %atomicrmw.end
7737 ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
7738 ; GFX8-NEXT: s_setpc_b64 s[30:31]
7740 ; GFX7-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
7742 ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7743 ; GFX7-NEXT: v_mov_b32_e32 v1, v0
7744 ; GFX7-NEXT: s_mov_b32 s11, s17
7745 ; GFX7-NEXT: s_mov_b32 s10, s16
7746 ; GFX7-NEXT: s_mov_b32 s9, s7
7747 ; GFX7-NEXT: s_mov_b32 s8, s6
7748 ; GFX7-NEXT: v_mov_b32_e32 v0, s18
7749 ; GFX7-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
7750 ; GFX7-NEXT: s_add_i32 s6, s18, 0x400
7751 ; GFX7-NEXT: s_mov_b64 s[4:5], 0
7752 ; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v1
7753 ; GFX7-NEXT: v_mov_b32_e32 v3, s6
7754 ; GFX7-NEXT: .LBB18_1: ; %atomicrmw.start
7755 ; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
7756 ; GFX7-NEXT: s_waitcnt vmcnt(0)
7757 ; GFX7-NEXT: v_mov_b32_e32 v5, v0
7758 ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v5
7759 ; GFX7-NEXT: v_max_f32_e32 v4, v0, v2
7760 ; GFX7-NEXT: v_mov_b32_e32 v0, v4
7761 ; GFX7-NEXT: v_mov_b32_e32 v1, v5
7762 ; GFX7-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
7763 ; GFX7-NEXT: s_waitcnt vmcnt(0)
7764 ; GFX7-NEXT: buffer_wbinvl1
7765 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
7766 ; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7767 ; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
7768 ; GFX7-NEXT: s_cbranch_execnz .LBB18_1
7769 ; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
7770 ; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
7771 ; GFX7-NEXT: s_setpc_b64 s[30:31]
7773 ; GFX6-LABEL: buffer_fat_ptr_system_atomic_fmax_ret_f32__offset:
7775 ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7776 ; GFX6-NEXT: v_mov_b32_e32 v1, v0
7777 ; GFX6-NEXT: s_mov_b32 s11, s17
7778 ; GFX6-NEXT: s_mov_b32 s10, s16
7779 ; GFX6-NEXT: s_mov_b32 s9, s7
7780 ; GFX6-NEXT: s_mov_b32 s8, s6
7781 ; GFX6-NEXT: v_mov_b32_e32 v0, s18
7782 ; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen offset:1024
7783 ; GFX6-NEXT: s_add_i32 s6, s18, 0x400
7784 ; GFX6-NEXT: s_mov_b64 s[4:5], 0
7785 ; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v1
7786 ; GFX6-NEXT: v_mov_b32_e32 v3, s6
7787 ; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start
7788 ; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
7789 ; GFX6-NEXT: s_waitcnt vmcnt(0)
7790 ; GFX6-NEXT: v_mov_b32_e32 v5, v0
7791 ; GFX6-NEXT: s_waitcnt expcnt(0)
7792 ; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v5
7793 ; GFX6-NEXT: v_max_f32_e32 v4, v0, v2
7794 ; GFX6-NEXT: v_mov_b32_e32 v0, v4
7795 ; GFX6-NEXT: v_mov_b32_e32 v1, v5
7796 ; GFX6-NEXT: buffer_atomic_cmpswap v[0:1], v3, s[8:11], 0 offen glc
7797 ; GFX6-NEXT: s_waitcnt vmcnt(0)
7798 ; GFX6-NEXT: buffer_wbinvl1
7799 ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
7800 ; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
7801 ; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
7802 ; GFX6-NEXT: s_cbranch_execnz .LBB18_1
7803 ; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
7804 ; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
7805 ; GFX6-NEXT: s_waitcnt expcnt(0)
7806 ; GFX6-NEXT: s_setpc_b64 s[30:31]
7807 %gep = getelementptr float, ptr addrspace(7) %ptr, i32 256
7808 %result = atomicrmw fmax ptr addrspace(7) %gep, float %val seq_cst
7812 attributes #0 = { nounwind "amdgpu-unsafe-fp-atomics"="true" }