1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
6 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
7 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
8 ; RUN: llc -march=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
9 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
10 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
11 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
12 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
13 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
15 declare float @div.float.value()
17 define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
18 ; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
20 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
21 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
22 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
23 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
24 ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3
25 ; GFX7LESS-NEXT: ; %bb.1:
26 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
27 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
28 ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0
29 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
30 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
31 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
32 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
33 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
34 ; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start
35 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
36 ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
37 ; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0
38 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
39 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
40 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
41 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
42 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
43 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
44 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
45 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
46 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
47 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2
48 ; GFX7LESS-NEXT: .LBB0_3:
49 ; GFX7LESS-NEXT: s_endpgm
51 ; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
53 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
54 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
55 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
56 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
57 ; GFX9-NEXT: s_cbranch_execz .LBB0_3
59 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
60 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
61 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
62 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
63 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
64 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
65 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
66 ; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start
67 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
68 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
69 ; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
70 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
71 ; GFX9-NEXT: s_waitcnt vmcnt(0)
72 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
73 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
74 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
75 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
76 ; GFX9-NEXT: s_cbranch_execnz .LBB0_2
80 ; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
82 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
83 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
84 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
85 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
86 ; GFX1064-NEXT: s_cbranch_execz .LBB0_3
87 ; GFX1064-NEXT: ; %bb.1:
88 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
89 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0
90 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
91 ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
92 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
93 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2
94 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
95 ; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
96 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
97 ; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
98 ; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
99 ; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
100 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
101 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
102 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
103 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
104 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
105 ; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
106 ; GFX1064-NEXT: .LBB0_3:
107 ; GFX1064-NEXT: s_endpgm
109 ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
111 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
112 ; GFX1032-NEXT: s_mov_b32 s2, 0
113 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
114 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
115 ; GFX1032-NEXT: s_cbranch_execz .LBB0_3
116 ; GFX1032-NEXT: ; %bb.1:
117 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
118 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0
119 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
120 ; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
121 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
122 ; GFX1032-NEXT: v_mov_b32_e32 v1, s3
123 ; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
124 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
125 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
126 ; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
127 ; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
128 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
129 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
130 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
131 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
132 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
133 ; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
134 ; GFX1032-NEXT: .LBB0_3:
135 ; GFX1032-NEXT: s_endpgm
137 ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
139 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
140 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec
141 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
142 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
143 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
144 ; GFX1164-NEXT: s_cbranch_execz .LBB0_3
145 ; GFX1164-NEXT: ; %bb.1:
146 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
147 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0
148 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
149 ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
150 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
151 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2
152 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
153 ; GFX1164-NEXT: .LBB0_2: ; %atomicrmw.start
154 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
155 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
156 ; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
157 ; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
158 ; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
159 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
160 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
161 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0
162 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
163 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
164 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
165 ; GFX1164-NEXT: s_cbranch_execnz .LBB0_2
166 ; GFX1164-NEXT: .LBB0_3:
167 ; GFX1164-NEXT: s_endpgm
169 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
171 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
172 ; GFX1132-NEXT: s_mov_b32 s2, 0
173 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo
174 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
175 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
176 ; GFX1132-NEXT: s_cbranch_execz .LBB0_3
177 ; GFX1132-NEXT: ; %bb.1:
178 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
179 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0
180 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
181 ; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
182 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
183 ; GFX1132-NEXT: v_mov_b32_e32 v1, s3
184 ; GFX1132-NEXT: .LBB0_2: ; %atomicrmw.start
185 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
186 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
187 ; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
188 ; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
189 ; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
190 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
191 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
192 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0
193 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
194 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
195 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
196 ; GFX1132-NEXT: s_cbranch_execnz .LBB0_2
197 ; GFX1132-NEXT: .LBB0_3:
198 ; GFX1132-NEXT: s_endpgm
200 ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
202 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
203 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
204 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
205 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
206 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3
207 ; GFX9-DPP-NEXT: ; %bb.1:
208 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
209 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
210 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
211 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
212 ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
213 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
214 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
215 ; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
216 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
217 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
218 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
219 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
220 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
221 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
222 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
223 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
224 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
225 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2
226 ; GFX9-DPP-NEXT: .LBB0_3:
227 ; GFX9-DPP-NEXT: s_endpgm
229 ; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
230 ; GFX1064-DPP: ; %bb.0:
231 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
232 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
233 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
234 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
235 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
236 ; GFX1064-DPP-NEXT: ; %bb.1:
237 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
238 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
239 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
240 ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
241 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
242 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
243 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
244 ; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
245 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
246 ; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
247 ; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
248 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
249 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
250 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
251 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
252 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
253 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
254 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
255 ; GFX1064-DPP-NEXT: .LBB0_3:
256 ; GFX1064-DPP-NEXT: s_endpgm
258 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
259 ; GFX1032-DPP: ; %bb.0:
260 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
261 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
262 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
263 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
264 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
265 ; GFX1032-DPP-NEXT: ; %bb.1:
266 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
267 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
268 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
269 ; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
270 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
271 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
272 ; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
273 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
274 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
275 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
276 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
277 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
278 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
279 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
280 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
281 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
282 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
283 ; GFX1032-DPP-NEXT: .LBB0_3:
284 ; GFX1032-DPP-NEXT: s_endpgm
286 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
287 ; GFX1164-DPP: ; %bb.0:
288 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
289 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
290 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
291 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
292 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
293 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_3
294 ; GFX1164-DPP-NEXT: ; %bb.1:
295 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
296 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
297 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
298 ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
299 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
300 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
301 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
302 ; GFX1164-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
303 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
304 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
305 ; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
306 ; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
307 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
308 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
309 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
310 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
311 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
312 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
313 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
314 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB0_2
315 ; GFX1164-DPP-NEXT: .LBB0_3:
316 ; GFX1164-DPP-NEXT: s_endpgm
318 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
319 ; GFX1132-DPP: ; %bb.0:
320 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
321 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
322 ; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
323 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
324 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
325 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_3
326 ; GFX1132-DPP-NEXT: ; %bb.1:
327 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
328 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
329 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
330 ; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
331 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
332 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
333 ; GFX1132-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
334 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
335 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
336 ; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
337 ; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
338 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
339 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
340 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
341 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
342 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
343 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
344 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
345 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB0_2
346 ; GFX1132-DPP-NEXT: .LBB0_3:
347 ; GFX1132-DPP-NEXT: s_endpgm
348 %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4
352 define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
353 ; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
355 ; GFX7LESS-NEXT: s_mov_b32 s32, 0
356 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
357 ; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
358 ; GFX7LESS-NEXT: s_mov_b32 s42, -1
359 ; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
360 ; GFX7LESS-NEXT: s_add_u32 s40, s40, s11
361 ; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
362 ; GFX7LESS-NEXT: s_mov_b32 s14, s10
363 ; GFX7LESS-NEXT: s_mov_b32 s13, s9
364 ; GFX7LESS-NEXT: s_mov_b32 s12, s8
365 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7]
366 ; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
367 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
368 ; GFX7LESS-NEXT: s_mov_b32 s38, -1
369 ; GFX7LESS-NEXT: s_add_u32 s8, s4, 44
370 ; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0
371 ; GFX7LESS-NEXT: s_getpc_b64 s[4:5]
372 ; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
373 ; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
374 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
375 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
376 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
377 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
378 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
379 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
380 ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3]
381 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
382 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
383 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
384 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
385 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0
386 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
387 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0
388 ; GFX7LESS-NEXT: .LBB1_1: ; %atomicrmw.start
389 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
390 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
391 ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
392 ; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2
393 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
394 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
395 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
396 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
397 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
398 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
399 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
400 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
401 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
402 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_1
403 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
404 ; GFX7LESS-NEXT: s_endpgm
406 ; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
408 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
409 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
410 ; GFX9-NEXT: s_mov_b32 s38, -1
411 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000
412 ; GFX9-NEXT: s_add_u32 s36, s36, s11
413 ; GFX9-NEXT: s_addc_u32 s37, s37, 0
414 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
415 ; GFX9-NEXT: s_mov_b32 s12, s8
416 ; GFX9-NEXT: s_add_u32 s8, s34, 44
417 ; GFX9-NEXT: s_mov_b32 s13, s9
418 ; GFX9-NEXT: s_addc_u32 s9, s35, 0
419 ; GFX9-NEXT: s_getpc_b64 s[4:5]
420 ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
421 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
422 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
423 ; GFX9-NEXT: s_mov_b32 s14, s10
424 ; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
425 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
426 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
427 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
428 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
429 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
430 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
431 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
432 ; GFX9-NEXT: s_mov_b32 s32, 0
433 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
434 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
435 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
436 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xff800000
437 ; GFX9-NEXT: .LBB1_1: ; %ComputeLoop
438 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
439 ; GFX9-NEXT: s_ff1_i32_b32 s2, s1
440 ; GFX9-NEXT: s_ff1_i32_b32 s3, s0
441 ; GFX9-NEXT: s_add_i32 s2, s2, 32
442 ; GFX9-NEXT: s_min_u32 s2, s3, s2
443 ; GFX9-NEXT: v_readlane_b32 s4, v0, s2
444 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
445 ; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
446 ; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
447 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
448 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
449 ; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
450 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
451 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
452 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
453 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
454 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
455 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
456 ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
457 ; GFX9-NEXT: s_cbranch_execz .LBB1_5
458 ; GFX9-NEXT: ; %bb.3:
459 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
460 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
461 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
462 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
463 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
464 ; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
465 ; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start
466 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
467 ; GFX9-NEXT: s_waitcnt vmcnt(0)
468 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
469 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
470 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
471 ; GFX9-NEXT: s_waitcnt vmcnt(0)
472 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
473 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
474 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
475 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
476 ; GFX9-NEXT: s_cbranch_execnz .LBB1_4
477 ; GFX9-NEXT: .LBB1_5:
478 ; GFX9-NEXT: s_endpgm
480 ; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
482 ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
483 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
484 ; GFX1064-NEXT: s_mov_b32 s38, -1
485 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
486 ; GFX1064-NEXT: s_add_u32 s36, s36, s11
487 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5]
488 ; GFX1064-NEXT: s_addc_u32 s37, s37, 0
489 ; GFX1064-NEXT: s_mov_b32 s12, s8
490 ; GFX1064-NEXT: s_add_u32 s8, s34, 44
491 ; GFX1064-NEXT: s_mov_b32 s13, s9
492 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0
493 ; GFX1064-NEXT: s_getpc_b64 s[4:5]
494 ; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
495 ; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
496 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
497 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
498 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
499 ; GFX1064-NEXT: s_mov_b32 s14, s10
500 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
501 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
502 ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
503 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
504 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
505 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
506 ; GFX1064-NEXT: s_mov_b32 s32, 0
507 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
508 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
509 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0xff800000
510 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
511 ; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
512 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
513 ; GFX1064-NEXT: s_ff1_i32_b32 s2, s1
514 ; GFX1064-NEXT: s_ff1_i32_b32 s3, s0
515 ; GFX1064-NEXT: s_add_i32 s2, s2, 32
516 ; GFX1064-NEXT: v_max_f32_e32 v1, v2, v2
517 ; GFX1064-NEXT: s_min_u32 s2, s3, s2
518 ; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
519 ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
520 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
521 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
522 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
523 ; GFX1064-NEXT: v_max_f32_e32 v2, v1, v2
524 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
525 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
526 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
527 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
528 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
529 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
530 ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
531 ; GFX1064-NEXT: s_cbranch_execz .LBB1_5
532 ; GFX1064-NEXT: ; %bb.3:
533 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
534 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0
535 ; GFX1064-NEXT: v_max_f32_e32 v2, v2, v2
536 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
537 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
538 ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
539 ; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start
540 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
541 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
542 ; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
543 ; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2
544 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
545 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
546 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
547 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
548 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
549 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
550 ; GFX1064-NEXT: s_cbranch_execnz .LBB1_4
551 ; GFX1064-NEXT: .LBB1_5:
552 ; GFX1064-NEXT: s_endpgm
554 ; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
556 ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
557 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
558 ; GFX1032-NEXT: s_mov_b32 s38, -1
559 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
560 ; GFX1032-NEXT: s_add_u32 s36, s36, s11
561 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5]
562 ; GFX1032-NEXT: s_addc_u32 s37, s37, 0
563 ; GFX1032-NEXT: s_mov_b32 s12, s8
564 ; GFX1032-NEXT: s_add_u32 s8, s34, 44
565 ; GFX1032-NEXT: s_mov_b32 s13, s9
566 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0
567 ; GFX1032-NEXT: s_getpc_b64 s[4:5]
568 ; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
569 ; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
570 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
571 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
572 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
573 ; GFX1032-NEXT: s_mov_b32 s14, s10
574 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
575 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
576 ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
577 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
578 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
579 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
580 ; GFX1032-NEXT: s_mov_b32 s32, 0
581 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
582 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
583 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0xff800000
584 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
585 ; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop
586 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
587 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
588 ; GFX1032-NEXT: v_max_f32_e32 v1, v2, v2
589 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
590 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
591 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
592 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
593 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
594 ; GFX1032-NEXT: v_max_f32_e32 v2, v1, v2
595 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
596 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
597 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
598 ; GFX1032-NEXT: s_mov_b32 s2, 0
599 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
600 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
601 ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
602 ; GFX1032-NEXT: s_cbranch_execz .LBB1_5
603 ; GFX1032-NEXT: ; %bb.3:
604 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
605 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0
606 ; GFX1032-NEXT: v_max_f32_e32 v2, v2, v2
607 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
608 ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
609 ; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start
610 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
611 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
612 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
613 ; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2
614 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
615 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
616 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
617 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
618 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
619 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
620 ; GFX1032-NEXT: s_cbranch_execnz .LBB1_4
621 ; GFX1032-NEXT: .LBB1_5:
622 ; GFX1032-NEXT: s_endpgm
624 ; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
626 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5]
627 ; GFX1164-NEXT: s_mov_b32 s12, s8
628 ; GFX1164-NEXT: s_add_u32 s8, s34, 44
629 ; GFX1164-NEXT: s_mov_b32 s13, s9
630 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0
631 ; GFX1164-NEXT: s_getpc_b64 s[4:5]
632 ; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
633 ; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
634 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0
635 ; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
636 ; GFX1164-NEXT: s_mov_b32 s14, s10
637 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7]
638 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
639 ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3]
640 ; GFX1164-NEXT: s_mov_b32 s32, 0
641 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
642 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
643 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0xff800000
644 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
645 ; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop
646 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
647 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
648 ; GFX1164-NEXT: s_ctz_i32_b32 s2, s1
649 ; GFX1164-NEXT: s_ctz_i32_b32 s3, s0
650 ; GFX1164-NEXT: s_add_i32 s2, s2, 32
651 ; GFX1164-NEXT: v_max_f32_e32 v1, v2, v2
652 ; GFX1164-NEXT: s_min_u32 s2, s3, s2
653 ; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
654 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
655 ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
656 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
657 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
658 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
659 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
660 ; GFX1164-NEXT: v_max_f32_e32 v2, v1, v2
661 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
662 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
663 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
664 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
665 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
666 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
667 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
668 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
669 ; GFX1164-NEXT: s_cbranch_execz .LBB1_5
670 ; GFX1164-NEXT: ; %bb.3:
671 ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
672 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0
673 ; GFX1164-NEXT: v_max_f32_e32 v2, v2, v2
674 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
675 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
676 ; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
677 ; GFX1164-NEXT: .LBB1_4: ; %atomicrmw.start
678 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
679 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
680 ; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
681 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
682 ; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2
683 ; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
684 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
685 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
686 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0
687 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
688 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
689 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
690 ; GFX1164-NEXT: s_cbranch_execnz .LBB1_4
691 ; GFX1164-NEXT: .LBB1_5:
692 ; GFX1164-NEXT: s_endpgm
694 ; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
696 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5]
697 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0
698 ; GFX1132-NEXT: s_add_u32 s8, s34, 44
699 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0
700 ; GFX1132-NEXT: s_getpc_b64 s[4:5]
701 ; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
702 ; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
703 ; GFX1132-NEXT: s_mov_b32 s12, s13
704 ; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
705 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
706 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
707 ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3]
708 ; GFX1132-NEXT: s_mov_b32 s13, s14
709 ; GFX1132-NEXT: s_mov_b32 s14, s15
710 ; GFX1132-NEXT: s_mov_b32 s32, 0
711 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
712 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17]
713 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0xff800000
714 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
715 ; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop
716 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
717 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
718 ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
719 ; GFX1132-NEXT: v_max_f32_e32 v1, v2, v2
720 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
721 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
722 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
723 ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
724 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
725 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
726 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
727 ; GFX1132-NEXT: v_max_f32_e32 v2, v1, v2
728 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
729 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
730 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
731 ; GFX1132-NEXT: s_mov_b32 s2, 0
732 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
733 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
734 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
735 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
736 ; GFX1132-NEXT: s_cbranch_execz .LBB1_5
737 ; GFX1132-NEXT: ; %bb.3:
738 ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
739 ; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
740 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
741 ; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
742 ; GFX1132-NEXT: .LBB1_4: ; %atomicrmw.start
743 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
744 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
745 ; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
746 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
747 ; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2
748 ; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
749 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
750 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
751 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0
752 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
753 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
754 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
755 ; GFX1132-NEXT: s_cbranch_execnz .LBB1_4
756 ; GFX1132-NEXT: .LBB1_5:
757 ; GFX1132-NEXT: s_endpgm
759 ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
761 ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
762 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
763 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1
764 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
765 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11
766 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
767 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
768 ; GFX9-DPP-NEXT: s_mov_b32 s12, s8
769 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
770 ; GFX9-DPP-NEXT: s_mov_b32 s13, s9
771 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
772 ; GFX9-DPP-NEXT: s_getpc_b64 s[4:5]
773 ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
774 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
775 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
776 ; GFX9-DPP-NEXT: s_mov_b32 s14, s10
777 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
778 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
779 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
780 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
781 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
782 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
783 ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
784 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
785 ; GFX9-DPP-NEXT: s_mov_b32 s32, 0
786 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
787 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
788 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
789 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
790 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
791 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
792 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
793 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
794 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
795 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
796 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000
797 ; GFX9-DPP-NEXT: s_nop 0
798 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
799 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3
800 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
801 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
802 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
803 ; GFX9-DPP-NEXT: s_nop 1
804 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
805 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
806 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
807 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
808 ; GFX9-DPP-NEXT: s_nop 1
809 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
810 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
811 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
812 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
813 ; GFX9-DPP-NEXT: s_nop 1
814 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
815 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
816 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
817 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
818 ; GFX9-DPP-NEXT: s_nop 1
819 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
820 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
821 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
822 ; GFX9-DPP-NEXT: s_nop 1
823 ; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
824 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4
825 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v4
826 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
827 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
828 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
829 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
830 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3
831 ; GFX9-DPP-NEXT: ; %bb.1:
832 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
833 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
834 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
835 ; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4
836 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
837 ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
838 ; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
839 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
840 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
841 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
842 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6
843 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
844 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
845 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
846 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
847 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
848 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
849 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2
850 ; GFX9-DPP-NEXT: .LBB1_3:
851 ; GFX9-DPP-NEXT: s_endpgm
853 ; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
854 ; GFX1064-DPP: ; %bb.0:
855 ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
856 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
857 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
858 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
859 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11
860 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
861 ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
862 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s8
863 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
864 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s9
865 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
866 ; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
867 ; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
868 ; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
869 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
870 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
871 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
872 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
873 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
874 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
875 ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
876 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
877 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
878 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
879 ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
880 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
881 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
882 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
883 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
884 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
885 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
886 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
887 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000
888 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
889 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
890 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
891 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
892 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
893 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3
894 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3
895 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
896 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5
897 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
898 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4
899 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
900 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5
901 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
902 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4
903 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
904 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5
905 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4
906 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
907 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
908 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
909 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4
910 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32
911 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0
912 ; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2
913 ; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3
914 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
915 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
916 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
917 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3
918 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
919 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0
920 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
921 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
922 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
923 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3
924 ; GFX1064-DPP-NEXT: ; %bb.1:
925 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
926 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
927 ; GFX1064-DPP-NEXT: v_max_f32_e32 v6, v0, v0
928 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
929 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
930 ; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1]
931 ; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
932 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
933 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
934 ; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
935 ; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v6
936 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
937 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
938 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
939 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
940 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
941 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
942 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2
943 ; GFX1064-DPP-NEXT: .LBB1_3:
944 ; GFX1064-DPP-NEXT: s_endpgm
946 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
947 ; GFX1032-DPP: ; %bb.0:
948 ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
949 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
950 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
951 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
952 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11
953 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
954 ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
955 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s8
956 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
957 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s9
958 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
959 ; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
960 ; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
961 ; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
962 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
963 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
964 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
965 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
966 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
967 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
968 ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
969 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
970 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
971 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
972 ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
973 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
974 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
975 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
976 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
977 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
978 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
979 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
980 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000
981 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
982 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
983 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
984 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
985 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
986 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3
987 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3
988 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
989 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5
990 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
991 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4
992 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
993 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5
994 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
995 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4
996 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
997 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5
998 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4
999 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
1000 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
1001 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
1002 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4
1003 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
1004 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
1005 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
1006 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
1007 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
1008 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
1009 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3
1010 ; GFX1032-DPP-NEXT: ; %bb.1:
1011 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
1012 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
1013 ; GFX1032-DPP-NEXT: v_max_f32_e32 v6, v0, v0
1014 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
1015 ; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1]
1016 ; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
1017 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
1018 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
1019 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
1020 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v6
1021 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
1022 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
1023 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
1024 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
1025 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
1026 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
1027 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2
1028 ; GFX1032-DPP-NEXT: .LBB1_3:
1029 ; GFX1032-DPP-NEXT: s_endpgm
1031 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
1032 ; GFX1164-DPP: ; %bb.0:
1033 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
1034 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s8
1035 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
1036 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
1037 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
1038 ; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
1039 ; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
1040 ; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
1041 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
1042 ; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
1043 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
1044 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
1045 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
1046 ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
1047 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
1048 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
1049 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
1050 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
1051 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000
1052 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
1053 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
1054 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
1055 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000
1056 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
1057 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
1058 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
1059 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
1060 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
1061 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
1062 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
1063 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1
1064 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1
1065 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1066 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
1067 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
1068 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
1069 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1070 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
1071 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
1072 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1073 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
1074 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
1075 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
1076 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1077 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
1078 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
1079 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1080 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
1081 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
1082 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1083 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
1084 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
1085 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1086 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
1087 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
1088 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1089 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
1090 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
1091 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1092 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
1093 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
1094 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
1095 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
1096 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
1097 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1098 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1099 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1
1100 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
1101 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
1102 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_3
1103 ; GFX1164-DPP-NEXT: ; %bb.1:
1104 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
1105 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
1106 ; GFX1164-DPP-NEXT: v_max_f32_e32 v6, v4, v4
1107 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
1108 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
1109 ; GFX1164-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
1110 ; GFX1164-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
1111 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
1112 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
1113 ; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5
1114 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
1115 ; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v4, v6
1116 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
1117 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
1118 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
1119 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
1120 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1121 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1122 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
1123 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB1_2
1124 ; GFX1164-DPP-NEXT: .LBB1_3:
1125 ; GFX1164-DPP-NEXT: s_endpgm
1127 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
1128 ; GFX1132-DPP: ; %bb.0:
1129 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
1130 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
1131 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
1132 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
1133 ; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
1134 ; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
1135 ; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
1136 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
1137 ; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
1138 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
1139 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
1140 ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
1141 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
1142 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
1143 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
1144 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
1145 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
1146 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
1147 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000
1148 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
1149 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
1150 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
1151 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000
1152 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
1153 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
1154 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1155 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
1156 ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0xff800000
1157 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1
1158 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1159 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1
1160 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
1161 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1162 ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
1163 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
1164 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1165 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
1166 ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
1167 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1168 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
1169 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
1170 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1171 ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3
1172 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
1173 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1174 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
1175 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
1176 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1177 ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2
1178 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
1179 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
1180 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
1181 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1182 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1
1183 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
1184 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
1185 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
1186 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
1187 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_3
1188 ; GFX1132-DPP-NEXT: ; %bb.1:
1189 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
1190 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0
1191 ; GFX1132-DPP-NEXT: v_max_f32_e32 v6, v4, v4
1192 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
1193 ; GFX1132-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
1194 ; GFX1132-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
1195 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
1196 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
1197 ; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5
1198 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
1199 ; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v4, v6
1200 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
1201 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
1202 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
1203 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
1204 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
1205 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1206 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
1207 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB1_2
1208 ; GFX1132-DPP-NEXT: .LBB1_3:
1209 ; GFX1132-DPP-NEXT: s_endpgm
1210 %divValue = call float @div.float.value()
1211 %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
1215 define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
1216 ; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
1217 ; GFX7LESS: ; %bb.0:
1218 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
1219 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
1220 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1221 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
1222 ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3
1223 ; GFX7LESS-NEXT: ; %bb.1:
1224 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
1225 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
1226 ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0
1227 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
1228 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
1229 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
1230 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
1231 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
1232 ; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start
1233 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
1234 ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
1235 ; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0
1236 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
1237 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
1238 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
1239 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
1240 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
1241 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
1242 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1243 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
1244 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
1245 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2
1246 ; GFX7LESS-NEXT: .LBB2_3:
1247 ; GFX7LESS-NEXT: s_endpgm
1249 ; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
1251 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1252 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1253 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1254 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
1255 ; GFX9-NEXT: s_cbranch_execz .LBB2_3
1256 ; GFX9-NEXT: ; %bb.1:
1257 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1258 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
1259 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1260 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1261 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
1262 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1263 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
1264 ; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start
1265 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1266 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
1267 ; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
1268 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
1269 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1270 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1271 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1272 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1273 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
1274 ; GFX9-NEXT: s_cbranch_execnz .LBB2_2
1275 ; GFX9-NEXT: .LBB2_3:
1276 ; GFX9-NEXT: s_endpgm
1278 ; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
1280 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1281 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1282 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1283 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
1284 ; GFX1064-NEXT: s_cbranch_execz .LBB2_3
1285 ; GFX1064-NEXT: ; %bb.1:
1286 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1287 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0
1288 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
1289 ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
1290 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
1291 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2
1292 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
1293 ; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
1294 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
1295 ; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
1296 ; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
1297 ; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
1298 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
1299 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1300 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
1301 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1302 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
1303 ; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
1304 ; GFX1064-NEXT: .LBB2_3:
1305 ; GFX1064-NEXT: s_endpgm
1307 ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
1309 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1310 ; GFX1032-NEXT: s_mov_b32 s2, 0
1311 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
1312 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
1313 ; GFX1032-NEXT: s_cbranch_execz .LBB2_3
1314 ; GFX1032-NEXT: ; %bb.1:
1315 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1316 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0
1317 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
1318 ; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
1319 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
1320 ; GFX1032-NEXT: v_mov_b32_e32 v1, s3
1321 ; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
1322 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
1323 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
1324 ; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
1325 ; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
1326 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
1327 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
1328 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
1329 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
1330 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
1331 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
1332 ; GFX1032-NEXT: .LBB2_3:
1333 ; GFX1032-NEXT: s_endpgm
1335 ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
1337 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1338 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec
1339 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1340 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1341 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
1342 ; GFX1164-NEXT: s_cbranch_execz .LBB2_3
1343 ; GFX1164-NEXT: ; %bb.1:
1344 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1345 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0
1346 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
1347 ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
1348 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
1349 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2
1350 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
1351 ; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
1352 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
1353 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1354 ; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
1355 ; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
1356 ; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
1357 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
1358 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1359 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0
1360 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1361 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1362 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
1363 ; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
1364 ; GFX1164-NEXT: .LBB2_3:
1365 ; GFX1164-NEXT: s_endpgm
1367 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
1369 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1370 ; GFX1132-NEXT: s_mov_b32 s2, 0
1371 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo
1372 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
1373 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
1374 ; GFX1132-NEXT: s_cbranch_execz .LBB2_3
1375 ; GFX1132-NEXT: ; %bb.1:
1376 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1377 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0
1378 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
1379 ; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
1380 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
1381 ; GFX1132-NEXT: v_mov_b32_e32 v1, s3
1382 ; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
1383 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
1384 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1385 ; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
1386 ; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
1387 ; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
1388 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
1389 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
1390 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0
1391 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
1392 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1393 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
1394 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
1395 ; GFX1132-NEXT: .LBB2_3:
1396 ; GFX1132-NEXT: s_endpgm
1398 ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
1399 ; GFX9-DPP: ; %bb.0:
1400 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1401 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1402 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1403 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
1404 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3
1405 ; GFX9-DPP-NEXT: ; %bb.1:
1406 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1407 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
1408 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
1409 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
1410 ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
1411 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
1412 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
1413 ; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
1414 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
1415 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
1416 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
1417 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
1418 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
1419 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1420 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1421 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
1422 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
1423 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2
1424 ; GFX9-DPP-NEXT: .LBB2_3:
1425 ; GFX9-DPP-NEXT: s_endpgm
1427 ; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
1428 ; GFX1064-DPP: ; %bb.0:
1429 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1430 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1431 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1432 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
1433 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3
1434 ; GFX1064-DPP-NEXT: ; %bb.1:
1435 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1436 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
1437 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
1438 ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
1439 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
1440 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
1441 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
1442 ; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
1443 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
1444 ; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
1445 ; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
1446 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
1447 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
1448 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1449 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
1450 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1451 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
1452 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
1453 ; GFX1064-DPP-NEXT: .LBB2_3:
1454 ; GFX1064-DPP-NEXT: s_endpgm
1456 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
1457 ; GFX1032-DPP: ; %bb.0:
1458 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1459 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
1460 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
1461 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
1462 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
1463 ; GFX1032-DPP-NEXT: ; %bb.1:
1464 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
1465 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
1466 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
1467 ; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
1468 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
1469 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
1470 ; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
1471 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
1472 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
1473 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
1474 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
1475 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
1476 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
1477 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
1478 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
1479 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
1480 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
1481 ; GFX1032-DPP-NEXT: .LBB2_3:
1482 ; GFX1032-DPP-NEXT: s_endpgm
1484 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
1485 ; GFX1164-DPP: ; %bb.0:
1486 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1487 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
1488 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1489 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1490 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
1491 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3
1492 ; GFX1164-DPP-NEXT: ; %bb.1:
1493 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1494 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
1495 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
1496 ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
1497 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
1498 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
1499 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
1500 ; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
1501 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
1502 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1503 ; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
1504 ; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
1505 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
1506 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
1507 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1508 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
1509 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1510 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1511 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
1512 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
1513 ; GFX1164-DPP-NEXT: .LBB2_3:
1514 ; GFX1164-DPP-NEXT: s_endpgm
1516 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
1517 ; GFX1132-DPP: ; %bb.0:
1518 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1519 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
1520 ; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
1521 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
1522 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
1523 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3
1524 ; GFX1132-DPP-NEXT: ; %bb.1:
1525 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
1526 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
1527 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
1528 ; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
1529 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
1530 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
1531 ; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
1532 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
1533 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1534 ; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
1535 ; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
1536 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
1537 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
1538 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
1539 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
1540 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
1541 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1542 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
1543 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
1544 ; GFX1132-DPP-NEXT: .LBB2_3:
1545 ; GFX1132-DPP-NEXT: s_endpgm
1546 %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic
1551 define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
1552 ; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
1553 ; GFX7LESS: ; %bb.0:
1554 ; GFX7LESS-NEXT: s_mov_b32 s32, 0
1555 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
1556 ; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
1557 ; GFX7LESS-NEXT: s_mov_b32 s42, -1
1558 ; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
1559 ; GFX7LESS-NEXT: s_add_u32 s40, s40, s11
1560 ; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
1561 ; GFX7LESS-NEXT: s_mov_b32 s14, s10
1562 ; GFX7LESS-NEXT: s_mov_b32 s13, s9
1563 ; GFX7LESS-NEXT: s_mov_b32 s12, s8
1564 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7]
1565 ; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
1566 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
1567 ; GFX7LESS-NEXT: s_mov_b32 s38, -1
1568 ; GFX7LESS-NEXT: s_add_u32 s8, s4, 44
1569 ; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0
1570 ; GFX7LESS-NEXT: s_getpc_b64 s[4:5]
1571 ; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
1572 ; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
1573 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
1574 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
1575 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
1576 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
1577 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
1578 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
1579 ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3]
1580 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
1581 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
1582 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
1583 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
1584 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0
1585 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
1586 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0
1587 ; GFX7LESS-NEXT: .LBB3_1: ; %atomicrmw.start
1588 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
1589 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
1590 ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
1591 ; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2
1592 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
1593 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
1594 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
1595 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
1596 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
1597 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
1598 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1599 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
1600 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
1601 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_1
1602 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
1603 ; GFX7LESS-NEXT: s_endpgm
1605 ; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
1607 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
1608 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
1609 ; GFX9-NEXT: s_mov_b32 s38, -1
1610 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000
1611 ; GFX9-NEXT: s_add_u32 s36, s36, s11
1612 ; GFX9-NEXT: s_addc_u32 s37, s37, 0
1613 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
1614 ; GFX9-NEXT: s_mov_b32 s12, s8
1615 ; GFX9-NEXT: s_add_u32 s8, s34, 44
1616 ; GFX9-NEXT: s_mov_b32 s13, s9
1617 ; GFX9-NEXT: s_addc_u32 s9, s35, 0
1618 ; GFX9-NEXT: s_getpc_b64 s[4:5]
1619 ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
1620 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
1621 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
1622 ; GFX9-NEXT: s_mov_b32 s14, s10
1623 ; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
1624 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
1625 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
1626 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
1627 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
1628 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
1629 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
1630 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
1631 ; GFX9-NEXT: s_mov_b32 s32, 0
1632 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1633 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
1634 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
1635 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xff800000
1636 ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
1637 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1638 ; GFX9-NEXT: s_ff1_i32_b32 s2, s1
1639 ; GFX9-NEXT: s_ff1_i32_b32 s3, s0
1640 ; GFX9-NEXT: s_add_i32 s2, s2, 32
1641 ; GFX9-NEXT: s_min_u32 s2, s3, s2
1642 ; GFX9-NEXT: v_readlane_b32 s4, v0, s2
1643 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
1644 ; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
1645 ; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
1646 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
1647 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
1648 ; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
1649 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
1650 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
1651 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1652 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1653 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1654 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
1655 ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
1656 ; GFX9-NEXT: s_cbranch_execz .LBB3_5
1657 ; GFX9-NEXT: ; %bb.3:
1658 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
1659 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
1660 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
1661 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
1662 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1663 ; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
1664 ; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start
1665 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1666 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1667 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
1668 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
1669 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1670 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1671 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1672 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1673 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1674 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
1675 ; GFX9-NEXT: s_cbranch_execnz .LBB3_4
1676 ; GFX9-NEXT: .LBB3_5:
1677 ; GFX9-NEXT: s_endpgm
1679 ; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
1681 ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
1682 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
1683 ; GFX1064-NEXT: s_mov_b32 s38, -1
1684 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
1685 ; GFX1064-NEXT: s_add_u32 s36, s36, s11
1686 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5]
1687 ; GFX1064-NEXT: s_addc_u32 s37, s37, 0
1688 ; GFX1064-NEXT: s_mov_b32 s12, s8
1689 ; GFX1064-NEXT: s_add_u32 s8, s34, 44
1690 ; GFX1064-NEXT: s_mov_b32 s13, s9
1691 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0
1692 ; GFX1064-NEXT: s_getpc_b64 s[4:5]
1693 ; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
1694 ; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
1695 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
1696 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
1697 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
1698 ; GFX1064-NEXT: s_mov_b32 s14, s10
1699 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
1700 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
1701 ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
1702 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
1703 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
1704 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
1705 ; GFX1064-NEXT: s_mov_b32 s32, 0
1706 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
1707 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
1708 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0xff800000
1709 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
1710 ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
1711 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
1712 ; GFX1064-NEXT: s_ff1_i32_b32 s2, s1
1713 ; GFX1064-NEXT: s_ff1_i32_b32 s3, s0
1714 ; GFX1064-NEXT: s_add_i32 s2, s2, 32
1715 ; GFX1064-NEXT: v_max_f32_e32 v1, v2, v2
1716 ; GFX1064-NEXT: s_min_u32 s2, s3, s2
1717 ; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
1718 ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
1719 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
1720 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
1721 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
1722 ; GFX1064-NEXT: v_max_f32_e32 v2, v1, v2
1723 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
1724 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
1725 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1726 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1727 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1728 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
1729 ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
1730 ; GFX1064-NEXT: s_cbranch_execz .LBB3_5
1731 ; GFX1064-NEXT: ; %bb.3:
1732 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
1733 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0
1734 ; GFX1064-NEXT: v_max_f32_e32 v2, v2, v2
1735 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
1736 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
1737 ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
1738 ; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start
1739 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
1740 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
1741 ; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
1742 ; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2
1743 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1744 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
1745 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1746 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
1747 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1748 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
1749 ; GFX1064-NEXT: s_cbranch_execnz .LBB3_4
1750 ; GFX1064-NEXT: .LBB3_5:
1751 ; GFX1064-NEXT: s_endpgm
1753 ; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
1755 ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
1756 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
1757 ; GFX1032-NEXT: s_mov_b32 s38, -1
1758 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
1759 ; GFX1032-NEXT: s_add_u32 s36, s36, s11
1760 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5]
1761 ; GFX1032-NEXT: s_addc_u32 s37, s37, 0
1762 ; GFX1032-NEXT: s_mov_b32 s12, s8
1763 ; GFX1032-NEXT: s_add_u32 s8, s34, 44
1764 ; GFX1032-NEXT: s_mov_b32 s13, s9
1765 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0
1766 ; GFX1032-NEXT: s_getpc_b64 s[4:5]
1767 ; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
1768 ; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
1769 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
1770 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
1771 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
1772 ; GFX1032-NEXT: s_mov_b32 s14, s10
1773 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
1774 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
1775 ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
1776 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
1777 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
1778 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
1779 ; GFX1032-NEXT: s_mov_b32 s32, 0
1780 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
1781 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
1782 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0xff800000
1783 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
1784 ; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop
1785 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
1786 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
1787 ; GFX1032-NEXT: v_max_f32_e32 v1, v2, v2
1788 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
1789 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
1790 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
1791 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
1792 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
1793 ; GFX1032-NEXT: v_max_f32_e32 v2, v1, v2
1794 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
1795 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
1796 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1797 ; GFX1032-NEXT: s_mov_b32 s2, 0
1798 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
1799 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
1800 ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
1801 ; GFX1032-NEXT: s_cbranch_execz .LBB3_5
1802 ; GFX1032-NEXT: ; %bb.3:
1803 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
1804 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0
1805 ; GFX1032-NEXT: v_max_f32_e32 v2, v2, v2
1806 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
1807 ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
1808 ; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start
1809 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
1810 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
1811 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
1812 ; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2
1813 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1814 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
1815 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
1816 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
1817 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
1818 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
1819 ; GFX1032-NEXT: s_cbranch_execnz .LBB3_4
1820 ; GFX1032-NEXT: .LBB3_5:
1821 ; GFX1032-NEXT: s_endpgm
1823 ; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
1825 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5]
1826 ; GFX1164-NEXT: s_mov_b32 s12, s8
1827 ; GFX1164-NEXT: s_add_u32 s8, s34, 44
1828 ; GFX1164-NEXT: s_mov_b32 s13, s9
1829 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0
1830 ; GFX1164-NEXT: s_getpc_b64 s[4:5]
1831 ; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
1832 ; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
1833 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0
1834 ; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
1835 ; GFX1164-NEXT: s_mov_b32 s14, s10
1836 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7]
1837 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
1838 ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3]
1839 ; GFX1164-NEXT: s_mov_b32 s32, 0
1840 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
1841 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
1842 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0xff800000
1843 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
1844 ; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop
1845 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
1846 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
1847 ; GFX1164-NEXT: s_ctz_i32_b32 s2, s1
1848 ; GFX1164-NEXT: s_ctz_i32_b32 s3, s0
1849 ; GFX1164-NEXT: s_add_i32 s2, s2, 32
1850 ; GFX1164-NEXT: v_max_f32_e32 v1, v2, v2
1851 ; GFX1164-NEXT: s_min_u32 s2, s3, s2
1852 ; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
1853 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1854 ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
1855 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
1856 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
1857 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1858 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
1859 ; GFX1164-NEXT: v_max_f32_e32 v2, v1, v2
1860 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
1861 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
1862 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1863 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
1864 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1865 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1866 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
1867 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
1868 ; GFX1164-NEXT: s_cbranch_execz .LBB3_5
1869 ; GFX1164-NEXT: ; %bb.3:
1870 ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
1871 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0
1872 ; GFX1164-NEXT: v_max_f32_e32 v2, v2, v2
1873 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
1874 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
1875 ; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
1876 ; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start
1877 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
1878 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
1879 ; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
1880 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
1881 ; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2
1882 ; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
1883 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
1884 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1885 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0
1886 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1887 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1888 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
1889 ; GFX1164-NEXT: s_cbranch_execnz .LBB3_4
1890 ; GFX1164-NEXT: .LBB3_5:
1891 ; GFX1164-NEXT: s_endpgm
1893 ; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
1895 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5]
1896 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0
1897 ; GFX1132-NEXT: s_add_u32 s8, s34, 44
1898 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0
1899 ; GFX1132-NEXT: s_getpc_b64 s[4:5]
1900 ; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
1901 ; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
1902 ; GFX1132-NEXT: s_mov_b32 s12, s13
1903 ; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
1904 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
1905 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
1906 ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3]
1907 ; GFX1132-NEXT: s_mov_b32 s13, s14
1908 ; GFX1132-NEXT: s_mov_b32 s14, s15
1909 ; GFX1132-NEXT: s_mov_b32 s32, 0
1910 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
1911 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17]
1912 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0xff800000
1913 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
1914 ; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop
1915 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
1916 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1917 ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
1918 ; GFX1132-NEXT: v_max_f32_e32 v1, v2, v2
1919 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
1920 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
1921 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1922 ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
1923 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
1924 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1925 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
1926 ; GFX1132-NEXT: v_max_f32_e32 v2, v1, v2
1927 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
1928 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
1929 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1930 ; GFX1132-NEXT: s_mov_b32 s2, 0
1931 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
1932 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
1933 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
1934 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
1935 ; GFX1132-NEXT: s_cbranch_execz .LBB3_5
1936 ; GFX1132-NEXT: ; %bb.3:
1937 ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
1938 ; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
1939 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
1940 ; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
1941 ; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start
1942 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
1943 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
1944 ; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
1945 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
1946 ; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2
1947 ; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
1948 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
1949 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
1950 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0
1951 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
1952 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1953 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
1954 ; GFX1132-NEXT: s_cbranch_execnz .LBB3_4
1955 ; GFX1132-NEXT: .LBB3_5:
1956 ; GFX1132-NEXT: s_endpgm
1958 ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
1959 ; GFX9-DPP: ; %bb.0:
1960 ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
1961 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
1962 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1
1963 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
1964 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11
1965 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
1966 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
1967 ; GFX9-DPP-NEXT: s_mov_b32 s12, s8
1968 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
1969 ; GFX9-DPP-NEXT: s_mov_b32 s13, s9
1970 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
1971 ; GFX9-DPP-NEXT: s_getpc_b64 s[4:5]
1972 ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
1973 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
1974 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
1975 ; GFX9-DPP-NEXT: s_mov_b32 s14, s10
1976 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
1977 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
1978 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
1979 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
1980 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
1981 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
1982 ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
1983 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
1984 ; GFX9-DPP-NEXT: s_mov_b32 s32, 0
1985 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
1986 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
1987 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
1988 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
1989 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
1990 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
1991 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
1992 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
1993 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
1994 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
1995 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000
1996 ; GFX9-DPP-NEXT: s_nop 0
1997 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
1998 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3
1999 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
2000 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
2001 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
2002 ; GFX9-DPP-NEXT: s_nop 1
2003 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
2004 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
2005 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
2006 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
2007 ; GFX9-DPP-NEXT: s_nop 1
2008 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
2009 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
2010 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
2011 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
2012 ; GFX9-DPP-NEXT: s_nop 1
2013 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
2014 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
2015 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
2016 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
2017 ; GFX9-DPP-NEXT: s_nop 1
2018 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
2019 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
2020 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
2021 ; GFX9-DPP-NEXT: s_nop 1
2022 ; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
2023 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4
2024 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v4
2025 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
2026 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
2027 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
2028 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
2029 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3
2030 ; GFX9-DPP-NEXT: ; %bb.1:
2031 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
2032 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
2033 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
2034 ; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4
2035 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
2036 ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
2037 ; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
2038 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2039 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
2040 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
2041 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6
2042 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
2043 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
2044 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2045 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2046 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
2047 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
2048 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2
2049 ; GFX9-DPP-NEXT: .LBB3_3:
2050 ; GFX9-DPP-NEXT: s_endpgm
2052 ; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
2053 ; GFX1064-DPP: ; %bb.0:
2054 ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
2055 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
2056 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
2057 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
2058 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11
2059 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
2060 ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
2061 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s8
2062 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
2063 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s9
2064 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
2065 ; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
2066 ; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
2067 ; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
2068 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
2069 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
2070 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
2071 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
2072 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
2073 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
2074 ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
2075 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
2076 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
2077 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
2078 ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
2079 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
2080 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
2081 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
2082 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
2083 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
2084 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
2085 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
2086 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000
2087 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
2088 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
2089 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
2090 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
2091 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
2092 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3
2093 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3
2094 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
2095 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5
2096 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
2097 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4
2098 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
2099 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5
2100 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
2101 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4
2102 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
2103 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5
2104 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4
2105 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
2106 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
2107 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
2108 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4
2109 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32
2110 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0
2111 ; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2
2112 ; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3
2113 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
2114 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2115 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
2116 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3
2117 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
2118 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0
2119 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
2120 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
2121 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
2122 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3
2123 ; GFX1064-DPP-NEXT: ; %bb.1:
2124 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
2125 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
2126 ; GFX1064-DPP-NEXT: v_max_f32_e32 v6, v0, v0
2127 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
2128 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
2129 ; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1]
2130 ; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
2131 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2132 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
2133 ; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
2134 ; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v6
2135 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
2136 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
2137 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2138 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
2139 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2140 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
2141 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2
2142 ; GFX1064-DPP-NEXT: .LBB3_3:
2143 ; GFX1064-DPP-NEXT: s_endpgm
2145 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
2146 ; GFX1032-DPP: ; %bb.0:
2147 ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
2148 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
2149 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
2150 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
2151 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11
2152 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
2153 ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
2154 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s8
2155 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
2156 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s9
2157 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
2158 ; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
2159 ; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
2160 ; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
2161 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
2162 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
2163 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
2164 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
2165 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
2166 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
2167 ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
2168 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
2169 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
2170 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
2171 ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
2172 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
2173 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
2174 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
2175 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
2176 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
2177 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
2178 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
2179 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000
2180 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
2181 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
2182 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
2183 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
2184 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
2185 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3
2186 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3
2187 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
2188 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5
2189 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
2190 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4
2191 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
2192 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5
2193 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
2194 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4
2195 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
2196 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5
2197 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4
2198 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
2199 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
2200 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
2201 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4
2202 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
2203 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
2204 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
2205 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
2206 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
2207 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
2208 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3
2209 ; GFX1032-DPP-NEXT: ; %bb.1:
2210 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
2211 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
2212 ; GFX1032-DPP-NEXT: v_max_f32_e32 v6, v0, v0
2213 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
2214 ; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1]
2215 ; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
2216 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2217 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
2218 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
2219 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v6
2220 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
2221 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
2222 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
2223 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
2224 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
2225 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
2226 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2
2227 ; GFX1032-DPP-NEXT: .LBB3_3:
2228 ; GFX1032-DPP-NEXT: s_endpgm
2230 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
2231 ; GFX1164-DPP: ; %bb.0:
2232 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
2233 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s8
2234 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
2235 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
2236 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
2237 ; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
2238 ; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
2239 ; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
2240 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
2241 ; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
2242 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
2243 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
2244 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
2245 ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
2246 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
2247 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
2248 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
2249 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
2250 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000
2251 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
2252 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
2253 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
2254 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000
2255 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
2256 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
2257 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
2258 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
2259 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
2260 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
2261 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
2262 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1
2263 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1
2264 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2265 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
2266 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
2267 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
2268 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2269 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
2270 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
2271 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2272 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
2273 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
2274 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
2275 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2276 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
2277 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
2278 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2279 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
2280 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
2281 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2282 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
2283 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
2284 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2285 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
2286 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
2287 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2288 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
2289 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
2290 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2291 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
2292 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
2293 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
2294 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
2295 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
2296 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2297 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2298 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1
2299 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
2300 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
2301 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3
2302 ; GFX1164-DPP-NEXT: ; %bb.1:
2303 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
2304 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
2305 ; GFX1164-DPP-NEXT: v_max_f32_e32 v6, v4, v4
2306 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
2307 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
2308 ; GFX1164-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
2309 ; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
2310 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2311 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
2312 ; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5
2313 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
2314 ; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v4, v6
2315 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
2316 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
2317 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
2318 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
2319 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2320 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2321 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
2322 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2
2323 ; GFX1164-DPP-NEXT: .LBB3_3:
2324 ; GFX1164-DPP-NEXT: s_endpgm
2326 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
2327 ; GFX1132-DPP: ; %bb.0:
2328 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
2329 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
2330 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
2331 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
2332 ; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
2333 ; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
2334 ; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
2335 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
2336 ; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
2337 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
2338 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
2339 ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
2340 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
2341 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
2342 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
2343 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
2344 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
2345 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
2346 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000
2347 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
2348 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
2349 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
2350 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000
2351 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
2352 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
2353 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2354 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
2355 ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0xff800000
2356 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1
2357 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2358 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1
2359 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
2360 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2361 ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
2362 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
2363 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2364 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
2365 ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
2366 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2367 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
2368 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
2369 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2370 ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3
2371 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
2372 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2373 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
2374 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
2375 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2376 ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2
2377 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
2378 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
2379 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
2380 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2381 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1
2382 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
2383 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
2384 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
2385 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
2386 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3
2387 ; GFX1132-DPP-NEXT: ; %bb.1:
2388 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
2389 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0
2390 ; GFX1132-DPP-NEXT: v_max_f32_e32 v6, v4, v4
2391 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
2392 ; GFX1132-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
2393 ; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
2394 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2395 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
2396 ; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5
2397 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
2398 ; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v4, v6
2399 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
2400 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
2401 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
2402 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
2403 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
2404 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2405 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
2406 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2
2407 ; GFX1132-DPP-NEXT: .LBB3_3:
2408 ; GFX1132-DPP-NEXT: s_endpgm
2409 %divValue = call float @div.float.value()
2410 %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic
2415 define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 {
2416 ; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
2417 ; GFX7LESS: ; %bb.0:
2418 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2419 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
2420 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2421 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
2422 ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3
2423 ; GFX7LESS-NEXT: ; %bb.1:
2424 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2425 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
2426 ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0
2427 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
2428 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
2429 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
2430 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
2431 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
2432 ; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start
2433 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
2434 ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
2435 ; GFX7LESS-NEXT: v_max_f32_e32 v0, 4.0, v0
2436 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
2437 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v1
2438 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v0
2439 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[2:3], off, s[0:3], 0 glc
2440 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
2441 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v2, v1
2442 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2443 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v2
2444 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
2445 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2
2446 ; GFX7LESS-NEXT: .LBB4_3:
2447 ; GFX7LESS-NEXT: s_endpgm
2449 ; GFX9-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
2451 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2452 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2453 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2454 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
2455 ; GFX9-NEXT: s_cbranch_execz .LBB4_3
2456 ; GFX9-NEXT: ; %bb.1:
2457 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2458 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
2459 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
2460 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2461 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
2462 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2463 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
2464 ; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
2465 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2466 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
2467 ; GFX9-NEXT: v_max_f32_e32 v0, 4.0, v0
2468 ; GFX9-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
2469 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2470 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2471 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2472 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
2473 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
2474 ; GFX9-NEXT: s_cbranch_execnz .LBB4_2
2475 ; GFX9-NEXT: .LBB4_3:
2476 ; GFX9-NEXT: s_endpgm
2478 ; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
2480 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2481 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2482 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2483 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
2484 ; GFX1064-NEXT: s_cbranch_execz .LBB4_3
2485 ; GFX1064-NEXT: ; %bb.1:
2486 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2487 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0
2488 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
2489 ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
2490 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
2491 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2
2492 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
2493 ; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
2494 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
2495 ; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
2496 ; GFX1064-NEXT: v_max_f32_e32 v0, 4.0, v0
2497 ; GFX1064-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
2498 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
2499 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2500 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
2501 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2502 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
2503 ; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
2504 ; GFX1064-NEXT: .LBB4_3:
2505 ; GFX1064-NEXT: s_endpgm
2507 ; GFX1032-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
2509 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2510 ; GFX1032-NEXT: s_mov_b32 s2, 0
2511 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
2512 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
2513 ; GFX1032-NEXT: s_cbranch_execz .LBB4_3
2514 ; GFX1032-NEXT: ; %bb.1:
2515 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2516 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0
2517 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
2518 ; GFX1032-NEXT: s_load_dword s3, s[0:1], 0x0
2519 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
2520 ; GFX1032-NEXT: v_mov_b32_e32 v1, s3
2521 ; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
2522 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
2523 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
2524 ; GFX1032-NEXT: v_max_f32_e32 v0, 4.0, v0
2525 ; GFX1032-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
2526 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
2527 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
2528 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
2529 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
2530 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
2531 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
2532 ; GFX1032-NEXT: .LBB4_3:
2533 ; GFX1032-NEXT: s_endpgm
2535 ; GFX1164-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
2537 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2538 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec
2539 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2540 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2541 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
2542 ; GFX1164-NEXT: s_cbranch_execz .LBB4_3
2543 ; GFX1164-NEXT: ; %bb.1:
2544 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2545 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0
2546 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
2547 ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
2548 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
2549 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2
2550 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
2551 ; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
2552 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
2553 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2554 ; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
2555 ; GFX1164-NEXT: v_max_f32_e32 v0, 4.0, v0
2556 ; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
2557 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
2558 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2559 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0
2560 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2561 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2562 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
2563 ; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
2564 ; GFX1164-NEXT: .LBB4_3:
2565 ; GFX1164-NEXT: s_endpgm
2567 ; GFX1132-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
2569 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2570 ; GFX1132-NEXT: s_mov_b32 s2, 0
2571 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo
2572 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
2573 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
2574 ; GFX1132-NEXT: s_cbranch_execz .LBB4_3
2575 ; GFX1132-NEXT: ; %bb.1:
2576 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2577 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0
2578 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
2579 ; GFX1132-NEXT: s_load_b32 s3, s[0:1], 0x0
2580 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
2581 ; GFX1132-NEXT: v_mov_b32_e32 v1, s3
2582 ; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
2583 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
2584 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2585 ; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
2586 ; GFX1132-NEXT: v_max_f32_e32 v0, 4.0, v0
2587 ; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
2588 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
2589 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
2590 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0
2591 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
2592 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2593 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
2594 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
2595 ; GFX1132-NEXT: .LBB4_3:
2596 ; GFX1132-NEXT: s_endpgm
2598 ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
2599 ; GFX9-DPP: ; %bb.0:
2600 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2601 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2602 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2603 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
2604 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3
2605 ; GFX9-DPP-NEXT: ; %bb.1:
2606 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2607 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
2608 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
2609 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
2610 ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
2611 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
2612 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
2613 ; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
2614 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2615 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
2616 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
2617 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
2618 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
2619 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2620 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2621 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
2622 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
2623 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
2624 ; GFX9-DPP-NEXT: .LBB4_3:
2625 ; GFX9-DPP-NEXT: s_endpgm
2627 ; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
2628 ; GFX1064-DPP: ; %bb.0:
2629 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2630 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2631 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2632 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
2633 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3
2634 ; GFX1064-DPP-NEXT: ; %bb.1:
2635 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2636 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
2637 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
2638 ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
2639 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
2640 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
2641 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
2642 ; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
2643 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2644 ; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
2645 ; GFX1064-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
2646 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
2647 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
2648 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2649 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
2650 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2651 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
2652 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
2653 ; GFX1064-DPP-NEXT: .LBB4_3:
2654 ; GFX1064-DPP-NEXT: s_endpgm
2656 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
2657 ; GFX1032-DPP: ; %bb.0:
2658 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2659 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
2660 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
2661 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
2662 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
2663 ; GFX1032-DPP-NEXT: ; %bb.1:
2664 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
2665 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
2666 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
2667 ; GFX1032-DPP-NEXT: s_load_dword s3, s[0:1], 0x0
2668 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
2669 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s3
2670 ; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
2671 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2672 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
2673 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
2674 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
2675 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
2676 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
2677 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
2678 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
2679 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
2680 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
2681 ; GFX1032-DPP-NEXT: .LBB4_3:
2682 ; GFX1032-DPP-NEXT: s_endpgm
2684 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
2685 ; GFX1164-DPP: ; %bb.0:
2686 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2687 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
2688 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2689 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2690 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
2691 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3
2692 ; GFX1164-DPP-NEXT: ; %bb.1:
2693 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2694 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
2695 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
2696 ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
2697 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
2698 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
2699 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
2700 ; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
2701 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2702 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2703 ; GFX1164-DPP-NEXT: v_max_f32_e32 v0, v1, v1
2704 ; GFX1164-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
2705 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
2706 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
2707 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2708 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
2709 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2710 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2711 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
2712 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
2713 ; GFX1164-DPP-NEXT: .LBB4_3:
2714 ; GFX1164-DPP-NEXT: s_endpgm
2716 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_defalut_scope_unsafe:
2717 ; GFX1132-DPP: ; %bb.0:
2718 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2719 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
2720 ; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
2721 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
2722 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
2723 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3
2724 ; GFX1132-DPP-NEXT: ; %bb.1:
2725 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
2726 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
2727 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
2728 ; GFX1132-DPP-NEXT: s_load_b32 s3, s[0:1], 0x0
2729 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
2730 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, s3
2731 ; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
2732 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2733 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2734 ; GFX1132-DPP-NEXT: v_max_f32_e32 v0, v1, v1
2735 ; GFX1132-DPP-NEXT: v_max_f32_e32 v0, 4.0, v0
2736 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] glc
2737 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
2738 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
2739 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
2740 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
2741 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2742 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
2743 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
2744 ; GFX1132-DPP-NEXT: .LBB4_3:
2745 ; GFX1132-DPP-NEXT: s_endpgm
2746 %result = atomicrmw fmax ptr addrspace(1) %ptr, float 4.0 monotonic, align 4
2750 define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 {
2751 ; GFX7LESS-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
2752 ; GFX7LESS: ; %bb.0:
2753 ; GFX7LESS-NEXT: s_mov_b32 s32, 0
2754 ; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
2755 ; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
2756 ; GFX7LESS-NEXT: s_mov_b32 s42, -1
2757 ; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
2758 ; GFX7LESS-NEXT: s_add_u32 s40, s40, s11
2759 ; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
2760 ; GFX7LESS-NEXT: s_mov_b32 s14, s10
2761 ; GFX7LESS-NEXT: s_mov_b32 s13, s9
2762 ; GFX7LESS-NEXT: s_mov_b32 s12, s8
2763 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[6:7]
2764 ; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9
2765 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
2766 ; GFX7LESS-NEXT: s_mov_b32 s38, -1
2767 ; GFX7LESS-NEXT: s_add_u32 s8, s4, 44
2768 ; GFX7LESS-NEXT: s_addc_u32 s9, s5, 0
2769 ; GFX7LESS-NEXT: s_getpc_b64 s[4:5]
2770 ; GFX7LESS-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
2771 ; GFX7LESS-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
2772 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
2773 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
2774 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
2775 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
2776 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
2777 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
2778 ; GFX7LESS-NEXT: s_mov_b64 s[6:7], s[2:3]
2779 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
2780 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
2781 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
2782 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
2783 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[36:39], 0
2784 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
2785 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 1.0, v0
2786 ; GFX7LESS-NEXT: .LBB5_1: ; %atomicrmw.start
2787 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
2788 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
2789 ; GFX7LESS-NEXT: v_mul_f32_e32 v0, 1.0, v1
2790 ; GFX7LESS-NEXT: v_max_f32_e32 v0, v0, v2
2791 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
2792 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
2793 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
2794 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
2795 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
2796 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
2797 ; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2798 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
2799 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
2800 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_1
2801 ; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
2802 ; GFX7LESS-NEXT: s_endpgm
2804 ; GFX9-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
2806 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
2807 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
2808 ; GFX9-NEXT: s_mov_b32 s38, -1
2809 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000
2810 ; GFX9-NEXT: s_add_u32 s36, s36, s11
2811 ; GFX9-NEXT: s_addc_u32 s37, s37, 0
2812 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
2813 ; GFX9-NEXT: s_mov_b32 s12, s8
2814 ; GFX9-NEXT: s_add_u32 s8, s34, 44
2815 ; GFX9-NEXT: s_mov_b32 s13, s9
2816 ; GFX9-NEXT: s_addc_u32 s9, s35, 0
2817 ; GFX9-NEXT: s_getpc_b64 s[4:5]
2818 ; GFX9-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
2819 ; GFX9-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
2820 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
2821 ; GFX9-NEXT: s_mov_b32 s14, s10
2822 ; GFX9-NEXT: s_mov_b64 s[10:11], s[6:7]
2823 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
2824 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
2825 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
2826 ; GFX9-NEXT: s_mov_b64 s[6:7], s[2:3]
2827 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
2828 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
2829 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
2830 ; GFX9-NEXT: s_mov_b32 s32, 0
2831 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2832 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
2833 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
2834 ; GFX9-NEXT: v_mov_b32_e32 v2, 0xff800000
2835 ; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
2836 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2837 ; GFX9-NEXT: s_ff1_i32_b32 s2, s1
2838 ; GFX9-NEXT: s_ff1_i32_b32 s3, s0
2839 ; GFX9-NEXT: s_add_i32 s2, s2, 32
2840 ; GFX9-NEXT: s_min_u32 s2, s3, s2
2841 ; GFX9-NEXT: v_readlane_b32 s4, v0, s2
2842 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
2843 ; GFX9-NEXT: v_max_f32_e32 v1, v2, v2
2844 ; GFX9-NEXT: v_max_f32_e64 v2, s4, s4
2845 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
2846 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
2847 ; GFX9-NEXT: v_max_f32_e32 v2, v1, v2
2848 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
2849 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
2850 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2851 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2852 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2853 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
2854 ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
2855 ; GFX9-NEXT: s_cbranch_execz .LBB5_5
2856 ; GFX9-NEXT: ; %bb.3:
2857 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
2858 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
2859 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
2860 ; GFX9-NEXT: v_max_f32_e32 v2, v2, v2
2861 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2862 ; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
2863 ; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start
2864 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2865 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2866 ; GFX9-NEXT: v_max_f32_e32 v0, v1, v1
2867 ; GFX9-NEXT: v_max_f32_e32 v0, v0, v2
2868 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
2869 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2870 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2871 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2872 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
2873 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
2874 ; GFX9-NEXT: s_cbranch_execnz .LBB5_4
2875 ; GFX9-NEXT: .LBB5_5:
2876 ; GFX9-NEXT: s_endpgm
2878 ; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
2880 ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
2881 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
2882 ; GFX1064-NEXT: s_mov_b32 s38, -1
2883 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
2884 ; GFX1064-NEXT: s_add_u32 s36, s36, s11
2885 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5]
2886 ; GFX1064-NEXT: s_addc_u32 s37, s37, 0
2887 ; GFX1064-NEXT: s_mov_b32 s12, s8
2888 ; GFX1064-NEXT: s_add_u32 s8, s34, 44
2889 ; GFX1064-NEXT: s_mov_b32 s13, s9
2890 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0
2891 ; GFX1064-NEXT: s_getpc_b64 s[4:5]
2892 ; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
2893 ; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
2894 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
2895 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
2896 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
2897 ; GFX1064-NEXT: s_mov_b32 s14, s10
2898 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[6:7]
2899 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
2900 ; GFX1064-NEXT: s_mov_b64 s[6:7], s[2:3]
2901 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
2902 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
2903 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
2904 ; GFX1064-NEXT: s_mov_b32 s32, 0
2905 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
2906 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
2907 ; GFX1064-NEXT: v_mov_b32_e32 v2, 0xff800000
2908 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
2909 ; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
2910 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
2911 ; GFX1064-NEXT: s_ff1_i32_b32 s2, s1
2912 ; GFX1064-NEXT: s_ff1_i32_b32 s3, s0
2913 ; GFX1064-NEXT: s_add_i32 s2, s2, 32
2914 ; GFX1064-NEXT: v_max_f32_e32 v1, v2, v2
2915 ; GFX1064-NEXT: s_min_u32 s2, s3, s2
2916 ; GFX1064-NEXT: v_readlane_b32 s3, v0, s2
2917 ; GFX1064-NEXT: v_max_f32_e64 v2, s3, s3
2918 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
2919 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
2920 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
2921 ; GFX1064-NEXT: v_max_f32_e32 v2, v1, v2
2922 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
2923 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
2924 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2925 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2926 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2927 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
2928 ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
2929 ; GFX1064-NEXT: s_cbranch_execz .LBB5_5
2930 ; GFX1064-NEXT: ; %bb.3:
2931 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
2932 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0
2933 ; GFX1064-NEXT: v_max_f32_e32 v2, v2, v2
2934 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
2935 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
2936 ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
2937 ; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start
2938 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
2939 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
2940 ; GFX1064-NEXT: v_max_f32_e32 v0, v1, v1
2941 ; GFX1064-NEXT: v_max_f32_e32 v0, v0, v2
2942 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
2943 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
2944 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2945 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
2946 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2947 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
2948 ; GFX1064-NEXT: s_cbranch_execnz .LBB5_4
2949 ; GFX1064-NEXT: .LBB5_5:
2950 ; GFX1064-NEXT: s_endpgm
2952 ; GFX1032-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
2954 ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
2955 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
2956 ; GFX1032-NEXT: s_mov_b32 s38, -1
2957 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
2958 ; GFX1032-NEXT: s_add_u32 s36, s36, s11
2959 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5]
2960 ; GFX1032-NEXT: s_addc_u32 s37, s37, 0
2961 ; GFX1032-NEXT: s_mov_b32 s12, s8
2962 ; GFX1032-NEXT: s_add_u32 s8, s34, 44
2963 ; GFX1032-NEXT: s_mov_b32 s13, s9
2964 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0
2965 ; GFX1032-NEXT: s_getpc_b64 s[4:5]
2966 ; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
2967 ; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
2968 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
2969 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
2970 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
2971 ; GFX1032-NEXT: s_mov_b32 s14, s10
2972 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[6:7]
2973 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
2974 ; GFX1032-NEXT: s_mov_b64 s[6:7], s[2:3]
2975 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
2976 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
2977 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
2978 ; GFX1032-NEXT: s_mov_b32 s32, 0
2979 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
2980 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
2981 ; GFX1032-NEXT: v_mov_b32_e32 v2, 0xff800000
2982 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
2983 ; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop
2984 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
2985 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
2986 ; GFX1032-NEXT: v_max_f32_e32 v1, v2, v2
2987 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
2988 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
2989 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
2990 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
2991 ; GFX1032-NEXT: v_max_f32_e64 v2, s2, s2
2992 ; GFX1032-NEXT: v_max_f32_e32 v2, v1, v2
2993 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
2994 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
2995 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2996 ; GFX1032-NEXT: s_mov_b32 s2, 0
2997 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
2998 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
2999 ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
3000 ; GFX1032-NEXT: s_cbranch_execz .LBB5_5
3001 ; GFX1032-NEXT: ; %bb.3:
3002 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
3003 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0
3004 ; GFX1032-NEXT: v_max_f32_e32 v2, v2, v2
3005 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
3006 ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
3007 ; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start
3008 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
3009 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
3010 ; GFX1032-NEXT: v_max_f32_e32 v0, v1, v1
3011 ; GFX1032-NEXT: v_max_f32_e32 v0, v0, v2
3012 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
3013 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
3014 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
3015 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
3016 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
3017 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
3018 ; GFX1032-NEXT: s_cbranch_execnz .LBB5_4
3019 ; GFX1032-NEXT: .LBB5_5:
3020 ; GFX1032-NEXT: s_endpgm
3022 ; GFX1164-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
3024 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5]
3025 ; GFX1164-NEXT: s_mov_b32 s12, s8
3026 ; GFX1164-NEXT: s_add_u32 s8, s34, 44
3027 ; GFX1164-NEXT: s_mov_b32 s13, s9
3028 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0
3029 ; GFX1164-NEXT: s_getpc_b64 s[4:5]
3030 ; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
3031 ; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
3032 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0
3033 ; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
3034 ; GFX1164-NEXT: s_mov_b32 s14, s10
3035 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[6:7]
3036 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
3037 ; GFX1164-NEXT: s_mov_b64 s[6:7], s[2:3]
3038 ; GFX1164-NEXT: s_mov_b32 s32, 0
3039 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
3040 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
3041 ; GFX1164-NEXT: v_mov_b32_e32 v2, 0xff800000
3042 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
3043 ; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop
3044 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
3045 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
3046 ; GFX1164-NEXT: s_ctz_i32_b32 s2, s1
3047 ; GFX1164-NEXT: s_ctz_i32_b32 s3, s0
3048 ; GFX1164-NEXT: s_add_i32 s2, s2, 32
3049 ; GFX1164-NEXT: v_max_f32_e32 v1, v2, v2
3050 ; GFX1164-NEXT: s_min_u32 s2, s3, s2
3051 ; GFX1164-NEXT: v_readlane_b32 s3, v0, s2
3052 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
3053 ; GFX1164-NEXT: v_max_f32_e64 v2, s3, s3
3054 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
3055 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
3056 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3057 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
3058 ; GFX1164-NEXT: v_max_f32_e32 v2, v1, v2
3059 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
3060 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
3061 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3062 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
3063 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3064 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3065 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
3066 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
3067 ; GFX1164-NEXT: s_cbranch_execz .LBB5_5
3068 ; GFX1164-NEXT: ; %bb.3:
3069 ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
3070 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0
3071 ; GFX1164-NEXT: v_max_f32_e32 v2, v2, v2
3072 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
3073 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
3074 ; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
3075 ; GFX1164-NEXT: .LBB5_4: ; %atomicrmw.start
3076 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
3077 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
3078 ; GFX1164-NEXT: v_max_f32_e32 v0, v1, v1
3079 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
3080 ; GFX1164-NEXT: v_max_f32_e32 v0, v0, v2
3081 ; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
3082 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
3083 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
3084 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0
3085 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
3086 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3087 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
3088 ; GFX1164-NEXT: s_cbranch_execnz .LBB5_4
3089 ; GFX1164-NEXT: .LBB5_5:
3090 ; GFX1164-NEXT: s_endpgm
3092 ; GFX1132-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
3094 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5]
3095 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0
3096 ; GFX1132-NEXT: s_add_u32 s8, s34, 44
3097 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0
3098 ; GFX1132-NEXT: s_getpc_b64 s[4:5]
3099 ; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
3100 ; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
3101 ; GFX1132-NEXT: s_mov_b32 s12, s13
3102 ; GFX1132-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
3103 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[6:7]
3104 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
3105 ; GFX1132-NEXT: s_mov_b64 s[6:7], s[2:3]
3106 ; GFX1132-NEXT: s_mov_b32 s13, s14
3107 ; GFX1132-NEXT: s_mov_b32 s14, s15
3108 ; GFX1132-NEXT: s_mov_b32 s32, 0
3109 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
3110 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[16:17]
3111 ; GFX1132-NEXT: v_mov_b32_e32 v2, 0xff800000
3112 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
3113 ; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop
3114 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
3115 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3116 ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
3117 ; GFX1132-NEXT: v_max_f32_e32 v1, v2, v2
3118 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
3119 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
3120 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3121 ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
3122 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
3123 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3124 ; GFX1132-NEXT: v_max_f32_e64 v2, s2, s2
3125 ; GFX1132-NEXT: v_max_f32_e32 v2, v1, v2
3126 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
3127 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
3128 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3129 ; GFX1132-NEXT: s_mov_b32 s2, 0
3130 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
3131 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
3132 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
3133 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
3134 ; GFX1132-NEXT: s_cbranch_execz .LBB5_5
3135 ; GFX1132-NEXT: ; %bb.3:
3136 ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
3137 ; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_max_f32 v2, v2, v2
3138 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
3139 ; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
3140 ; GFX1132-NEXT: .LBB5_4: ; %atomicrmw.start
3141 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
3142 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
3143 ; GFX1132-NEXT: v_max_f32_e32 v0, v1, v1
3144 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
3145 ; GFX1132-NEXT: v_max_f32_e32 v0, v0, v2
3146 ; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
3147 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
3148 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
3149 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0
3150 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
3151 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3152 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
3153 ; GFX1132-NEXT: s_cbranch_execnz .LBB5_4
3154 ; GFX1132-NEXT: .LBB5_5:
3155 ; GFX1132-NEXT: s_endpgm
3157 ; GFX9-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
3158 ; GFX9-DPP: ; %bb.0:
3159 ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
3160 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
3161 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1
3162 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
3163 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s11
3164 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
3165 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
3166 ; GFX9-DPP-NEXT: s_mov_b32 s12, s8
3167 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
3168 ; GFX9-DPP-NEXT: s_mov_b32 s13, s9
3169 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
3170 ; GFX9-DPP-NEXT: s_getpc_b64 s[4:5]
3171 ; GFX9-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
3172 ; GFX9-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
3173 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
3174 ; GFX9-DPP-NEXT: s_mov_b32 s14, s10
3175 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
3176 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
3177 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
3178 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
3179 ; GFX9-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
3180 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
3181 ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
3182 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
3183 ; GFX9-DPP-NEXT: s_mov_b32 s32, 0
3184 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
3185 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
3186 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
3187 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
3188 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v0
3189 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
3190 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
3191 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
3192 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
3193 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
3194 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000
3195 ; GFX9-DPP-NEXT: s_nop 0
3196 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
3197 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v3
3198 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
3199 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
3200 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
3201 ; GFX9-DPP-NEXT: s_nop 1
3202 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
3203 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
3204 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
3205 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
3206 ; GFX9-DPP-NEXT: s_nop 1
3207 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
3208 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
3209 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
3210 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
3211 ; GFX9-DPP-NEXT: s_nop 1
3212 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
3213 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
3214 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
3215 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
3216 ; GFX9-DPP-NEXT: s_nop 1
3217 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
3218 ; GFX9-DPP-NEXT: v_max_f32_e32 v5, v5, v5
3219 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v5
3220 ; GFX9-DPP-NEXT: s_nop 1
3221 ; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
3222 ; GFX9-DPP-NEXT: v_max_f32_e32 v4, v4, v4
3223 ; GFX9-DPP-NEXT: v_max_f32_e32 v3, v3, v4
3224 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
3225 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
3226 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
3227 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
3228 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3
3229 ; GFX9-DPP-NEXT: ; %bb.1:
3230 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
3231 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
3232 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
3233 ; GFX9-DPP-NEXT: v_max_f32_e64 v6, s4, s4
3234 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
3235 ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
3236 ; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
3237 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
3238 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
3239 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v1, v1
3240 ; GFX9-DPP-NEXT: v_max_f32_e32 v0, v0, v6
3241 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
3242 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
3243 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
3244 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
3245 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
3246 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
3247 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2
3248 ; GFX9-DPP-NEXT: .LBB5_3:
3249 ; GFX9-DPP-NEXT: s_endpgm
3251 ; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
3252 ; GFX1064-DPP: ; %bb.0:
3253 ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
3254 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
3255 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
3256 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
3257 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s11
3258 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
3259 ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
3260 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s8
3261 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
3262 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s9
3263 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
3264 ; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
3265 ; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
3266 ; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
3267 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
3268 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
3269 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
3270 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s10
3271 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
3272 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
3273 ; GFX1064-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
3274 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
3275 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
3276 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
3277 ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
3278 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
3279 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
3280 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
3281 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
3282 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
3283 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
3284 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
3285 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000
3286 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
3287 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
3288 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
3289 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
3290 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
3291 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v3
3292 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3
3293 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
3294 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5
3295 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
3296 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4
3297 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
3298 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5
3299 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
3300 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4
3301 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
3302 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v5, v5
3303 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4
3304 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
3305 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
3306 ; GFX1064-DPP-NEXT: v_max_f32_e32 v4, v4, v4
3307 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v3, v4
3308 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 32
3309 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 0
3310 ; GFX1064-DPP-NEXT: v_max_f32_e64 v3, s2, s2
3311 ; GFX1064-DPP-NEXT: v_max_f32_e64 v4, s3, s3
3312 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
3313 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3314 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
3315 ; GFX1064-DPP-NEXT: v_max_f32_e32 v3, v4, v3
3316 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
3317 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v0
3318 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
3319 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
3320 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
3321 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3
3322 ; GFX1064-DPP-NEXT: ; %bb.1:
3323 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
3324 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
3325 ; GFX1064-DPP-NEXT: v_max_f32_e32 v6, v0, v0
3326 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
3327 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
3328 ; GFX1064-DPP-NEXT: global_load_dword v1, v2, s[0:1]
3329 ; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
3330 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
3331 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
3332 ; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v1, v1
3333 ; GFX1064-DPP-NEXT: v_max_f32_e32 v0, v0, v6
3334 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
3335 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
3336 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
3337 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
3338 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
3339 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
3340 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2
3341 ; GFX1064-DPP-NEXT: .LBB5_3:
3342 ; GFX1064-DPP-NEXT: s_endpgm
3344 ; GFX1032-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
3345 ; GFX1032-DPP: ; %bb.0:
3346 ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
3347 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
3348 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
3349 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
3350 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s11
3351 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
3352 ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
3353 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s8
3354 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
3355 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s9
3356 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
3357 ; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
3358 ; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
3359 ; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
3360 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
3361 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
3362 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
3363 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s10
3364 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
3365 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
3366 ; GFX1032-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
3367 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
3368 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
3369 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
3370 ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
3371 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
3372 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
3373 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
3374 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
3375 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
3376 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
3377 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
3378 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0xff800000
3379 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
3380 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
3381 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
3382 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
3383 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
3384 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v3
3385 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v4, v3
3386 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
3387 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5
3388 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
3389 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4
3390 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
3391 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5
3392 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 0xff800000
3393 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4
3394 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
3395 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v5, v5
3396 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4
3397 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
3398 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
3399 ; GFX1032-DPP-NEXT: v_max_f32_e32 v4, v4, v4
3400 ; GFX1032-DPP-NEXT: v_max_f32_e32 v3, v3, v4
3401 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
3402 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
3403 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
3404 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
3405 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
3406 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
3407 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3
3408 ; GFX1032-DPP-NEXT: ; %bb.1:
3409 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
3410 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
3411 ; GFX1032-DPP-NEXT: v_max_f32_e32 v6, v0, v0
3412 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
3413 ; GFX1032-DPP-NEXT: global_load_dword v1, v2, s[0:1]
3414 ; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
3415 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
3416 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
3417 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v1, v1
3418 ; GFX1032-DPP-NEXT: v_max_f32_e32 v0, v0, v6
3419 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
3420 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
3421 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
3422 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
3423 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
3424 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
3425 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2
3426 ; GFX1032-DPP-NEXT: .LBB5_3:
3427 ; GFX1032-DPP-NEXT: s_endpgm
3429 ; GFX1164-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
3430 ; GFX1164-DPP: ; %bb.0:
3431 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
3432 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s8
3433 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
3434 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s9
3435 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
3436 ; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
3437 ; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
3438 ; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
3439 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
3440 ; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
3441 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s10
3442 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
3443 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
3444 ; GFX1164-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
3445 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
3446 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
3447 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
3448 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
3449 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000
3450 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
3451 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
3452 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
3453 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000
3454 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
3455 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
3456 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
3457 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
3458 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
3459 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
3460 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
3461 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v1
3462 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v2, v1
3463 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3464 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
3465 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
3466 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
3467 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3468 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
3469 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
3470 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3471 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
3472 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0xff800000
3473 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
3474 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3475 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
3476 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v3, v3
3477 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3478 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
3479 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
3480 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3481 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
3482 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
3483 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3484 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
3485 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
3486 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
3487 ; GFX1164-DPP-NEXT: v_max_f32_e32 v2, v2, v2
3488 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
3489 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3490 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
3491 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
3492 ; GFX1164-DPP-NEXT: v_max_f32_e32 v1, v1, v2
3493 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
3494 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
3495 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3496 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3497 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v1
3498 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
3499 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
3500 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_3
3501 ; GFX1164-DPP-NEXT: ; %bb.1:
3502 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
3503 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
3504 ; GFX1164-DPP-NEXT: v_max_f32_e32 v6, v4, v4
3505 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
3506 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
3507 ; GFX1164-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
3508 ; GFX1164-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
3509 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
3510 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
3511 ; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v5, v5
3512 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
3513 ; GFX1164-DPP-NEXT: v_max_f32_e32 v4, v4, v6
3514 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
3515 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
3516 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
3517 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
3518 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
3519 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3520 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
3521 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB5_2
3522 ; GFX1164-DPP-NEXT: .LBB5_3:
3523 ; GFX1164-DPP-NEXT: s_endpgm
3525 ; GFX1132-DPP-LABEL: global_atomic_fmax_uni_address_div_value_defalut_scope_unsafe:
3526 ; GFX1132-DPP: ; %bb.0:
3527 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
3528 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
3529 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
3530 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
3531 ; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
3532 ; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
3533 ; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
3534 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
3535 ; GFX1132-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
3536 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[6:7]
3537 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
3538 ; GFX1132-DPP-NEXT: s_mov_b64 s[6:7], s[2:3]
3539 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
3540 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
3541 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
3542 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
3543 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
3544 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
3545 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, 0xff800000
3546 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
3547 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
3548 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
3549 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0xff800000
3550 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
3551 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
3552 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3553 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
3554 ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_mov_b32 v3, 0xff800000
3555 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v1
3556 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3557 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v2, v1
3558 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
3559 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3560 ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
3561 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
3562 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3563 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
3564 ; GFX1132-DPP-NEXT: v_dual_max_f32 v2, v3, v3 :: v_dual_mov_b32 v3, 0xff800000
3565 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3566 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
3567 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
3568 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3569 ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v3, v3
3570 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
3571 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3572 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
3573 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
3574 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3575 ; GFX1132-DPP-NEXT: v_max_f32_e32 v2, v2, v2
3576 ; GFX1132-DPP-NEXT: v_max_f32_e32 v1, v1, v2
3577 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
3578 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
3579 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3580 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v1
3581 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
3582 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
3583 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
3584 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
3585 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_3
3586 ; GFX1132-DPP-NEXT: ; %bb.1:
3587 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
3588 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0
3589 ; GFX1132-DPP-NEXT: v_max_f32_e32 v6, v4, v4
3590 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
3591 ; GFX1132-DPP-NEXT: global_load_b32 v5, v0, s[0:1]
3592 ; GFX1132-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
3593 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
3594 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
3595 ; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v5, v5
3596 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
3597 ; GFX1132-DPP-NEXT: v_max_f32_e32 v4, v4, v6
3598 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v0, v[4:5], s[0:1] glc
3599 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
3600 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
3601 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
3602 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
3603 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3604 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
3605 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB5_2
3606 ; GFX1132-DPP-NEXT: .LBB5_3:
3607 ; GFX1132-DPP-NEXT: s_endpgm
3608 %divValue = call float @div.float.value()
3609 %result = atomicrmw fmax ptr addrspace(1) %ptr, float %divValue monotonic, align 4
3613 attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }