1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
8 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
9 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
10 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
11 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
12 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-wavefrontsize32,+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
13 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
15 declare float @div.float.value()
16 declare double @div.double.value()
18 define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
19 ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
21 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
22 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
23 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
24 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
25 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
26 ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_3
27 ; GFX7LESS-NEXT: ; %bb.1:
28 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
29 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5]
30 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
31 ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0
32 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
33 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
34 ; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
35 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
36 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
37 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
38 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
39 ; GFX7LESS-NEXT: .LBB0_2: ; %atomicrmw.start
40 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
41 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
42 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
43 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
44 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
45 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
46 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
47 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
48 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
49 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
50 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
51 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB0_2
52 ; GFX7LESS-NEXT: .LBB0_3:
53 ; GFX7LESS-NEXT: s_endpgm
55 ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
57 ; GFX9-NEXT: s_mov_b64 s[4:5], exec
58 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
59 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
60 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
61 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
62 ; GFX9-NEXT: s_cbranch_execz .LBB0_3
64 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
65 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
66 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
67 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
68 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
69 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
70 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0
71 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
72 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
73 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
74 ; GFX9-NEXT: .LBB0_2: ; %atomicrmw.start
75 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
76 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
77 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
78 ; GFX9-NEXT: s_waitcnt vmcnt(0)
79 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
80 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
81 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
82 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
83 ; GFX9-NEXT: s_cbranch_execnz .LBB0_2
87 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
89 ; GFX1064-NEXT: s_mov_b64 s[4:5], exec
90 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
91 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
92 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
93 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
94 ; GFX1064-NEXT: s_cbranch_execz .LBB0_3
95 ; GFX1064-NEXT: ; %bb.1:
96 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
97 ; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5]
98 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0
99 ; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
100 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
101 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
102 ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
103 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
104 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2
105 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
106 ; GFX1064-NEXT: .LBB0_2: ; %atomicrmw.start
107 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
108 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
109 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
110 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
111 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
112 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
113 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
114 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
115 ; GFX1064-NEXT: s_cbranch_execnz .LBB0_2
116 ; GFX1064-NEXT: .LBB0_3:
117 ; GFX1064-NEXT: s_endpgm
119 ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
121 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo
122 ; GFX1032-NEXT: s_mov_b32 s4, 0
123 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
124 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
125 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
126 ; GFX1032-NEXT: s_cbranch_execz .LBB0_3
127 ; GFX1032-NEXT: ; %bb.1:
128 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
129 ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5
130 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0
131 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
132 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
133 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
134 ; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0
135 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
136 ; GFX1032-NEXT: v_mov_b32_e32 v1, s2
137 ; GFX1032-NEXT: .LBB0_2: ; %atomicrmw.start
138 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
139 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
140 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
141 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
142 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
143 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
144 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
145 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
146 ; GFX1032-NEXT: s_cbranch_execnz .LBB0_2
147 ; GFX1032-NEXT: .LBB0_3:
148 ; GFX1032-NEXT: s_endpgm
150 ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
152 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
153 ; GFX1164-NEXT: s_mov_b64 s[4:5], exec
154 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
155 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
156 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
157 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
158 ; GFX1164-NEXT: s_cbranch_execz .LBB0_2
159 ; GFX1164-NEXT: ; %bb.1:
160 ; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x24
161 ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
162 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0
163 ; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
164 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
165 ; GFX1164-NEXT: v_mul_f32_e32 v0, 4.0, v0
166 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
167 ; GFX1164-NEXT: global_atomic_add_f32 v1, v0, s[2:3]
168 ; GFX1164-NEXT: .LBB0_2:
169 ; GFX1164-NEXT: s_nop 0
170 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
171 ; GFX1164-NEXT: s_endpgm
173 ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
175 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
176 ; GFX1132-NEXT: s_mov_b32 s1, exec_lo
177 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
178 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
179 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
180 ; GFX1132-NEXT: s_cbranch_execz .LBB0_2
181 ; GFX1132-NEXT: ; %bb.1:
182 ; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x24
183 ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s0
184 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
185 ; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
186 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0
187 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
188 ; GFX1132-NEXT: global_atomic_add_f32 v1, v0, s[2:3]
189 ; GFX1132-NEXT: .LBB0_2:
190 ; GFX1132-NEXT: s_nop 0
191 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
192 ; GFX1132-NEXT: s_endpgm
194 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
195 ; GFX7LESS-DPP: ; %bb.0:
196 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec
197 ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
198 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
199 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
200 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
201 ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB0_3
202 ; GFX7LESS-DPP-NEXT: ; %bb.1:
203 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
204 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5]
205 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
206 ; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0
207 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
208 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
209 ; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
210 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
211 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
212 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6
213 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
214 ; GFX7LESS-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
215 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
216 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
217 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
218 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
219 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
220 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
221 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
222 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
223 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
224 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
225 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
226 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB0_2
227 ; GFX7LESS-DPP-NEXT: .LBB0_3:
228 ; GFX7LESS-DPP-NEXT: s_endpgm
230 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
232 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
233 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
234 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
235 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
236 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
237 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB0_3
238 ; GFX9-DPP-NEXT: ; %bb.1:
239 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
240 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
241 ; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
242 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
243 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
244 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
245 ; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0
246 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
247 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
248 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6
249 ; GFX9-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
250 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
251 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
252 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
253 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
254 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
255 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
256 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
257 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
258 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB0_2
259 ; GFX9-DPP-NEXT: .LBB0_3:
260 ; GFX9-DPP-NEXT: s_endpgm
262 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
263 ; GFX1064-DPP: ; %bb.0:
264 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
265 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
266 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
267 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
268 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
269 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB0_3
270 ; GFX1064-DPP-NEXT: ; %bb.1:
271 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
272 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5]
273 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
274 ; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
275 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
276 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
277 ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
278 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
279 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
280 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
281 ; GFX1064-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
282 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
283 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
284 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
285 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
286 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
287 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
288 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
289 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
290 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB0_2
291 ; GFX1064-DPP-NEXT: .LBB0_3:
292 ; GFX1064-DPP-NEXT: s_endpgm
294 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
295 ; GFX1032-DPP: ; %bb.0:
296 ; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
297 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
298 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
299 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
300 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
301 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB0_3
302 ; GFX1032-DPP-NEXT: ; %bb.1:
303 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
304 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5
305 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
306 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
307 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
308 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
309 ; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
310 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
311 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2
312 ; GFX1032-DPP-NEXT: .LBB0_2: ; %atomicrmw.start
313 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
314 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
315 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
316 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
317 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
318 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
319 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
320 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
321 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB0_2
322 ; GFX1032-DPP-NEXT: .LBB0_3:
323 ; GFX1032-DPP-NEXT: s_endpgm
325 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
326 ; GFX1164-DPP: ; %bb.0:
327 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
328 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
329 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
330 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
331 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
332 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
333 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB0_2
334 ; GFX1164-DPP-NEXT: ; %bb.1:
335 ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24
336 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
337 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
338 ; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
339 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
340 ; GFX1164-DPP-NEXT: v_mul_f32_e32 v0, 4.0, v0
341 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
342 ; GFX1164-DPP-NEXT: global_atomic_add_f32 v1, v0, s[2:3]
343 ; GFX1164-DPP-NEXT: .LBB0_2:
344 ; GFX1164-DPP-NEXT: s_nop 0
345 ; GFX1164-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
346 ; GFX1164-DPP-NEXT: s_endpgm
348 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
349 ; GFX1132-DPP: ; %bb.0:
350 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
351 ; GFX1132-DPP-NEXT: s_mov_b32 s1, exec_lo
352 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
353 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
354 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
355 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB0_2
356 ; GFX1132-DPP-NEXT: ; %bb.1:
357 ; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x24
358 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s0
359 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
360 ; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s0
361 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0
362 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
363 ; GFX1132-DPP-NEXT: global_atomic_add_f32 v1, v0, s[2:3]
364 ; GFX1132-DPP-NEXT: .LBB0_2:
365 ; GFX1132-DPP-NEXT: s_nop 0
366 ; GFX1132-DPP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
367 ; GFX1132-DPP-NEXT: s_endpgm
368 %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic, align 4
372 define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 {
373 ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
375 ; GFX7LESS-NEXT: s_mov_b32 s32, 0
376 ; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
377 ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
378 ; GFX7LESS-NEXT: s_mov_b32 s38, -1
379 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
380 ; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
381 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
382 ; GFX7LESS-NEXT: s_mov_b32 s14, s8
383 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
384 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
385 ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
386 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
387 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
388 ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
389 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
390 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
391 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
392 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
393 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
394 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
395 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
396 ; GFX7LESS-NEXT: s_mov_b32 s12, s6
397 ; GFX7LESS-NEXT: s_mov_b32 s13, s7
398 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
399 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
400 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
401 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
402 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
403 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1
404 ; GFX7LESS-NEXT: .LBB1_1: ; %ComputeLoop
405 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
406 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
407 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
408 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
409 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
410 ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
411 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
412 ; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2
413 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB1_1
414 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
415 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
416 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
417 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
418 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
419 ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
420 ; GFX7LESS-NEXT: s_cbranch_execz .LBB1_5
421 ; GFX7LESS-NEXT: ; %bb.3:
422 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
423 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
424 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
425 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
426 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
427 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
428 ; GFX7LESS-NEXT: .LBB1_4: ; %atomicrmw.start
429 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
430 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
431 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
432 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
433 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
434 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
435 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
436 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
437 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
438 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
439 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
440 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
441 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB1_4
442 ; GFX7LESS-NEXT: .LBB1_5:
443 ; GFX7LESS-NEXT: s_endpgm
445 ; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
447 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
448 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
449 ; GFX9-NEXT: s_mov_b32 s38, -1
450 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000
451 ; GFX9-NEXT: s_add_u32 s36, s36, s9
452 ; GFX9-NEXT: s_addc_u32 s37, s37, 0
453 ; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
454 ; GFX9-NEXT: s_mov_b32 s14, s8
455 ; GFX9-NEXT: s_add_u32 s8, s34, 44
456 ; GFX9-NEXT: s_addc_u32 s9, s35, 0
457 ; GFX9-NEXT: s_getpc_b64 s[2:3]
458 ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
459 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
460 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
461 ; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
462 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
463 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
464 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
465 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
466 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
467 ; GFX9-NEXT: s_mov_b32 s12, s6
468 ; GFX9-NEXT: s_mov_b32 s13, s7
469 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
470 ; GFX9-NEXT: s_mov_b32 s32, 0
471 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
472 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
473 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
474 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
475 ; GFX9-NEXT: .LBB1_1: ; %ComputeLoop
476 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
477 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
478 ; GFX9-NEXT: v_readlane_b32 s4, v0, s2
479 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
480 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
481 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
482 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
483 ; GFX9-NEXT: s_cbranch_scc1 .LBB1_1
484 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
485 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
486 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
487 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
488 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
489 ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
490 ; GFX9-NEXT: s_cbranch_execz .LBB1_5
491 ; GFX9-NEXT: ; %bb.3:
492 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
493 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
494 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
495 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
496 ; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
497 ; GFX9-NEXT: .LBB1_4: ; %atomicrmw.start
498 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
499 ; GFX9-NEXT: s_waitcnt vmcnt(0)
500 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
501 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
502 ; GFX9-NEXT: s_waitcnt vmcnt(0)
503 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
504 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
505 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
506 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
507 ; GFX9-NEXT: s_cbranch_execnz .LBB1_4
508 ; GFX9-NEXT: .LBB1_5:
509 ; GFX9-NEXT: s_endpgm
511 ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
513 ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
514 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
515 ; GFX1064-NEXT: s_mov_b32 s38, -1
516 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
517 ; GFX1064-NEXT: s_add_u32 s36, s36, s9
518 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
519 ; GFX1064-NEXT: s_addc_u32 s37, s37, 0
520 ; GFX1064-NEXT: s_mov_b32 s14, s8
521 ; GFX1064-NEXT: s_add_u32 s8, s34, 44
522 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0
523 ; GFX1064-NEXT: s_getpc_b64 s[2:3]
524 ; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
525 ; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
526 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
527 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
528 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
529 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
530 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
531 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
532 ; GFX1064-NEXT: s_mov_b32 s12, s6
533 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
534 ; GFX1064-NEXT: s_mov_b32 s13, s7
535 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
536 ; GFX1064-NEXT: s_mov_b32 s32, 0
537 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
538 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
539 ; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1
540 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
541 ; GFX1064-NEXT: .LBB1_1: ; %ComputeLoop
542 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
543 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
544 ; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
545 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
546 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
547 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
548 ; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
549 ; GFX1064-NEXT: s_cbranch_scc1 .LBB1_1
550 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
551 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
552 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
553 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
554 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
555 ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
556 ; GFX1064-NEXT: s_cbranch_execz .LBB1_5
557 ; GFX1064-NEXT: ; %bb.3:
558 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
559 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0
560 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
561 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
562 ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
563 ; GFX1064-NEXT: .LBB1_4: ; %atomicrmw.start
564 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
565 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
566 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
567 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
568 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
569 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
570 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
571 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
572 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
573 ; GFX1064-NEXT: s_cbranch_execnz .LBB1_4
574 ; GFX1064-NEXT: .LBB1_5:
575 ; GFX1064-NEXT: s_endpgm
577 ; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
579 ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
580 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
581 ; GFX1032-NEXT: s_mov_b32 s38, -1
582 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
583 ; GFX1032-NEXT: s_add_u32 s36, s36, s9
584 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
585 ; GFX1032-NEXT: s_addc_u32 s37, s37, 0
586 ; GFX1032-NEXT: s_mov_b32 s14, s8
587 ; GFX1032-NEXT: s_add_u32 s8, s34, 44
588 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0
589 ; GFX1032-NEXT: s_getpc_b64 s[2:3]
590 ; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
591 ; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
592 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
593 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
594 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
595 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
596 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
597 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
598 ; GFX1032-NEXT: s_mov_b32 s12, s6
599 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
600 ; GFX1032-NEXT: s_mov_b32 s13, s7
601 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
602 ; GFX1032-NEXT: s_mov_b32 s32, 0
603 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
604 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
605 ; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1
606 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
607 ; GFX1032-NEXT: .LBB1_1: ; %ComputeLoop
608 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
609 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
610 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
611 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
612 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
613 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
614 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
615 ; GFX1032-NEXT: s_cbranch_scc1 .LBB1_1
616 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
617 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
618 ; GFX1032-NEXT: s_mov_b32 s2, 0
619 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
620 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
621 ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
622 ; GFX1032-NEXT: s_cbranch_execz .LBB1_5
623 ; GFX1032-NEXT: ; %bb.3:
624 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
625 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0
626 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
627 ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
628 ; GFX1032-NEXT: .LBB1_4: ; %atomicrmw.start
629 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
630 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
631 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
632 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
633 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
634 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
635 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
636 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
637 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
638 ; GFX1032-NEXT: s_cbranch_execnz .LBB1_4
639 ; GFX1032-NEXT: .LBB1_5:
640 ; GFX1032-NEXT: s_endpgm
642 ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
644 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
645 ; GFX1164-NEXT: s_mov_b32 s14, s8
646 ; GFX1164-NEXT: s_add_u32 s8, s34, 44
647 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0
648 ; GFX1164-NEXT: s_getpc_b64 s[2:3]
649 ; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
650 ; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
651 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0
652 ; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
653 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
654 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
655 ; GFX1164-NEXT: s_mov_b32 s12, s6
656 ; GFX1164-NEXT: s_mov_b32 s13, s7
657 ; GFX1164-NEXT: s_mov_b32 s32, 0
658 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
659 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
660 ; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1
661 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
662 ; GFX1164-NEXT: .LBB1_1: ; %ComputeLoop
663 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
664 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
665 ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
666 ; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
667 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
668 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
669 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
670 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
671 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
672 ; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
673 ; GFX1164-NEXT: s_cbranch_scc1 .LBB1_1
674 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
675 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
676 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
677 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
678 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
679 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
680 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
681 ; GFX1164-NEXT: s_cbranch_execz .LBB1_4
682 ; GFX1164-NEXT: ; %bb.3:
683 ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
684 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0
685 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
686 ; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
687 ; GFX1164-NEXT: .LBB1_4:
688 ; GFX1164-NEXT: s_endpgm
690 ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
692 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
693 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0
694 ; GFX1132-NEXT: s_add_u32 s8, s34, 44
695 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0
696 ; GFX1132-NEXT: s_getpc_b64 s[2:3]
697 ; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
698 ; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
699 ; GFX1132-NEXT: s_mov_b32 s12, s13
700 ; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
701 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
702 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
703 ; GFX1132-NEXT: s_mov_b32 s13, s14
704 ; GFX1132-NEXT: s_mov_b32 s14, s15
705 ; GFX1132-NEXT: s_mov_b32 s32, 0
706 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
707 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
708 ; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1
709 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
710 ; GFX1132-NEXT: .LBB1_1: ; %ComputeLoop
711 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
712 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
713 ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
714 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
715 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
716 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
717 ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
718 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
719 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
720 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
721 ; GFX1132-NEXT: s_cbranch_scc1 .LBB1_1
722 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
723 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
724 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
725 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
726 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
727 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
728 ; GFX1132-NEXT: s_cbranch_execz .LBB1_4
729 ; GFX1132-NEXT: ; %bb.3:
730 ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
731 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0
732 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
733 ; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
734 ; GFX1132-NEXT: .LBB1_4:
735 ; GFX1132-NEXT: s_endpgm
737 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
738 ; GFX7LESS-DPP: ; %bb.0:
739 ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
740 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
741 ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
742 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
743 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
744 ; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
745 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
746 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
747 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
748 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
749 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
750 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
751 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
752 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
753 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
754 ; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
755 ; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
756 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
757 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
758 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
759 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
760 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
761 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
762 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
763 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
764 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
765 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
766 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
767 ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
768 ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
769 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
770 ; GFX7LESS-DPP-NEXT: .LBB1_1: ; %atomicrmw.start
771 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
772 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
773 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0
774 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
775 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
776 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
777 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
778 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
779 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
780 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
781 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
782 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
783 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB1_1
784 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
785 ; GFX7LESS-DPP-NEXT: s_endpgm
787 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
789 ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
790 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
791 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1
792 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
793 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
794 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
795 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
796 ; GFX9-DPP-NEXT: s_mov_b32 s14, s8
797 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
798 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
799 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
800 ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
801 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
802 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
803 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
804 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
805 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
806 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
807 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
808 ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
809 ; GFX9-DPP-NEXT: s_mov_b32 s12, s6
810 ; GFX9-DPP-NEXT: s_mov_b32 s13, s7
811 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
812 ; GFX9-DPP-NEXT: s_mov_b32 s32, 0
813 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
814 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
815 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
816 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
817 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
818 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
819 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
820 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
821 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
822 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
823 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
824 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
825 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
826 ; GFX9-DPP-NEXT: s_nop 1
827 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
828 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
829 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
830 ; GFX9-DPP-NEXT: s_nop 1
831 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
832 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
833 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
834 ; GFX9-DPP-NEXT: s_nop 1
835 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
836 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
837 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
838 ; GFX9-DPP-NEXT: s_nop 1
839 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
840 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
841 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
842 ; GFX9-DPP-NEXT: s_nop 1
843 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
844 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
845 ; GFX9-DPP-NEXT: s_nop 1
846 ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
847 ; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
848 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
849 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
850 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
851 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
852 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB1_3
853 ; GFX9-DPP-NEXT: ; %bb.1:
854 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
855 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
856 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
857 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
858 ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
859 ; GFX9-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
860 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
861 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
862 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1
863 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
864 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
865 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
866 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
867 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
868 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
869 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB1_2
870 ; GFX9-DPP-NEXT: .LBB1_3:
871 ; GFX9-DPP-NEXT: s_endpgm
873 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
874 ; GFX1064-DPP: ; %bb.0:
875 ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
876 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
877 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
878 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
879 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
880 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
881 ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
882 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
883 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
884 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
885 ; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
886 ; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
887 ; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
888 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
889 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
890 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
891 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
892 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
893 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
894 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
895 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
896 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
897 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
898 ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
899 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
900 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
901 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
902 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
903 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
904 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
905 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
906 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
907 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
908 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
909 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
910 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
911 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
912 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
913 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
914 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
915 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
916 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
917 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
918 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
919 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
920 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
921 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
922 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
923 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
924 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32
925 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
926 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
927 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
928 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3
929 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
930 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
931 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3
932 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
933 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
934 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB1_3
935 ; GFX1064-DPP-NEXT: ; %bb.1:
936 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
937 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
938 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
939 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
940 ; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
941 ; GFX1064-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
942 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
943 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
944 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
945 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
946 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
947 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
948 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
949 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
950 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
951 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB1_2
952 ; GFX1064-DPP-NEXT: .LBB1_3:
953 ; GFX1064-DPP-NEXT: s_endpgm
955 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
956 ; GFX1032-DPP: ; %bb.0:
957 ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
958 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
959 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
960 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
961 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
962 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
963 ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
964 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
965 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
966 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
967 ; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
968 ; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
969 ; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
970 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
971 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
972 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
973 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
974 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
975 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
976 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
977 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
978 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
979 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
980 ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
981 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
982 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
983 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
984 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
985 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
986 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
987 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
988 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
989 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
990 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
991 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
992 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
993 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
994 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
995 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
996 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
997 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
998 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
999 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
1000 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
1001 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
1002 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
1003 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
1004 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
1005 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
1006 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1007 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3
1008 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
1009 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
1010 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
1011 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB1_3
1012 ; GFX1032-DPP-NEXT: ; %bb.1:
1013 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
1014 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
1015 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
1016 ; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
1017 ; GFX1032-DPP-NEXT: .LBB1_2: ; %atomicrmw.start
1018 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
1019 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
1020 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
1021 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
1022 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
1023 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
1024 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
1025 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
1026 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
1027 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB1_2
1028 ; GFX1032-DPP-NEXT: .LBB1_3:
1029 ; GFX1032-DPP-NEXT: s_endpgm
1031 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
1032 ; GFX1164-DPP: ; %bb.0:
1033 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
1034 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
1035 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
1036 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
1037 ; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
1038 ; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
1039 ; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
1040 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
1041 ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
1042 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
1043 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
1044 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
1045 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
1046 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
1047 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
1048 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
1049 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
1050 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
1051 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
1052 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
1053 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
1054 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
1055 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
1056 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
1057 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
1058 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
1059 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
1060 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1061 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
1062 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
1063 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
1064 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1065 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
1066 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
1067 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
1068 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1069 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
1070 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
1071 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1072 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
1073 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
1074 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1075 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
1076 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
1077 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1078 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
1079 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
1080 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1081 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
1082 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
1083 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
1084 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
1085 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
1086 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
1087 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1088 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
1089 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
1090 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
1091 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB1_2
1092 ; GFX1164-DPP-NEXT: ; %bb.1:
1093 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
1094 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
1095 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
1096 ; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
1097 ; GFX1164-DPP-NEXT: .LBB1_2:
1098 ; GFX1164-DPP-NEXT: s_endpgm
1100 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
1101 ; GFX1132-DPP: ; %bb.0:
1102 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
1103 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
1104 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
1105 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
1106 ; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
1107 ; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
1108 ; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
1109 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
1110 ; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
1111 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
1112 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
1113 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
1114 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
1115 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
1116 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
1117 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
1118 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
1119 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
1120 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
1121 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
1122 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
1123 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
1124 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
1125 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
1126 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1127 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
1128 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
1129 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
1130 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
1131 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1132 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
1133 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
1134 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
1135 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1136 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
1137 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
1138 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1139 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
1140 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
1141 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1142 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
1143 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
1144 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
1145 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
1146 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
1147 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
1148 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1149 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
1150 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
1151 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
1152 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB1_2
1153 ; GFX1132-DPP-NEXT: ; %bb.1:
1154 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
1155 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
1156 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
1157 ; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
1158 ; GFX1132-DPP-NEXT: .LBB1_2:
1159 ; GFX1132-DPP-NEXT: s_endpgm
1160 %divValue = call float @div.float.value()
1161 %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic, align 4
1165 define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
1166 ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
1167 ; GFX7LESS: ; %bb.0:
1168 ; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1169 ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1170 ; GFX7LESS-NEXT: s_mov_b32 s14, -1
1171 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000
1172 ; GFX7LESS-NEXT: s_add_u32 s12, s12, s9
1173 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0
1174 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
1175 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0
1176 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0
1177 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1178 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
1179 ; GFX7LESS-NEXT: s_cbranch_execz .LBB2_3
1180 ; GFX7LESS-NEXT: ; %bb.1:
1181 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1]
1182 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
1183 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
1184 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
1185 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
1186 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
1187 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
1188 ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0
1189 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
1190 ; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
1191 ; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
1192 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
1193 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
1194 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
1195 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
1196 ; GFX7LESS-NEXT: .LBB2_2: ; %atomicrmw.start
1197 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
1198 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
1199 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
1200 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
1201 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
1202 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
1203 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
1204 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
1205 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1206 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
1207 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
1208 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB2_2
1209 ; GFX7LESS-NEXT: .LBB2_3:
1210 ; GFX7LESS-NEXT: s_endpgm
1212 ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
1214 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1215 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1216 ; GFX9-NEXT: s_mov_b32 s14, -1
1217 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
1218 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000
1219 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
1220 ; GFX9-NEXT: s_add_u32 s12, s12, s9
1221 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
1222 ; GFX9-NEXT: s_addc_u32 s13, s13, 0
1223 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1224 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
1225 ; GFX9-NEXT: s_cbranch_execz .LBB2_3
1226 ; GFX9-NEXT: ; %bb.1:
1227 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
1228 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
1229 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
1230 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000
1231 ; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
1232 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1233 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
1234 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
1235 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1236 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
1237 ; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
1238 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1239 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
1240 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
1241 ; GFX9-NEXT: .LBB2_2: ; %atomicrmw.start
1242 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1243 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
1244 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1245 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1246 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1247 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1248 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1249 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
1250 ; GFX9-NEXT: s_cbranch_execnz .LBB2_2
1251 ; GFX9-NEXT: .LBB2_3:
1252 ; GFX9-NEXT: s_endpgm
1254 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
1256 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
1257 ; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1258 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
1259 ; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1260 ; GFX1064-NEXT: s_mov_b32 s14, -1
1261 ; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000
1262 ; GFX1064-NEXT: s_add_u32 s12, s12, s9
1263 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
1264 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0
1265 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1266 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
1267 ; GFX1064-NEXT: s_cbranch_execz .LBB2_3
1268 ; GFX1064-NEXT: ; %bb.1:
1269 ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
1270 ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000
1271 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0
1272 ; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
1273 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1274 ; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
1275 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
1276 ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
1277 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
1278 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2
1279 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
1280 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
1281 ; GFX1064-NEXT: .LBB2_2: ; %atomicrmw.start
1282 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
1283 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
1284 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1285 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
1286 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1287 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
1288 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1289 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
1290 ; GFX1064-NEXT: s_cbranch_execnz .LBB2_2
1291 ; GFX1064-NEXT: .LBB2_3:
1292 ; GFX1064-NEXT: s_endpgm
1294 ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
1296 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
1297 ; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1298 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
1299 ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1300 ; GFX1032-NEXT: s_mov_b32 s14, -1
1301 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
1302 ; GFX1032-NEXT: s_add_u32 s12, s12, s9
1303 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
1304 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0
1305 ; GFX1032-NEXT: s_mov_b32 s4, 0
1306 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
1307 ; GFX1032-NEXT: s_cbranch_execz .LBB2_3
1308 ; GFX1032-NEXT: ; %bb.1:
1309 ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0
1310 ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000
1311 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0
1312 ; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
1313 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1314 ; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
1315 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
1316 ; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0
1317 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
1318 ; GFX1032-NEXT: v_mov_b32_e32 v1, s2
1319 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
1320 ; GFX1032-NEXT: .LBB2_2: ; %atomicrmw.start
1321 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
1322 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
1323 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1324 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
1325 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
1326 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
1327 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
1328 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
1329 ; GFX1032-NEXT: s_cbranch_execnz .LBB2_2
1330 ; GFX1032-NEXT: .LBB2_3:
1331 ; GFX1032-NEXT: s_endpgm
1333 ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
1335 ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec
1336 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
1337 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0
1338 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
1339 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
1340 ; GFX1164-NEXT: s_clause 0x1
1341 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4
1342 ; GFX1164-NEXT: scratch_store_b32 off, v1, off
1343 ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off
1344 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
1345 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
1346 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
1347 ; GFX1164-NEXT: s_cbranch_execz .LBB2_3
1348 ; GFX1164-NEXT: ; %bb.1:
1349 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
1350 ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
1351 ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
1352 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0
1353 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
1354 ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
1355 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1356 ; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
1357 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
1358 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2
1359 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
1360 ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
1361 ; GFX1164-NEXT: .LBB2_2: ; %atomicrmw.start
1362 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
1363 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
1364 ; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
1365 ; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
1366 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
1367 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1368 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0
1369 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1370 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1371 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
1372 ; GFX1164-NEXT: s_cbranch_execnz .LBB2_2
1373 ; GFX1164-NEXT: .LBB2_3:
1374 ; GFX1164-NEXT: s_endpgm
1376 ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
1378 ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo
1379 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1380 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0
1381 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
1382 ; GFX1132-NEXT: s_mov_b32 s4, 0
1383 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
1384 ; GFX1132-NEXT: s_clause 0x1
1385 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
1386 ; GFX1132-NEXT: scratch_store_b32 off, v1, off
1387 ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off
1388 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
1389 ; GFX1132-NEXT: s_cbranch_execz .LBB2_3
1390 ; GFX1132-NEXT: ; %bb.1:
1391 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
1392 ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
1393 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
1394 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0
1395 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
1396 ; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0
1397 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1398 ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
1399 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
1400 ; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0
1401 ; GFX1132-NEXT: .LBB2_2: ; %atomicrmw.start
1402 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
1403 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
1404 ; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
1405 ; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
1406 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
1407 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
1408 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0
1409 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
1410 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1411 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
1412 ; GFX1132-NEXT: s_cbranch_execnz .LBB2_2
1413 ; GFX1132-NEXT: .LBB2_3:
1414 ; GFX1132-NEXT: s_endpgm
1416 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
1417 ; GFX7LESS-DPP: ; %bb.0:
1418 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1419 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1420 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1
1421 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000
1422 ; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9
1423 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0
1424 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec
1425 ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0
1426 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0
1427 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1428 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
1429 ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB2_3
1430 ; GFX7LESS-DPP-NEXT: ; %bb.1:
1431 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1]
1432 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
1433 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
1434 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
1435 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
1436 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
1437 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
1438 ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
1439 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
1440 ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
1441 ; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
1442 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
1443 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
1444 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
1445 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
1446 ; GFX7LESS-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
1447 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
1448 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
1449 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
1450 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
1451 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
1452 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
1453 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
1454 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
1455 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1456 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
1457 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
1458 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB2_2
1459 ; GFX7LESS-DPP-NEXT: .LBB2_3:
1460 ; GFX7LESS-DPP-NEXT: s_endpgm
1462 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
1463 ; GFX9-DPP: ; %bb.0:
1464 ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1465 ; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1466 ; GFX9-DPP-NEXT: s_mov_b32 s14, -1
1467 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec
1468 ; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000
1469 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
1470 ; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9
1471 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
1472 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0
1473 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1474 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
1475 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB2_3
1476 ; GFX9-DPP-NEXT: ; %bb.1:
1477 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
1478 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
1479 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
1480 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000
1481 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
1482 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1483 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
1484 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
1485 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
1486 ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
1487 ; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
1488 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
1489 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
1490 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
1491 ; GFX9-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
1492 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
1493 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
1494 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1495 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
1496 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1497 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1498 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
1499 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
1500 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB2_2
1501 ; GFX9-DPP-NEXT: .LBB2_3:
1502 ; GFX9-DPP-NEXT: s_endpgm
1504 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
1505 ; GFX1064-DPP: ; %bb.0:
1506 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec
1507 ; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1508 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
1509 ; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1510 ; GFX1064-DPP-NEXT: s_mov_b32 s14, -1
1511 ; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000
1512 ; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9
1513 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
1514 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0
1515 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1516 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
1517 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB2_3
1518 ; GFX1064-DPP-NEXT: ; %bb.1:
1519 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
1520 ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000
1521 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
1522 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
1523 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1524 ; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
1525 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
1526 ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
1527 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
1528 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
1529 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
1530 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
1531 ; GFX1064-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
1532 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
1533 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
1534 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1535 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
1536 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1537 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
1538 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1539 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
1540 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB2_2
1541 ; GFX1064-DPP-NEXT: .LBB2_3:
1542 ; GFX1064-DPP-NEXT: s_endpgm
1544 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
1545 ; GFX1032-DPP: ; %bb.0:
1546 ; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
1547 ; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
1548 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
1549 ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
1550 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
1551 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
1552 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
1553 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
1554 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
1555 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
1556 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
1557 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB2_3
1558 ; GFX1032-DPP-NEXT: ; %bb.1:
1559 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0
1560 ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000
1561 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
1562 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
1563 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
1564 ; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
1565 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
1566 ; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
1567 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
1568 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2
1569 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
1570 ; GFX1032-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
1571 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
1572 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
1573 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1574 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
1575 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
1576 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
1577 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
1578 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
1579 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB2_2
1580 ; GFX1032-DPP-NEXT: .LBB2_3:
1581 ; GFX1032-DPP-NEXT: s_endpgm
1583 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
1584 ; GFX1164-DPP: ; %bb.0:
1585 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec
1586 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
1587 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
1588 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
1589 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
1590 ; GFX1164-DPP-NEXT: s_clause 0x1
1591 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
1592 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off
1593 ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off
1594 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
1595 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
1596 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
1597 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB2_3
1598 ; GFX1164-DPP-NEXT: ; %bb.1:
1599 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
1600 ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
1601 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
1602 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
1603 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
1604 ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
1605 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
1606 ; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
1607 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
1608 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
1609 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
1610 ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
1611 ; GFX1164-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
1612 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
1613 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
1614 ; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
1615 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
1616 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
1617 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1618 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
1619 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1620 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1621 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
1622 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB2_2
1623 ; GFX1164-DPP-NEXT: .LBB2_3:
1624 ; GFX1164-DPP-NEXT: s_endpgm
1626 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
1627 ; GFX1132-DPP: ; %bb.0:
1628 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo
1629 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1630 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0
1631 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
1632 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
1633 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
1634 ; GFX1132-DPP-NEXT: s_clause 0x1
1635 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
1636 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
1637 ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off
1638 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
1639 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB2_3
1640 ; GFX1132-DPP-NEXT: ; %bb.1:
1641 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
1642 ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
1643 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
1644 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
1645 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
1646 ; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
1647 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
1648 ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
1649 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
1650 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0
1651 ; GFX1132-DPP-NEXT: .LBB2_2: ; %atomicrmw.start
1652 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
1653 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
1654 ; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
1655 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
1656 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
1657 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
1658 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
1659 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
1660 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1661 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
1662 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB2_2
1663 ; GFX1132-DPP-NEXT: .LBB2_3:
1664 ; GFX1132-DPP-NEXT: s_endpgm
1665 %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") monotonic
1670 define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
1671 ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
1672 ; GFX7LESS: ; %bb.0:
1673 ; GFX7LESS-NEXT: s_mov_b32 s32, 0
1674 ; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
1675 ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
1676 ; GFX7LESS-NEXT: s_mov_b32 s38, -1
1677 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
1678 ; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
1679 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
1680 ; GFX7LESS-NEXT: s_mov_b32 s14, s8
1681 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
1682 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
1683 ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
1684 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
1685 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
1686 ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
1687 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
1688 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
1689 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
1690 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
1691 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
1692 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
1693 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
1694 ; GFX7LESS-NEXT: s_mov_b32 s12, s6
1695 ; GFX7LESS-NEXT: s_mov_b32 s13, s7
1696 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
1697 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
1698 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
1699 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
1700 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
1701 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1
1702 ; GFX7LESS-NEXT: .LBB3_1: ; %ComputeLoop
1703 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
1704 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
1705 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
1706 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
1707 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
1708 ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
1709 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
1710 ; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2
1711 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB3_1
1712 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
1713 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
1714 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
1715 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1716 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
1717 ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
1718 ; GFX7LESS-NEXT: s_cbranch_execz .LBB3_5
1719 ; GFX7LESS-NEXT: ; %bb.3:
1720 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
1721 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
1722 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
1723 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
1724 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
1725 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
1726 ; GFX7LESS-NEXT: .LBB3_4: ; %atomicrmw.start
1727 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
1728 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
1729 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
1730 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
1731 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
1732 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
1733 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
1734 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
1735 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
1736 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
1737 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
1738 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
1739 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB3_4
1740 ; GFX7LESS-NEXT: .LBB3_5:
1741 ; GFX7LESS-NEXT: s_endpgm
1743 ; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
1745 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
1746 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
1747 ; GFX9-NEXT: s_mov_b32 s38, -1
1748 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000
1749 ; GFX9-NEXT: s_add_u32 s36, s36, s9
1750 ; GFX9-NEXT: s_addc_u32 s37, s37, 0
1751 ; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
1752 ; GFX9-NEXT: s_mov_b32 s14, s8
1753 ; GFX9-NEXT: s_add_u32 s8, s34, 44
1754 ; GFX9-NEXT: s_addc_u32 s9, s35, 0
1755 ; GFX9-NEXT: s_getpc_b64 s[2:3]
1756 ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
1757 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
1758 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
1759 ; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
1760 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
1761 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
1762 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
1763 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
1764 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
1765 ; GFX9-NEXT: s_mov_b32 s12, s6
1766 ; GFX9-NEXT: s_mov_b32 s13, s7
1767 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
1768 ; GFX9-NEXT: s_mov_b32 s32, 0
1769 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1770 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
1771 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
1772 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
1773 ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop
1774 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1775 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
1776 ; GFX9-NEXT: v_readlane_b32 s4, v0, s2
1777 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
1778 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
1779 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
1780 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
1781 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1
1782 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
1783 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1784 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1785 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1786 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
1787 ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
1788 ; GFX9-NEXT: s_cbranch_execz .LBB3_5
1789 ; GFX9-NEXT: ; %bb.3:
1790 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
1791 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
1792 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
1793 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1794 ; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
1795 ; GFX9-NEXT: .LBB3_4: ; %atomicrmw.start
1796 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1797 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1798 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
1799 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1800 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1801 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1802 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1803 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
1804 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
1805 ; GFX9-NEXT: s_cbranch_execnz .LBB3_4
1806 ; GFX9-NEXT: .LBB3_5:
1807 ; GFX9-NEXT: s_endpgm
1809 ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
1811 ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
1812 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
1813 ; GFX1064-NEXT: s_mov_b32 s38, -1
1814 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
1815 ; GFX1064-NEXT: s_add_u32 s36, s36, s9
1816 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
1817 ; GFX1064-NEXT: s_addc_u32 s37, s37, 0
1818 ; GFX1064-NEXT: s_mov_b32 s14, s8
1819 ; GFX1064-NEXT: s_add_u32 s8, s34, 44
1820 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0
1821 ; GFX1064-NEXT: s_getpc_b64 s[2:3]
1822 ; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
1823 ; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
1824 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
1825 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
1826 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
1827 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
1828 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
1829 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
1830 ; GFX1064-NEXT: s_mov_b32 s12, s6
1831 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
1832 ; GFX1064-NEXT: s_mov_b32 s13, s7
1833 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
1834 ; GFX1064-NEXT: s_mov_b32 s32, 0
1835 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
1836 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
1837 ; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1
1838 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
1839 ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop
1840 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
1841 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
1842 ; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
1843 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
1844 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
1845 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
1846 ; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
1847 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1
1848 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
1849 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1850 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1851 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1852 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
1853 ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
1854 ; GFX1064-NEXT: s_cbranch_execz .LBB3_5
1855 ; GFX1064-NEXT: ; %bb.3:
1856 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
1857 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0
1858 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
1859 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
1860 ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
1861 ; GFX1064-NEXT: .LBB3_4: ; %atomicrmw.start
1862 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
1863 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
1864 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
1865 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1866 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
1867 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1868 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
1869 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1870 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
1871 ; GFX1064-NEXT: s_cbranch_execnz .LBB3_4
1872 ; GFX1064-NEXT: .LBB3_5:
1873 ; GFX1064-NEXT: s_endpgm
1875 ; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
1877 ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
1878 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
1879 ; GFX1032-NEXT: s_mov_b32 s38, -1
1880 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
1881 ; GFX1032-NEXT: s_add_u32 s36, s36, s9
1882 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
1883 ; GFX1032-NEXT: s_addc_u32 s37, s37, 0
1884 ; GFX1032-NEXT: s_mov_b32 s14, s8
1885 ; GFX1032-NEXT: s_add_u32 s8, s34, 44
1886 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0
1887 ; GFX1032-NEXT: s_getpc_b64 s[2:3]
1888 ; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
1889 ; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
1890 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
1891 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
1892 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
1893 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
1894 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
1895 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
1896 ; GFX1032-NEXT: s_mov_b32 s12, s6
1897 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
1898 ; GFX1032-NEXT: s_mov_b32 s13, s7
1899 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
1900 ; GFX1032-NEXT: s_mov_b32 s32, 0
1901 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
1902 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
1903 ; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1
1904 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
1905 ; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop
1906 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
1907 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
1908 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
1909 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
1910 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
1911 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
1912 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
1913 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1
1914 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
1915 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1916 ; GFX1032-NEXT: s_mov_b32 s2, 0
1917 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
1918 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
1919 ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
1920 ; GFX1032-NEXT: s_cbranch_execz .LBB3_5
1921 ; GFX1032-NEXT: ; %bb.3:
1922 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
1923 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0
1924 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
1925 ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
1926 ; GFX1032-NEXT: .LBB3_4: ; %atomicrmw.start
1927 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
1928 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
1929 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
1930 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
1931 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
1932 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
1933 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
1934 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
1935 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
1936 ; GFX1032-NEXT: s_cbranch_execnz .LBB3_4
1937 ; GFX1032-NEXT: .LBB3_5:
1938 ; GFX1032-NEXT: s_endpgm
1940 ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
1942 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
1943 ; GFX1164-NEXT: s_mov_b32 s14, s8
1944 ; GFX1164-NEXT: s_add_u32 s8, s34, 44
1945 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0
1946 ; GFX1164-NEXT: s_getpc_b64 s[2:3]
1947 ; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
1948 ; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
1949 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0
1950 ; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
1951 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
1952 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
1953 ; GFX1164-NEXT: s_mov_b32 s12, s6
1954 ; GFX1164-NEXT: s_mov_b32 s13, s7
1955 ; GFX1164-NEXT: s_mov_b32 s32, 0
1956 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
1957 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
1958 ; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1
1959 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
1960 ; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop
1961 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
1962 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1963 ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
1964 ; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
1965 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
1966 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1967 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
1968 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
1969 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
1970 ; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
1971 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1
1972 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
1973 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
1974 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
1975 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1976 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
1977 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
1978 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
1979 ; GFX1164-NEXT: s_cbranch_execz .LBB3_5
1980 ; GFX1164-NEXT: ; %bb.3:
1981 ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
1982 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0
1983 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
1984 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
1985 ; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
1986 ; GFX1164-NEXT: .LBB3_4: ; %atomicrmw.start
1987 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
1988 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
1989 ; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
1990 ; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
1991 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
1992 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
1993 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0
1994 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
1995 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1996 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
1997 ; GFX1164-NEXT: s_cbranch_execnz .LBB3_4
1998 ; GFX1164-NEXT: .LBB3_5:
1999 ; GFX1164-NEXT: s_endpgm
2001 ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
2003 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
2004 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0
2005 ; GFX1132-NEXT: s_add_u32 s8, s34, 44
2006 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0
2007 ; GFX1132-NEXT: s_getpc_b64 s[2:3]
2008 ; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
2009 ; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
2010 ; GFX1132-NEXT: s_mov_b32 s12, s13
2011 ; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
2012 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
2013 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
2014 ; GFX1132-NEXT: s_mov_b32 s13, s14
2015 ; GFX1132-NEXT: s_mov_b32 s14, s15
2016 ; GFX1132-NEXT: s_mov_b32 s32, 0
2017 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
2018 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
2019 ; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1
2020 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
2021 ; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop
2022 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
2023 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2024 ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
2025 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
2026 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
2027 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
2028 ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
2029 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
2030 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
2031 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
2032 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1
2033 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
2034 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2035 ; GFX1132-NEXT: s_mov_b32 s2, 0
2036 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
2037 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
2038 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
2039 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
2040 ; GFX1132-NEXT: s_cbranch_execz .LBB3_5
2041 ; GFX1132-NEXT: ; %bb.3:
2042 ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
2043 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0
2044 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
2045 ; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
2046 ; GFX1132-NEXT: .LBB3_4: ; %atomicrmw.start
2047 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
2048 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
2049 ; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
2050 ; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
2051 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
2052 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
2053 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0
2054 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
2055 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2056 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
2057 ; GFX1132-NEXT: s_cbranch_execnz .LBB3_4
2058 ; GFX1132-NEXT: .LBB3_5:
2059 ; GFX1132-NEXT: s_endpgm
2061 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
2062 ; GFX7LESS-DPP: ; %bb.0:
2063 ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
2064 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
2065 ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
2066 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
2067 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
2068 ; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
2069 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
2070 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
2071 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
2072 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
2073 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
2074 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
2075 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
2076 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
2077 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
2078 ; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
2079 ; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
2080 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
2081 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
2082 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
2083 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
2084 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
2085 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
2086 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
2087 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
2088 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
2089 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
2090 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
2091 ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
2092 ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
2093 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
2094 ; GFX7LESS-DPP-NEXT: .LBB3_1: ; %atomicrmw.start
2095 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2096 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
2097 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0
2098 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
2099 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
2100 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
2101 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
2102 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
2103 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
2104 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2105 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
2106 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
2107 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB3_1
2108 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
2109 ; GFX7LESS-DPP-NEXT: s_endpgm
2111 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
2112 ; GFX9-DPP: ; %bb.0:
2113 ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
2114 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
2115 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1
2116 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
2117 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
2118 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
2119 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
2120 ; GFX9-DPP-NEXT: s_mov_b32 s14, s8
2121 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
2122 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
2123 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
2124 ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
2125 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
2126 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
2127 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
2128 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
2129 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
2130 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
2131 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
2132 ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
2133 ; GFX9-DPP-NEXT: s_mov_b32 s12, s6
2134 ; GFX9-DPP-NEXT: s_mov_b32 s13, s7
2135 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
2136 ; GFX9-DPP-NEXT: s_mov_b32 s32, 0
2137 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
2138 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
2139 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
2140 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
2141 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
2142 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
2143 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
2144 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
2145 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
2146 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
2147 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
2148 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
2149 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
2150 ; GFX9-DPP-NEXT: s_nop 1
2151 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
2152 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
2153 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
2154 ; GFX9-DPP-NEXT: s_nop 1
2155 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
2156 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
2157 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
2158 ; GFX9-DPP-NEXT: s_nop 1
2159 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
2160 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
2161 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
2162 ; GFX9-DPP-NEXT: s_nop 1
2163 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
2164 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
2165 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
2166 ; GFX9-DPP-NEXT: s_nop 1
2167 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
2168 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
2169 ; GFX9-DPP-NEXT: s_nop 1
2170 ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
2171 ; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
2172 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
2173 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
2174 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
2175 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
2176 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB3_3
2177 ; GFX9-DPP-NEXT: ; %bb.1:
2178 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
2179 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
2180 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
2181 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
2182 ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
2183 ; GFX9-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
2184 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2185 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
2186 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1
2187 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
2188 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
2189 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2190 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2191 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
2192 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
2193 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB3_2
2194 ; GFX9-DPP-NEXT: .LBB3_3:
2195 ; GFX9-DPP-NEXT: s_endpgm
2197 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
2198 ; GFX1064-DPP: ; %bb.0:
2199 ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
2200 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
2201 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
2202 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
2203 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
2204 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
2205 ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
2206 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
2207 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
2208 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
2209 ; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
2210 ; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
2211 ; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
2212 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
2213 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
2214 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
2215 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
2216 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
2217 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
2218 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
2219 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
2220 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
2221 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
2222 ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
2223 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
2224 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
2225 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
2226 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
2227 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
2228 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
2229 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
2230 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
2231 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
2232 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
2233 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
2234 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
2235 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
2236 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
2237 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
2238 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
2239 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
2240 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
2241 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
2242 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
2243 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
2244 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
2245 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
2246 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
2247 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
2248 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32
2249 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
2250 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2251 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
2252 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3
2253 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
2254 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
2255 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3
2256 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2257 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
2258 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB3_3
2259 ; GFX1064-DPP-NEXT: ; %bb.1:
2260 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
2261 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
2262 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
2263 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
2264 ; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
2265 ; GFX1064-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
2266 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2267 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
2268 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
2269 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
2270 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
2271 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2272 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
2273 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2274 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
2275 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB3_2
2276 ; GFX1064-DPP-NEXT: .LBB3_3:
2277 ; GFX1064-DPP-NEXT: s_endpgm
2279 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
2280 ; GFX1032-DPP: ; %bb.0:
2281 ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
2282 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
2283 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
2284 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
2285 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
2286 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
2287 ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
2288 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
2289 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
2290 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
2291 ; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
2292 ; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
2293 ; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
2294 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
2295 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
2296 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
2297 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
2298 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
2299 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
2300 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
2301 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
2302 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
2303 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
2304 ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
2305 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
2306 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
2307 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
2308 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
2309 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
2310 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
2311 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
2312 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
2313 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
2314 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
2315 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
2316 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
2317 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
2318 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
2319 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
2320 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
2321 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
2322 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
2323 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
2324 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
2325 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
2326 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
2327 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
2328 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
2329 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
2330 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2331 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3
2332 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
2333 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
2334 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
2335 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB3_3
2336 ; GFX1032-DPP-NEXT: ; %bb.1:
2337 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
2338 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
2339 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
2340 ; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
2341 ; GFX1032-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
2342 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2343 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
2344 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
2345 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
2346 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
2347 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
2348 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
2349 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
2350 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
2351 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB3_2
2352 ; GFX1032-DPP-NEXT: .LBB3_3:
2353 ; GFX1032-DPP-NEXT: s_endpgm
2355 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
2356 ; GFX1164-DPP: ; %bb.0:
2357 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
2358 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
2359 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
2360 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
2361 ; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
2362 ; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
2363 ; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
2364 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
2365 ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
2366 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
2367 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
2368 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
2369 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
2370 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
2371 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
2372 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
2373 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
2374 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
2375 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
2376 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
2377 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
2378 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
2379 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
2380 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
2381 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
2382 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
2383 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
2384 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2385 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
2386 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
2387 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
2388 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2389 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
2390 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
2391 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
2392 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2393 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
2394 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
2395 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2396 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
2397 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
2398 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2399 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
2400 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
2401 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2402 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
2403 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
2404 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
2405 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
2406 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
2407 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
2408 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
2409 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
2410 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
2411 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2412 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
2413 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
2414 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
2415 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB3_3
2416 ; GFX1164-DPP-NEXT: ; %bb.1:
2417 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
2418 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
2419 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
2420 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
2421 ; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
2422 ; GFX1164-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
2423 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2424 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
2425 ; GFX1164-DPP-NEXT: v_add_f32_e32 v4, v5, v0
2426 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
2427 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
2428 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
2429 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
2430 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2431 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2432 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
2433 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB3_2
2434 ; GFX1164-DPP-NEXT: .LBB3_3:
2435 ; GFX1164-DPP-NEXT: s_endpgm
2437 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
2438 ; GFX1132-DPP: ; %bb.0:
2439 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
2440 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
2441 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
2442 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
2443 ; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
2444 ; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
2445 ; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
2446 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
2447 ; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
2448 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
2449 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
2450 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
2451 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
2452 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
2453 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
2454 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
2455 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
2456 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
2457 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
2458 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
2459 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
2460 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
2461 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
2462 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
2463 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2464 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
2465 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
2466 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
2467 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
2468 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2469 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
2470 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
2471 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
2472 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
2473 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
2474 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
2475 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2476 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
2477 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
2478 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2479 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
2480 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
2481 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
2482 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
2483 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
2484 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
2485 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
2486 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
2487 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
2488 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
2489 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
2490 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB3_3
2491 ; GFX1132-DPP-NEXT: ; %bb.1:
2492 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
2493 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
2494 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
2495 ; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
2496 ; GFX1132-DPP-NEXT: .LBB3_2: ; %atomicrmw.start
2497 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2498 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
2499 ; GFX1132-DPP-NEXT: v_add_f32_e32 v4, v5, v0
2500 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
2501 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
2502 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
2503 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
2504 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
2505 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2506 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
2507 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB3_2
2508 ; GFX1132-DPP-NEXT: .LBB3_3:
2509 ; GFX1132-DPP-NEXT: s_endpgm
2510 %divValue = call float @div.float.value() strictfp
2511 %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("one-as") monotonic
2515 define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{
2516 ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
2517 ; GFX7LESS: ; %bb.0:
2518 ; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2519 ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2520 ; GFX7LESS-NEXT: s_mov_b32 s14, -1
2521 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000
2522 ; GFX7LESS-NEXT: s_add_u32 s12, s12, s9
2523 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0
2524 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
2525 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0
2526 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0
2527 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2528 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
2529 ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_3
2530 ; GFX7LESS-NEXT: ; %bb.1:
2531 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1]
2532 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
2533 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
2534 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
2535 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
2536 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
2537 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
2538 ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0
2539 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
2540 ; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
2541 ; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
2542 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
2543 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
2544 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
2545 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
2546 ; GFX7LESS-NEXT: .LBB4_2: ; %atomicrmw.start
2547 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
2548 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
2549 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
2550 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
2551 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
2552 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
2553 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
2554 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
2555 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2556 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
2557 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
2558 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB4_2
2559 ; GFX7LESS-NEXT: .LBB4_3:
2560 ; GFX7LESS-NEXT: s_endpgm
2562 ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
2564 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2565 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2566 ; GFX9-NEXT: s_mov_b32 s14, -1
2567 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
2568 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000
2569 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
2570 ; GFX9-NEXT: s_add_u32 s12, s12, s9
2571 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
2572 ; GFX9-NEXT: s_addc_u32 s13, s13, 0
2573 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2574 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
2575 ; GFX9-NEXT: s_cbranch_execz .LBB4_3
2576 ; GFX9-NEXT: ; %bb.1:
2577 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
2578 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
2579 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
2580 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000
2581 ; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
2582 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2583 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
2584 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
2585 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2586 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
2587 ; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
2588 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2589 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
2590 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
2591 ; GFX9-NEXT: .LBB4_2: ; %atomicrmw.start
2592 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2593 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
2594 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
2595 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2596 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2597 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2598 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
2599 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
2600 ; GFX9-NEXT: s_cbranch_execnz .LBB4_2
2601 ; GFX9-NEXT: .LBB4_3:
2602 ; GFX9-NEXT: s_endpgm
2604 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
2606 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
2607 ; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2608 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
2609 ; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2610 ; GFX1064-NEXT: s_mov_b32 s14, -1
2611 ; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000
2612 ; GFX1064-NEXT: s_add_u32 s12, s12, s9
2613 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
2614 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0
2615 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2616 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
2617 ; GFX1064-NEXT: s_cbranch_execz .LBB4_3
2618 ; GFX1064-NEXT: ; %bb.1:
2619 ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
2620 ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000
2621 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0
2622 ; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
2623 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2624 ; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
2625 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
2626 ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
2627 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
2628 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2
2629 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
2630 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
2631 ; GFX1064-NEXT: .LBB4_2: ; %atomicrmw.start
2632 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
2633 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
2634 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
2635 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
2636 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2637 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
2638 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2639 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
2640 ; GFX1064-NEXT: s_cbranch_execnz .LBB4_2
2641 ; GFX1064-NEXT: .LBB4_3:
2642 ; GFX1064-NEXT: s_endpgm
2644 ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
2646 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
2647 ; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2648 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
2649 ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2650 ; GFX1032-NEXT: s_mov_b32 s14, -1
2651 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
2652 ; GFX1032-NEXT: s_add_u32 s12, s12, s9
2653 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
2654 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0
2655 ; GFX1032-NEXT: s_mov_b32 s4, 0
2656 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
2657 ; GFX1032-NEXT: s_cbranch_execz .LBB4_3
2658 ; GFX1032-NEXT: ; %bb.1:
2659 ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0
2660 ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000
2661 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0
2662 ; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
2663 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2664 ; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
2665 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
2666 ; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0
2667 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
2668 ; GFX1032-NEXT: v_mov_b32_e32 v1, s2
2669 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
2670 ; GFX1032-NEXT: .LBB4_2: ; %atomicrmw.start
2671 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
2672 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
2673 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
2674 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
2675 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
2676 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
2677 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
2678 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
2679 ; GFX1032-NEXT: s_cbranch_execnz .LBB4_2
2680 ; GFX1032-NEXT: .LBB4_3:
2681 ; GFX1032-NEXT: s_endpgm
2683 ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
2685 ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec
2686 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
2687 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0
2688 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2689 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
2690 ; GFX1164-NEXT: s_clause 0x1
2691 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4
2692 ; GFX1164-NEXT: scratch_store_b32 off, v1, off
2693 ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off
2694 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2695 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
2696 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
2697 ; GFX1164-NEXT: s_cbranch_execz .LBB4_3
2698 ; GFX1164-NEXT: ; %bb.1:
2699 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
2700 ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
2701 ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
2702 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0
2703 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
2704 ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
2705 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
2706 ; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
2707 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
2708 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2
2709 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
2710 ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
2711 ; GFX1164-NEXT: .LBB4_2: ; %atomicrmw.start
2712 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
2713 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
2714 ; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
2715 ; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
2716 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
2717 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2718 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0
2719 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2720 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2721 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
2722 ; GFX1164-NEXT: s_cbranch_execnz .LBB4_2
2723 ; GFX1164-NEXT: .LBB4_3:
2724 ; GFX1164-NEXT: s_endpgm
2726 ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
2728 ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo
2729 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2730 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0
2731 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2732 ; GFX1132-NEXT: s_mov_b32 s4, 0
2733 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
2734 ; GFX1132-NEXT: s_clause 0x1
2735 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
2736 ; GFX1132-NEXT: scratch_store_b32 off, v1, off
2737 ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off
2738 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
2739 ; GFX1132-NEXT: s_cbranch_execz .LBB4_3
2740 ; GFX1132-NEXT: ; %bb.1:
2741 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
2742 ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
2743 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
2744 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0
2745 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
2746 ; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0
2747 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2748 ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
2749 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
2750 ; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0
2751 ; GFX1132-NEXT: .LBB4_2: ; %atomicrmw.start
2752 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
2753 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
2754 ; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
2755 ; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
2756 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
2757 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
2758 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0
2759 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
2760 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2761 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
2762 ; GFX1132-NEXT: s_cbranch_execnz .LBB4_2
2763 ; GFX1132-NEXT: .LBB4_3:
2764 ; GFX1132-NEXT: s_endpgm
2766 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
2767 ; GFX7LESS-DPP: ; %bb.0:
2768 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2769 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2770 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1
2771 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000
2772 ; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9
2773 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0
2774 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec
2775 ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0
2776 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0
2777 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2778 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
2779 ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB4_3
2780 ; GFX7LESS-DPP-NEXT: ; %bb.1:
2781 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1]
2782 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
2783 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
2784 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
2785 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
2786 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
2787 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
2788 ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
2789 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
2790 ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
2791 ; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
2792 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
2793 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
2794 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
2795 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
2796 ; GFX7LESS-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
2797 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2798 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
2799 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
2800 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
2801 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
2802 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
2803 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
2804 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
2805 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
2806 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
2807 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
2808 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB4_2
2809 ; GFX7LESS-DPP-NEXT: .LBB4_3:
2810 ; GFX7LESS-DPP-NEXT: s_endpgm
2812 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
2813 ; GFX9-DPP: ; %bb.0:
2814 ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2815 ; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2816 ; GFX9-DPP-NEXT: s_mov_b32 s14, -1
2817 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec
2818 ; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000
2819 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
2820 ; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9
2821 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
2822 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0
2823 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2824 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
2825 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB4_3
2826 ; GFX9-DPP-NEXT: ; %bb.1:
2827 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
2828 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
2829 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
2830 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000
2831 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
2832 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2833 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
2834 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
2835 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
2836 ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
2837 ; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
2838 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
2839 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
2840 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
2841 ; GFX9-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
2842 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2843 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
2844 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
2845 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
2846 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2847 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2848 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
2849 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
2850 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB4_2
2851 ; GFX9-DPP-NEXT: .LBB4_3:
2852 ; GFX9-DPP-NEXT: s_endpgm
2854 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
2855 ; GFX1064-DPP: ; %bb.0:
2856 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec
2857 ; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2858 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
2859 ; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2860 ; GFX1064-DPP-NEXT: s_mov_b32 s14, -1
2861 ; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000
2862 ; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9
2863 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
2864 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0
2865 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2866 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
2867 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB4_3
2868 ; GFX1064-DPP-NEXT: ; %bb.1:
2869 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
2870 ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000
2871 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
2872 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
2873 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2874 ; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
2875 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
2876 ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
2877 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
2878 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
2879 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
2880 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
2881 ; GFX1064-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
2882 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2883 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
2884 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
2885 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
2886 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2887 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
2888 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2889 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
2890 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB4_2
2891 ; GFX1064-DPP-NEXT: .LBB4_3:
2892 ; GFX1064-DPP-NEXT: s_endpgm
2894 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
2895 ; GFX1032-DPP: ; %bb.0:
2896 ; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
2897 ; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
2898 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
2899 ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
2900 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
2901 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
2902 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
2903 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
2904 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
2905 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
2906 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
2907 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB4_3
2908 ; GFX1032-DPP-NEXT: ; %bb.1:
2909 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0
2910 ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000
2911 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
2912 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
2913 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
2914 ; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
2915 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
2916 ; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
2917 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
2918 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2
2919 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
2920 ; GFX1032-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
2921 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2922 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
2923 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
2924 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
2925 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
2926 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
2927 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
2928 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
2929 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB4_2
2930 ; GFX1032-DPP-NEXT: .LBB4_3:
2931 ; GFX1032-DPP-NEXT: s_endpgm
2933 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
2934 ; GFX1164-DPP: ; %bb.0:
2935 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec
2936 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
2937 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
2938 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2939 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
2940 ; GFX1164-DPP-NEXT: s_clause 0x1
2941 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
2942 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off
2943 ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off
2944 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
2945 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
2946 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
2947 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB4_3
2948 ; GFX1164-DPP-NEXT: ; %bb.1:
2949 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
2950 ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
2951 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
2952 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
2953 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
2954 ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
2955 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
2956 ; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
2957 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
2958 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
2959 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
2960 ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
2961 ; GFX1164-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
2962 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
2963 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
2964 ; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
2965 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
2966 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
2967 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
2968 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
2969 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
2970 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2971 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
2972 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB4_2
2973 ; GFX1164-DPP-NEXT: .LBB4_3:
2974 ; GFX1164-DPP-NEXT: s_endpgm
2976 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
2977 ; GFX1132-DPP: ; %bb.0:
2978 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo
2979 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2980 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0
2981 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
2982 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
2983 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
2984 ; GFX1132-DPP-NEXT: s_clause 0x1
2985 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
2986 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
2987 ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off
2988 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
2989 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB4_3
2990 ; GFX1132-DPP-NEXT: ; %bb.1:
2991 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
2992 ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
2993 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
2994 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
2995 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
2996 ; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
2997 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
2998 ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
2999 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
3000 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0
3001 ; GFX1132-DPP-NEXT: .LBB4_2: ; %atomicrmw.start
3002 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
3003 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
3004 ; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
3005 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
3006 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
3007 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
3008 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
3009 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
3010 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3011 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
3012 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB4_2
3013 ; GFX1132-DPP-NEXT: .LBB4_3:
3014 ; GFX1132-DPP-NEXT: s_endpgm
3015 %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") monotonic
3020 define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
3021 ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
3022 ; GFX7LESS: ; %bb.0:
3023 ; GFX7LESS-NEXT: s_mov_b32 s32, 0
3024 ; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
3025 ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
3026 ; GFX7LESS-NEXT: s_mov_b32 s38, -1
3027 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
3028 ; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
3029 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
3030 ; GFX7LESS-NEXT: s_mov_b32 s14, s8
3031 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
3032 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
3033 ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
3034 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
3035 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
3036 ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
3037 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
3038 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
3039 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
3040 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
3041 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
3042 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
3043 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
3044 ; GFX7LESS-NEXT: s_mov_b32 s12, s6
3045 ; GFX7LESS-NEXT: s_mov_b32 s13, s7
3046 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
3047 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
3048 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
3049 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
3050 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
3051 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1
3052 ; GFX7LESS-NEXT: .LBB5_1: ; %ComputeLoop
3053 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
3054 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
3055 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
3056 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
3057 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
3058 ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
3059 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
3060 ; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2
3061 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB5_1
3062 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
3063 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3064 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3065 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
3066 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
3067 ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
3068 ; GFX7LESS-NEXT: s_cbranch_execz .LBB5_5
3069 ; GFX7LESS-NEXT: ; %bb.3:
3070 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
3071 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
3072 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
3073 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
3074 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
3075 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
3076 ; GFX7LESS-NEXT: .LBB5_4: ; %atomicrmw.start
3077 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
3078 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
3079 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
3080 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
3081 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
3082 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
3083 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
3084 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
3085 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
3086 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3087 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
3088 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
3089 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB5_4
3090 ; GFX7LESS-NEXT: .LBB5_5:
3091 ; GFX7LESS-NEXT: s_endpgm
3093 ; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
3095 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
3096 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
3097 ; GFX9-NEXT: s_mov_b32 s38, -1
3098 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000
3099 ; GFX9-NEXT: s_add_u32 s36, s36, s9
3100 ; GFX9-NEXT: s_addc_u32 s37, s37, 0
3101 ; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
3102 ; GFX9-NEXT: s_mov_b32 s14, s8
3103 ; GFX9-NEXT: s_add_u32 s8, s34, 44
3104 ; GFX9-NEXT: s_addc_u32 s9, s35, 0
3105 ; GFX9-NEXT: s_getpc_b64 s[2:3]
3106 ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
3107 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
3108 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
3109 ; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
3110 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
3111 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
3112 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
3113 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
3114 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
3115 ; GFX9-NEXT: s_mov_b32 s12, s6
3116 ; GFX9-NEXT: s_mov_b32 s13, s7
3117 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
3118 ; GFX9-NEXT: s_mov_b32 s32, 0
3119 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3120 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
3121 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
3122 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
3123 ; GFX9-NEXT: .LBB5_1: ; %ComputeLoop
3124 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3125 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
3126 ; GFX9-NEXT: v_readlane_b32 s4, v0, s2
3127 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
3128 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
3129 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
3130 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
3131 ; GFX9-NEXT: s_cbranch_scc1 .LBB5_1
3132 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
3133 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3134 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3135 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
3136 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
3137 ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
3138 ; GFX9-NEXT: s_cbranch_execz .LBB5_5
3139 ; GFX9-NEXT: ; %bb.3:
3140 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
3141 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
3142 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
3143 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3144 ; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
3145 ; GFX9-NEXT: .LBB5_4: ; %atomicrmw.start
3146 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3147 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3148 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
3149 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
3150 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3151 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
3152 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
3153 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
3154 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
3155 ; GFX9-NEXT: s_cbranch_execnz .LBB5_4
3156 ; GFX9-NEXT: .LBB5_5:
3157 ; GFX9-NEXT: s_endpgm
3159 ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
3161 ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
3162 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
3163 ; GFX1064-NEXT: s_mov_b32 s38, -1
3164 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
3165 ; GFX1064-NEXT: s_add_u32 s36, s36, s9
3166 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
3167 ; GFX1064-NEXT: s_addc_u32 s37, s37, 0
3168 ; GFX1064-NEXT: s_mov_b32 s14, s8
3169 ; GFX1064-NEXT: s_add_u32 s8, s34, 44
3170 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0
3171 ; GFX1064-NEXT: s_getpc_b64 s[2:3]
3172 ; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
3173 ; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
3174 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
3175 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
3176 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
3177 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
3178 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
3179 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
3180 ; GFX1064-NEXT: s_mov_b32 s12, s6
3181 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
3182 ; GFX1064-NEXT: s_mov_b32 s13, s7
3183 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
3184 ; GFX1064-NEXT: s_mov_b32 s32, 0
3185 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
3186 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
3187 ; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1
3188 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
3189 ; GFX1064-NEXT: .LBB5_1: ; %ComputeLoop
3190 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
3191 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
3192 ; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
3193 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
3194 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
3195 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
3196 ; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
3197 ; GFX1064-NEXT: s_cbranch_scc1 .LBB5_1
3198 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
3199 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3200 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3201 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
3202 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
3203 ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
3204 ; GFX1064-NEXT: s_cbranch_execz .LBB5_5
3205 ; GFX1064-NEXT: ; %bb.3:
3206 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
3207 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0
3208 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
3209 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
3210 ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
3211 ; GFX1064-NEXT: .LBB5_4: ; %atomicrmw.start
3212 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
3213 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
3214 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
3215 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
3216 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
3217 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
3218 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
3219 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
3220 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
3221 ; GFX1064-NEXT: s_cbranch_execnz .LBB5_4
3222 ; GFX1064-NEXT: .LBB5_5:
3223 ; GFX1064-NEXT: s_endpgm
3225 ; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
3227 ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
3228 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
3229 ; GFX1032-NEXT: s_mov_b32 s38, -1
3230 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
3231 ; GFX1032-NEXT: s_add_u32 s36, s36, s9
3232 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
3233 ; GFX1032-NEXT: s_addc_u32 s37, s37, 0
3234 ; GFX1032-NEXT: s_mov_b32 s14, s8
3235 ; GFX1032-NEXT: s_add_u32 s8, s34, 44
3236 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0
3237 ; GFX1032-NEXT: s_getpc_b64 s[2:3]
3238 ; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
3239 ; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
3240 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
3241 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
3242 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
3243 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
3244 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
3245 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
3246 ; GFX1032-NEXT: s_mov_b32 s12, s6
3247 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
3248 ; GFX1032-NEXT: s_mov_b32 s13, s7
3249 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
3250 ; GFX1032-NEXT: s_mov_b32 s32, 0
3251 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
3252 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
3253 ; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1
3254 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
3255 ; GFX1032-NEXT: .LBB5_1: ; %ComputeLoop
3256 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
3257 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
3258 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
3259 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
3260 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
3261 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
3262 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
3263 ; GFX1032-NEXT: s_cbranch_scc1 .LBB5_1
3264 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
3265 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3266 ; GFX1032-NEXT: s_mov_b32 s2, 0
3267 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
3268 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
3269 ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
3270 ; GFX1032-NEXT: s_cbranch_execz .LBB5_5
3271 ; GFX1032-NEXT: ; %bb.3:
3272 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
3273 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0
3274 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
3275 ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
3276 ; GFX1032-NEXT: .LBB5_4: ; %atomicrmw.start
3277 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
3278 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
3279 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
3280 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
3281 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
3282 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
3283 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
3284 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
3285 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
3286 ; GFX1032-NEXT: s_cbranch_execnz .LBB5_4
3287 ; GFX1032-NEXT: .LBB5_5:
3288 ; GFX1032-NEXT: s_endpgm
3290 ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
3292 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
3293 ; GFX1164-NEXT: s_mov_b32 s14, s8
3294 ; GFX1164-NEXT: s_add_u32 s8, s34, 44
3295 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0
3296 ; GFX1164-NEXT: s_getpc_b64 s[2:3]
3297 ; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
3298 ; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
3299 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0
3300 ; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
3301 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
3302 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
3303 ; GFX1164-NEXT: s_mov_b32 s12, s6
3304 ; GFX1164-NEXT: s_mov_b32 s13, s7
3305 ; GFX1164-NEXT: s_mov_b32 s32, 0
3306 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
3307 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
3308 ; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1
3309 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
3310 ; GFX1164-NEXT: .LBB5_1: ; %ComputeLoop
3311 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
3312 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3313 ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
3314 ; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
3315 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
3316 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3317 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
3318 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
3319 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
3320 ; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
3321 ; GFX1164-NEXT: s_cbranch_scc1 .LBB5_1
3322 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
3323 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3324 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
3325 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3326 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3327 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
3328 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
3329 ; GFX1164-NEXT: s_cbranch_execz .LBB5_4
3330 ; GFX1164-NEXT: ; %bb.3:
3331 ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
3332 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0
3333 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
3334 ; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
3335 ; GFX1164-NEXT: .LBB5_4:
3336 ; GFX1164-NEXT: s_endpgm
3338 ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
3340 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
3341 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0
3342 ; GFX1132-NEXT: s_add_u32 s8, s34, 44
3343 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0
3344 ; GFX1132-NEXT: s_getpc_b64 s[2:3]
3345 ; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
3346 ; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
3347 ; GFX1132-NEXT: s_mov_b32 s12, s13
3348 ; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
3349 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
3350 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
3351 ; GFX1132-NEXT: s_mov_b32 s13, s14
3352 ; GFX1132-NEXT: s_mov_b32 s14, s15
3353 ; GFX1132-NEXT: s_mov_b32 s32, 0
3354 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
3355 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
3356 ; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1
3357 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
3358 ; GFX1132-NEXT: .LBB5_1: ; %ComputeLoop
3359 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
3360 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3361 ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
3362 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
3363 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
3364 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
3365 ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
3366 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
3367 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
3368 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
3369 ; GFX1132-NEXT: s_cbranch_scc1 .LBB5_1
3370 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
3371 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3372 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
3373 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
3374 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
3375 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
3376 ; GFX1132-NEXT: s_cbranch_execz .LBB5_4
3377 ; GFX1132-NEXT: ; %bb.3:
3378 ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
3379 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0
3380 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
3381 ; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
3382 ; GFX1132-NEXT: .LBB5_4:
3383 ; GFX1132-NEXT: s_endpgm
3385 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
3386 ; GFX7LESS-DPP: ; %bb.0:
3387 ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
3388 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
3389 ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
3390 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
3391 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
3392 ; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
3393 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
3394 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
3395 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
3396 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
3397 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
3398 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
3399 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
3400 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
3401 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
3402 ; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
3403 ; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
3404 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
3405 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
3406 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
3407 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
3408 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
3409 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
3410 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
3411 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
3412 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
3413 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
3414 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
3415 ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
3416 ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
3417 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
3418 ; GFX7LESS-DPP-NEXT: .LBB5_1: ; %atomicrmw.start
3419 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
3420 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
3421 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0
3422 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
3423 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
3424 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
3425 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
3426 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
3427 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
3428 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3429 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
3430 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
3431 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB5_1
3432 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
3433 ; GFX7LESS-DPP-NEXT: s_endpgm
3435 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
3436 ; GFX9-DPP: ; %bb.0:
3437 ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
3438 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
3439 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1
3440 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
3441 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
3442 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
3443 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
3444 ; GFX9-DPP-NEXT: s_mov_b32 s14, s8
3445 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
3446 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
3447 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
3448 ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
3449 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
3450 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
3451 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
3452 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
3453 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
3454 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
3455 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
3456 ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
3457 ; GFX9-DPP-NEXT: s_mov_b32 s12, s6
3458 ; GFX9-DPP-NEXT: s_mov_b32 s13, s7
3459 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
3460 ; GFX9-DPP-NEXT: s_mov_b32 s32, 0
3461 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
3462 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
3463 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
3464 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
3465 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
3466 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
3467 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
3468 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
3469 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
3470 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
3471 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
3472 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
3473 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
3474 ; GFX9-DPP-NEXT: s_nop 1
3475 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
3476 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
3477 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
3478 ; GFX9-DPP-NEXT: s_nop 1
3479 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
3480 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
3481 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
3482 ; GFX9-DPP-NEXT: s_nop 1
3483 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
3484 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
3485 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
3486 ; GFX9-DPP-NEXT: s_nop 1
3487 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
3488 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
3489 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
3490 ; GFX9-DPP-NEXT: s_nop 1
3491 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
3492 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
3493 ; GFX9-DPP-NEXT: s_nop 1
3494 ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
3495 ; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
3496 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
3497 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
3498 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
3499 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
3500 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB5_3
3501 ; GFX9-DPP-NEXT: ; %bb.1:
3502 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
3503 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
3504 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
3505 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
3506 ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
3507 ; GFX9-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
3508 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
3509 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
3510 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1
3511 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
3512 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
3513 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
3514 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
3515 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
3516 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
3517 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB5_2
3518 ; GFX9-DPP-NEXT: .LBB5_3:
3519 ; GFX9-DPP-NEXT: s_endpgm
3521 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
3522 ; GFX1064-DPP: ; %bb.0:
3523 ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
3524 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
3525 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
3526 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
3527 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
3528 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
3529 ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
3530 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
3531 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
3532 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
3533 ; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
3534 ; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
3535 ; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
3536 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
3537 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
3538 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
3539 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
3540 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
3541 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
3542 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
3543 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
3544 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
3545 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
3546 ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
3547 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
3548 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
3549 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
3550 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
3551 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
3552 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
3553 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
3554 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
3555 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
3556 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
3557 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
3558 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
3559 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
3560 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
3561 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
3562 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
3563 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
3564 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
3565 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
3566 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
3567 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
3568 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
3569 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
3570 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
3571 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
3572 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32
3573 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
3574 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3575 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
3576 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3
3577 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
3578 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3579 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3
3580 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
3581 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
3582 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB5_3
3583 ; GFX1064-DPP-NEXT: ; %bb.1:
3584 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
3585 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
3586 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
3587 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
3588 ; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
3589 ; GFX1064-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
3590 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
3591 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
3592 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
3593 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
3594 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
3595 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
3596 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
3597 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
3598 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
3599 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB5_2
3600 ; GFX1064-DPP-NEXT: .LBB5_3:
3601 ; GFX1064-DPP-NEXT: s_endpgm
3603 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
3604 ; GFX1032-DPP: ; %bb.0:
3605 ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
3606 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
3607 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
3608 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
3609 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
3610 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
3611 ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
3612 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
3613 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
3614 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
3615 ; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
3616 ; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
3617 ; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
3618 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
3619 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
3620 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
3621 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
3622 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
3623 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
3624 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
3625 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
3626 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
3627 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
3628 ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
3629 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
3630 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
3631 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
3632 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
3633 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
3634 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
3635 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
3636 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
3637 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
3638 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
3639 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
3640 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
3641 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
3642 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
3643 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
3644 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
3645 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
3646 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
3647 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
3648 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
3649 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
3650 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
3651 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
3652 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
3653 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
3654 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3655 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3
3656 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
3657 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
3658 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
3659 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB5_3
3660 ; GFX1032-DPP-NEXT: ; %bb.1:
3661 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
3662 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
3663 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
3664 ; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
3665 ; GFX1032-DPP-NEXT: .LBB5_2: ; %atomicrmw.start
3666 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
3667 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
3668 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
3669 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
3670 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
3671 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
3672 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
3673 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
3674 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
3675 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB5_2
3676 ; GFX1032-DPP-NEXT: .LBB5_3:
3677 ; GFX1032-DPP-NEXT: s_endpgm
3679 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
3680 ; GFX1164-DPP: ; %bb.0:
3681 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
3682 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
3683 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
3684 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
3685 ; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
3686 ; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
3687 ; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
3688 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
3689 ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
3690 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
3691 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
3692 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
3693 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
3694 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
3695 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
3696 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
3697 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
3698 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
3699 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
3700 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
3701 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
3702 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
3703 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
3704 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
3705 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
3706 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
3707 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
3708 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3709 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
3710 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
3711 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
3712 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3713 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
3714 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
3715 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
3716 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3717 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
3718 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
3719 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3720 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
3721 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
3722 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3723 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
3724 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
3725 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
3726 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
3727 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
3728 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3729 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
3730 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
3731 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
3732 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
3733 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
3734 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
3735 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3736 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
3737 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
3738 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
3739 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB5_2
3740 ; GFX1164-DPP-NEXT: ; %bb.1:
3741 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
3742 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
3743 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
3744 ; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
3745 ; GFX1164-DPP-NEXT: .LBB5_2:
3746 ; GFX1164-DPP-NEXT: s_endpgm
3748 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
3749 ; GFX1132-DPP: ; %bb.0:
3750 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
3751 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
3752 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
3753 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
3754 ; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
3755 ; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
3756 ; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
3757 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
3758 ; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
3759 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
3760 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
3761 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
3762 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
3763 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
3764 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
3765 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
3766 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
3767 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
3768 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
3769 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
3770 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
3771 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
3772 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
3773 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
3774 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3775 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
3776 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
3777 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
3778 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
3779 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3780 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
3781 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
3782 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
3783 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
3784 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
3785 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
3786 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3787 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
3788 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
3789 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
3790 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
3791 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
3792 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
3793 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
3794 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
3795 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
3796 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3797 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
3798 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
3799 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
3800 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB5_2
3801 ; GFX1132-DPP-NEXT: ; %bb.1:
3802 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
3803 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
3804 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
3805 ; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
3806 ; GFX1132-DPP-NEXT: .LBB5_2:
3807 ; GFX1132-DPP-NEXT: s_endpgm
3808 %divValue = call float @div.float.value()
3809 %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
3814 define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
3815 ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
3816 ; GFX7LESS: ; %bb.0:
3817 ; GFX7LESS-NEXT: s_mov_b32 s32, 0
3818 ; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
3819 ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
3820 ; GFX7LESS-NEXT: s_mov_b32 s38, -1
3821 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
3822 ; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
3823 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
3824 ; GFX7LESS-NEXT: s_mov_b32 s14, s8
3825 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
3826 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
3827 ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
3828 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
3829 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
3830 ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
3831 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
3832 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
3833 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
3834 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
3835 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
3836 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
3837 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
3838 ; GFX7LESS-NEXT: s_mov_b32 s12, s6
3839 ; GFX7LESS-NEXT: s_mov_b32 s13, s7
3840 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
3841 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
3842 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
3843 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
3844 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
3845 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1
3846 ; GFX7LESS-NEXT: .LBB6_1: ; %ComputeLoop
3847 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
3848 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
3849 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
3850 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
3851 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
3852 ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
3853 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
3854 ; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2
3855 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB6_1
3856 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
3857 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
3858 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
3859 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
3860 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
3861 ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
3862 ; GFX7LESS-NEXT: s_cbranch_execz .LBB6_5
3863 ; GFX7LESS-NEXT: ; %bb.3:
3864 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
3865 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
3866 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
3867 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
3868 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
3869 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
3870 ; GFX7LESS-NEXT: .LBB6_4: ; %atomicrmw.start
3871 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
3872 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
3873 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
3874 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
3875 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
3876 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
3877 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
3878 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
3879 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
3880 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
3881 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
3882 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
3883 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_4
3884 ; GFX7LESS-NEXT: .LBB6_5:
3885 ; GFX7LESS-NEXT: s_endpgm
3887 ; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
3889 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
3890 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
3891 ; GFX9-NEXT: s_mov_b32 s38, -1
3892 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000
3893 ; GFX9-NEXT: s_add_u32 s36, s36, s9
3894 ; GFX9-NEXT: s_addc_u32 s37, s37, 0
3895 ; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
3896 ; GFX9-NEXT: s_mov_b32 s14, s8
3897 ; GFX9-NEXT: s_add_u32 s8, s34, 44
3898 ; GFX9-NEXT: s_addc_u32 s9, s35, 0
3899 ; GFX9-NEXT: s_getpc_b64 s[2:3]
3900 ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
3901 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
3902 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
3903 ; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
3904 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
3905 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
3906 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
3907 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
3908 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
3909 ; GFX9-NEXT: s_mov_b32 s12, s6
3910 ; GFX9-NEXT: s_mov_b32 s13, s7
3911 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
3912 ; GFX9-NEXT: s_mov_b32 s32, 0
3913 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3914 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
3915 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
3916 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
3917 ; GFX9-NEXT: .LBB6_1: ; %ComputeLoop
3918 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3919 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
3920 ; GFX9-NEXT: v_readlane_b32 s4, v0, s2
3921 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
3922 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
3923 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
3924 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
3925 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1
3926 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
3927 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3928 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3929 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
3930 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
3931 ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
3932 ; GFX9-NEXT: s_cbranch_execz .LBB6_5
3933 ; GFX9-NEXT: ; %bb.3:
3934 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
3935 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
3936 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
3937 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3938 ; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
3939 ; GFX9-NEXT: .LBB6_4: ; %atomicrmw.start
3940 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3941 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3942 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
3943 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
3944 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3945 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
3946 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
3947 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
3948 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
3949 ; GFX9-NEXT: s_cbranch_execnz .LBB6_4
3950 ; GFX9-NEXT: .LBB6_5:
3951 ; GFX9-NEXT: s_endpgm
3953 ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
3955 ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
3956 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
3957 ; GFX1064-NEXT: s_mov_b32 s38, -1
3958 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
3959 ; GFX1064-NEXT: s_add_u32 s36, s36, s9
3960 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
3961 ; GFX1064-NEXT: s_addc_u32 s37, s37, 0
3962 ; GFX1064-NEXT: s_mov_b32 s14, s8
3963 ; GFX1064-NEXT: s_add_u32 s8, s34, 44
3964 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0
3965 ; GFX1064-NEXT: s_getpc_b64 s[2:3]
3966 ; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
3967 ; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
3968 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
3969 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
3970 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
3971 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
3972 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
3973 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
3974 ; GFX1064-NEXT: s_mov_b32 s12, s6
3975 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
3976 ; GFX1064-NEXT: s_mov_b32 s13, s7
3977 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
3978 ; GFX1064-NEXT: s_mov_b32 s32, 0
3979 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
3980 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
3981 ; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1
3982 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
3983 ; GFX1064-NEXT: .LBB6_1: ; %ComputeLoop
3984 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
3985 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
3986 ; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
3987 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
3988 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
3989 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
3990 ; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
3991 ; GFX1064-NEXT: s_cbranch_scc1 .LBB6_1
3992 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
3993 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
3994 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
3995 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
3996 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
3997 ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
3998 ; GFX1064-NEXT: s_cbranch_execz .LBB6_5
3999 ; GFX1064-NEXT: ; %bb.3:
4000 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
4001 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0
4002 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
4003 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
4004 ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
4005 ; GFX1064-NEXT: .LBB6_4: ; %atomicrmw.start
4006 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
4007 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
4008 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
4009 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
4010 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
4011 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4012 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
4013 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
4014 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
4015 ; GFX1064-NEXT: s_cbranch_execnz .LBB6_4
4016 ; GFX1064-NEXT: .LBB6_5:
4017 ; GFX1064-NEXT: s_endpgm
4019 ; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
4021 ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
4022 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
4023 ; GFX1032-NEXT: s_mov_b32 s38, -1
4024 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
4025 ; GFX1032-NEXT: s_add_u32 s36, s36, s9
4026 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
4027 ; GFX1032-NEXT: s_addc_u32 s37, s37, 0
4028 ; GFX1032-NEXT: s_mov_b32 s14, s8
4029 ; GFX1032-NEXT: s_add_u32 s8, s34, 44
4030 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0
4031 ; GFX1032-NEXT: s_getpc_b64 s[2:3]
4032 ; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
4033 ; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
4034 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
4035 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
4036 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
4037 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
4038 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
4039 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
4040 ; GFX1032-NEXT: s_mov_b32 s12, s6
4041 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
4042 ; GFX1032-NEXT: s_mov_b32 s13, s7
4043 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
4044 ; GFX1032-NEXT: s_mov_b32 s32, 0
4045 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
4046 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
4047 ; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1
4048 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
4049 ; GFX1032-NEXT: .LBB6_1: ; %ComputeLoop
4050 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
4051 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
4052 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
4053 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
4054 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
4055 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
4056 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
4057 ; GFX1032-NEXT: s_cbranch_scc1 .LBB6_1
4058 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
4059 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4060 ; GFX1032-NEXT: s_mov_b32 s2, 0
4061 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
4062 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
4063 ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
4064 ; GFX1032-NEXT: s_cbranch_execz .LBB6_5
4065 ; GFX1032-NEXT: ; %bb.3:
4066 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
4067 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0
4068 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
4069 ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
4070 ; GFX1032-NEXT: .LBB6_4: ; %atomicrmw.start
4071 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
4072 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
4073 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
4074 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
4075 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
4076 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
4077 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
4078 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
4079 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
4080 ; GFX1032-NEXT: s_cbranch_execnz .LBB6_4
4081 ; GFX1032-NEXT: .LBB6_5:
4082 ; GFX1032-NEXT: s_endpgm
4084 ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
4086 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
4087 ; GFX1164-NEXT: s_mov_b32 s14, s8
4088 ; GFX1164-NEXT: s_add_u32 s8, s34, 44
4089 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0
4090 ; GFX1164-NEXT: s_getpc_b64 s[2:3]
4091 ; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
4092 ; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
4093 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0
4094 ; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
4095 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
4096 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
4097 ; GFX1164-NEXT: s_mov_b32 s12, s6
4098 ; GFX1164-NEXT: s_mov_b32 s13, s7
4099 ; GFX1164-NEXT: s_mov_b32 s32, 0
4100 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
4101 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
4102 ; GFX1164-NEXT: v_bfrev_b32_e32 v1, 1
4103 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
4104 ; GFX1164-NEXT: .LBB6_1: ; %ComputeLoop
4105 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
4106 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
4107 ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
4108 ; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
4109 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
4110 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
4111 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
4112 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
4113 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
4114 ; GFX1164-NEXT: v_add_f32_e32 v1, s4, v1
4115 ; GFX1164-NEXT: s_cbranch_scc1 .LBB6_1
4116 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
4117 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4118 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
4119 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4120 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4121 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
4122 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
4123 ; GFX1164-NEXT: s_cbranch_execz .LBB6_4
4124 ; GFX1164-NEXT: ; %bb.3:
4125 ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
4126 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0
4127 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
4128 ; GFX1164-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
4129 ; GFX1164-NEXT: .LBB6_4:
4130 ; GFX1164-NEXT: s_endpgm
4132 ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
4134 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
4135 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0
4136 ; GFX1132-NEXT: s_add_u32 s8, s34, 44
4137 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0
4138 ; GFX1132-NEXT: s_getpc_b64 s[2:3]
4139 ; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
4140 ; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
4141 ; GFX1132-NEXT: s_mov_b32 s12, s13
4142 ; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
4143 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
4144 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
4145 ; GFX1132-NEXT: s_mov_b32 s13, s14
4146 ; GFX1132-NEXT: s_mov_b32 s14, s15
4147 ; GFX1132-NEXT: s_mov_b32 s32, 0
4148 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
4149 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
4150 ; GFX1132-NEXT: v_bfrev_b32_e32 v1, 1
4151 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
4152 ; GFX1132-NEXT: .LBB6_1: ; %ComputeLoop
4153 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
4154 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
4155 ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
4156 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
4157 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
4158 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
4159 ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
4160 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
4161 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
4162 ; GFX1132-NEXT: v_add_f32_e32 v1, s2, v1
4163 ; GFX1132-NEXT: s_cbranch_scc1 .LBB6_1
4164 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
4165 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4166 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
4167 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
4168 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
4169 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
4170 ; GFX1132-NEXT: s_cbranch_execz .LBB6_4
4171 ; GFX1132-NEXT: ; %bb.3:
4172 ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
4173 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0
4174 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
4175 ; GFX1132-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
4176 ; GFX1132-NEXT: .LBB6_4:
4177 ; GFX1132-NEXT: s_endpgm
4179 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
4180 ; GFX7LESS-DPP: ; %bb.0:
4181 ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
4182 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
4183 ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
4184 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
4185 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
4186 ; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
4187 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
4188 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
4189 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
4190 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
4191 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
4192 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
4193 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
4194 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
4195 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
4196 ; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
4197 ; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
4198 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
4199 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
4200 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
4201 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
4202 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
4203 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
4204 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
4205 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
4206 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
4207 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
4208 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
4209 ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
4210 ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
4211 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
4212 ; GFX7LESS-DPP-NEXT: .LBB6_1: ; %atomicrmw.start
4213 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
4214 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
4215 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0
4216 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
4217 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
4218 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
4219 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
4220 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
4221 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
4222 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
4223 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
4224 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
4225 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB6_1
4226 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
4227 ; GFX7LESS-DPP-NEXT: s_endpgm
4229 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
4230 ; GFX9-DPP: ; %bb.0:
4231 ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
4232 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
4233 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1
4234 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
4235 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
4236 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
4237 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
4238 ; GFX9-DPP-NEXT: s_mov_b32 s14, s8
4239 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
4240 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
4241 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
4242 ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
4243 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
4244 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
4245 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
4246 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
4247 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
4248 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
4249 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
4250 ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
4251 ; GFX9-DPP-NEXT: s_mov_b32 s12, s6
4252 ; GFX9-DPP-NEXT: s_mov_b32 s13, s7
4253 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
4254 ; GFX9-DPP-NEXT: s_mov_b32 s32, 0
4255 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
4256 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
4257 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
4258 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
4259 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
4260 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
4261 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
4262 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
4263 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
4264 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
4265 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
4266 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
4267 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
4268 ; GFX9-DPP-NEXT: s_nop 1
4269 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
4270 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
4271 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
4272 ; GFX9-DPP-NEXT: s_nop 1
4273 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
4274 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
4275 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
4276 ; GFX9-DPP-NEXT: s_nop 1
4277 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
4278 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
4279 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
4280 ; GFX9-DPP-NEXT: s_nop 1
4281 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
4282 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
4283 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
4284 ; GFX9-DPP-NEXT: s_nop 1
4285 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
4286 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
4287 ; GFX9-DPP-NEXT: s_nop 1
4288 ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
4289 ; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
4290 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
4291 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
4292 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
4293 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
4294 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3
4295 ; GFX9-DPP-NEXT: ; %bb.1:
4296 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
4297 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
4298 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
4299 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
4300 ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
4301 ; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
4302 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
4303 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
4304 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1
4305 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
4306 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
4307 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4308 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
4309 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
4310 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
4311 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2
4312 ; GFX9-DPP-NEXT: .LBB6_3:
4313 ; GFX9-DPP-NEXT: s_endpgm
4315 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
4316 ; GFX1064-DPP: ; %bb.0:
4317 ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
4318 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
4319 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
4320 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
4321 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
4322 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
4323 ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
4324 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
4325 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
4326 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
4327 ; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
4328 ; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
4329 ; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
4330 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
4331 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
4332 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
4333 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
4334 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
4335 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
4336 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
4337 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
4338 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
4339 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
4340 ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
4341 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
4342 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
4343 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
4344 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
4345 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
4346 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
4347 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
4348 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
4349 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
4350 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
4351 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
4352 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
4353 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
4354 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
4355 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
4356 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
4357 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
4358 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
4359 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
4360 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
4361 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
4362 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
4363 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
4364 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
4365 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
4366 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32
4367 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
4368 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4369 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
4370 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3
4371 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
4372 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
4373 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3
4374 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
4375 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
4376 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
4377 ; GFX1064-DPP-NEXT: ; %bb.1:
4378 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
4379 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
4380 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
4381 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
4382 ; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
4383 ; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
4384 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
4385 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
4386 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
4387 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
4388 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
4389 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4390 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
4391 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
4392 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
4393 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
4394 ; GFX1064-DPP-NEXT: .LBB6_3:
4395 ; GFX1064-DPP-NEXT: s_endpgm
4397 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
4398 ; GFX1032-DPP: ; %bb.0:
4399 ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
4400 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
4401 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
4402 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
4403 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
4404 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
4405 ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
4406 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
4407 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
4408 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
4409 ; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
4410 ; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
4411 ; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
4412 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
4413 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
4414 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
4415 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
4416 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
4417 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
4418 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
4419 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
4420 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
4421 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
4422 ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
4423 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
4424 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
4425 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
4426 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
4427 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
4428 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
4429 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
4430 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
4431 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
4432 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
4433 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
4434 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
4435 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
4436 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
4437 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
4438 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
4439 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
4440 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
4441 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
4442 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
4443 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
4444 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
4445 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
4446 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
4447 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
4448 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4449 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3
4450 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
4451 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
4452 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
4453 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
4454 ; GFX1032-DPP-NEXT: ; %bb.1:
4455 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
4456 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
4457 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
4458 ; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
4459 ; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
4460 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
4461 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
4462 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
4463 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
4464 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
4465 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
4466 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
4467 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
4468 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
4469 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
4470 ; GFX1032-DPP-NEXT: .LBB6_3:
4471 ; GFX1032-DPP-NEXT: s_endpgm
4473 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
4474 ; GFX1164-DPP: ; %bb.0:
4475 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
4476 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
4477 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
4478 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
4479 ; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
4480 ; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
4481 ; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
4482 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
4483 ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
4484 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
4485 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
4486 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
4487 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
4488 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
4489 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
4490 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
4491 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
4492 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
4493 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
4494 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
4495 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
4496 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
4497 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
4498 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
4499 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
4500 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
4501 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
4502 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4503 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
4504 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
4505 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
4506 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4507 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
4508 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
4509 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
4510 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4511 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
4512 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
4513 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4514 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
4515 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
4516 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4517 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
4518 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
4519 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4520 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
4521 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
4522 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
4523 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
4524 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
4525 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
4526 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
4527 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
4528 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
4529 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4530 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
4531 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
4532 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
4533 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_2
4534 ; GFX1164-DPP-NEXT: ; %bb.1:
4535 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
4536 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
4537 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
4538 ; GFX1164-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
4539 ; GFX1164-DPP-NEXT: .LBB6_2:
4540 ; GFX1164-DPP-NEXT: s_endpgm
4542 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
4543 ; GFX1132-DPP: ; %bb.0:
4544 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
4545 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
4546 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
4547 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
4548 ; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
4549 ; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
4550 ; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
4551 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
4552 ; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
4553 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
4554 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
4555 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
4556 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
4557 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
4558 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
4559 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
4560 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
4561 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
4562 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
4563 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
4564 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
4565 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
4566 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
4567 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
4568 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4569 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
4570 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
4571 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
4572 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
4573 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
4574 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
4575 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
4576 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
4577 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
4578 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
4579 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
4580 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4581 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
4582 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
4583 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
4584 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
4585 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
4586 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
4587 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
4588 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
4589 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
4590 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
4591 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
4592 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
4593 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
4594 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_2
4595 ; GFX1132-DPP-NEXT: ; %bb.1:
4596 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
4597 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
4598 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
4599 ; GFX1132-DPP-NEXT: global_atomic_add_f32 v4, v0, s[0:1]
4600 ; GFX1132-DPP-NEXT: .LBB6_2:
4601 ; GFX1132-DPP-NEXT: s_endpgm
4602 %divValue = call float @div.float.value() strictfp
4603 %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue syncscope("agent") monotonic
4607 define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 {
4608 ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
4609 ; GFX7LESS: ; %bb.0:
4610 ; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
4611 ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
4612 ; GFX7LESS-NEXT: s_mov_b32 s14, -1
4613 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000
4614 ; GFX7LESS-NEXT: s_add_u32 s12, s12, s9
4615 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0
4616 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
4617 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0
4618 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0
4619 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
4620 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
4621 ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_3
4622 ; GFX7LESS-NEXT: ; %bb.1:
4623 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[0:1]
4624 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
4625 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
4626 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
4627 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
4628 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
4629 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
4630 ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0x0
4631 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
4632 ; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
4633 ; GFX7LESS-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
4634 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
4635 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
4636 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s2
4637 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
4638 ; GFX7LESS-NEXT: .LBB7_2: ; %atomicrmw.start
4639 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
4640 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
4641 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
4642 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
4643 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
4644 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
4645 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
4646 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
4647 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4648 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
4649 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
4650 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_2
4651 ; GFX7LESS-NEXT: .LBB7_3:
4652 ; GFX7LESS-NEXT: s_endpgm
4654 ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
4656 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
4657 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
4658 ; GFX9-NEXT: s_mov_b32 s14, -1
4659 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
4660 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000
4661 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
4662 ; GFX9-NEXT: s_add_u32 s12, s12, s9
4663 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
4664 ; GFX9-NEXT: s_addc_u32 s13, s13, 0
4665 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
4666 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
4667 ; GFX9-NEXT: s_cbranch_execz .LBB7_3
4668 ; GFX9-NEXT: ; %bb.1:
4669 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
4670 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
4671 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
4672 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000
4673 ; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
4674 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
4675 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
4676 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
4677 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4678 ; GFX9-NEXT: s_load_dword s4, s[0:1], 0x0
4679 ; GFX9-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
4680 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
4681 ; GFX9-NEXT: v_mov_b32_e32 v1, s4
4682 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
4683 ; GFX9-NEXT: .LBB7_2: ; %atomicrmw.start
4684 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
4685 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
4686 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
4687 ; GFX9-NEXT: s_waitcnt vmcnt(0)
4688 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4689 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
4690 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
4691 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
4692 ; GFX9-NEXT: s_cbranch_execnz .LBB7_2
4693 ; GFX9-NEXT: .LBB7_3:
4694 ; GFX9-NEXT: s_endpgm
4696 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
4698 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
4699 ; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
4700 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
4701 ; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
4702 ; GFX1064-NEXT: s_mov_b32 s14, -1
4703 ; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000
4704 ; GFX1064-NEXT: s_add_u32 s12, s12, s9
4705 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
4706 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0
4707 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
4708 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
4709 ; GFX1064-NEXT: s_cbranch_execz .LBB7_3
4710 ; GFX1064-NEXT: ; %bb.1:
4711 ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
4712 ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000
4713 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0
4714 ; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
4715 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
4716 ; GFX1064-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
4717 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
4718 ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
4719 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
4720 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2
4721 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
4722 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
4723 ; GFX1064-NEXT: .LBB7_2: ; %atomicrmw.start
4724 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
4725 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
4726 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
4727 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
4728 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4729 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
4730 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
4731 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
4732 ; GFX1064-NEXT: s_cbranch_execnz .LBB7_2
4733 ; GFX1064-NEXT: .LBB7_3:
4734 ; GFX1064-NEXT: s_endpgm
4736 ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
4738 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
4739 ; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
4740 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
4741 ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
4742 ; GFX1032-NEXT: s_mov_b32 s14, -1
4743 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
4744 ; GFX1032-NEXT: s_add_u32 s12, s12, s9
4745 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
4746 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0
4747 ; GFX1032-NEXT: s_mov_b32 s4, 0
4748 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
4749 ; GFX1032-NEXT: s_cbranch_execz .LBB7_3
4750 ; GFX1032-NEXT: ; %bb.1:
4751 ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s0
4752 ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000
4753 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0
4754 ; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
4755 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
4756 ; GFX1032-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
4757 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
4758 ; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0
4759 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
4760 ; GFX1032-NEXT: v_mov_b32_e32 v1, s2
4761 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
4762 ; GFX1032-NEXT: .LBB7_2: ; %atomicrmw.start
4763 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
4764 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
4765 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
4766 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
4767 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
4768 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
4769 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
4770 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
4771 ; GFX1032-NEXT: s_cbranch_execnz .LBB7_2
4772 ; GFX1032-NEXT: .LBB7_3:
4773 ; GFX1032-NEXT: s_endpgm
4775 ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
4777 ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec
4778 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
4779 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0
4780 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
4781 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
4782 ; GFX1164-NEXT: s_clause 0x1
4783 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4
4784 ; GFX1164-NEXT: scratch_store_b32 off, v1, off
4785 ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off
4786 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
4787 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
4788 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
4789 ; GFX1164-NEXT: s_cbranch_execz .LBB7_3
4790 ; GFX1164-NEXT: ; %bb.1:
4791 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
4792 ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
4793 ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
4794 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0
4795 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
4796 ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
4797 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
4798 ; GFX1164-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
4799 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
4800 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2
4801 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
4802 ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
4803 ; GFX1164-NEXT: .LBB7_2: ; %atomicrmw.start
4804 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
4805 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
4806 ; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
4807 ; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
4808 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
4809 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4810 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0
4811 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
4812 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4813 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
4814 ; GFX1164-NEXT: s_cbranch_execnz .LBB7_2
4815 ; GFX1164-NEXT: .LBB7_3:
4816 ; GFX1164-NEXT: s_endpgm
4818 ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
4820 ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo
4821 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4822 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0
4823 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
4824 ; GFX1132-NEXT: s_mov_b32 s4, 0
4825 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
4826 ; GFX1132-NEXT: s_clause 0x1
4827 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
4828 ; GFX1132-NEXT: scratch_store_b32 off, v1, off
4829 ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off
4830 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
4831 ; GFX1132-NEXT: s_cbranch_execz .LBB7_3
4832 ; GFX1132-NEXT: ; %bb.1:
4833 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
4834 ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
4835 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
4836 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0
4837 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
4838 ; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0
4839 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
4840 ; GFX1132-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
4841 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
4842 ; GFX1132-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0
4843 ; GFX1132-NEXT: .LBB7_2: ; %atomicrmw.start
4844 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
4845 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
4846 ; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
4847 ; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
4848 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
4849 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
4850 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0
4851 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
4852 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
4853 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
4854 ; GFX1132-NEXT: s_cbranch_execnz .LBB7_2
4855 ; GFX1132-NEXT: .LBB7_3:
4856 ; GFX1132-NEXT: s_endpgm
4858 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
4859 ; GFX7LESS-DPP: ; %bb.0:
4860 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
4861 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
4862 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1
4863 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000
4864 ; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9
4865 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0
4866 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec
4867 ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0
4868 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0
4869 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
4870 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
4871 ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB7_3
4872 ; GFX7LESS-DPP-NEXT: ; %bb.1:
4873 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[0:1]
4874 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
4875 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
4876 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
4877 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
4878 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
4879 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
4880 ; GFX7LESS-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
4881 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
4882 ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
4883 ; GFX7LESS-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
4884 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
4885 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
4886 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s2
4887 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
4888 ; GFX7LESS-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
4889 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
4890 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
4891 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
4892 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
4893 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
4894 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
4895 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
4896 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
4897 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
4898 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
4899 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
4900 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB7_2
4901 ; GFX7LESS-DPP-NEXT: .LBB7_3:
4902 ; GFX7LESS-DPP-NEXT: s_endpgm
4904 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
4905 ; GFX9-DPP: ; %bb.0:
4906 ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
4907 ; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
4908 ; GFX9-DPP-NEXT: s_mov_b32 s14, -1
4909 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec
4910 ; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000
4911 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
4912 ; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9
4913 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
4914 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0
4915 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
4916 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
4917 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB7_3
4918 ; GFX9-DPP-NEXT: ; %bb.1:
4919 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
4920 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
4921 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
4922 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000
4923 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
4924 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
4925 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
4926 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
4927 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
4928 ; GFX9-DPP-NEXT: s_load_dword s4, s[0:1], 0x0
4929 ; GFX9-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
4930 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
4931 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s4
4932 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
4933 ; GFX9-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
4934 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
4935 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
4936 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
4937 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
4938 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4939 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
4940 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
4941 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
4942 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_2
4943 ; GFX9-DPP-NEXT: .LBB7_3:
4944 ; GFX9-DPP-NEXT: s_endpgm
4946 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
4947 ; GFX1064-DPP: ; %bb.0:
4948 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec
4949 ; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
4950 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
4951 ; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
4952 ; GFX1064-DPP-NEXT: s_mov_b32 s14, -1
4953 ; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000
4954 ; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9
4955 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
4956 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0
4957 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
4958 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
4959 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB7_3
4960 ; GFX1064-DPP-NEXT: ; %bb.1:
4961 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
4962 ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000
4963 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
4964 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
4965 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
4966 ; GFX1064-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
4967 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
4968 ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
4969 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
4970 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
4971 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
4972 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
4973 ; GFX1064-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
4974 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
4975 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
4976 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
4977 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
4978 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
4979 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
4980 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
4981 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
4982 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_2
4983 ; GFX1064-DPP-NEXT: .LBB7_3:
4984 ; GFX1064-DPP-NEXT: s_endpgm
4986 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
4987 ; GFX1032-DPP: ; %bb.0:
4988 ; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
4989 ; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
4990 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
4991 ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
4992 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
4993 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
4994 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
4995 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
4996 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
4997 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
4998 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
4999 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB7_3
5000 ; GFX1032-DPP-NEXT: ; %bb.1:
5001 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s0
5002 ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000
5003 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
5004 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
5005 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
5006 ; GFX1032-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
5007 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
5008 ; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
5009 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
5010 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2
5011 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
5012 ; GFX1032-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
5013 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
5014 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
5015 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
5016 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
5017 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
5018 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
5019 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
5020 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
5021 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_2
5022 ; GFX1032-DPP-NEXT: .LBB7_3:
5023 ; GFX1032-DPP-NEXT: s_endpgm
5025 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
5026 ; GFX1164-DPP: ; %bb.0:
5027 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec
5028 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
5029 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
5030 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
5031 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
5032 ; GFX1164-DPP-NEXT: s_clause 0x1
5033 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
5034 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off
5035 ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off
5036 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
5037 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
5038 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
5039 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB7_3
5040 ; GFX1164-DPP-NEXT: ; %bb.1:
5041 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
5042 ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
5043 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
5044 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
5045 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
5046 ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
5047 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2)
5048 ; GFX1164-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
5049 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
5050 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
5051 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
5052 ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
5053 ; GFX1164-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
5054 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
5055 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
5056 ; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
5057 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
5058 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
5059 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5060 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
5061 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
5062 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
5063 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
5064 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_2
5065 ; GFX1164-DPP-NEXT: .LBB7_3:
5066 ; GFX1164-DPP-NEXT: s_endpgm
5068 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
5069 ; GFX1132-DPP: ; %bb.0:
5070 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo
5071 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
5072 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0
5073 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
5074 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
5075 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
5076 ; GFX1132-DPP-NEXT: s_clause 0x1
5077 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
5078 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
5079 ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off
5080 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
5081 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB7_3
5082 ; GFX1132-DPP-NEXT: ; %bb.1:
5083 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
5084 ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
5085 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
5086 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
5087 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
5088 ; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
5089 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
5090 ; GFX1132-DPP-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
5091 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
5092 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mul_f32 v2, 4.0, v0
5093 ; GFX1132-DPP-NEXT: .LBB7_2: ; %atomicrmw.start
5094 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
5095 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
5096 ; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
5097 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
5098 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
5099 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
5100 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
5101 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
5102 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
5103 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
5104 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_2
5105 ; GFX1132-DPP-NEXT: .LBB7_3:
5106 ; GFX1132-DPP-NEXT: s_endpgm
5107 %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4
5111 define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 {
5112 ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
5113 ; GFX7LESS: ; %bb.0:
5114 ; GFX7LESS-NEXT: s_mov_b32 s32, 0
5115 ; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
5116 ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
5117 ; GFX7LESS-NEXT: s_mov_b32 s38, -1
5118 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
5119 ; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
5120 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
5121 ; GFX7LESS-NEXT: s_mov_b32 s14, s8
5122 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
5123 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
5124 ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
5125 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
5126 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
5127 ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
5128 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
5129 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
5130 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
5131 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
5132 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
5133 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
5134 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
5135 ; GFX7LESS-NEXT: s_mov_b32 s12, s6
5136 ; GFX7LESS-NEXT: s_mov_b32 s13, s7
5137 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
5138 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
5139 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
5140 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
5141 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
5142 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v2, 1
5143 ; GFX7LESS-NEXT: .LBB8_1: ; %ComputeLoop
5144 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
5145 ; GFX7LESS-NEXT: s_ff1_i32_b64 s2, s[0:1]
5146 ; GFX7LESS-NEXT: v_readlane_b32 s4, v0, s2
5147 ; GFX7LESS-NEXT: s_lshl_b64 s[2:3], 1, s2
5148 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
5149 ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[2:3], s[0:1], 0
5150 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[2:3]
5151 ; GFX7LESS-NEXT: v_add_f32_e32 v2, s4, v2
5152 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB8_1
5153 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
5154 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
5155 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
5156 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
5157 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
5158 ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
5159 ; GFX7LESS-NEXT: s_cbranch_execz .LBB8_5
5160 ; GFX7LESS-NEXT: ; %bb.3:
5161 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
5162 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
5163 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
5164 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
5165 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[0:3], 0
5166 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
5167 ; GFX7LESS-NEXT: .LBB8_4: ; %atomicrmw.start
5168 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
5169 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
5170 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
5171 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
5172 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
5173 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
5174 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
5175 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
5176 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
5177 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
5178 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
5179 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
5180 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_4
5181 ; GFX7LESS-NEXT: .LBB8_5:
5182 ; GFX7LESS-NEXT: s_endpgm
5184 ; GFX9-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
5186 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
5187 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
5188 ; GFX9-NEXT: s_mov_b32 s38, -1
5189 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000
5190 ; GFX9-NEXT: s_add_u32 s36, s36, s9
5191 ; GFX9-NEXT: s_addc_u32 s37, s37, 0
5192 ; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
5193 ; GFX9-NEXT: s_mov_b32 s14, s8
5194 ; GFX9-NEXT: s_add_u32 s8, s34, 44
5195 ; GFX9-NEXT: s_addc_u32 s9, s35, 0
5196 ; GFX9-NEXT: s_getpc_b64 s[2:3]
5197 ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
5198 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
5199 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
5200 ; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
5201 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
5202 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
5203 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
5204 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
5205 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
5206 ; GFX9-NEXT: s_mov_b32 s12, s6
5207 ; GFX9-NEXT: s_mov_b32 s13, s7
5208 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
5209 ; GFX9-NEXT: s_mov_b32 s32, 0
5210 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5211 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
5212 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
5213 ; GFX9-NEXT: v_bfrev_b32_e32 v2, 1
5214 ; GFX9-NEXT: .LBB8_1: ; %ComputeLoop
5215 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5216 ; GFX9-NEXT: s_ff1_i32_b64 s2, s[0:1]
5217 ; GFX9-NEXT: v_readlane_b32 s4, v0, s2
5218 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s2
5219 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
5220 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
5221 ; GFX9-NEXT: v_add_f32_e32 v2, s4, v2
5222 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_1
5223 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
5224 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5225 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5226 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
5227 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
5228 ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
5229 ; GFX9-NEXT: s_cbranch_execz .LBB8_5
5230 ; GFX9-NEXT: ; %bb.3:
5231 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
5232 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
5233 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
5234 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
5235 ; GFX9-NEXT: global_load_dword v1, v3, s[0:1]
5236 ; GFX9-NEXT: .LBB8_4: ; %atomicrmw.start
5237 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
5238 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5239 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
5240 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
5241 ; GFX9-NEXT: s_waitcnt vmcnt(0)
5242 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5243 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
5244 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
5245 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
5246 ; GFX9-NEXT: s_cbranch_execnz .LBB8_4
5247 ; GFX9-NEXT: .LBB8_5:
5248 ; GFX9-NEXT: s_endpgm
5250 ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
5252 ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
5253 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
5254 ; GFX1064-NEXT: s_mov_b32 s38, -1
5255 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
5256 ; GFX1064-NEXT: s_add_u32 s36, s36, s9
5257 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
5258 ; GFX1064-NEXT: s_addc_u32 s37, s37, 0
5259 ; GFX1064-NEXT: s_mov_b32 s14, s8
5260 ; GFX1064-NEXT: s_add_u32 s8, s34, 44
5261 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0
5262 ; GFX1064-NEXT: s_getpc_b64 s[2:3]
5263 ; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
5264 ; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
5265 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
5266 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
5267 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
5268 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
5269 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
5270 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
5271 ; GFX1064-NEXT: s_mov_b32 s12, s6
5272 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
5273 ; GFX1064-NEXT: s_mov_b32 s13, s7
5274 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
5275 ; GFX1064-NEXT: s_mov_b32 s32, 0
5276 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
5277 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
5278 ; GFX1064-NEXT: v_bfrev_b32_e32 v2, 1
5279 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
5280 ; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop
5281 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
5282 ; GFX1064-NEXT: s_ff1_i32_b64 s2, s[0:1]
5283 ; GFX1064-NEXT: v_readlane_b32 s4, v0, s2
5284 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s2
5285 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
5286 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
5287 ; GFX1064-NEXT: v_add_f32_e32 v2, s4, v2
5288 ; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1
5289 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
5290 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5291 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5292 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
5293 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
5294 ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
5295 ; GFX1064-NEXT: s_cbranch_execz .LBB8_5
5296 ; GFX1064-NEXT: ; %bb.3:
5297 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
5298 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0
5299 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
5300 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
5301 ; GFX1064-NEXT: global_load_dword v1, v3, s[0:1]
5302 ; GFX1064-NEXT: .LBB8_4: ; %atomicrmw.start
5303 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
5304 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
5305 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
5306 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
5307 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
5308 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5309 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
5310 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
5311 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
5312 ; GFX1064-NEXT: s_cbranch_execnz .LBB8_4
5313 ; GFX1064-NEXT: .LBB8_5:
5314 ; GFX1064-NEXT: s_endpgm
5316 ; GFX1032-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
5318 ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
5319 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
5320 ; GFX1032-NEXT: s_mov_b32 s38, -1
5321 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
5322 ; GFX1032-NEXT: s_add_u32 s36, s36, s9
5323 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
5324 ; GFX1032-NEXT: s_addc_u32 s37, s37, 0
5325 ; GFX1032-NEXT: s_mov_b32 s14, s8
5326 ; GFX1032-NEXT: s_add_u32 s8, s34, 44
5327 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0
5328 ; GFX1032-NEXT: s_getpc_b64 s[2:3]
5329 ; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
5330 ; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
5331 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
5332 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
5333 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
5334 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
5335 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
5336 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
5337 ; GFX1032-NEXT: s_mov_b32 s12, s6
5338 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
5339 ; GFX1032-NEXT: s_mov_b32 s13, s7
5340 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
5341 ; GFX1032-NEXT: s_mov_b32 s32, 0
5342 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
5343 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
5344 ; GFX1032-NEXT: v_bfrev_b32_e32 v2, 1
5345 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
5346 ; GFX1032-NEXT: .LBB8_1: ; %ComputeLoop
5347 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
5348 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
5349 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
5350 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
5351 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
5352 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
5353 ; GFX1032-NEXT: v_add_f32_e32 v2, s2, v2
5354 ; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1
5355 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
5356 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5357 ; GFX1032-NEXT: s_mov_b32 s2, 0
5358 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
5359 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
5360 ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
5361 ; GFX1032-NEXT: s_cbranch_execz .LBB8_5
5362 ; GFX1032-NEXT: ; %bb.3:
5363 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
5364 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0
5365 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
5366 ; GFX1032-NEXT: global_load_dword v1, v3, s[0:1]
5367 ; GFX1032-NEXT: .LBB8_4: ; %atomicrmw.start
5368 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
5369 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
5370 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
5371 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
5372 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
5373 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
5374 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
5375 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
5376 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
5377 ; GFX1032-NEXT: s_cbranch_execnz .LBB8_4
5378 ; GFX1032-NEXT: .LBB8_5:
5379 ; GFX1032-NEXT: s_endpgm
5381 ; GFX1164-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
5383 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
5384 ; GFX1164-NEXT: s_mov_b32 s14, s8
5385 ; GFX1164-NEXT: s_add_u32 s8, s34, 44
5386 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0
5387 ; GFX1164-NEXT: s_getpc_b64 s[2:3]
5388 ; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
5389 ; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
5390 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0
5391 ; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
5392 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
5393 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
5394 ; GFX1164-NEXT: s_mov_b32 s12, s6
5395 ; GFX1164-NEXT: s_mov_b32 s13, s7
5396 ; GFX1164-NEXT: s_mov_b32 s32, 0
5397 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
5398 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
5399 ; GFX1164-NEXT: v_bfrev_b32_e32 v2, 1
5400 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
5401 ; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop
5402 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
5403 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
5404 ; GFX1164-NEXT: s_ctz_i32_b64 s2, s[0:1]
5405 ; GFX1164-NEXT: v_readlane_b32 s4, v0, s2
5406 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s2
5407 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
5408 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
5409 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
5410 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
5411 ; GFX1164-NEXT: v_add_f32_e32 v2, s4, v2
5412 ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1
5413 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
5414 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5415 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
5416 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5417 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5418 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
5419 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
5420 ; GFX1164-NEXT: s_cbranch_execz .LBB8_5
5421 ; GFX1164-NEXT: ; %bb.3:
5422 ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
5423 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0
5424 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
5425 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
5426 ; GFX1164-NEXT: global_load_b32 v1, v3, s[0:1]
5427 ; GFX1164-NEXT: .LBB8_4: ; %atomicrmw.start
5428 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
5429 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
5430 ; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
5431 ; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
5432 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
5433 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5434 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0
5435 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
5436 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
5437 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
5438 ; GFX1164-NEXT: s_cbranch_execnz .LBB8_4
5439 ; GFX1164-NEXT: .LBB8_5:
5440 ; GFX1164-NEXT: s_endpgm
5442 ; GFX1132-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
5444 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
5445 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0
5446 ; GFX1132-NEXT: s_add_u32 s8, s34, 44
5447 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0
5448 ; GFX1132-NEXT: s_getpc_b64 s[2:3]
5449 ; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
5450 ; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
5451 ; GFX1132-NEXT: s_mov_b32 s12, s13
5452 ; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
5453 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
5454 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
5455 ; GFX1132-NEXT: s_mov_b32 s13, s14
5456 ; GFX1132-NEXT: s_mov_b32 s14, s15
5457 ; GFX1132-NEXT: s_mov_b32 s32, 0
5458 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
5459 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
5460 ; GFX1132-NEXT: v_bfrev_b32_e32 v2, 1
5461 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
5462 ; GFX1132-NEXT: .LBB8_1: ; %ComputeLoop
5463 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
5464 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
5465 ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
5466 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
5467 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
5468 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
5469 ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
5470 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
5471 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
5472 ; GFX1132-NEXT: v_add_f32_e32 v2, s2, v2
5473 ; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1
5474 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
5475 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5476 ; GFX1132-NEXT: s_mov_b32 s2, 0
5477 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
5478 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
5479 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
5480 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
5481 ; GFX1132-NEXT: s_cbranch_execz .LBB8_5
5482 ; GFX1132-NEXT: ; %bb.3:
5483 ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
5484 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0
5485 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
5486 ; GFX1132-NEXT: global_load_b32 v1, v3, s[0:1]
5487 ; GFX1132-NEXT: .LBB8_4: ; %atomicrmw.start
5488 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
5489 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
5490 ; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
5491 ; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
5492 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
5493 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
5494 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0
5495 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
5496 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
5497 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
5498 ; GFX1132-NEXT: s_cbranch_execnz .LBB8_4
5499 ; GFX1132-NEXT: .LBB8_5:
5500 ; GFX1132-NEXT: s_endpgm
5502 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
5503 ; GFX7LESS-DPP: ; %bb.0:
5504 ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
5505 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
5506 ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
5507 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
5508 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
5509 ; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
5510 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
5511 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
5512 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
5513 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
5514 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
5515 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
5516 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
5517 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
5518 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
5519 ; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
5520 ; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
5521 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
5522 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
5523 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
5524 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
5525 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
5526 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
5527 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
5528 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
5529 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
5530 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
5531 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
5532 ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
5533 ; GFX7LESS-DPP-NEXT: buffer_load_dword v2, off, s[36:39], 0
5534 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
5535 ; GFX7LESS-DPP-NEXT: .LBB8_1: ; %atomicrmw.start
5536 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
5537 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
5538 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v1, v2, v0
5539 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
5540 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v2
5541 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v1
5542 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[36:39], 0 glc
5543 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
5544 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v2
5545 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
5546 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v3
5547 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
5548 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB8_1
5549 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
5550 ; GFX7LESS-DPP-NEXT: s_endpgm
5552 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
5553 ; GFX9-DPP: ; %bb.0:
5554 ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
5555 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
5556 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1
5557 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
5558 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
5559 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
5560 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
5561 ; GFX9-DPP-NEXT: s_mov_b32 s14, s8
5562 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
5563 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
5564 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
5565 ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
5566 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
5567 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
5568 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
5569 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
5570 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
5571 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
5572 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
5573 ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
5574 ; GFX9-DPP-NEXT: s_mov_b32 s12, s6
5575 ; GFX9-DPP-NEXT: s_mov_b32 s13, s7
5576 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
5577 ; GFX9-DPP-NEXT: s_mov_b32 s32, 0
5578 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
5579 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
5580 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0
5581 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1
5582 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
5583 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v3, 1
5584 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
5585 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v0
5586 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
5587 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
5588 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
5589 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
5590 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
5591 ; GFX9-DPP-NEXT: s_nop 1
5592 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:1 row_mask:0xf bank_mask:0xf
5593 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
5594 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
5595 ; GFX9-DPP-NEXT: s_nop 1
5596 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:2 row_mask:0xf bank_mask:0xf
5597 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
5598 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
5599 ; GFX9-DPP-NEXT: s_nop 1
5600 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:4 row_mask:0xf bank_mask:0xf
5601 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
5602 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
5603 ; GFX9-DPP-NEXT: s_nop 1
5604 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_shr:8 row_mask:0xf bank_mask:0xf
5605 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
5606 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v5, 1
5607 ; GFX9-DPP-NEXT: s_nop 1
5608 ; GFX9-DPP-NEXT: v_mov_b32_dpp v5, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
5609 ; GFX9-DPP-NEXT: v_add_f32_e32 v4, v4, v5
5610 ; GFX9-DPP-NEXT: s_nop 1
5611 ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
5612 ; GFX9-DPP-NEXT: v_add_f32_e32 v3, v4, v3
5613 ; GFX9-DPP-NEXT: v_readlane_b32 s4, v3, 63
5614 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
5615 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
5616 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
5617 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3
5618 ; GFX9-DPP-NEXT: ; %bb.1:
5619 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
5620 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, 0
5621 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
5622 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
5623 ; GFX9-DPP-NEXT: global_load_dword v1, v2, s[0:1]
5624 ; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
5625 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
5626 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
5627 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, s4, v1
5628 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
5629 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
5630 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5631 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
5632 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
5633 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
5634 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2
5635 ; GFX9-DPP-NEXT: .LBB8_3:
5636 ; GFX9-DPP-NEXT: s_endpgm
5638 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
5639 ; GFX1064-DPP: ; %bb.0:
5640 ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
5641 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
5642 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
5643 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
5644 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
5645 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
5646 ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
5647 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
5648 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
5649 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
5650 ; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
5651 ; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
5652 ; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
5653 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
5654 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
5655 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
5656 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
5657 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
5658 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
5659 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
5660 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
5661 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
5662 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
5663 ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
5664 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
5665 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
5666 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
5667 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v3, 1
5668 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
5669 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v0
5670 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
5671 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
5672 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
5673 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
5674 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
5675 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
5676 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v4, v3
5677 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
5678 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
5679 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
5680 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v5, 1
5681 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
5682 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
5683 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
5684 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v5
5685 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v3
5686 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
5687 ; GFX1064-DPP-NEXT: v_add_f32_e32 v3, v3, v4
5688 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
5689 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v3, 32
5690 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
5691 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5692 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
5693 ; GFX1064-DPP-NEXT: v_add_f32_e64 v3, s2, s3
5694 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
5695 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
5696 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v3
5697 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
5698 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
5699 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3
5700 ; GFX1064-DPP-NEXT: ; %bb.1:
5701 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
5702 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
5703 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
5704 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
5705 ; GFX1064-DPP-NEXT: global_load_dword v1, v6, s[0:1]
5706 ; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
5707 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
5708 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
5709 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
5710 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
5711 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
5712 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
5713 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
5714 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
5715 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
5716 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2
5717 ; GFX1064-DPP-NEXT: .LBB8_3:
5718 ; GFX1064-DPP-NEXT: s_endpgm
5720 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
5721 ; GFX1032-DPP: ; %bb.0:
5722 ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
5723 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
5724 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
5725 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
5726 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
5727 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
5728 ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
5729 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
5730 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
5731 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
5732 ; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
5733 ; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
5734 ; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
5735 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
5736 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
5737 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
5738 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
5739 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
5740 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
5741 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
5742 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
5743 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
5744 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
5745 ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
5746 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
5747 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
5748 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
5749 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v3, 1
5750 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
5751 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v0
5752 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
5753 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
5754 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
5755 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
5756 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
5757 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
5758 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v4, v3
5759 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
5760 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:2 row_mask:0xf bank_mask:0xf
5761 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
5762 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v5, 1
5763 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v3 row_xmask:4 row_mask:0xf bank_mask:0xf
5764 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
5765 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v5, v3 row_xmask:8 row_mask:0xf bank_mask:0xf
5766 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v5
5767 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v3
5768 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
5769 ; GFX1032-DPP-NEXT: v_add_f32_e32 v3, v3, v4
5770 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
5771 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5772 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v3
5773 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
5774 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
5775 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
5776 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3
5777 ; GFX1032-DPP-NEXT: ; %bb.1:
5778 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
5779 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
5780 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
5781 ; GFX1032-DPP-NEXT: global_load_dword v1, v6, s[0:1]
5782 ; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
5783 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
5784 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
5785 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
5786 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v6, v[0:1], s[0:1] glc
5787 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
5788 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
5789 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
5790 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
5791 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
5792 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2
5793 ; GFX1032-DPP-NEXT: .LBB8_3:
5794 ; GFX1032-DPP-NEXT: s_endpgm
5796 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
5797 ; GFX1164-DPP: ; %bb.0:
5798 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
5799 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
5800 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
5801 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
5802 ; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
5803 ; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
5804 ; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
5805 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
5806 ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
5807 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
5808 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
5809 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
5810 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
5811 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
5812 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
5813 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
5814 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
5815 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v1, 1
5816 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
5817 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
5818 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
5819 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
5820 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
5821 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
5822 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
5823 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
5824 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
5825 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5826 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v2, v1
5827 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v2, 1
5828 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
5829 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5830 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
5831 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
5832 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
5833 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5834 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
5835 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
5836 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5837 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v3
5838 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v1
5839 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5840 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
5841 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
5842 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
5843 ; GFX1164-DPP-NEXT: v_permlane64_b32 v2, v1
5844 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
5845 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
5846 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
5847 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
5848 ; GFX1164-DPP-NEXT: v_add_f32_e32 v1, v1, v2
5849 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
5850 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
5851 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0
5852 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5853 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v1
5854 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
5855 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
5856 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
5857 ; GFX1164-DPP-NEXT: ; %bb.1:
5858 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
5859 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
5860 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
5861 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
5862 ; GFX1164-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
5863 ; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
5864 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
5865 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
5866 ; GFX1164-DPP-NEXT: v_add_f32_e32 v4, v5, v0
5867 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
5868 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
5869 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v5
5870 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v4
5871 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
5872 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
5873 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
5874 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2
5875 ; GFX1164-DPP-NEXT: .LBB8_3:
5876 ; GFX1164-DPP-NEXT: s_endpgm
5878 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
5879 ; GFX1132-DPP: ; %bb.0:
5880 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
5881 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
5882 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
5883 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
5884 ; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
5885 ; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
5886 ; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
5887 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
5888 ; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
5889 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
5890 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
5891 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
5892 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
5893 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
5894 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
5895 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
5896 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
5897 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v1, 1
5898 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
5899 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v0
5900 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
5901 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
5902 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
5903 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
5904 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
5905 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v1, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
5906 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
5907 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v2, v1
5908 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v2, 1
5909 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
5910 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:2 row_mask:0xf bank_mask:0xf
5911 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
5912 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
5913 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
5914 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v1 row_xmask:4 row_mask:0xf bank_mask:0xf
5915 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
5916 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5917 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v1 row_xmask:8 row_mask:0xf bank_mask:0xf
5918 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v3
5919 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
5920 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, v1
5921 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v2, v2, -1, -1
5922 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
5923 ; GFX1132-DPP-NEXT: v_add_f32_e32 v1, v1, v2
5924 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
5925 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v4, exec_lo, 0
5926 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
5927 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v1
5928 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
5929 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
5930 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v4
5931 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3
5932 ; GFX1132-DPP-NEXT: ; %bb.1:
5933 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
5934 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
5935 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
5936 ; GFX1132-DPP-NEXT: global_load_b32 v5, v6, s[0:1]
5937 ; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
5938 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
5939 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
5940 ; GFX1132-DPP-NEXT: v_add_f32_e32 v4, v5, v0
5941 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v4, v6, v[4:5], s[0:1] glc
5942 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
5943 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v4, v5
5944 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v4
5945 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
5946 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
5947 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
5948 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2
5949 ; GFX1132-DPP-NEXT: .LBB8_3:
5950 ; GFX1132-DPP-NEXT: s_endpgm
5951 %divValue = call float @div.float.value() strictfp
5952 %result = atomicrmw fadd ptr addrspace(1) %ptr, float %divValue monotonic, align 4
5956 define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
5957 ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
5958 ; GFX7LESS: ; %bb.0:
5959 ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
5960 ; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
5961 ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
5962 ; GFX7LESS-NEXT: s_mov_b32 s50, -1
5963 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
5964 ; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
5965 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
5966 ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
5967 ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
5968 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
5969 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0
5970 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3
5971 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
5972 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
5973 ; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3
5974 ; GFX7LESS-NEXT: ; %bb.1:
5975 ; GFX7LESS-NEXT: s_mov_b32 s33, s8
5976 ; GFX7LESS-NEXT: s_mov_b32 s40, s7
5977 ; GFX7LESS-NEXT: s_mov_b32 s41, s6
5978 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
5979 ; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9
5980 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[0:1]
5981 ; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0
5982 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v3, 20, v2
5983 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
5984 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0
5985 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v4, 10, v1
5986 ; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[1:2], s2
5987 ; GFX7LESS-NEXT: v_or_b32_e32 v4, v0, v4
5988 ; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[1:2], 4.0
5989 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
5990 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0
5991 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1
5992 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v4, v3
5993 ; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start
5994 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
5995 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
5996 ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42]
5997 ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
5998 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0
5999 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
6000 ; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12
6001 ; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8
6002 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
6003 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
6004 ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
6005 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
6006 ; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
6007 ; GFX7LESS-NEXT: s_waitcnt expcnt(2)
6008 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
6009 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
6010 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
6011 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
6012 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
6013 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
6014 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
6015 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
6016 ; GFX7LESS-NEXT: s_mov_b32 s12, s41
6017 ; GFX7LESS-NEXT: s_mov_b32 s13, s40
6018 ; GFX7LESS-NEXT: s_mov_b32 s14, s33
6019 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
6020 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
6021 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
6022 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
6023 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42
6024 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43
6025 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
6026 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
6027 ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
6028 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0
6029 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
6030 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
6031 ; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
6032 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45]
6033 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2
6034 ; GFX7LESS-NEXT: .LBB9_3:
6035 ; GFX7LESS-NEXT: s_endpgm
6037 ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
6039 ; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
6040 ; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
6041 ; GFX9-NEXT: s_mov_b32 s50, -1
6042 ; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
6043 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
6044 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000
6045 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0
6046 ; GFX9-NEXT: s_add_u32 s48, s48, s9
6047 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3
6048 ; GFX9-NEXT: s_addc_u32 s49, s49, 0
6049 ; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
6050 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
6051 ; GFX9-NEXT: s_movk_i32 s32, 0x800
6052 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
6053 ; GFX9-NEXT: s_cbranch_execz .LBB9_3
6054 ; GFX9-NEXT: ; %bb.1:
6055 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
6056 ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24
6057 ; GFX9-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
6058 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
6059 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
6060 ; GFX9-NEXT: s_mov_b32 s33, s8
6061 ; GFX9-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0
6062 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6063 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0
6064 ; GFX9-NEXT: s_mov_b32 s40, s7
6065 ; GFX9-NEXT: s_mov_b32 s41, s6
6066 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
6067 ; GFX9-NEXT: s_mov_b64 s[44:45], 0
6068 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6069 ; GFX9-NEXT: v_mov_b32_e32 v4, s1
6070 ; GFX9-NEXT: v_mov_b32_e32 v3, s0
6071 ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
6072 ; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start
6073 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
6074 ; GFX9-NEXT: s_waitcnt vmcnt(0)
6075 ; GFX9-NEXT: v_add_f64 v[0:1], v[3:4], v[41:42]
6076 ; GFX9-NEXT: s_add_u32 s8, s36, 44
6077 ; GFX9-NEXT: s_addc_u32 s9, s37, 0
6078 ; GFX9-NEXT: s_getpc_b64 s[0:1]
6079 ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
6080 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
6081 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
6082 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
6083 ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4
6084 ; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0
6085 ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
6086 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
6087 ; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
6088 ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
6089 ; GFX9-NEXT: s_mov_b32 s12, s41
6090 ; GFX9-NEXT: s_mov_b32 s13, s40
6091 ; GFX9-NEXT: s_mov_b32 s14, s33
6092 ; GFX9-NEXT: v_mov_b32_e32 v31, v40
6093 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
6094 ; GFX9-NEXT: v_mov_b32_e32 v0, 8
6095 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
6096 ; GFX9-NEXT: v_mov_b32_e32 v2, s42
6097 ; GFX9-NEXT: v_mov_b32_e32 v3, s43
6098 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
6099 ; GFX9-NEXT: v_mov_b32_e32 v5, 8
6100 ; GFX9-NEXT: v_mov_b32_e32 v6, 0
6101 ; GFX9-NEXT: v_mov_b32_e32 v7, 0
6102 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6103 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
6104 ; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0
6105 ; GFX9-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4
6106 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
6107 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
6108 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
6109 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
6110 ; GFX9-NEXT: s_cbranch_execnz .LBB9_2
6111 ; GFX9-NEXT: .LBB9_3:
6112 ; GFX9-NEXT: s_endpgm
6114 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
6116 ; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
6117 ; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
6118 ; GFX1064-NEXT: s_mov_b32 s50, -1
6119 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
6120 ; GFX1064-NEXT: s_add_u32 s48, s48, s9
6121 ; GFX1064-NEXT: s_mov_b32 s33, s8
6122 ; GFX1064-NEXT: s_mov_b64 s[8:9], exec
6123 ; GFX1064-NEXT: s_addc_u32 s49, s49, 0
6124 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
6125 ; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
6126 ; GFX1064-NEXT: s_movk_i32 s32, 0x800
6127 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3
6128 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
6129 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
6130 ; GFX1064-NEXT: s_cbranch_execz .LBB9_3
6131 ; GFX1064-NEXT: ; %bb.1:
6132 ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
6133 ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[8:9]
6134 ; GFX1064-NEXT: s_mov_b32 s40, s7
6135 ; GFX1064-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
6136 ; GFX1064-NEXT: s_mov_b32 s41, s6
6137 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5]
6138 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3]
6139 ; GFX1064-NEXT: s_mov_b64 s[44:45], 0
6140 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
6141 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0
6142 ; GFX1064-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0
6143 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2
6144 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1
6145 ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3
6146 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
6147 ; GFX1064-NEXT: v_mov_b32_e32 v2, s1
6148 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0
6149 ; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start
6150 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
6151 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
6152 ; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
6153 ; GFX1064-NEXT: s_add_u32 s8, s36, 44
6154 ; GFX1064-NEXT: s_addc_u32 s9, s37, 0
6155 ; GFX1064-NEXT: s_getpc_b64 s[0:1]
6156 ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
6157 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
6158 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
6159 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0
6160 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
6161 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40
6162 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8
6163 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0
6164 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42
6165 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8
6166 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0
6167 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0
6168 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
6169 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
6170 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35]
6171 ; GFX1064-NEXT: s_mov_b32 s12, s41
6172 ; GFX1064-NEXT: s_mov_b32 s13, s40
6173 ; GFX1064-NEXT: s_mov_b32 s14, s33
6174 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
6175 ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
6176 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
6177 ; GFX1064-NEXT: v_mov_b32_e32 v3, s43
6178 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0
6179 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
6180 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
6181 ; GFX1064-NEXT: s_clause 0x1
6182 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0
6183 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
6184 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
6185 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
6186 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
6187 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
6188 ; GFX1064-NEXT: s_cbranch_execnz .LBB9_2
6189 ; GFX1064-NEXT: .LBB9_3:
6190 ; GFX1064-NEXT: s_endpgm
6192 ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
6194 ; GFX1032-NEXT: s_mov_b32 s33, s8
6195 ; GFX1032-NEXT: s_mov_b32 s8, exec_lo
6196 ; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
6197 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
6198 ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
6199 ; GFX1032-NEXT: s_mov_b32 s50, -1
6200 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
6201 ; GFX1032-NEXT: s_add_u32 s48, s48, s9
6202 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
6203 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0
6204 ; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
6205 ; GFX1032-NEXT: s_mov_b32 s44, 0
6206 ; GFX1032-NEXT: s_movk_i32 s32, 0x400
6207 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
6208 ; GFX1032-NEXT: s_cbranch_execz .LBB9_3
6209 ; GFX1032-NEXT: ; %bb.1:
6210 ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
6211 ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s8
6212 ; GFX1032-NEXT: s_mov_b32 s40, s7
6213 ; GFX1032-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
6214 ; GFX1032-NEXT: s_mov_b32 s41, s6
6215 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5]
6216 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3]
6217 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
6218 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0
6219 ; GFX1032-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0
6220 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2
6221 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1
6222 ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3
6223 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
6224 ; GFX1032-NEXT: v_mov_b32_e32 v2, s1
6225 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0
6226 ; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start
6227 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
6228 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
6229 ; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
6230 ; GFX1032-NEXT: s_add_u32 s8, s36, 44
6231 ; GFX1032-NEXT: s_addc_u32 s9, s37, 0
6232 ; GFX1032-NEXT: s_getpc_b64 s[0:1]
6233 ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
6234 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
6235 ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
6236 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0
6237 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
6238 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40
6239 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8
6240 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0
6241 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42
6242 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8
6243 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0
6244 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0
6245 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
6246 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
6247 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35]
6248 ; GFX1032-NEXT: s_mov_b32 s12, s41
6249 ; GFX1032-NEXT: s_mov_b32 s13, s40
6250 ; GFX1032-NEXT: s_mov_b32 s14, s33
6251 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
6252 ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
6253 ; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
6254 ; GFX1032-NEXT: v_mov_b32_e32 v3, s43
6255 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0
6256 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
6257 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
6258 ; GFX1032-NEXT: s_clause 0x1
6259 ; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0
6260 ; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
6261 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
6262 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
6263 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
6264 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
6265 ; GFX1032-NEXT: s_cbranch_execnz .LBB9_2
6266 ; GFX1032-NEXT: .LBB9_3:
6267 ; GFX1032-NEXT: s_endpgm
6269 ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
6271 ; GFX1164-NEXT: s_mov_b32 s33, s8
6272 ; GFX1164-NEXT: s_mov_b64 s[8:9], exec
6273 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0
6274 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
6275 ; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
6276 ; GFX1164-NEXT: s_mov_b32 s32, 32
6277 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
6278 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6279 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
6280 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
6281 ; GFX1164-NEXT: s_cbranch_execz .LBB9_3
6282 ; GFX1164-NEXT: ; %bb.1:
6283 ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[8:9]
6284 ; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
6285 ; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
6286 ; GFX1164-NEXT: s_mov_b32 s40, s7
6287 ; GFX1164-NEXT: s_mov_b32 s41, s6
6288 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5]
6289 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3]
6290 ; GFX1164-NEXT: s_mov_b64 s[44:45], 0
6291 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
6292 ; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0
6293 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
6294 ; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
6295 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
6296 ; GFX1164-NEXT: v_mov_b32_e32 v2, s1
6297 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0
6298 ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
6299 ; GFX1164-NEXT: .p2align 6
6300 ; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start
6301 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
6302 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
6303 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
6304 ; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
6305 ; GFX1164-NEXT: s_add_u32 s8, s36, 44
6306 ; GFX1164-NEXT: s_addc_u32 s9, s37, 0
6307 ; GFX1164-NEXT: s_getpc_b64 s[0:1]
6308 ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
6309 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
6310 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40
6311 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
6312 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8
6313 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8
6314 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0
6315 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0
6316 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
6317 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35]
6318 ; GFX1164-NEXT: s_mov_b32 s12, s41
6319 ; GFX1164-NEXT: s_mov_b32 s13, s40
6320 ; GFX1164-NEXT: s_mov_b32 s14, s33
6321 ; GFX1164-NEXT: s_clause 0x1
6322 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
6323 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
6324 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0
6325 ; GFX1164-NEXT: v_mov_b32_e32 v2, s42
6326 ; GFX1164-NEXT: v_mov_b32_e32 v3, s43
6327 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0
6328 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
6329 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
6330 ; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
6331 ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
6332 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
6333 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
6334 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
6335 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
6336 ; GFX1164-NEXT: s_cbranch_execnz .LBB9_2
6337 ; GFX1164-NEXT: .LBB9_3:
6338 ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
6339 ; GFX1164-NEXT: s_endpgm
6341 ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
6343 ; GFX1132-NEXT: s_mov_b32 s6, exec_lo
6344 ; GFX1132-NEXT: v_mov_b32_e32 v40, v0
6345 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
6346 ; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
6347 ; GFX1132-NEXT: s_mov_b32 s44, 0
6348 ; GFX1132-NEXT: s_mov_b32 s32, 32
6349 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
6350 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
6351 ; GFX1132-NEXT: s_cbranch_execz .LBB9_3
6352 ; GFX1132-NEXT: ; %bb.1:
6353 ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s6
6354 ; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
6355 ; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
6356 ; GFX1132-NEXT: s_mov_b32 s33, s15
6357 ; GFX1132-NEXT: s_mov_b32 s40, s14
6358 ; GFX1132-NEXT: s_mov_b32 s41, s13
6359 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5]
6360 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3]
6361 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
6362 ; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0
6363 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
6364 ; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
6365 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
6366 ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
6367 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
6368 ; GFX1132-NEXT: .p2align 6
6369 ; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start
6370 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
6371 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
6372 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
6373 ; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
6374 ; GFX1132-NEXT: s_add_u32 s8, s36, 44
6375 ; GFX1132-NEXT: s_addc_u32 s9, s37, 0
6376 ; GFX1132-NEXT: s_getpc_b64 s[0:1]
6377 ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
6378 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
6379 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
6380 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
6381 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
6382 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0
6383 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
6384 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35]
6385 ; GFX1132-NEXT: s_mov_b32 s12, s41
6386 ; GFX1132-NEXT: s_mov_b32 s13, s40
6387 ; GFX1132-NEXT: s_mov_b32 s14, s33
6388 ; GFX1132-NEXT: s_clause 0x1
6389 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
6390 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
6391 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
6392 ; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
6393 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
6394 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
6395 ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
6396 ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
6397 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
6398 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
6399 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
6400 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
6401 ; GFX1132-NEXT: s_cbranch_execnz .LBB9_2
6402 ; GFX1132-NEXT: .LBB9_3:
6403 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
6404 ; GFX1132-NEXT: s_endpgm
6406 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
6407 ; GFX7LESS-DPP: ; %bb.0:
6408 ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800
6409 ; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
6410 ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
6411 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1
6412 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000
6413 ; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9
6414 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0
6415 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
6416 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
6417 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], exec
6418 ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v3, s0, 0
6419 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v3, s1, v3
6420 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
6421 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
6422 ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB9_3
6423 ; GFX7LESS-DPP-NEXT: ; %bb.1:
6424 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8
6425 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7
6426 ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6
6427 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
6428 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x9
6429 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[0:1]
6430 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0
6431 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2
6432 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
6433 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0
6434 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1
6435 ; GFX7LESS-DPP-NEXT: v_cvt_f64_u32_e32 v[1:2], s2
6436 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v4, v0, v4
6437 ; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], v[1:2], 4.0
6438 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
6439 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0
6440 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1
6441 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v4, v3
6442 ; GFX7LESS-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
6443 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
6444 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
6445 ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42]
6446 ; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
6447 ; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0
6448 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
6449 ; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12
6450 ; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8
6451 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0
6452 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
6453 ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
6454 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
6455 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
6456 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2)
6457 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8
6458 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0
6459 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0
6460 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8
6461 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0
6462 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0
6463 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
6464 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
6465 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41
6466 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40
6467 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33
6468 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40
6469 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
6470 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
6471 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
6472 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42
6473 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43
6474 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
6475 ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
6476 ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0
6477 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0
6478 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
6479 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
6480 ; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
6481 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
6482 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB9_2
6483 ; GFX7LESS-DPP-NEXT: .LBB9_3:
6484 ; GFX7LESS-DPP-NEXT: s_endpgm
6486 ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
6487 ; GFX9-DPP: ; %bb.0:
6488 ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
6489 ; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
6490 ; GFX9-DPP-NEXT: s_mov_b32 s50, -1
6491 ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
6492 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec
6493 ; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
6494 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s0, 0
6495 ; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
6496 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s1, v3
6497 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
6498 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
6499 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
6500 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
6501 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
6502 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
6503 ; GFX9-DPP-NEXT: ; %bb.1:
6504 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
6505 ; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24
6506 ; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
6507 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
6508 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
6509 ; GFX9-DPP-NEXT: s_mov_b32 s33, s8
6510 ; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0
6511 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
6512 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0
6513 ; GFX9-DPP-NEXT: s_mov_b32 s40, s7
6514 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6
6515 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
6516 ; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
6517 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
6518 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, s1
6519 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s0
6520 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
6521 ; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
6522 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
6523 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
6524 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[3:4], v[41:42]
6525 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
6526 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
6527 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
6528 ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
6529 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
6530 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
6531 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
6532 ; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:4
6533 ; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0
6534 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
6535 ; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
6536 ; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
6537 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
6538 ; GFX9-DPP-NEXT: s_mov_b32 s12, s41
6539 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40
6540 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33
6541 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
6542 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
6543 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
6544 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
6545 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
6546 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
6547 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
6548 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
6549 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
6550 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
6551 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
6552 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
6553 ; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0
6554 ; GFX9-DPP-NEXT: buffer_load_dword v4, off, s[48:51], 0 offset:4
6555 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
6556 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
6557 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
6558 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
6559 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
6560 ; GFX9-DPP-NEXT: .LBB9_3:
6561 ; GFX9-DPP-NEXT: s_endpgm
6563 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
6564 ; GFX1064-DPP: ; %bb.0:
6565 ; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
6566 ; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
6567 ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
6568 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
6569 ; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
6570 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
6571 ; GFX1064-DPP-NEXT: s_mov_b64 s[8:9], exec
6572 ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
6573 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
6574 ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
6575 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
6576 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, s9, v3
6577 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
6578 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
6579 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
6580 ; GFX1064-DPP-NEXT: ; %bb.1:
6581 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
6582 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9]
6583 ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
6584 ; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
6585 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
6586 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
6587 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
6588 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
6589 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
6590 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0
6591 ; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0
6592 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2
6593 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1
6594 ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3
6595 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
6596 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
6597 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
6598 ; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
6599 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
6600 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
6601 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
6602 ; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44
6603 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0
6604 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
6605 ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
6606 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
6607 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
6608 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
6609 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
6610 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
6611 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
6612 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
6613 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
6614 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
6615 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
6616 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
6617 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
6618 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
6619 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
6620 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
6621 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
6622 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
6623 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
6624 ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
6625 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
6626 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
6627 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
6628 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
6629 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
6630 ; GFX1064-DPP-NEXT: s_clause 0x1
6631 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
6632 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
6633 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
6634 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
6635 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
6636 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
6637 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2
6638 ; GFX1064-DPP-NEXT: .LBB9_3:
6639 ; GFX1064-DPP-NEXT: s_endpgm
6641 ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
6642 ; GFX1032-DPP: ; %bb.0:
6643 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
6644 ; GFX1032-DPP-NEXT: s_mov_b32 s8, exec_lo
6645 ; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
6646 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, s8, 0
6647 ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
6648 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
6649 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
6650 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
6651 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
6652 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
6653 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
6654 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
6655 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
6656 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
6657 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
6658 ; GFX1032-DPP-NEXT: ; %bb.1:
6659 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
6660 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s8
6661 ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
6662 ; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[3:4], s0
6663 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
6664 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
6665 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
6666 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
6667 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0
6668 ; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[3:4], 4.0
6669 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2
6670 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1
6671 ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3
6672 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
6673 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
6674 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
6675 ; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
6676 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
6677 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
6678 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
6679 ; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44
6680 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0
6681 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
6682 ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
6683 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
6684 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
6685 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
6686 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
6687 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
6688 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
6689 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
6690 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
6691 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
6692 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
6693 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
6694 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
6695 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
6696 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
6697 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
6698 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
6699 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
6700 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
6701 ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
6702 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
6703 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
6704 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
6705 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
6706 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
6707 ; GFX1032-DPP-NEXT: s_clause 0x1
6708 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
6709 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
6710 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
6711 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
6712 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
6713 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
6714 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2
6715 ; GFX1032-DPP-NEXT: .LBB9_3:
6716 ; GFX1032-DPP-NEXT: s_endpgm
6718 ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
6719 ; GFX1164-DPP: ; %bb.0:
6720 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
6721 ; GFX1164-DPP-NEXT: s_mov_b64 s[8:9], exec
6722 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
6723 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0
6724 ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
6725 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
6726 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
6727 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
6728 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s9, v0
6729 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
6730 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
6731 ; GFX1164-DPP-NEXT: ; %bb.1:
6732 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[8:9]
6733 ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
6734 ; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
6735 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
6736 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
6737 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
6738 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
6739 ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
6740 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
6741 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0
6742 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
6743 ; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
6744 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
6745 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
6746 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
6747 ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
6748 ; GFX1164-DPP-NEXT: .p2align 6
6749 ; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
6750 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
6751 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
6752 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
6753 ; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
6754 ; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44
6755 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0
6756 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
6757 ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
6758 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
6759 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
6760 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
6761 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
6762 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
6763 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
6764 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
6765 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
6766 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
6767 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
6768 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
6769 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
6770 ; GFX1164-DPP-NEXT: s_clause 0x1
6771 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
6772 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
6773 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
6774 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
6775 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
6776 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
6777 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
6778 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
6779 ; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
6780 ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
6781 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
6782 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
6783 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
6784 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
6785 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
6786 ; GFX1164-DPP-NEXT: .LBB9_3:
6787 ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
6788 ; GFX1164-DPP-NEXT: s_endpgm
6790 ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
6791 ; GFX1132-DPP: ; %bb.0:
6792 ; GFX1132-DPP-NEXT: s_mov_b32 s6, exec_lo
6793 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0
6794 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0
6795 ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
6796 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
6797 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
6798 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
6799 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
6800 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
6801 ; GFX1132-DPP-NEXT: ; %bb.1:
6802 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s6
6803 ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
6804 ; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
6805 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
6806 ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
6807 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
6808 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
6809 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
6810 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
6811 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0
6812 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
6813 ; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
6814 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
6815 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
6816 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
6817 ; GFX1132-DPP-NEXT: .p2align 6
6818 ; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
6819 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
6820 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
6821 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
6822 ; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
6823 ; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44
6824 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0
6825 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
6826 ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
6827 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
6828 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
6829 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
6830 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
6831 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
6832 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
6833 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
6834 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
6835 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
6836 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
6837 ; GFX1132-DPP-NEXT: s_clause 0x1
6838 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
6839 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
6840 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
6841 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
6842 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
6843 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
6844 ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
6845 ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
6846 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
6847 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
6848 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
6849 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
6850 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
6851 ; GFX1132-DPP-NEXT: .LBB9_3:
6852 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
6853 ; GFX1132-DPP-NEXT: s_endpgm
6854 %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4
6858 define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 {
6859 ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
6860 ; GFX7LESS: ; %bb.0:
6861 ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
6862 ; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
6863 ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
6864 ; GFX7LESS-NEXT: s_mov_b32 s50, -1
6865 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
6866 ; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
6867 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
6868 ; GFX7LESS-NEXT: s_mov_b32 s33, s8
6869 ; GFX7LESS-NEXT: s_mov_b32 s40, s7
6870 ; GFX7LESS-NEXT: s_mov_b32 s41, s6
6871 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
6872 ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
6873 ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
6874 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
6875 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
6876 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
6877 ; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
6878 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
6879 ; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
6880 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
6881 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
6882 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
6883 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2
6884 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
6885 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
6886 ; GFX7LESS-NEXT: s_mov_b32 s12, s41
6887 ; GFX7LESS-NEXT: s_mov_b32 s13, s40
6888 ; GFX7LESS-NEXT: s_mov_b32 s14, s33
6889 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
6890 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
6891 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
6892 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
6893 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
6894 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
6895 ; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0
6896 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v42, 1
6897 ; GFX7LESS-NEXT: .LBB10_1: ; %ComputeLoop
6898 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
6899 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
6900 ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
6901 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
6902 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
6903 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
6904 ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
6905 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5]
6906 ; GFX7LESS-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
6907 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB10_1
6908 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
6909 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
6910 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
6911 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
6912 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
6913 ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
6914 ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_5
6915 ; GFX7LESS-NEXT: ; %bb.3:
6916 ; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9
6917 ; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
6918 ; GFX7LESS-NEXT: s_mov_b32 s46, -1
6919 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
6920 ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
6921 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
6922 ; GFX7LESS-NEXT: .LBB10_4: ; %atomicrmw.start
6923 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
6924 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
6925 ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42]
6926 ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
6927 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0
6928 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
6929 ; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12
6930 ; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8
6931 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
6932 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
6933 ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
6934 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
6935 ; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
6936 ; GFX7LESS-NEXT: s_waitcnt expcnt(2)
6937 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
6938 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
6939 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
6940 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
6941 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
6942 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
6943 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
6944 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
6945 ; GFX7LESS-NEXT: s_mov_b32 s12, s41
6946 ; GFX7LESS-NEXT: s_mov_b32 s13, s40
6947 ; GFX7LESS-NEXT: s_mov_b32 s14, s33
6948 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
6949 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
6950 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
6951 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
6952 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
6953 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
6954 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
6955 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
6956 ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
6957 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0
6958 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
6959 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
6960 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
6961 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
6962 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_4
6963 ; GFX7LESS-NEXT: .LBB10_5:
6964 ; GFX7LESS-NEXT: s_endpgm
6966 ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
6968 ; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
6969 ; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
6970 ; GFX9-NEXT: s_mov_b32 s50, -1
6971 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000
6972 ; GFX9-NEXT: s_add_u32 s48, s48, s9
6973 ; GFX9-NEXT: s_addc_u32 s49, s49, 0
6974 ; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
6975 ; GFX9-NEXT: s_mov_b32 s33, s8
6976 ; GFX9-NEXT: s_add_u32 s8, s36, 44
6977 ; GFX9-NEXT: s_addc_u32 s9, s37, 0
6978 ; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
6979 ; GFX9-NEXT: s_getpc_b64 s[0:1]
6980 ; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
6981 ; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
6982 ; GFX9-NEXT: s_mov_b32 s40, s7
6983 ; GFX9-NEXT: s_mov_b32 s41, s6
6984 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
6985 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
6986 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
6987 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
6988 ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
6989 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
6990 ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
6991 ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
6992 ; GFX9-NEXT: s_mov_b32 s12, s41
6993 ; GFX9-NEXT: s_mov_b32 s13, s40
6994 ; GFX9-NEXT: s_mov_b32 s14, s33
6995 ; GFX9-NEXT: v_mov_b32_e32 v31, v40
6996 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
6997 ; GFX9-NEXT: s_movk_i32 s32, 0x800
6998 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
6999 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
7000 ; GFX9-NEXT: v_mov_b32_e32 v41, 0
7001 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
7002 ; GFX9-NEXT: v_bfrev_b32_e32 v42, 1
7003 ; GFX9-NEXT: .LBB10_1: ; %ComputeLoop
7004 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7005 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
7006 ; GFX9-NEXT: v_readlane_b32 s3, v1, s4
7007 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4
7008 ; GFX9-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
7009 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
7010 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
7011 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
7012 ; GFX9-NEXT: s_cbranch_scc1 .LBB10_1
7013 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
7014 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7015 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
7016 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
7017 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
7018 ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
7019 ; GFX9-NEXT: s_cbranch_execz .LBB10_5
7020 ; GFX9-NEXT: ; %bb.3:
7021 ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24
7022 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
7023 ; GFX9-NEXT: s_mov_b64 s[44:45], 0
7024 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7025 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43]
7026 ; GFX9-NEXT: .LBB10_4: ; %atomicrmw.start
7027 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
7028 ; GFX9-NEXT: s_waitcnt vmcnt(0)
7029 ; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
7030 ; GFX9-NEXT: s_add_u32 s8, s36, 44
7031 ; GFX9-NEXT: s_addc_u32 s9, s37, 0
7032 ; GFX9-NEXT: s_getpc_b64 s[0:1]
7033 ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
7034 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
7035 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
7036 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
7037 ; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
7038 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
7039 ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
7040 ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
7041 ; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
7042 ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
7043 ; GFX9-NEXT: s_mov_b32 s12, s41
7044 ; GFX9-NEXT: s_mov_b32 s13, s40
7045 ; GFX9-NEXT: s_mov_b32 s14, s33
7046 ; GFX9-NEXT: v_mov_b32_e32 v31, v40
7047 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
7048 ; GFX9-NEXT: v_mov_b32_e32 v0, 8
7049 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
7050 ; GFX9-NEXT: v_mov_b32_e32 v2, s42
7051 ; GFX9-NEXT: v_mov_b32_e32 v3, s43
7052 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
7053 ; GFX9-NEXT: v_mov_b32_e32 v5, 8
7054 ; GFX9-NEXT: v_mov_b32_e32 v6, 0
7055 ; GFX9-NEXT: v_mov_b32_e32 v7, 0
7056 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
7057 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
7058 ; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0
7059 ; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
7060 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
7061 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
7062 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
7063 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
7064 ; GFX9-NEXT: s_cbranch_execnz .LBB10_4
7065 ; GFX9-NEXT: .LBB10_5:
7066 ; GFX9-NEXT: s_endpgm
7068 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
7070 ; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
7071 ; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
7072 ; GFX1064-NEXT: s_mov_b32 s50, -1
7073 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
7074 ; GFX1064-NEXT: s_add_u32 s48, s48, s9
7075 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
7076 ; GFX1064-NEXT: s_addc_u32 s49, s49, 0
7077 ; GFX1064-NEXT: s_mov_b32 s33, s8
7078 ; GFX1064-NEXT: s_add_u32 s8, s34, 44
7079 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0
7080 ; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
7081 ; GFX1064-NEXT: s_getpc_b64 s[0:1]
7082 ; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
7083 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
7084 ; GFX1064-NEXT: s_mov_b32 s40, s7
7085 ; GFX1064-NEXT: s_mov_b32 s41, s6
7086 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
7087 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
7088 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
7089 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
7090 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
7091 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
7092 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
7093 ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
7094 ; GFX1064-NEXT: s_mov_b32 s12, s41
7095 ; GFX1064-NEXT: s_mov_b32 s13, s40
7096 ; GFX1064-NEXT: s_mov_b32 s14, s33
7097 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
7098 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40
7099 ; GFX1064-NEXT: s_movk_i32 s32, 0x800
7100 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
7101 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
7102 ; GFX1064-NEXT: v_mov_b32_e32 v41, 0
7103 ; GFX1064-NEXT: v_bfrev_b32_e32 v42, 1
7104 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
7105 ; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop
7106 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
7107 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
7108 ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
7109 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
7110 ; GFX1064-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
7111 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
7112 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
7113 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
7114 ; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1
7115 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
7116 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7117 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
7118 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
7119 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
7120 ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
7121 ; GFX1064-NEXT: s_cbranch_execz .LBB10_5
7122 ; GFX1064-NEXT: ; %bb.3:
7123 ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24
7124 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0
7125 ; GFX1064-NEXT: s_mov_b64 s[44:45], 0
7126 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
7127 ; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43]
7128 ; GFX1064-NEXT: .LBB10_4: ; %atomicrmw.start
7129 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
7130 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
7131 ; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
7132 ; GFX1064-NEXT: s_add_u32 s8, s34, 44
7133 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0
7134 ; GFX1064-NEXT: s_getpc_b64 s[0:1]
7135 ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
7136 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
7137 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
7138 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0
7139 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
7140 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40
7141 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8
7142 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0
7143 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42
7144 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8
7145 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0
7146 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0
7147 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
7148 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
7149 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
7150 ; GFX1064-NEXT: s_mov_b32 s12, s41
7151 ; GFX1064-NEXT: s_mov_b32 s13, s40
7152 ; GFX1064-NEXT: s_mov_b32 s14, s33
7153 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
7154 ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
7155 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
7156 ; GFX1064-NEXT: v_mov_b32_e32 v3, s43
7157 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0
7158 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
7159 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
7160 ; GFX1064-NEXT: s_clause 0x1
7161 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0
7162 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
7163 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
7164 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
7165 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
7166 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
7167 ; GFX1064-NEXT: s_cbranch_execnz .LBB10_4
7168 ; GFX1064-NEXT: .LBB10_5:
7169 ; GFX1064-NEXT: s_endpgm
7171 ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
7173 ; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
7174 ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
7175 ; GFX1032-NEXT: s_mov_b32 s50, -1
7176 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
7177 ; GFX1032-NEXT: s_add_u32 s48, s48, s9
7178 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
7179 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0
7180 ; GFX1032-NEXT: s_mov_b32 s33, s8
7181 ; GFX1032-NEXT: s_add_u32 s8, s34, 44
7182 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0
7183 ; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
7184 ; GFX1032-NEXT: s_getpc_b64 s[0:1]
7185 ; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
7186 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
7187 ; GFX1032-NEXT: s_mov_b32 s40, s7
7188 ; GFX1032-NEXT: s_mov_b32 s41, s6
7189 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
7190 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
7191 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
7192 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
7193 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
7194 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
7195 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
7196 ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
7197 ; GFX1032-NEXT: s_mov_b32 s12, s41
7198 ; GFX1032-NEXT: s_mov_b32 s13, s40
7199 ; GFX1032-NEXT: s_mov_b32 s14, s33
7200 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
7201 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40
7202 ; GFX1032-NEXT: s_movk_i32 s32, 0x400
7203 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
7204 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
7205 ; GFX1032-NEXT: v_mov_b32_e32 v41, 0
7206 ; GFX1032-NEXT: v_bfrev_b32_e32 v42, 1
7207 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
7208 ; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop
7209 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
7210 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
7211 ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
7212 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
7213 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
7214 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
7215 ; GFX1032-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
7216 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
7217 ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1
7218 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
7219 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7220 ; GFX1032-NEXT: s_mov_b32 s44, 0
7221 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
7222 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
7223 ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
7224 ; GFX1032-NEXT: s_cbranch_execz .LBB10_5
7225 ; GFX1032-NEXT: ; %bb.3:
7226 ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24
7227 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0
7228 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
7229 ; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43]
7230 ; GFX1032-NEXT: .LBB10_4: ; %atomicrmw.start
7231 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
7232 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
7233 ; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
7234 ; GFX1032-NEXT: s_add_u32 s8, s34, 44
7235 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0
7236 ; GFX1032-NEXT: s_getpc_b64 s[0:1]
7237 ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
7238 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
7239 ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
7240 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0
7241 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
7242 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40
7243 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8
7244 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0
7245 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42
7246 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8
7247 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0
7248 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0
7249 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
7250 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
7251 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
7252 ; GFX1032-NEXT: s_mov_b32 s12, s41
7253 ; GFX1032-NEXT: s_mov_b32 s13, s40
7254 ; GFX1032-NEXT: s_mov_b32 s14, s33
7255 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
7256 ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
7257 ; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
7258 ; GFX1032-NEXT: v_mov_b32_e32 v3, s43
7259 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0
7260 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
7261 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
7262 ; GFX1032-NEXT: s_clause 0x1
7263 ; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0
7264 ; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
7265 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
7266 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
7267 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
7268 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
7269 ; GFX1032-NEXT: s_cbranch_execnz .LBB10_4
7270 ; GFX1032-NEXT: .LBB10_5:
7271 ; GFX1032-NEXT: s_endpgm
7273 ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
7275 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
7276 ; GFX1164-NEXT: s_mov_b32 s33, s8
7277 ; GFX1164-NEXT: s_add_u32 s8, s34, 44
7278 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0
7279 ; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
7280 ; GFX1164-NEXT: s_getpc_b64 s[0:1]
7281 ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
7282 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
7283 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0
7284 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
7285 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
7286 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
7287 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
7288 ; GFX1164-NEXT: s_mov_b32 s12, s6
7289 ; GFX1164-NEXT: s_mov_b32 s13, s7
7290 ; GFX1164-NEXT: s_mov_b32 s14, s33
7291 ; GFX1164-NEXT: s_mov_b32 s32, 32
7292 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0
7293 ; GFX1164-NEXT: s_mov_b32 s40, s7
7294 ; GFX1164-NEXT: s_mov_b32 s41, s6
7295 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
7296 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
7297 ; GFX1164-NEXT: v_mov_b32_e32 v41, 0
7298 ; GFX1164-NEXT: v_bfrev_b32_e32 v42, 1
7299 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
7300 ; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop
7301 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
7302 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
7303 ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
7304 ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
7305 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
7306 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
7307 ; GFX1164-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
7308 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
7309 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
7310 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
7311 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
7312 ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1
7313 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
7314 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7315 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
7316 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7317 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
7318 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
7319 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
7320 ; GFX1164-NEXT: s_cbranch_execz .LBB10_5
7321 ; GFX1164-NEXT: ; %bb.3:
7322 ; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24
7323 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0
7324 ; GFX1164-NEXT: s_mov_b64 s[44:45], 0
7325 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
7326 ; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[42:43]
7327 ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
7328 ; GFX1164-NEXT: .p2align 6
7329 ; GFX1164-NEXT: .LBB10_4: ; %atomicrmw.start
7330 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
7331 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
7332 ; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
7333 ; GFX1164-NEXT: s_add_u32 s8, s34, 44
7334 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0
7335 ; GFX1164-NEXT: s_getpc_b64 s[0:1]
7336 ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
7337 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
7338 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40
7339 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
7340 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8
7341 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8
7342 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0
7343 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0
7344 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
7345 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
7346 ; GFX1164-NEXT: s_mov_b32 s12, s41
7347 ; GFX1164-NEXT: s_mov_b32 s13, s40
7348 ; GFX1164-NEXT: s_mov_b32 s14, s33
7349 ; GFX1164-NEXT: s_clause 0x1
7350 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
7351 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
7352 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0
7353 ; GFX1164-NEXT: v_mov_b32_e32 v2, s42
7354 ; GFX1164-NEXT: v_mov_b32_e32 v3, s43
7355 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0
7356 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
7357 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
7358 ; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
7359 ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
7360 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
7361 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
7362 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
7363 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
7364 ; GFX1164-NEXT: s_cbranch_execnz .LBB10_4
7365 ; GFX1164-NEXT: .LBB10_5:
7366 ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
7367 ; GFX1164-NEXT: s_endpgm
7369 ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
7371 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
7372 ; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
7373 ; GFX1132-NEXT: s_add_u32 s8, s34, 44
7374 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0
7375 ; GFX1132-NEXT: s_getpc_b64 s[0:1]
7376 ; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
7377 ; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
7378 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0
7379 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
7380 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
7381 ; GFX1132-NEXT: s_mov_b32 s40, s14
7382 ; GFX1132-NEXT: s_mov_b32 s41, s13
7383 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
7384 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
7385 ; GFX1132-NEXT: s_mov_b32 s12, s13
7386 ; GFX1132-NEXT: s_mov_b32 s13, s14
7387 ; GFX1132-NEXT: s_mov_b32 s14, s15
7388 ; GFX1132-NEXT: s_mov_b32 s32, 32
7389 ; GFX1132-NEXT: s_mov_b32 s33, s15
7390 ; GFX1132-NEXT: v_mov_b32_e32 v40, v0
7391 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
7392 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
7393 ; GFX1132-NEXT: v_mov_b32_e32 v41, 0
7394 ; GFX1132-NEXT: v_bfrev_b32_e32 v42, 1
7395 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
7396 ; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop
7397 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
7398 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
7399 ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
7400 ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
7401 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
7402 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
7403 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7404 ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
7405 ; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
7406 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
7407 ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1
7408 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
7409 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7410 ; GFX1132-NEXT: s_mov_b32 s44, 0
7411 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
7412 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
7413 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
7414 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
7415 ; GFX1132-NEXT: s_cbranch_execz .LBB10_5
7416 ; GFX1132-NEXT: ; %bb.3:
7417 ; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24
7418 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0
7419 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
7420 ; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[42:43]
7421 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
7422 ; GFX1132-NEXT: .p2align 6
7423 ; GFX1132-NEXT: .LBB10_4: ; %atomicrmw.start
7424 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
7425 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
7426 ; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
7427 ; GFX1132-NEXT: s_add_u32 s8, s34, 44
7428 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0
7429 ; GFX1132-NEXT: s_getpc_b64 s[0:1]
7430 ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
7431 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
7432 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
7433 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
7434 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
7435 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0
7436 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
7437 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
7438 ; GFX1132-NEXT: s_mov_b32 s12, s41
7439 ; GFX1132-NEXT: s_mov_b32 s13, s40
7440 ; GFX1132-NEXT: s_mov_b32 s14, s33
7441 ; GFX1132-NEXT: s_clause 0x1
7442 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
7443 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
7444 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
7445 ; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
7446 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
7447 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
7448 ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
7449 ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
7450 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
7451 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
7452 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
7453 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
7454 ; GFX1132-NEXT: s_cbranch_execnz .LBB10_4
7455 ; GFX1132-NEXT: .LBB10_5:
7456 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
7457 ; GFX1132-NEXT: s_endpgm
7459 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
7460 ; GFX7LESS-DPP: ; %bb.0:
7461 ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800
7462 ; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
7463 ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
7464 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1
7465 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000
7466 ; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9
7467 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0
7468 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8
7469 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7
7470 ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6
7471 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
7472 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
7473 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
7474 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
7475 ; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000
7476 ; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1
7477 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
7478 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0
7479 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
7480 ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
7481 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
7482 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
7483 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
7484 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
7485 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
7486 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2
7487 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
7488 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
7489 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41
7490 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40
7491 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33
7492 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42
7493 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
7494 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
7495 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
7496 ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
7497 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0
7498 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1
7499 ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
7500 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0
7501 ; GFX7LESS-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
7502 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
7503 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
7504 ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41]
7505 ; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
7506 ; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0
7507 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
7508 ; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12
7509 ; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8
7510 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0
7511 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
7512 ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
7513 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
7514 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
7515 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2)
7516 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8
7517 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0
7518 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0
7519 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8
7520 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0
7521 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0
7522 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
7523 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
7524 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41
7525 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40
7526 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33
7527 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42
7528 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
7529 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
7530 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
7531 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44
7532 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45
7533 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
7534 ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
7535 ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0
7536 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0
7537 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
7538 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
7539 ; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
7540 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43]
7541 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB10_1
7542 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
7543 ; GFX7LESS-DPP-NEXT: s_endpgm
7545 ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
7546 ; GFX9-DPP: ; %bb.0:
7547 ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
7548 ; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
7549 ; GFX9-DPP-NEXT: s_mov_b32 s50, -1
7550 ; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
7551 ; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
7552 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
7553 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
7554 ; GFX9-DPP-NEXT: s_mov_b32 s33, s8
7555 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
7556 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
7557 ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
7558 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
7559 ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
7560 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
7561 ; GFX9-DPP-NEXT: s_mov_b32 s40, s7
7562 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6
7563 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
7564 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
7565 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
7566 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
7567 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
7568 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
7569 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
7570 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
7571 ; GFX9-DPP-NEXT: s_mov_b32 s12, s41
7572 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40
7573 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33
7574 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
7575 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
7576 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
7577 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
7578 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
7579 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
7580 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
7581 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1
7582 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
7583 ; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
7584 ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
7585 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
7586 ; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
7587 ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
7588 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
7589 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
7590 ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
7591 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
7592 ; GFX9-DPP-NEXT: s_nop 0
7593 ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
7594 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
7595 ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
7596 ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
7597 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
7598 ; GFX9-DPP-NEXT: s_nop 0
7599 ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
7600 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
7601 ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
7602 ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
7603 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
7604 ; GFX9-DPP-NEXT: s_nop 0
7605 ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
7606 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
7607 ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
7608 ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
7609 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
7610 ; GFX9-DPP-NEXT: s_nop 0
7611 ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
7612 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
7613 ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
7614 ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
7615 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
7616 ; GFX9-DPP-NEXT: s_nop 0
7617 ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
7618 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
7619 ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
7620 ; GFX9-DPP-NEXT: s_nop 1
7621 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
7622 ; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
7623 ; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
7624 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
7625 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7626 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
7627 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
7628 ; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63
7629 ; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63
7630 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
7631 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
7632 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
7633 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
7634 ; GFX9-DPP-NEXT: ; %bb.1:
7635 ; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24
7636 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
7637 ; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
7638 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
7639 ; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
7640 ; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
7641 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
7642 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
7643 ; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[42:43]
7644 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
7645 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
7646 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
7647 ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
7648 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
7649 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
7650 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
7651 ; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
7652 ; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
7653 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
7654 ; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
7655 ; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
7656 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
7657 ; GFX9-DPP-NEXT: s_mov_b32 s12, s41
7658 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40
7659 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33
7660 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
7661 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
7662 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
7663 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
7664 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44
7665 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45
7666 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
7667 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
7668 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
7669 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
7670 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
7671 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
7672 ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
7673 ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
7674 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
7675 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
7676 ; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47]
7677 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47]
7678 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
7679 ; GFX9-DPP-NEXT: .LBB10_3:
7680 ; GFX9-DPP-NEXT: s_endpgm
7682 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
7683 ; GFX1064-DPP: ; %bb.0:
7684 ; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
7685 ; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
7686 ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
7687 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
7688 ; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
7689 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
7690 ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
7691 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
7692 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
7693 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
7694 ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
7695 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
7696 ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
7697 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
7698 ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
7699 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
7700 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
7701 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
7702 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
7703 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
7704 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
7705 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
7706 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
7707 ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
7708 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
7709 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
7710 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
7711 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
7712 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
7713 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
7714 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
7715 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
7716 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
7717 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
7718 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1
7719 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
7720 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
7721 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
7722 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
7723 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
7724 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
7725 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
7726 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
7727 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
7728 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
7729 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
7730 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
7731 ; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
7732 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
7733 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
7734 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
7735 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
7736 ; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
7737 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
7738 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
7739 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
7740 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
7741 ; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
7742 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
7743 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
7744 ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
7745 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
7746 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
7747 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1
7748 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1
7749 ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
7750 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0
7751 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0
7752 ; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32
7753 ; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32
7754 ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], s[2:3], s[4:5]
7755 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
7756 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7757 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8
7758 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9
7759 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
7760 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
7761 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
7762 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
7763 ; GFX1064-DPP-NEXT: ; %bb.1:
7764 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24
7765 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
7766 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
7767 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
7768 ; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43]
7769 ; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
7770 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
7771 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
7772 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
7773 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
7774 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
7775 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
7776 ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
7777 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
7778 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
7779 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
7780 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
7781 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
7782 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
7783 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
7784 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
7785 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
7786 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
7787 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
7788 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
7789 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
7790 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
7791 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
7792 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
7793 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
7794 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
7795 ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
7796 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
7797 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
7798 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
7799 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
7800 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
7801 ; GFX1064-DPP-NEXT: s_clause 0x1
7802 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
7803 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
7804 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
7805 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
7806 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
7807 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
7808 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
7809 ; GFX1064-DPP-NEXT: .LBB10_3:
7810 ; GFX1064-DPP-NEXT: s_endpgm
7812 ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
7813 ; GFX1032-DPP: ; %bb.0:
7814 ; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
7815 ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
7816 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
7817 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
7818 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
7819 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
7820 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
7821 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
7822 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
7823 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
7824 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
7825 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
7826 ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
7827 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
7828 ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
7829 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
7830 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
7831 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
7832 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
7833 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
7834 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
7835 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
7836 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
7837 ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
7838 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
7839 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
7840 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
7841 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
7842 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
7843 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
7844 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
7845 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
7846 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
7847 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
7848 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1
7849 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
7850 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
7851 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
7852 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
7853 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
7854 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
7855 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
7856 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
7857 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
7858 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
7859 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
7860 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
7861 ; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
7862 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
7863 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
7864 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
7865 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
7866 ; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
7867 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
7868 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
7869 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
7870 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
7871 ; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
7872 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
7873 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
7874 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
7875 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
7876 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
7877 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1
7878 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1
7879 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
7880 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
7881 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
7882 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
7883 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
7884 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
7885 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
7886 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
7887 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
7888 ; GFX1032-DPP-NEXT: ; %bb.1:
7889 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24
7890 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
7891 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
7892 ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43]
7893 ; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
7894 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
7895 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
7896 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
7897 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
7898 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
7899 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
7900 ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
7901 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
7902 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
7903 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
7904 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
7905 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
7906 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
7907 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
7908 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
7909 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
7910 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
7911 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
7912 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
7913 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
7914 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
7915 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
7916 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
7917 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
7918 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
7919 ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
7920 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
7921 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
7922 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
7923 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
7924 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
7925 ; GFX1032-DPP-NEXT: s_clause 0x1
7926 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
7927 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
7928 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
7929 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
7930 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
7931 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
7932 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
7933 ; GFX1032-DPP-NEXT: .LBB10_3:
7934 ; GFX1032-DPP-NEXT: s_endpgm
7936 ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
7937 ; GFX1164-DPP: ; %bb.0:
7938 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
7939 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
7940 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
7941 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
7942 ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
7943 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
7944 ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
7945 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
7946 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
7947 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
7948 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
7949 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
7950 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
7951 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
7952 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
7953 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
7954 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
7955 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
7956 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
7957 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
7958 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
7959 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
7960 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
7961 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
7962 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1
7963 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
7964 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
7965 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
7966 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
7967 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
7968 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
7969 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
7970 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
7971 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
7972 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
7973 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
7974 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
7975 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
7976 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
7977 ; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
7978 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
7979 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
7980 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
7981 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
7982 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
7983 ; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
7984 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
7985 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
7986 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7987 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
7988 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
7989 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
7990 ; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
7991 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
7992 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
7993 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
7994 ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
7995 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
7996 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
7997 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
7998 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
7999 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1
8000 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1
8001 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8002 ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
8003 ; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9
8004 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
8005 ; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8
8006 ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
8007 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
8008 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
8009 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
8010 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8
8011 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
8012 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9
8013 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
8014 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
8015 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
8016 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
8017 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
8018 ; GFX1164-DPP-NEXT: ; %bb.1:
8019 ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24
8020 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
8021 ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
8022 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
8023 ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43]
8024 ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
8025 ; GFX1164-DPP-NEXT: .p2align 6
8026 ; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
8027 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
8028 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
8029 ; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
8030 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
8031 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
8032 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
8033 ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
8034 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
8035 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
8036 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
8037 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
8038 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
8039 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
8040 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
8041 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
8042 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
8043 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
8044 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
8045 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
8046 ; GFX1164-DPP-NEXT: s_clause 0x1
8047 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
8048 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
8049 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
8050 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
8051 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
8052 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
8053 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
8054 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
8055 ; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
8056 ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
8057 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
8058 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
8059 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
8060 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
8061 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
8062 ; GFX1164-DPP-NEXT: .LBB10_3:
8063 ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
8064 ; GFX1164-DPP-NEXT: s_endpgm
8066 ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
8067 ; GFX1132-DPP: ; %bb.0:
8068 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
8069 ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
8070 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
8071 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
8072 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
8073 ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
8074 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
8075 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
8076 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
8077 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
8078 ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
8079 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
8080 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
8081 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
8082 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
8083 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
8084 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
8085 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
8086 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
8087 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0
8088 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
8089 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
8090 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
8091 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
8092 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1
8093 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
8094 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
8095 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
8096 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
8097 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
8098 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
8099 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
8100 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
8101 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
8102 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
8103 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
8104 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
8105 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
8106 ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
8107 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
8108 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
8109 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
8110 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
8111 ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
8112 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
8113 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
8114 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
8115 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
8116 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8117 ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
8118 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
8119 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
8120 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
8121 ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
8122 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
8123 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
8124 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1
8125 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
8126 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1
8127 ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
8128 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
8129 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
8130 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8
8131 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
8132 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9
8133 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
8134 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
8135 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
8136 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
8137 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
8138 ; GFX1132-DPP-NEXT: ; %bb.1:
8139 ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24
8140 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0
8141 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
8142 ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43]
8143 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
8144 ; GFX1132-DPP-NEXT: .p2align 6
8145 ; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
8146 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
8147 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
8148 ; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
8149 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
8150 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
8151 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
8152 ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
8153 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
8154 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
8155 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
8156 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
8157 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
8158 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
8159 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
8160 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
8161 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
8162 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
8163 ; GFX1132-DPP-NEXT: s_clause 0x1
8164 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
8165 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
8166 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
8167 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
8168 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
8169 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
8170 ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
8171 ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
8172 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
8173 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
8174 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
8175 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
8176 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
8177 ; GFX1132-DPP-NEXT: .LBB10_3:
8178 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
8179 ; GFX1132-DPP-NEXT: s_endpgm
8180 %divValue = call double @div.float.value()
8181 %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4
8185 define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
8186 ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
8187 ; GFX7LESS: ; %bb.0:
8188 ; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
8189 ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
8190 ; GFX7LESS-NEXT: s_mov_b32 s14, -1
8191 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000
8192 ; GFX7LESS-NEXT: s_add_u32 s12, s12, s9
8193 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0
8194 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
8195 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
8196 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
8197 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
8198 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
8199 ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3
8200 ; GFX7LESS-NEXT: ; %bb.1:
8201 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
8202 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5]
8203 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
8204 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
8205 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
8206 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
8207 ; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
8208 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
8209 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
8210 ; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
8211 ; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
8212 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
8213 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8
8214 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9
8215 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
8216 ; GFX7LESS-NEXT: .LBB11_2: ; %atomicrmw.start
8217 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
8218 ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
8219 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
8220 ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
8221 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
8222 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
8223 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
8224 ; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
8225 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
8226 ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
8227 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
8228 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
8229 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
8230 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
8231 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2
8232 ; GFX7LESS-NEXT: .LBB11_3:
8233 ; GFX7LESS-NEXT: s_endpgm
8235 ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
8237 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
8238 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
8239 ; GFX9-NEXT: s_mov_b32 s14, -1
8240 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
8241 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000
8242 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
8243 ; GFX9-NEXT: s_add_u32 s12, s12, s9
8244 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
8245 ; GFX9-NEXT: s_addc_u32 s13, s13, 0
8246 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
8247 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
8248 ; GFX9-NEXT: s_cbranch_execz .LBB11_3
8249 ; GFX9-NEXT: ; %bb.1:
8250 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
8251 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
8252 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
8253 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000
8254 ; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
8255 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
8256 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
8257 ; GFX9-NEXT: v_mov_b32_e32 v6, 0
8258 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8259 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
8260 ; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
8261 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8262 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
8263 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
8264 ; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start
8265 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
8266 ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
8267 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
8268 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8269 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8270 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
8271 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
8272 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
8273 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
8274 ; GFX9-NEXT: s_cbranch_execnz .LBB11_2
8275 ; GFX9-NEXT: .LBB11_3:
8276 ; GFX9-NEXT: s_endpgm
8278 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
8280 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
8281 ; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
8282 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
8283 ; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
8284 ; GFX1064-NEXT: s_mov_b32 s14, -1
8285 ; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000
8286 ; GFX1064-NEXT: s_add_u32 s12, s12, s9
8287 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
8288 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0
8289 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
8290 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
8291 ; GFX1064-NEXT: s_cbranch_execz .LBB11_3
8292 ; GFX1064-NEXT: ; %bb.1:
8293 ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1]
8294 ; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
8295 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
8296 ; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
8297 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0
8298 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
8299 ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
8300 ; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
8301 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
8302 ; GFX1064-NEXT: v_mov_b32_e32 v2, s2
8303 ; GFX1064-NEXT: v_mov_b32_e32 v3, s3
8304 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
8305 ; GFX1064-NEXT: .LBB11_2: ; %atomicrmw.start
8306 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
8307 ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
8308 ; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
8309 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
8310 ; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8311 ; GFX1064-NEXT: v_mov_b32_e32 v3, v1
8312 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0
8313 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
8314 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
8315 ; GFX1064-NEXT: s_cbranch_execnz .LBB11_2
8316 ; GFX1064-NEXT: .LBB11_3:
8317 ; GFX1064-NEXT: s_endpgm
8319 ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
8321 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
8322 ; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
8323 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
8324 ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
8325 ; GFX1032-NEXT: s_mov_b32 s14, -1
8326 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
8327 ; GFX1032-NEXT: s_add_u32 s12, s12, s9
8328 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
8329 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0
8330 ; GFX1032-NEXT: s_mov_b32 s4, 0
8331 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
8332 ; GFX1032-NEXT: s_cbranch_execz .LBB11_3
8333 ; GFX1032-NEXT: ; %bb.1:
8334 ; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0
8335 ; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
8336 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
8337 ; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
8338 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0
8339 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
8340 ; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
8341 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
8342 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
8343 ; GFX1032-NEXT: v_mov_b32_e32 v2, s2
8344 ; GFX1032-NEXT: v_mov_b32_e32 v3, s3
8345 ; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start
8346 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
8347 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
8348 ; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
8349 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
8350 ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
8351 ; GFX1032-NEXT: v_mov_b32_e32 v3, v1
8352 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0
8353 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
8354 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
8355 ; GFX1032-NEXT: s_cbranch_execnz .LBB11_2
8356 ; GFX1032-NEXT: .LBB11_3:
8357 ; GFX1032-NEXT: s_endpgm
8359 ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
8361 ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec
8362 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
8363 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0
8364 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
8365 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
8366 ; GFX1164-NEXT: s_clause 0x1
8367 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4
8368 ; GFX1164-NEXT: scratch_store_b32 off, v1, off
8369 ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off
8370 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
8371 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
8372 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
8373 ; GFX1164-NEXT: s_cbranch_execz .LBB11_3
8374 ; GFX1164-NEXT: ; %bb.1:
8375 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
8376 ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
8377 ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
8378 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0
8379 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
8380 ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
8381 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
8382 ; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
8383 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
8384 ; GFX1164-NEXT: v_mov_b32_e32 v2, s2
8385 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3
8386 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
8387 ; GFX1164-NEXT: .LBB11_2: ; %atomicrmw.start
8388 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
8389 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
8390 ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
8391 ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
8392 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
8393 ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8394 ; GFX1164-NEXT: v_mov_b32_e32 v3, v1
8395 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0
8396 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
8397 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
8398 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
8399 ; GFX1164-NEXT: s_cbranch_execnz .LBB11_2
8400 ; GFX1164-NEXT: .LBB11_3:
8401 ; GFX1164-NEXT: s_endpgm
8403 ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
8405 ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo
8406 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
8407 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0
8408 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
8409 ; GFX1132-NEXT: s_mov_b32 s4, 0
8410 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
8411 ; GFX1132-NEXT: s_clause 0x1
8412 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
8413 ; GFX1132-NEXT: scratch_store_b32 off, v1, off
8414 ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off
8415 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
8416 ; GFX1132-NEXT: s_cbranch_execz .LBB11_3
8417 ; GFX1132-NEXT: ; %bb.1:
8418 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
8419 ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
8420 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
8421 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0
8422 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
8423 ; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
8424 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
8425 ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
8426 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
8427 ; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
8428 ; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start
8429 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
8430 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
8431 ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
8432 ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
8433 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
8434 ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
8435 ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
8436 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
8437 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
8438 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
8439 ; GFX1132-NEXT: s_cbranch_execnz .LBB11_2
8440 ; GFX1132-NEXT: .LBB11_3:
8441 ; GFX1132-NEXT: s_endpgm
8443 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
8444 ; GFX7LESS-DPP: ; %bb.0:
8445 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
8446 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
8447 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1
8448 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000
8449 ; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9
8450 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0
8451 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec
8452 ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
8453 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
8454 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
8455 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
8456 ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB11_3
8457 ; GFX7LESS-DPP-NEXT: ; %bb.1:
8458 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
8459 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5]
8460 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
8461 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
8462 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
8463 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
8464 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
8465 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
8466 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
8467 ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
8468 ; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
8469 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
8470 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8
8471 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9
8472 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
8473 ; GFX7LESS-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
8474 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
8475 ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
8476 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
8477 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
8478 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
8479 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
8480 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
8481 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
8482 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
8483 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
8484 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
8485 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
8486 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
8487 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
8488 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB11_2
8489 ; GFX7LESS-DPP-NEXT: .LBB11_3:
8490 ; GFX7LESS-DPP-NEXT: s_endpgm
8492 ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
8493 ; GFX9-DPP: ; %bb.0:
8494 ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
8495 ; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
8496 ; GFX9-DPP-NEXT: s_mov_b32 s14, -1
8497 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec
8498 ; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000
8499 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
8500 ; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9
8501 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
8502 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0
8503 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
8504 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
8505 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3
8506 ; GFX9-DPP-NEXT: ; %bb.1:
8507 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
8508 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
8509 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
8510 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000
8511 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
8512 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
8513 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
8514 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
8515 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
8516 ; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
8517 ; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
8518 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
8519 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
8520 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
8521 ; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
8522 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
8523 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
8524 ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
8525 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
8526 ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8527 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
8528 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
8529 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
8530 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
8531 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2
8532 ; GFX9-DPP-NEXT: .LBB11_3:
8533 ; GFX9-DPP-NEXT: s_endpgm
8535 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
8536 ; GFX1064-DPP: ; %bb.0:
8537 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec
8538 ; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
8539 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
8540 ; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
8541 ; GFX1064-DPP-NEXT: s_mov_b32 s14, -1
8542 ; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000
8543 ; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9
8544 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
8545 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0
8546 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
8547 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
8548 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3
8549 ; GFX1064-DPP-NEXT: ; %bb.1:
8550 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1]
8551 ; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
8552 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
8553 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
8554 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
8555 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
8556 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
8557 ; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
8558 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
8559 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
8560 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
8561 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
8562 ; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
8563 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
8564 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
8565 ; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
8566 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
8567 ; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8568 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
8569 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
8570 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
8571 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
8572 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2
8573 ; GFX1064-DPP-NEXT: .LBB11_3:
8574 ; GFX1064-DPP-NEXT: s_endpgm
8576 ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
8577 ; GFX1032-DPP: ; %bb.0:
8578 ; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
8579 ; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
8580 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
8581 ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
8582 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
8583 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
8584 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
8585 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
8586 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
8587 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
8588 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
8589 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
8590 ; GFX1032-DPP-NEXT: ; %bb.1:
8591 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0
8592 ; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
8593 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
8594 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
8595 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
8596 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
8597 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
8598 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
8599 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
8600 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2
8601 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3
8602 ; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
8603 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
8604 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
8605 ; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
8606 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
8607 ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
8608 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
8609 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
8610 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
8611 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
8612 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2
8613 ; GFX1032-DPP-NEXT: .LBB11_3:
8614 ; GFX1032-DPP-NEXT: s_endpgm
8616 ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
8617 ; GFX1164-DPP: ; %bb.0:
8618 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec
8619 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
8620 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
8621 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
8622 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
8623 ; GFX1164-DPP-NEXT: s_clause 0x1
8624 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
8625 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off
8626 ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off
8627 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
8628 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
8629 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
8630 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3
8631 ; GFX1164-DPP-NEXT: ; %bb.1:
8632 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
8633 ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
8634 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
8635 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
8636 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
8637 ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
8638 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
8639 ; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
8640 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
8641 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
8642 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
8643 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
8644 ; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
8645 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
8646 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
8647 ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
8648 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
8649 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
8650 ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8651 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
8652 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
8653 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
8654 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
8655 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
8656 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2
8657 ; GFX1164-DPP-NEXT: .LBB11_3:
8658 ; GFX1164-DPP-NEXT: s_endpgm
8660 ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
8661 ; GFX1132-DPP: ; %bb.0:
8662 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo
8663 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
8664 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0
8665 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
8666 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
8667 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
8668 ; GFX1132-DPP-NEXT: s_clause 0x1
8669 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
8670 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
8671 ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off
8672 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
8673 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3
8674 ; GFX1132-DPP-NEXT: ; %bb.1:
8675 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
8676 ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
8677 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
8678 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
8679 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
8680 ; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
8681 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
8682 ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
8683 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
8684 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
8685 ; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
8686 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
8687 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
8688 ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
8689 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
8690 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
8691 ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
8692 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
8693 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
8694 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
8695 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
8696 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2
8697 ; GFX1132-DPP-NEXT: .LBB11_3:
8698 ; GFX1132-DPP-NEXT: s_endpgm
8699 %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic
8703 define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
8704 ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
8705 ; GFX7LESS: ; %bb.0:
8706 ; GFX7LESS-NEXT: s_mov_b32 s32, 0
8707 ; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
8708 ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
8709 ; GFX7LESS-NEXT: s_mov_b32 s38, -1
8710 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
8711 ; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
8712 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
8713 ; GFX7LESS-NEXT: s_mov_b32 s14, s8
8714 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
8715 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
8716 ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
8717 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
8718 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
8719 ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
8720 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
8721 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
8722 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
8723 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
8724 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
8725 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
8726 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
8727 ; GFX7LESS-NEXT: s_mov_b32 s12, s6
8728 ; GFX7LESS-NEXT: s_mov_b32 s13, s7
8729 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
8730 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
8731 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
8732 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
8733 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
8734 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
8735 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1
8736 ; GFX7LESS-NEXT: .LBB12_1: ; %ComputeLoop
8737 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
8738 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
8739 ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
8740 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
8741 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
8742 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
8743 ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
8744 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5]
8745 ; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
8746 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB12_1
8747 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
8748 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
8749 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
8750 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
8751 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
8752 ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
8753 ; GFX7LESS-NEXT: s_cbranch_execz .LBB12_5
8754 ; GFX7LESS-NEXT: ; %bb.3:
8755 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
8756 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
8757 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
8758 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
8759 ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
8760 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
8761 ; GFX7LESS-NEXT: .LBB12_4: ; %atomicrmw.start
8762 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
8763 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
8764 ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
8765 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
8766 ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
8767 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
8768 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
8769 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
8770 ; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
8771 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
8772 ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
8773 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
8774 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
8775 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
8776 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
8777 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_4
8778 ; GFX7LESS-NEXT: .LBB12_5:
8779 ; GFX7LESS-NEXT: s_endpgm
8781 ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
8783 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
8784 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
8785 ; GFX9-NEXT: s_mov_b32 s38, -1
8786 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000
8787 ; GFX9-NEXT: s_add_u32 s36, s36, s9
8788 ; GFX9-NEXT: s_addc_u32 s37, s37, 0
8789 ; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
8790 ; GFX9-NEXT: s_mov_b32 s14, s8
8791 ; GFX9-NEXT: s_add_u32 s8, s34, 44
8792 ; GFX9-NEXT: s_addc_u32 s9, s35, 0
8793 ; GFX9-NEXT: s_getpc_b64 s[2:3]
8794 ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
8795 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
8796 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
8797 ; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
8798 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
8799 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
8800 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
8801 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
8802 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
8803 ; GFX9-NEXT: s_mov_b32 s12, s6
8804 ; GFX9-NEXT: s_mov_b32 s13, s7
8805 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
8806 ; GFX9-NEXT: s_mov_b32 s32, 0
8807 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8808 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
8809 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
8810 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
8811 ; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
8812 ; GFX9-NEXT: .LBB12_1: ; %ComputeLoop
8813 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
8814 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
8815 ; GFX9-NEXT: v_readlane_b32 s3, v1, s4
8816 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4
8817 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
8818 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
8819 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
8820 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
8821 ; GFX9-NEXT: s_cbranch_scc1 .LBB12_1
8822 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
8823 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
8824 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
8825 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
8826 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
8827 ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
8828 ; GFX9-NEXT: s_cbranch_execz .LBB12_5
8829 ; GFX9-NEXT: ; %bb.3:
8830 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
8831 ; GFX9-NEXT: v_mov_b32_e32 v6, 0
8832 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
8833 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
8834 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
8835 ; GFX9-NEXT: .LBB12_4: ; %atomicrmw.start
8836 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
8837 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8838 ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
8839 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
8840 ; GFX9-NEXT: s_waitcnt vmcnt(0)
8841 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8842 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
8843 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
8844 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
8845 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
8846 ; GFX9-NEXT: s_cbranch_execnz .LBB12_4
8847 ; GFX9-NEXT: .LBB12_5:
8848 ; GFX9-NEXT: s_endpgm
8850 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
8852 ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
8853 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
8854 ; GFX1064-NEXT: s_mov_b32 s38, -1
8855 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
8856 ; GFX1064-NEXT: s_add_u32 s36, s36, s9
8857 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
8858 ; GFX1064-NEXT: s_addc_u32 s37, s37, 0
8859 ; GFX1064-NEXT: s_mov_b32 s14, s8
8860 ; GFX1064-NEXT: s_add_u32 s8, s34, 44
8861 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0
8862 ; GFX1064-NEXT: s_getpc_b64 s[2:3]
8863 ; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
8864 ; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
8865 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
8866 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
8867 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
8868 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
8869 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
8870 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
8871 ; GFX1064-NEXT: s_mov_b32 s12, s6
8872 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
8873 ; GFX1064-NEXT: s_mov_b32 s13, s7
8874 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
8875 ; GFX1064-NEXT: s_mov_b32 s32, 0
8876 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
8877 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
8878 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0
8879 ; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
8880 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
8881 ; GFX1064-NEXT: .LBB12_1: ; %ComputeLoop
8882 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
8883 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
8884 ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
8885 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
8886 ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
8887 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
8888 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
8889 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
8890 ; GFX1064-NEXT: s_cbranch_scc1 .LBB12_1
8891 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
8892 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
8893 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
8894 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
8895 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
8896 ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
8897 ; GFX1064-NEXT: s_cbranch_execz .LBB12_5
8898 ; GFX1064-NEXT: ; %bb.3:
8899 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
8900 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0
8901 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
8902 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
8903 ; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
8904 ; GFX1064-NEXT: .LBB12_4: ; %atomicrmw.start
8905 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
8906 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
8907 ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
8908 ; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
8909 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
8910 ; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
8911 ; GFX1064-NEXT: v_mov_b32_e32 v3, v1
8912 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0
8913 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
8914 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
8915 ; GFX1064-NEXT: s_cbranch_execnz .LBB12_4
8916 ; GFX1064-NEXT: .LBB12_5:
8917 ; GFX1064-NEXT: s_endpgm
8919 ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
8921 ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
8922 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
8923 ; GFX1032-NEXT: s_mov_b32 s38, -1
8924 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
8925 ; GFX1032-NEXT: s_add_u32 s36, s36, s9
8926 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
8927 ; GFX1032-NEXT: s_addc_u32 s37, s37, 0
8928 ; GFX1032-NEXT: s_mov_b32 s14, s8
8929 ; GFX1032-NEXT: s_add_u32 s8, s34, 44
8930 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0
8931 ; GFX1032-NEXT: s_getpc_b64 s[2:3]
8932 ; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
8933 ; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
8934 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
8935 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
8936 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
8937 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
8938 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
8939 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
8940 ; GFX1032-NEXT: s_mov_b32 s12, s6
8941 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
8942 ; GFX1032-NEXT: s_mov_b32 s13, s7
8943 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
8944 ; GFX1032-NEXT: s_mov_b32 s32, 0
8945 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
8946 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
8947 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0
8948 ; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
8949 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
8950 ; GFX1032-NEXT: .LBB12_1: ; %ComputeLoop
8951 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
8952 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
8953 ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
8954 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
8955 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
8956 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
8957 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
8958 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
8959 ; GFX1032-NEXT: s_cbranch_scc1 .LBB12_1
8960 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
8961 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
8962 ; GFX1032-NEXT: s_mov_b32 s2, 0
8963 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
8964 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
8965 ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
8966 ; GFX1032-NEXT: s_cbranch_execz .LBB12_5
8967 ; GFX1032-NEXT: ; %bb.3:
8968 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
8969 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0
8970 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
8971 ; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
8972 ; GFX1032-NEXT: .LBB12_4: ; %atomicrmw.start
8973 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
8974 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
8975 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
8976 ; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
8977 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
8978 ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
8979 ; GFX1032-NEXT: v_mov_b32_e32 v3, v1
8980 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0
8981 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
8982 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
8983 ; GFX1032-NEXT: s_cbranch_execnz .LBB12_4
8984 ; GFX1032-NEXT: .LBB12_5:
8985 ; GFX1032-NEXT: s_endpgm
8987 ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
8989 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
8990 ; GFX1164-NEXT: s_mov_b32 s14, s8
8991 ; GFX1164-NEXT: s_add_u32 s8, s34, 44
8992 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0
8993 ; GFX1164-NEXT: s_getpc_b64 s[2:3]
8994 ; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
8995 ; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
8996 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0
8997 ; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
8998 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
8999 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
9000 ; GFX1164-NEXT: s_mov_b32 s12, s6
9001 ; GFX1164-NEXT: s_mov_b32 s13, s7
9002 ; GFX1164-NEXT: s_mov_b32 s32, 0
9003 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
9004 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
9005 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0
9006 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
9007 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
9008 ; GFX1164-NEXT: .LBB12_1: ; %ComputeLoop
9009 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
9010 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
9011 ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
9012 ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
9013 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
9014 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
9015 ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
9016 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
9017 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
9018 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
9019 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
9020 ; GFX1164-NEXT: s_cbranch_scc1 .LBB12_1
9021 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
9022 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9023 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
9024 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9025 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
9026 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
9027 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
9028 ; GFX1164-NEXT: s_cbranch_execz .LBB12_5
9029 ; GFX1164-NEXT: ; %bb.3:
9030 ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
9031 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0
9032 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
9033 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
9034 ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
9035 ; GFX1164-NEXT: .LBB12_4: ; %atomicrmw.start
9036 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
9037 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
9038 ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
9039 ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
9040 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
9041 ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
9042 ; GFX1164-NEXT: v_mov_b32_e32 v3, v1
9043 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0
9044 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
9045 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
9046 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
9047 ; GFX1164-NEXT: s_cbranch_execnz .LBB12_4
9048 ; GFX1164-NEXT: .LBB12_5:
9049 ; GFX1164-NEXT: s_endpgm
9051 ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
9053 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
9054 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0
9055 ; GFX1132-NEXT: s_add_u32 s8, s34, 44
9056 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0
9057 ; GFX1132-NEXT: s_getpc_b64 s[2:3]
9058 ; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
9059 ; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
9060 ; GFX1132-NEXT: s_mov_b32 s12, s13
9061 ; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
9062 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
9063 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
9064 ; GFX1132-NEXT: s_mov_b32 s13, s14
9065 ; GFX1132-NEXT: s_mov_b32 s14, s15
9066 ; GFX1132-NEXT: s_mov_b32 s32, 0
9067 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
9068 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
9069 ; GFX1132-NEXT: v_mov_b32_e32 v4, 0
9070 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
9071 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
9072 ; GFX1132-NEXT: .LBB12_1: ; %ComputeLoop
9073 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
9074 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
9075 ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
9076 ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
9077 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
9078 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
9079 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9080 ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
9081 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
9082 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
9083 ; GFX1132-NEXT: s_cbranch_scc1 .LBB12_1
9084 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
9085 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9086 ; GFX1132-NEXT: s_mov_b32 s2, 0
9087 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
9088 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
9089 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
9090 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
9091 ; GFX1132-NEXT: s_cbranch_execz .LBB12_5
9092 ; GFX1132-NEXT: ; %bb.3:
9093 ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
9094 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0
9095 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
9096 ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
9097 ; GFX1132-NEXT: .LBB12_4: ; %atomicrmw.start
9098 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
9099 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
9100 ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
9101 ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
9102 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
9103 ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
9104 ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
9105 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
9106 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
9107 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
9108 ; GFX1132-NEXT: s_cbranch_execnz .LBB12_4
9109 ; GFX1132-NEXT: .LBB12_5:
9110 ; GFX1132-NEXT: s_endpgm
9112 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
9113 ; GFX7LESS-DPP: ; %bb.0:
9114 ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
9115 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
9116 ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
9117 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
9118 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
9119 ; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
9120 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
9121 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
9122 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
9123 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
9124 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
9125 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
9126 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
9127 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
9128 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
9129 ; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
9130 ; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
9131 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
9132 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
9133 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
9134 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
9135 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
9136 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
9137 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
9138 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
9139 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
9140 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
9141 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
9142 ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
9143 ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
9144 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
9145 ; GFX7LESS-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
9146 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
9147 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
9148 ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
9149 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
9150 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
9151 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
9152 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
9153 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
9154 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
9155 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
9156 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
9157 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
9158 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
9159 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
9160 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
9161 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB12_1
9162 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
9163 ; GFX7LESS-DPP-NEXT: s_endpgm
9165 ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
9166 ; GFX9-DPP: ; %bb.0:
9167 ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
9168 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
9169 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1
9170 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
9171 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
9172 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
9173 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
9174 ; GFX9-DPP-NEXT: s_mov_b32 s14, s8
9175 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
9176 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
9177 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
9178 ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
9179 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
9180 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
9181 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
9182 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
9183 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
9184 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
9185 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
9186 ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
9187 ; GFX9-DPP-NEXT: s_mov_b32 s12, s6
9188 ; GFX9-DPP-NEXT: s_mov_b32 s13, s7
9189 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
9190 ; GFX9-DPP-NEXT: s_mov_b32 s32, 0
9191 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
9192 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
9193 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
9194 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
9195 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
9196 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
9197 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
9198 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
9199 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
9200 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
9201 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
9202 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
9203 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
9204 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
9205 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
9206 ; GFX9-DPP-NEXT: s_nop 0
9207 ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
9208 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
9209 ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
9210 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
9211 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
9212 ; GFX9-DPP-NEXT: s_nop 0
9213 ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
9214 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
9215 ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
9216 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
9217 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
9218 ; GFX9-DPP-NEXT: s_nop 0
9219 ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
9220 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
9221 ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
9222 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
9223 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
9224 ; GFX9-DPP-NEXT: s_nop 0
9225 ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
9226 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
9227 ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
9228 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
9229 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
9230 ; GFX9-DPP-NEXT: s_nop 0
9231 ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
9232 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
9233 ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
9234 ; GFX9-DPP-NEXT: s_nop 1
9235 ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
9236 ; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
9237 ; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
9238 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
9239 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9240 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
9241 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
9242 ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
9243 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
9244 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
9245 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
9246 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
9247 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
9248 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB12_3
9249 ; GFX9-DPP-NEXT: ; %bb.1:
9250 ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
9251 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
9252 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
9253 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
9254 ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
9255 ; GFX9-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
9256 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
9257 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
9258 ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1]
9259 ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
9260 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
9261 ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
9262 ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
9263 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
9264 ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
9265 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
9266 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_2
9267 ; GFX9-DPP-NEXT: .LBB12_3:
9268 ; GFX9-DPP-NEXT: s_endpgm
9270 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
9271 ; GFX1064-DPP: ; %bb.0:
9272 ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
9273 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
9274 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
9275 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
9276 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
9277 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
9278 ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
9279 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
9280 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
9281 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
9282 ; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
9283 ; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
9284 ; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
9285 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
9286 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
9287 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
9288 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
9289 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
9290 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
9291 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
9292 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
9293 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
9294 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
9295 ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
9296 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
9297 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
9298 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
9299 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
9300 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
9301 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
9302 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
9303 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
9304 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
9305 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
9306 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
9307 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
9308 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
9309 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
9310 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
9311 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
9312 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
9313 ; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
9314 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
9315 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
9316 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
9317 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
9318 ; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
9319 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
9320 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
9321 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
9322 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
9323 ; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
9324 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
9325 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
9326 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
9327 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
9328 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
9329 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
9330 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
9331 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
9332 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
9333 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
9334 ; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
9335 ; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
9336 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
9337 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
9338 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9339 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
9340 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
9341 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
9342 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
9343 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
9344 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB12_3
9345 ; GFX1064-DPP-NEXT: ; %bb.1:
9346 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
9347 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
9348 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
9349 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
9350 ; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
9351 ; GFX1064-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
9352 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
9353 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
9354 ; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
9355 ; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
9356 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
9357 ; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
9358 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
9359 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
9360 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
9361 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
9362 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_2
9363 ; GFX1064-DPP-NEXT: .LBB12_3:
9364 ; GFX1064-DPP-NEXT: s_endpgm
9366 ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
9367 ; GFX1032-DPP: ; %bb.0:
9368 ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
9369 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
9370 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
9371 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
9372 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
9373 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
9374 ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
9375 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
9376 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
9377 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
9378 ; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
9379 ; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
9380 ; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
9381 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
9382 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
9383 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
9384 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
9385 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
9386 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
9387 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
9388 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
9389 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
9390 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
9391 ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
9392 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
9393 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
9394 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
9395 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
9396 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
9397 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
9398 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
9399 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
9400 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
9401 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
9402 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
9403 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
9404 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
9405 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
9406 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
9407 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
9408 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
9409 ; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
9410 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
9411 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
9412 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
9413 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
9414 ; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
9415 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
9416 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
9417 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
9418 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
9419 ; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
9420 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
9421 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
9422 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
9423 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
9424 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
9425 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
9426 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
9427 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
9428 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
9429 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
9430 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
9431 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
9432 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
9433 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
9434 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
9435 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB12_3
9436 ; GFX1032-DPP-NEXT: ; %bb.1:
9437 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
9438 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
9439 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
9440 ; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
9441 ; GFX1032-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
9442 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
9443 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
9444 ; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
9445 ; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
9446 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
9447 ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
9448 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
9449 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
9450 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
9451 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
9452 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_2
9453 ; GFX1032-DPP-NEXT: .LBB12_3:
9454 ; GFX1032-DPP-NEXT: s_endpgm
9456 ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
9457 ; GFX1164-DPP: ; %bb.0:
9458 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
9459 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
9460 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
9461 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
9462 ; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
9463 ; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
9464 ; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
9465 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
9466 ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
9467 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
9468 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
9469 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
9470 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
9471 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
9472 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
9473 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
9474 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
9475 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
9476 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
9477 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
9478 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
9479 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
9480 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
9481 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
9482 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
9483 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
9484 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
9485 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
9486 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
9487 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
9488 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
9489 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
9490 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
9491 ; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
9492 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
9493 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
9494 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
9495 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
9496 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
9497 ; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
9498 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
9499 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
9500 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
9501 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
9502 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
9503 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9504 ; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
9505 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
9506 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
9507 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
9508 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
9509 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
9510 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
9511 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
9512 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
9513 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
9514 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
9515 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9516 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
9517 ; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
9518 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
9519 ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
9520 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
9521 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
9522 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
9523 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
9524 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
9525 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
9526 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
9527 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
9528 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
9529 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
9530 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
9531 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB12_3
9532 ; GFX1164-DPP-NEXT: ; %bb.1:
9533 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
9534 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
9535 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
9536 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
9537 ; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
9538 ; GFX1164-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
9539 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
9540 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
9541 ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
9542 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
9543 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
9544 ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
9545 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
9546 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
9547 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
9548 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
9549 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
9550 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_2
9551 ; GFX1164-DPP-NEXT: .LBB12_3:
9552 ; GFX1164-DPP-NEXT: s_endpgm
9554 ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
9555 ; GFX1132-DPP: ; %bb.0:
9556 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
9557 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
9558 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
9559 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
9560 ; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
9561 ; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
9562 ; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
9563 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
9564 ; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
9565 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
9566 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
9567 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
9568 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
9569 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
9570 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
9571 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
9572 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
9573 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
9574 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
9575 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
9576 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
9577 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
9578 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
9579 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
9580 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
9581 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
9582 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
9583 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
9584 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
9585 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
9586 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
9587 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
9588 ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
9589 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
9590 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
9591 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
9592 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
9593 ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
9594 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
9595 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
9596 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
9597 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
9598 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9599 ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
9600 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
9601 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
9602 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
9603 ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
9604 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9605 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
9606 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
9607 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
9608 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
9609 ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
9610 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
9611 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
9612 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
9613 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
9614 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
9615 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
9616 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
9617 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
9618 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
9619 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB12_3
9620 ; GFX1132-DPP-NEXT: ; %bb.1:
9621 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
9622 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
9623 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
9624 ; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
9625 ; GFX1132-DPP-NEXT: .LBB12_2: ; %atomicrmw.start
9626 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
9627 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
9628 ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
9629 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
9630 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
9631 ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
9632 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
9633 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
9634 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
9635 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
9636 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_2
9637 ; GFX1132-DPP-NEXT: .LBB12_3:
9638 ; GFX1132-DPP-NEXT: s_endpgm
9639 %divValue = call double @div.double.value() strictfp
9640 %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic
9644 define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{
9645 ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
9646 ; GFX7LESS: ; %bb.0:
9647 ; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
9648 ; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
9649 ; GFX7LESS-NEXT: s_mov_b32 s14, -1
9650 ; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000
9651 ; GFX7LESS-NEXT: s_add_u32 s12, s12, s9
9652 ; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0
9653 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
9654 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
9655 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
9656 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
9657 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
9658 ; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3
9659 ; GFX7LESS-NEXT: ; %bb.1:
9660 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
9661 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[4:5]
9662 ; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
9663 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
9664 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
9665 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
9666 ; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
9667 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
9668 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
9669 ; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
9670 ; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
9671 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
9672 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8
9673 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9
9674 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
9675 ; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start
9676 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
9677 ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
9678 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
9679 ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
9680 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
9681 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
9682 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
9683 ; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
9684 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
9685 ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
9686 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
9687 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
9688 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
9689 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
9690 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2
9691 ; GFX7LESS-NEXT: .LBB13_3:
9692 ; GFX7LESS-NEXT: s_endpgm
9694 ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
9696 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
9697 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
9698 ; GFX9-NEXT: s_mov_b32 s14, -1
9699 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
9700 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000
9701 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
9702 ; GFX9-NEXT: s_add_u32 s12, s12, s9
9703 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
9704 ; GFX9-NEXT: s_addc_u32 s13, s13, 0
9705 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
9706 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
9707 ; GFX9-NEXT: s_cbranch_execz .LBB13_3
9708 ; GFX9-NEXT: ; %bb.1:
9709 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
9710 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
9711 ; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
9712 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000
9713 ; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
9714 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
9715 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
9716 ; GFX9-NEXT: v_mov_b32_e32 v6, 0
9717 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
9718 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
9719 ; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
9720 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
9721 ; GFX9-NEXT: v_mov_b32_e32 v2, s4
9722 ; GFX9-NEXT: v_mov_b32_e32 v3, s5
9723 ; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
9724 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
9725 ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
9726 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
9727 ; GFX9-NEXT: s_waitcnt vmcnt(0)
9728 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
9729 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
9730 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
9731 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
9732 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
9733 ; GFX9-NEXT: s_cbranch_execnz .LBB13_2
9734 ; GFX9-NEXT: .LBB13_3:
9735 ; GFX9-NEXT: s_endpgm
9737 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
9739 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
9740 ; GFX1064-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
9741 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
9742 ; GFX1064-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
9743 ; GFX1064-NEXT: s_mov_b32 s14, -1
9744 ; GFX1064-NEXT: s_mov_b32 s15, 0x31e16000
9745 ; GFX1064-NEXT: s_add_u32 s12, s12, s9
9746 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
9747 ; GFX1064-NEXT: s_addc_u32 s13, s13, 0
9748 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
9749 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
9750 ; GFX1064-NEXT: s_cbranch_execz .LBB13_3
9751 ; GFX1064-NEXT: ; %bb.1:
9752 ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[0:1]
9753 ; GFX1064-NEXT: s_mov_b32 s5, 0x43300000
9754 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
9755 ; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
9756 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0
9757 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
9758 ; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
9759 ; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
9760 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
9761 ; GFX1064-NEXT: v_mov_b32_e32 v2, s2
9762 ; GFX1064-NEXT: v_mov_b32_e32 v3, s3
9763 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
9764 ; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
9765 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
9766 ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
9767 ; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
9768 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
9769 ; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
9770 ; GFX1064-NEXT: v_mov_b32_e32 v3, v1
9771 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0
9772 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
9773 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
9774 ; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
9775 ; GFX1064-NEXT: .LBB13_3:
9776 ; GFX1064-NEXT: s_endpgm
9778 ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
9780 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
9781 ; GFX1032-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
9782 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
9783 ; GFX1032-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
9784 ; GFX1032-NEXT: s_mov_b32 s14, -1
9785 ; GFX1032-NEXT: s_mov_b32 s15, 0x31c16000
9786 ; GFX1032-NEXT: s_add_u32 s12, s12, s9
9787 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
9788 ; GFX1032-NEXT: s_addc_u32 s13, s13, 0
9789 ; GFX1032-NEXT: s_mov_b32 s4, 0
9790 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo
9791 ; GFX1032-NEXT: s_cbranch_execz .LBB13_3
9792 ; GFX1032-NEXT: ; %bb.1:
9793 ; GFX1032-NEXT: s_bcnt1_i32_b32 s6, s0
9794 ; GFX1032-NEXT: s_mov_b32 s7, 0x43300000
9795 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
9796 ; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
9797 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0
9798 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
9799 ; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
9800 ; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
9801 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
9802 ; GFX1032-NEXT: v_mov_b32_e32 v2, s2
9803 ; GFX1032-NEXT: v_mov_b32_e32 v3, s3
9804 ; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
9805 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
9806 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
9807 ; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
9808 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
9809 ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
9810 ; GFX1032-NEXT: v_mov_b32_e32 v3, v1
9811 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0
9812 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
9813 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
9814 ; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
9815 ; GFX1032-NEXT: .LBB13_3:
9816 ; GFX1032-NEXT: s_endpgm
9818 ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
9820 ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec
9821 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
9822 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0
9823 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
9824 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
9825 ; GFX1164-NEXT: s_clause 0x1
9826 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4
9827 ; GFX1164-NEXT: scratch_store_b32 off, v1, off
9828 ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off
9829 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
9830 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
9831 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
9832 ; GFX1164-NEXT: s_cbranch_execz .LBB13_3
9833 ; GFX1164-NEXT: ; %bb.1:
9834 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
9835 ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
9836 ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
9837 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0
9838 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
9839 ; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
9840 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
9841 ; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
9842 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
9843 ; GFX1164-NEXT: v_mov_b32_e32 v2, s2
9844 ; GFX1164-NEXT: v_mov_b32_e32 v3, s3
9845 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
9846 ; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
9847 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
9848 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
9849 ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
9850 ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
9851 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
9852 ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
9853 ; GFX1164-NEXT: v_mov_b32_e32 v3, v1
9854 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0
9855 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
9856 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
9857 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
9858 ; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
9859 ; GFX1164-NEXT: .LBB13_3:
9860 ; GFX1164-NEXT: s_endpgm
9862 ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
9864 ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo
9865 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
9866 ; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0
9867 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
9868 ; GFX1132-NEXT: s_mov_b32 s4, 0
9869 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
9870 ; GFX1132-NEXT: s_clause 0x1
9871 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
9872 ; GFX1132-NEXT: scratch_store_b32 off, v1, off
9873 ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off
9874 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
9875 ; GFX1132-NEXT: s_cbranch_execz .LBB13_3
9876 ; GFX1132-NEXT: ; %bb.1:
9877 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
9878 ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
9879 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
9880 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0
9881 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
9882 ; GFX1132-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
9883 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
9884 ; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
9885 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
9886 ; GFX1132-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
9887 ; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
9888 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
9889 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
9890 ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
9891 ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
9892 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
9893 ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
9894 ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
9895 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
9896 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
9897 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
9898 ; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
9899 ; GFX1132-NEXT: .LBB13_3:
9900 ; GFX1132-NEXT: s_endpgm
9902 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
9903 ; GFX7LESS-DPP: ; %bb.0:
9904 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
9905 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
9906 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, -1
9907 ; GFX7LESS-DPP-NEXT: s_mov_b32 s15, 0xe8f000
9908 ; GFX7LESS-DPP-NEXT: s_add_u32 s12, s12, s9
9909 ; GFX7LESS-DPP-NEXT: s_addc_u32 s13, s13, 0
9910 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec
9911 ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
9912 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
9913 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
9914 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
9915 ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB13_3
9916 ; GFX7LESS-DPP-NEXT: ; %bb.1:
9917 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
9918 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s6, s[4:5]
9919 ; GFX7LESS-DPP-NEXT: s_mov_b32 s7, 0x43300000
9920 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 0
9921 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
9922 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
9923 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
9924 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
9925 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
9926 ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
9927 ; GFX7LESS-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
9928 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
9929 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s8
9930 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s9
9931 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
9932 ; GFX7LESS-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
9933 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
9934 ; GFX7LESS-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
9935 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
9936 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v3
9937 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v2
9938 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v1
9939 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v0
9940 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
9941 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
9942 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
9943 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
9944 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, v6
9945 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v7
9946 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
9947 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB13_2
9948 ; GFX7LESS-DPP-NEXT: .LBB13_3:
9949 ; GFX7LESS-DPP-NEXT: s_endpgm
9951 ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
9952 ; GFX9-DPP: ; %bb.0:
9953 ; GFX9-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
9954 ; GFX9-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
9955 ; GFX9-DPP-NEXT: s_mov_b32 s14, -1
9956 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec
9957 ; GFX9-DPP-NEXT: s_mov_b32 s15, 0xe00000
9958 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
9959 ; GFX9-DPP-NEXT: s_add_u32 s12, s12, s9
9960 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
9961 ; GFX9-DPP-NEXT: s_addc_u32 s13, s13, 0
9962 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
9963 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
9964 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3
9965 ; GFX9-DPP-NEXT: ; %bb.1:
9966 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
9967 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
9968 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
9969 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000
9970 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
9971 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
9972 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
9973 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
9974 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
9975 ; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
9976 ; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
9977 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
9978 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
9979 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
9980 ; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
9981 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
9982 ; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
9983 ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
9984 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
9985 ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
9986 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
9987 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
9988 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
9989 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
9990 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
9991 ; GFX9-DPP-NEXT: .LBB13_3:
9992 ; GFX9-DPP-NEXT: s_endpgm
9994 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
9995 ; GFX1064-DPP: ; %bb.0:
9996 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], exec
9997 ; GFX1064-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
9998 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
9999 ; GFX1064-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
10000 ; GFX1064-DPP-NEXT: s_mov_b32 s14, -1
10001 ; GFX1064-DPP-NEXT: s_mov_b32 s15, 0x31e16000
10002 ; GFX1064-DPP-NEXT: s_add_u32 s12, s12, s9
10003 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
10004 ; GFX1064-DPP-NEXT: s_addc_u32 s13, s13, 0
10005 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
10006 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
10007 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
10008 ; GFX1064-DPP-NEXT: ; %bb.1:
10009 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s4, s[0:1]
10010 ; GFX1064-DPP-NEXT: s_mov_b32 s5, 0x43300000
10011 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
10012 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
10013 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
10014 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
10015 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
10016 ; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
10017 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
10018 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
10019 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
10020 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
10021 ; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
10022 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
10023 ; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
10024 ; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
10025 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
10026 ; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
10027 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
10028 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
10029 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
10030 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
10031 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
10032 ; GFX1064-DPP-NEXT: .LBB13_3:
10033 ; GFX1064-DPP-NEXT: s_endpgm
10035 ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
10036 ; GFX1032-DPP: ; %bb.0:
10037 ; GFX1032-DPP-NEXT: s_mov_b32 s0, exec_lo
10038 ; GFX1032-DPP-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
10039 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
10040 ; GFX1032-DPP-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
10041 ; GFX1032-DPP-NEXT: s_mov_b32 s14, -1
10042 ; GFX1032-DPP-NEXT: s_mov_b32 s15, 0x31c16000
10043 ; GFX1032-DPP-NEXT: s_add_u32 s12, s12, s9
10044 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
10045 ; GFX1032-DPP-NEXT: s_addc_u32 s13, s13, 0
10046 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
10047 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s1, vcc_lo
10048 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
10049 ; GFX1032-DPP-NEXT: ; %bb.1:
10050 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s6, s0
10051 ; GFX1032-DPP-NEXT: s_mov_b32 s7, 0x43300000
10052 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
10053 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[6:7]
10054 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
10055 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
10056 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
10057 ; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
10058 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
10059 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s2
10060 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s3
10061 ; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
10062 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
10063 ; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
10064 ; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
10065 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
10066 ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
10067 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
10068 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
10069 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
10070 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
10071 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
10072 ; GFX1032-DPP-NEXT: .LBB13_3:
10073 ; GFX1032-DPP-NEXT: s_endpgm
10075 ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
10076 ; GFX1164-DPP: ; %bb.0:
10077 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec
10078 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
10079 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
10080 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
10081 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
10082 ; GFX1164-DPP-NEXT: s_clause 0x1
10083 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
10084 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off
10085 ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off
10086 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
10087 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
10088 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
10089 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3
10090 ; GFX1164-DPP-NEXT: ; %bb.1:
10091 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
10092 ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
10093 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
10094 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
10095 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
10096 ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
10097 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
10098 ; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
10099 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
10100 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
10101 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
10102 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
10103 ; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
10104 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
10105 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
10106 ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
10107 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
10108 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
10109 ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
10110 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
10111 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
10112 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
10113 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
10114 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
10115 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
10116 ; GFX1164-DPP-NEXT: .LBB13_3:
10117 ; GFX1164-DPP-NEXT: s_endpgm
10119 ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
10120 ; GFX1132-DPP: ; %bb.0:
10121 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo
10122 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
10123 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s0
10124 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
10125 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
10126 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
10127 ; GFX1132-DPP-NEXT: s_clause 0x1
10128 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
10129 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
10130 ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off
10131 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
10132 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3
10133 ; GFX1132-DPP-NEXT: ; %bb.1:
10134 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
10135 ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
10136 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
10137 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
10138 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
10139 ; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
10140 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
10141 ; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
10142 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
10143 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
10144 ; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
10145 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
10146 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
10147 ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
10148 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
10149 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
10150 ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
10151 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
10152 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
10153 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
10154 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
10155 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
10156 ; GFX1132-DPP-NEXT: .LBB13_3:
10157 ; GFX1132-DPP-NEXT: s_endpgm
10158 %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic
10162 define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
10163 ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
10164 ; GFX7LESS: ; %bb.0:
10165 ; GFX7LESS-NEXT: s_mov_b32 s32, 0
10166 ; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
10167 ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
10168 ; GFX7LESS-NEXT: s_mov_b32 s38, -1
10169 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
10170 ; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
10171 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
10172 ; GFX7LESS-NEXT: s_mov_b32 s14, s8
10173 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
10174 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
10175 ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
10176 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
10177 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
10178 ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
10179 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
10180 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
10181 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
10182 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
10183 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
10184 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
10185 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
10186 ; GFX7LESS-NEXT: s_mov_b32 s12, s6
10187 ; GFX7LESS-NEXT: s_mov_b32 s13, s7
10188 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
10189 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
10190 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
10191 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
10192 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
10193 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
10194 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1
10195 ; GFX7LESS-NEXT: .LBB14_1: ; %ComputeLoop
10196 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
10197 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
10198 ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
10199 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
10200 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
10201 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
10202 ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
10203 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5]
10204 ; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
10205 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB14_1
10206 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
10207 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
10208 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
10209 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
10210 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
10211 ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
10212 ; GFX7LESS-NEXT: s_cbranch_execz .LBB14_5
10213 ; GFX7LESS-NEXT: ; %bb.3:
10214 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
10215 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
10216 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
10217 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
10218 ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
10219 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
10220 ; GFX7LESS-NEXT: .LBB14_4: ; %atomicrmw.start
10221 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
10222 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
10223 ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
10224 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
10225 ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
10226 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
10227 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
10228 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
10229 ; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
10230 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
10231 ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
10232 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
10233 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
10234 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
10235 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
10236 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_4
10237 ; GFX7LESS-NEXT: .LBB14_5:
10238 ; GFX7LESS-NEXT: s_endpgm
10240 ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
10242 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
10243 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
10244 ; GFX9-NEXT: s_mov_b32 s38, -1
10245 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000
10246 ; GFX9-NEXT: s_add_u32 s36, s36, s9
10247 ; GFX9-NEXT: s_addc_u32 s37, s37, 0
10248 ; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
10249 ; GFX9-NEXT: s_mov_b32 s14, s8
10250 ; GFX9-NEXT: s_add_u32 s8, s34, 44
10251 ; GFX9-NEXT: s_addc_u32 s9, s35, 0
10252 ; GFX9-NEXT: s_getpc_b64 s[2:3]
10253 ; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
10254 ; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
10255 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
10256 ; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
10257 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
10258 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
10259 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
10260 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
10261 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
10262 ; GFX9-NEXT: s_mov_b32 s12, s6
10263 ; GFX9-NEXT: s_mov_b32 s13, s7
10264 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
10265 ; GFX9-NEXT: s_mov_b32 s32, 0
10266 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
10267 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
10268 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
10269 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
10270 ; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
10271 ; GFX9-NEXT: .LBB14_1: ; %ComputeLoop
10272 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
10273 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
10274 ; GFX9-NEXT: v_readlane_b32 s3, v1, s4
10275 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4
10276 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
10277 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
10278 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
10279 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
10280 ; GFX9-NEXT: s_cbranch_scc1 .LBB14_1
10281 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
10282 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10283 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
10284 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
10285 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
10286 ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
10287 ; GFX9-NEXT: s_cbranch_execz .LBB14_5
10288 ; GFX9-NEXT: ; %bb.3:
10289 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
10290 ; GFX9-NEXT: v_mov_b32_e32 v6, 0
10291 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
10292 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
10293 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
10294 ; GFX9-NEXT: .LBB14_4: ; %atomicrmw.start
10295 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
10296 ; GFX9-NEXT: s_waitcnt vmcnt(0)
10297 ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
10298 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
10299 ; GFX9-NEXT: s_waitcnt vmcnt(0)
10300 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
10301 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
10302 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
10303 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
10304 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
10305 ; GFX9-NEXT: s_cbranch_execnz .LBB14_4
10306 ; GFX9-NEXT: .LBB14_5:
10307 ; GFX9-NEXT: s_endpgm
10309 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
10310 ; GFX1064: ; %bb.0:
10311 ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
10312 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
10313 ; GFX1064-NEXT: s_mov_b32 s38, -1
10314 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
10315 ; GFX1064-NEXT: s_add_u32 s36, s36, s9
10316 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
10317 ; GFX1064-NEXT: s_addc_u32 s37, s37, 0
10318 ; GFX1064-NEXT: s_mov_b32 s14, s8
10319 ; GFX1064-NEXT: s_add_u32 s8, s34, 44
10320 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0
10321 ; GFX1064-NEXT: s_getpc_b64 s[2:3]
10322 ; GFX1064-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
10323 ; GFX1064-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
10324 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
10325 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
10326 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
10327 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
10328 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
10329 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
10330 ; GFX1064-NEXT: s_mov_b32 s12, s6
10331 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
10332 ; GFX1064-NEXT: s_mov_b32 s13, s7
10333 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
10334 ; GFX1064-NEXT: s_mov_b32 s32, 0
10335 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
10336 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
10337 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0
10338 ; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
10339 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
10340 ; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop
10341 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
10342 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
10343 ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
10344 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
10345 ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
10346 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
10347 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
10348 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
10349 ; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1
10350 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
10351 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10352 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
10353 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
10354 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
10355 ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
10356 ; GFX1064-NEXT: s_cbranch_execz .LBB14_5
10357 ; GFX1064-NEXT: ; %bb.3:
10358 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
10359 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0
10360 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
10361 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
10362 ; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
10363 ; GFX1064-NEXT: .LBB14_4: ; %atomicrmw.start
10364 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
10365 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
10366 ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
10367 ; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
10368 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
10369 ; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
10370 ; GFX1064-NEXT: v_mov_b32_e32 v3, v1
10371 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0
10372 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
10373 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
10374 ; GFX1064-NEXT: s_cbranch_execnz .LBB14_4
10375 ; GFX1064-NEXT: .LBB14_5:
10376 ; GFX1064-NEXT: s_endpgm
10378 ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
10379 ; GFX1032: ; %bb.0:
10380 ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
10381 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
10382 ; GFX1032-NEXT: s_mov_b32 s38, -1
10383 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
10384 ; GFX1032-NEXT: s_add_u32 s36, s36, s9
10385 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
10386 ; GFX1032-NEXT: s_addc_u32 s37, s37, 0
10387 ; GFX1032-NEXT: s_mov_b32 s14, s8
10388 ; GFX1032-NEXT: s_add_u32 s8, s34, 44
10389 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0
10390 ; GFX1032-NEXT: s_getpc_b64 s[2:3]
10391 ; GFX1032-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
10392 ; GFX1032-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
10393 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
10394 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
10395 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
10396 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
10397 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
10398 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
10399 ; GFX1032-NEXT: s_mov_b32 s12, s6
10400 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
10401 ; GFX1032-NEXT: s_mov_b32 s13, s7
10402 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
10403 ; GFX1032-NEXT: s_mov_b32 s32, 0
10404 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
10405 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
10406 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0
10407 ; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
10408 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
10409 ; GFX1032-NEXT: .LBB14_1: ; %ComputeLoop
10410 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
10411 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
10412 ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
10413 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
10414 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
10415 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
10416 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
10417 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
10418 ; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1
10419 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
10420 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10421 ; GFX1032-NEXT: s_mov_b32 s2, 0
10422 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
10423 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
10424 ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
10425 ; GFX1032-NEXT: s_cbranch_execz .LBB14_5
10426 ; GFX1032-NEXT: ; %bb.3:
10427 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
10428 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0
10429 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
10430 ; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
10431 ; GFX1032-NEXT: .LBB14_4: ; %atomicrmw.start
10432 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
10433 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
10434 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
10435 ; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
10436 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
10437 ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
10438 ; GFX1032-NEXT: v_mov_b32_e32 v3, v1
10439 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0
10440 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
10441 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
10442 ; GFX1032-NEXT: s_cbranch_execnz .LBB14_4
10443 ; GFX1032-NEXT: .LBB14_5:
10444 ; GFX1032-NEXT: s_endpgm
10446 ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
10447 ; GFX1164: ; %bb.0:
10448 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
10449 ; GFX1164-NEXT: s_mov_b32 s14, s8
10450 ; GFX1164-NEXT: s_add_u32 s8, s34, 44
10451 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0
10452 ; GFX1164-NEXT: s_getpc_b64 s[2:3]
10453 ; GFX1164-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
10454 ; GFX1164-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
10455 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0
10456 ; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
10457 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
10458 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
10459 ; GFX1164-NEXT: s_mov_b32 s12, s6
10460 ; GFX1164-NEXT: s_mov_b32 s13, s7
10461 ; GFX1164-NEXT: s_mov_b32 s32, 0
10462 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
10463 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
10464 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0
10465 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
10466 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
10467 ; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop
10468 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
10469 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
10470 ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
10471 ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
10472 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
10473 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
10474 ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
10475 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
10476 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
10477 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
10478 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
10479 ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1
10480 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
10481 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10482 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
10483 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10484 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
10485 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
10486 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
10487 ; GFX1164-NEXT: s_cbranch_execz .LBB14_5
10488 ; GFX1164-NEXT: ; %bb.3:
10489 ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
10490 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0
10491 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
10492 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
10493 ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
10494 ; GFX1164-NEXT: .LBB14_4: ; %atomicrmw.start
10495 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
10496 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
10497 ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
10498 ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
10499 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
10500 ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
10501 ; GFX1164-NEXT: v_mov_b32_e32 v3, v1
10502 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0
10503 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
10504 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
10505 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
10506 ; GFX1164-NEXT: s_cbranch_execnz .LBB14_4
10507 ; GFX1164-NEXT: .LBB14_5:
10508 ; GFX1164-NEXT: s_endpgm
10510 ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
10511 ; GFX1132: ; %bb.0:
10512 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
10513 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0
10514 ; GFX1132-NEXT: s_add_u32 s8, s34, 44
10515 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0
10516 ; GFX1132-NEXT: s_getpc_b64 s[2:3]
10517 ; GFX1132-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
10518 ; GFX1132-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
10519 ; GFX1132-NEXT: s_mov_b32 s12, s13
10520 ; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
10521 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
10522 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
10523 ; GFX1132-NEXT: s_mov_b32 s13, s14
10524 ; GFX1132-NEXT: s_mov_b32 s14, s15
10525 ; GFX1132-NEXT: s_mov_b32 s32, 0
10526 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
10527 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
10528 ; GFX1132-NEXT: v_mov_b32_e32 v4, 0
10529 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
10530 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
10531 ; GFX1132-NEXT: .LBB14_1: ; %ComputeLoop
10532 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
10533 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
10534 ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
10535 ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
10536 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
10537 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
10538 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10539 ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
10540 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
10541 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
10542 ; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1
10543 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
10544 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10545 ; GFX1132-NEXT: s_mov_b32 s2, 0
10546 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
10547 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
10548 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
10549 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
10550 ; GFX1132-NEXT: s_cbranch_execz .LBB14_5
10551 ; GFX1132-NEXT: ; %bb.3:
10552 ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
10553 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0
10554 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
10555 ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
10556 ; GFX1132-NEXT: .LBB14_4: ; %atomicrmw.start
10557 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
10558 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
10559 ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
10560 ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
10561 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
10562 ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
10563 ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
10564 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
10565 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
10566 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
10567 ; GFX1132-NEXT: s_cbranch_execnz .LBB14_4
10568 ; GFX1132-NEXT: .LBB14_5:
10569 ; GFX1132-NEXT: s_endpgm
10571 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
10572 ; GFX7LESS-DPP: ; %bb.0:
10573 ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
10574 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
10575 ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
10576 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
10577 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
10578 ; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
10579 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
10580 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
10581 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
10582 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
10583 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
10584 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
10585 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
10586 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
10587 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
10588 ; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
10589 ; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
10590 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
10591 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
10592 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
10593 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
10594 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
10595 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
10596 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
10597 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
10598 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
10599 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
10600 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
10601 ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
10602 ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
10603 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
10604 ; GFX7LESS-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
10605 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
10606 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
10607 ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
10608 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
10609 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
10610 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
10611 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
10612 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
10613 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
10614 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
10615 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
10616 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
10617 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
10618 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
10619 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
10620 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB14_1
10621 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
10622 ; GFX7LESS-DPP-NEXT: s_endpgm
10624 ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
10625 ; GFX9-DPP: ; %bb.0:
10626 ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
10627 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
10628 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1
10629 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
10630 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
10631 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
10632 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
10633 ; GFX9-DPP-NEXT: s_mov_b32 s14, s8
10634 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
10635 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
10636 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
10637 ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
10638 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
10639 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
10640 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
10641 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
10642 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
10643 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
10644 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
10645 ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
10646 ; GFX9-DPP-NEXT: s_mov_b32 s12, s6
10647 ; GFX9-DPP-NEXT: s_mov_b32 s13, s7
10648 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
10649 ; GFX9-DPP-NEXT: s_mov_b32 s32, 0
10650 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
10651 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
10652 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
10653 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
10654 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
10655 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
10656 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
10657 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
10658 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
10659 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
10660 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
10661 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
10662 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
10663 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
10664 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
10665 ; GFX9-DPP-NEXT: s_nop 0
10666 ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
10667 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
10668 ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
10669 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
10670 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
10671 ; GFX9-DPP-NEXT: s_nop 0
10672 ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
10673 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
10674 ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
10675 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
10676 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
10677 ; GFX9-DPP-NEXT: s_nop 0
10678 ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
10679 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
10680 ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
10681 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
10682 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
10683 ; GFX9-DPP-NEXT: s_nop 0
10684 ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
10685 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
10686 ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
10687 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
10688 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
10689 ; GFX9-DPP-NEXT: s_nop 0
10690 ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
10691 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
10692 ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
10693 ; GFX9-DPP-NEXT: s_nop 1
10694 ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
10695 ; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
10696 ; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
10697 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
10698 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10699 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
10700 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
10701 ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
10702 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
10703 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
10704 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
10705 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
10706 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
10707 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB14_3
10708 ; GFX9-DPP-NEXT: ; %bb.1:
10709 ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
10710 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
10711 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
10712 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
10713 ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
10714 ; GFX9-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
10715 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
10716 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
10717 ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1]
10718 ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
10719 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
10720 ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
10721 ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
10722 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
10723 ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
10724 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
10725 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_2
10726 ; GFX9-DPP-NEXT: .LBB14_3:
10727 ; GFX9-DPP-NEXT: s_endpgm
10729 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
10730 ; GFX1064-DPP: ; %bb.0:
10731 ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
10732 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
10733 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
10734 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
10735 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
10736 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
10737 ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
10738 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
10739 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
10740 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
10741 ; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
10742 ; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
10743 ; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
10744 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
10745 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
10746 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
10747 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
10748 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
10749 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
10750 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
10751 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
10752 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
10753 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
10754 ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
10755 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
10756 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
10757 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
10758 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
10759 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
10760 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
10761 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
10762 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
10763 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
10764 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
10765 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
10766 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
10767 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
10768 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
10769 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
10770 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
10771 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
10772 ; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
10773 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
10774 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
10775 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
10776 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
10777 ; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
10778 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
10779 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
10780 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
10781 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
10782 ; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
10783 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
10784 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
10785 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
10786 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
10787 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
10788 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
10789 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
10790 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
10791 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
10792 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
10793 ; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
10794 ; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
10795 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
10796 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
10797 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10798 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
10799 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
10800 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
10801 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
10802 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
10803 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB14_3
10804 ; GFX1064-DPP-NEXT: ; %bb.1:
10805 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
10806 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
10807 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
10808 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
10809 ; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
10810 ; GFX1064-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
10811 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
10812 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
10813 ; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
10814 ; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
10815 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
10816 ; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
10817 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
10818 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
10819 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
10820 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
10821 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_2
10822 ; GFX1064-DPP-NEXT: .LBB14_3:
10823 ; GFX1064-DPP-NEXT: s_endpgm
10825 ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
10826 ; GFX1032-DPP: ; %bb.0:
10827 ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
10828 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
10829 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
10830 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
10831 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
10832 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
10833 ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
10834 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
10835 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
10836 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
10837 ; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
10838 ; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
10839 ; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
10840 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
10841 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
10842 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
10843 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
10844 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
10845 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
10846 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
10847 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
10848 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
10849 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
10850 ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
10851 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
10852 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
10853 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
10854 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
10855 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
10856 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
10857 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
10858 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
10859 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
10860 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
10861 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
10862 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
10863 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
10864 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
10865 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
10866 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
10867 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
10868 ; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
10869 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
10870 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
10871 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
10872 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
10873 ; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
10874 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
10875 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
10876 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
10877 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
10878 ; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
10879 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
10880 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
10881 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
10882 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
10883 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
10884 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
10885 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
10886 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
10887 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
10888 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
10889 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
10890 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
10891 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
10892 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
10893 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
10894 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB14_3
10895 ; GFX1032-DPP-NEXT: ; %bb.1:
10896 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
10897 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
10898 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
10899 ; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
10900 ; GFX1032-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
10901 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
10902 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
10903 ; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
10904 ; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
10905 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
10906 ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
10907 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
10908 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
10909 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
10910 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
10911 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_2
10912 ; GFX1032-DPP-NEXT: .LBB14_3:
10913 ; GFX1032-DPP-NEXT: s_endpgm
10915 ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
10916 ; GFX1164-DPP: ; %bb.0:
10917 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
10918 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
10919 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
10920 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
10921 ; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
10922 ; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
10923 ; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
10924 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
10925 ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
10926 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
10927 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
10928 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
10929 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
10930 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
10931 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
10932 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
10933 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
10934 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
10935 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
10936 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
10937 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
10938 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
10939 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
10940 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
10941 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
10942 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
10943 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
10944 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
10945 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
10946 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
10947 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
10948 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
10949 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
10950 ; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
10951 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
10952 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
10953 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
10954 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
10955 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
10956 ; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
10957 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
10958 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
10959 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10960 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
10961 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
10962 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10963 ; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
10964 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
10965 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
10966 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
10967 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
10968 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
10969 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
10970 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
10971 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
10972 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
10973 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
10974 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
10975 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
10976 ; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
10977 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
10978 ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
10979 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
10980 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
10981 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
10982 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
10983 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
10984 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
10985 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
10986 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
10987 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
10988 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
10989 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
10990 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB14_3
10991 ; GFX1164-DPP-NEXT: ; %bb.1:
10992 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
10993 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
10994 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
10995 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
10996 ; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
10997 ; GFX1164-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
10998 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
10999 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
11000 ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
11001 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
11002 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
11003 ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
11004 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
11005 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
11006 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
11007 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
11008 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
11009 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_2
11010 ; GFX1164-DPP-NEXT: .LBB14_3:
11011 ; GFX1164-DPP-NEXT: s_endpgm
11013 ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
11014 ; GFX1132-DPP: ; %bb.0:
11015 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
11016 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
11017 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
11018 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
11019 ; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
11020 ; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
11021 ; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
11022 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
11023 ; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
11024 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
11025 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
11026 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
11027 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
11028 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
11029 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
11030 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
11031 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
11032 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
11033 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
11034 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
11035 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
11036 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
11037 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
11038 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
11039 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
11040 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
11041 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
11042 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
11043 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
11044 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
11045 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
11046 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
11047 ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
11048 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
11049 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
11050 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
11051 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
11052 ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
11053 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
11054 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
11055 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
11056 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
11057 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11058 ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
11059 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
11060 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
11061 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
11062 ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
11063 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11064 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
11065 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
11066 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
11067 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
11068 ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
11069 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
11070 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
11071 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
11072 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
11073 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
11074 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
11075 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
11076 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
11077 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
11078 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB14_3
11079 ; GFX1132-DPP-NEXT: ; %bb.1:
11080 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
11081 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
11082 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
11083 ; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
11084 ; GFX1132-DPP-NEXT: .LBB14_2: ; %atomicrmw.start
11085 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
11086 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
11087 ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
11088 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
11089 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
11090 ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
11091 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
11092 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
11093 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
11094 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
11095 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_2
11096 ; GFX1132-DPP-NEXT: .LBB14_3:
11097 ; GFX1132-DPP-NEXT: s_endpgm
11098 %divValue = call double @div.double.value()
11099 %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic
11103 define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
11104 ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
11105 ; GFX7LESS: ; %bb.0:
11106 ; GFX7LESS-NEXT: s_mov_b32 s32, 0
11107 ; GFX7LESS-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
11108 ; GFX7LESS-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
11109 ; GFX7LESS-NEXT: s_mov_b32 s38, -1
11110 ; GFX7LESS-NEXT: s_mov_b32 s39, 0xe8f000
11111 ; GFX7LESS-NEXT: s_add_u32 s36, s36, s9
11112 ; GFX7LESS-NEXT: s_addc_u32 s37, s37, 0
11113 ; GFX7LESS-NEXT: s_mov_b32 s14, s8
11114 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
11115 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[2:3]
11116 ; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
11117 ; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
11118 ; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
11119 ; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
11120 ; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
11121 ; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
11122 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
11123 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
11124 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
11125 ; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
11126 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
11127 ; GFX7LESS-NEXT: s_mov_b32 s12, s6
11128 ; GFX7LESS-NEXT: s_mov_b32 s13, s7
11129 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[36:37]
11130 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[38:39]
11131 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
11132 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
11133 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
11134 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
11135 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v5, 1
11136 ; GFX7LESS-NEXT: .LBB15_1: ; %ComputeLoop
11137 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
11138 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
11139 ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
11140 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
11141 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
11142 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
11143 ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
11144 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5]
11145 ; GFX7LESS-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
11146 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB15_1
11147 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
11148 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
11149 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
11150 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
11151 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
11152 ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
11153 ; GFX7LESS-NEXT: s_cbranch_execz .LBB15_5
11154 ; GFX7LESS-NEXT: ; %bb.3:
11155 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x9
11156 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
11157 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
11158 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
11159 ; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[0:3], 0
11160 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
11161 ; GFX7LESS-NEXT: .LBB15_4: ; %atomicrmw.start
11162 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
11163 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
11164 ; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
11165 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
11166 ; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
11167 ; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
11168 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
11169 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
11170 ; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
11171 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
11172 ; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
11173 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
11174 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
11175 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
11176 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
11177 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_4
11178 ; GFX7LESS-NEXT: .LBB15_5:
11179 ; GFX7LESS-NEXT: s_endpgm
11181 ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
11183 ; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
11184 ; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
11185 ; GFX9-NEXT: s_mov_b32 s38, -1
11186 ; GFX9-NEXT: s_mov_b32 s39, 0xe00000
11187 ; GFX9-NEXT: s_add_u32 s36, s36, s9
11188 ; GFX9-NEXT: s_addc_u32 s37, s37, 0
11189 ; GFX9-NEXT: s_mov_b64 s[34:35], s[2:3]
11190 ; GFX9-NEXT: s_mov_b32 s14, s8
11191 ; GFX9-NEXT: s_add_u32 s8, s34, 44
11192 ; GFX9-NEXT: s_addc_u32 s9, s35, 0
11193 ; GFX9-NEXT: s_getpc_b64 s[2:3]
11194 ; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
11195 ; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
11196 ; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
11197 ; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
11198 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
11199 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
11200 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
11201 ; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
11202 ; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
11203 ; GFX9-NEXT: s_mov_b32 s12, s6
11204 ; GFX9-NEXT: s_mov_b32 s13, s7
11205 ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
11206 ; GFX9-NEXT: s_mov_b32 s32, 0
11207 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
11208 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
11209 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
11210 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
11211 ; GFX9-NEXT: v_bfrev_b32_e32 v5, 1
11212 ; GFX9-NEXT: .LBB15_1: ; %ComputeLoop
11213 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
11214 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
11215 ; GFX9-NEXT: v_readlane_b32 s3, v1, s4
11216 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4
11217 ; GFX9-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
11218 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
11219 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
11220 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
11221 ; GFX9-NEXT: s_cbranch_scc1 .LBB15_1
11222 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
11223 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11224 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
11225 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
11226 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
11227 ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
11228 ; GFX9-NEXT: s_cbranch_execz .LBB15_5
11229 ; GFX9-NEXT: ; %bb.3:
11230 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
11231 ; GFX9-NEXT: v_mov_b32_e32 v6, 0
11232 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
11233 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
11234 ; GFX9-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
11235 ; GFX9-NEXT: .LBB15_4: ; %atomicrmw.start
11236 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
11237 ; GFX9-NEXT: s_waitcnt vmcnt(0)
11238 ; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
11239 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
11240 ; GFX9-NEXT: s_waitcnt vmcnt(0)
11241 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
11242 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
11243 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
11244 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
11245 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
11246 ; GFX9-NEXT: s_cbranch_execnz .LBB15_4
11247 ; GFX9-NEXT: .LBB15_5:
11248 ; GFX9-NEXT: s_endpgm
11250 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
11251 ; GFX1064: ; %bb.0:
11252 ; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
11253 ; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
11254 ; GFX1064-NEXT: s_mov_b32 s38, -1
11255 ; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
11256 ; GFX1064-NEXT: s_add_u32 s36, s36, s9
11257 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
11258 ; GFX1064-NEXT: s_addc_u32 s37, s37, 0
11259 ; GFX1064-NEXT: s_mov_b32 s14, s8
11260 ; GFX1064-NEXT: s_add_u32 s8, s34, 44
11261 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0
11262 ; GFX1064-NEXT: s_getpc_b64 s[2:3]
11263 ; GFX1064-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
11264 ; GFX1064-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
11265 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
11266 ; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
11267 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
11268 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
11269 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
11270 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
11271 ; GFX1064-NEXT: s_mov_b32 s12, s6
11272 ; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
11273 ; GFX1064-NEXT: s_mov_b32 s13, s7
11274 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
11275 ; GFX1064-NEXT: s_mov_b32 s32, 0
11276 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
11277 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
11278 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0
11279 ; GFX1064-NEXT: v_bfrev_b32_e32 v5, 1
11280 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
11281 ; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop
11282 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
11283 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
11284 ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
11285 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
11286 ; GFX1064-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
11287 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
11288 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
11289 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
11290 ; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1
11291 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
11292 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11293 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
11294 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
11295 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
11296 ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
11297 ; GFX1064-NEXT: s_cbranch_execz .LBB15_5
11298 ; GFX1064-NEXT: ; %bb.3:
11299 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
11300 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0
11301 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
11302 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
11303 ; GFX1064-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
11304 ; GFX1064-NEXT: .LBB15_4: ; %atomicrmw.start
11305 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
11306 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
11307 ; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
11308 ; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
11309 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
11310 ; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
11311 ; GFX1064-NEXT: v_mov_b32_e32 v3, v1
11312 ; GFX1064-NEXT: v_mov_b32_e32 v2, v0
11313 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
11314 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
11315 ; GFX1064-NEXT: s_cbranch_execnz .LBB15_4
11316 ; GFX1064-NEXT: .LBB15_5:
11317 ; GFX1064-NEXT: s_endpgm
11319 ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
11320 ; GFX1032: ; %bb.0:
11321 ; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
11322 ; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
11323 ; GFX1032-NEXT: s_mov_b32 s38, -1
11324 ; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
11325 ; GFX1032-NEXT: s_add_u32 s36, s36, s9
11326 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
11327 ; GFX1032-NEXT: s_addc_u32 s37, s37, 0
11328 ; GFX1032-NEXT: s_mov_b32 s14, s8
11329 ; GFX1032-NEXT: s_add_u32 s8, s34, 44
11330 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0
11331 ; GFX1032-NEXT: s_getpc_b64 s[2:3]
11332 ; GFX1032-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
11333 ; GFX1032-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
11334 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
11335 ; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
11336 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
11337 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
11338 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
11339 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
11340 ; GFX1032-NEXT: s_mov_b32 s12, s6
11341 ; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
11342 ; GFX1032-NEXT: s_mov_b32 s13, s7
11343 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
11344 ; GFX1032-NEXT: s_mov_b32 s32, 0
11345 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
11346 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
11347 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0
11348 ; GFX1032-NEXT: v_bfrev_b32_e32 v5, 1
11349 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
11350 ; GFX1032-NEXT: .LBB15_1: ; %ComputeLoop
11351 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
11352 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
11353 ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
11354 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
11355 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
11356 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
11357 ; GFX1032-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
11358 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
11359 ; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1
11360 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
11361 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11362 ; GFX1032-NEXT: s_mov_b32 s2, 0
11363 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
11364 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
11365 ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
11366 ; GFX1032-NEXT: s_cbranch_execz .LBB15_5
11367 ; GFX1032-NEXT: ; %bb.3:
11368 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
11369 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0
11370 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
11371 ; GFX1032-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1]
11372 ; GFX1032-NEXT: .LBB15_4: ; %atomicrmw.start
11373 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
11374 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
11375 ; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
11376 ; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
11377 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
11378 ; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
11379 ; GFX1032-NEXT: v_mov_b32_e32 v3, v1
11380 ; GFX1032-NEXT: v_mov_b32_e32 v2, v0
11381 ; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
11382 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
11383 ; GFX1032-NEXT: s_cbranch_execnz .LBB15_4
11384 ; GFX1032-NEXT: .LBB15_5:
11385 ; GFX1032-NEXT: s_endpgm
11387 ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
11388 ; GFX1164: ; %bb.0:
11389 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
11390 ; GFX1164-NEXT: s_mov_b32 s14, s8
11391 ; GFX1164-NEXT: s_add_u32 s8, s34, 44
11392 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0
11393 ; GFX1164-NEXT: s_getpc_b64 s[2:3]
11394 ; GFX1164-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
11395 ; GFX1164-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
11396 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0
11397 ; GFX1164-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
11398 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
11399 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
11400 ; GFX1164-NEXT: s_mov_b32 s12, s6
11401 ; GFX1164-NEXT: s_mov_b32 s13, s7
11402 ; GFX1164-NEXT: s_mov_b32 s32, 0
11403 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
11404 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[2:3]
11405 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0
11406 ; GFX1164-NEXT: v_bfrev_b32_e32 v5, 1
11407 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
11408 ; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop
11409 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
11410 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
11411 ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
11412 ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
11413 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
11414 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
11415 ; GFX1164-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
11416 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
11417 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
11418 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
11419 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
11420 ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1
11421 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
11422 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11423 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
11424 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11425 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
11426 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
11427 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
11428 ; GFX1164-NEXT: s_cbranch_execz .LBB15_5
11429 ; GFX1164-NEXT: ; %bb.3:
11430 ; GFX1164-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
11431 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0
11432 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
11433 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
11434 ; GFX1164-NEXT: global_load_b64 v[2:3], v6, s[0:1]
11435 ; GFX1164-NEXT: .LBB15_4: ; %atomicrmw.start
11436 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
11437 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
11438 ; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
11439 ; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
11440 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
11441 ; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
11442 ; GFX1164-NEXT: v_mov_b32_e32 v3, v1
11443 ; GFX1164-NEXT: v_mov_b32_e32 v2, v0
11444 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
11445 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
11446 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
11447 ; GFX1164-NEXT: s_cbranch_execnz .LBB15_4
11448 ; GFX1164-NEXT: .LBB15_5:
11449 ; GFX1164-NEXT: s_endpgm
11451 ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
11452 ; GFX1132: ; %bb.0:
11453 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
11454 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0
11455 ; GFX1132-NEXT: s_add_u32 s8, s34, 44
11456 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0
11457 ; GFX1132-NEXT: s_getpc_b64 s[2:3]
11458 ; GFX1132-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
11459 ; GFX1132-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
11460 ; GFX1132-NEXT: s_mov_b32 s12, s13
11461 ; GFX1132-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
11462 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
11463 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
11464 ; GFX1132-NEXT: s_mov_b32 s13, s14
11465 ; GFX1132-NEXT: s_mov_b32 s14, s15
11466 ; GFX1132-NEXT: s_mov_b32 s32, 0
11467 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
11468 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[2:3]
11469 ; GFX1132-NEXT: v_mov_b32_e32 v4, 0
11470 ; GFX1132-NEXT: v_bfrev_b32_e32 v5, 1
11471 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
11472 ; GFX1132-NEXT: .LBB15_1: ; %ComputeLoop
11473 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
11474 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
11475 ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
11476 ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
11477 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
11478 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
11479 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11480 ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
11481 ; GFX1132-NEXT: v_add_f64 v[4:5], v[4:5], s[2:3]
11482 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
11483 ; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1
11484 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
11485 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11486 ; GFX1132-NEXT: s_mov_b32 s2, 0
11487 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
11488 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
11489 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
11490 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
11491 ; GFX1132-NEXT: s_cbranch_execz .LBB15_5
11492 ; GFX1132-NEXT: ; %bb.3:
11493 ; GFX1132-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
11494 ; GFX1132-NEXT: v_mov_b32_e32 v6, 0
11495 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
11496 ; GFX1132-NEXT: global_load_b64 v[2:3], v6, s[0:1]
11497 ; GFX1132-NEXT: .LBB15_4: ; %atomicrmw.start
11498 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
11499 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
11500 ; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
11501 ; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
11502 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
11503 ; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
11504 ; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
11505 ; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
11506 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
11507 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
11508 ; GFX1132-NEXT: s_cbranch_execnz .LBB15_4
11509 ; GFX1132-NEXT: .LBB15_5:
11510 ; GFX1132-NEXT: s_endpgm
11512 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
11513 ; GFX7LESS-DPP: ; %bb.0:
11514 ; GFX7LESS-DPP-NEXT: s_mov_b32 s32, 0
11515 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
11516 ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
11517 ; GFX7LESS-DPP-NEXT: s_mov_b32 s42, -1
11518 ; GFX7LESS-DPP-NEXT: s_mov_b32 s43, 0xe8f000
11519 ; GFX7LESS-DPP-NEXT: s_add_u32 s40, s40, s9
11520 ; GFX7LESS-DPP-NEXT: s_addc_u32 s41, s41, 0
11521 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s8
11522 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
11523 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
11524 ; GFX7LESS-DPP-NEXT: s_mov_b32 s39, 0xf000
11525 ; GFX7LESS-DPP-NEXT: s_mov_b32 s38, -1
11526 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s2, 44
11527 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s3, 0
11528 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[2:3]
11529 ; GFX7LESS-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
11530 ; GFX7LESS-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
11531 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
11532 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
11533 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
11534 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
11535 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v31, v0, v2
11536 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
11537 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s6
11538 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s7
11539 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
11540 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
11541 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
11542 ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
11543 ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
11544 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], 0
11545 ; GFX7LESS-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
11546 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
11547 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
11548 ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
11549 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
11550 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v9, v5
11551 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v8, v4
11552 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, v3
11553 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, v2
11554 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
11555 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
11556 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
11557 ; GFX7LESS-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
11558 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v6
11559 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, v7
11560 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
11561 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB15_1
11562 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
11563 ; GFX7LESS-DPP-NEXT: s_endpgm
11565 ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
11566 ; GFX9-DPP: ; %bb.0:
11567 ; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
11568 ; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
11569 ; GFX9-DPP-NEXT: s_mov_b32 s38, -1
11570 ; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
11571 ; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
11572 ; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
11573 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
11574 ; GFX9-DPP-NEXT: s_mov_b32 s14, s8
11575 ; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
11576 ; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
11577 ; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
11578 ; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
11579 ; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
11580 ; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
11581 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
11582 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
11583 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
11584 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
11585 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
11586 ; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
11587 ; GFX9-DPP-NEXT: s_mov_b32 s12, s6
11588 ; GFX9-DPP-NEXT: s_mov_b32 s13, s7
11589 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
11590 ; GFX9-DPP-NEXT: s_mov_b32 s32, 0
11591 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
11592 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
11593 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
11594 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
11595 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v4, 1
11596 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
11597 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v0
11598 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v1
11599 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
11600 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
11601 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, v4
11602 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
11603 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
11604 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
11605 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
11606 ; GFX9-DPP-NEXT: s_nop 0
11607 ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
11608 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:1 row_mask:0xf bank_mask:0xf
11609 ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
11610 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
11611 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
11612 ; GFX9-DPP-NEXT: s_nop 0
11613 ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
11614 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:2 row_mask:0xf bank_mask:0xf
11615 ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
11616 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
11617 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
11618 ; GFX9-DPP-NEXT: s_nop 0
11619 ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
11620 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:4 row_mask:0xf bank_mask:0xf
11621 ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
11622 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
11623 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
11624 ; GFX9-DPP-NEXT: s_nop 0
11625 ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
11626 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_shr:8 row_mask:0xf bank_mask:0xf
11627 ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
11628 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, v3
11629 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, v4
11630 ; GFX9-DPP-NEXT: s_nop 0
11631 ; GFX9-DPP-NEXT: v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
11632 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v6 row_bcast:15 row_mask:0xa bank_mask:0xf
11633 ; GFX9-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
11634 ; GFX9-DPP-NEXT: s_nop 1
11635 ; GFX9-DPP-NEXT: v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
11636 ; GFX9-DPP-NEXT: v_mov_b32_dpp v4, v6 row_bcast:31 row_mask:0xc bank_mask:0xf
11637 ; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
11638 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
11639 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11640 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
11641 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
11642 ; GFX9-DPP-NEXT: v_readlane_b32 s3, v4, 63
11643 ; GFX9-DPP-NEXT: v_readlane_b32 s2, v3, 63
11644 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
11645 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[2:3]
11646 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
11647 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
11648 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB15_3
11649 ; GFX9-DPP-NEXT: ; %bb.1:
11650 ; GFX9-DPP-NEXT: s_load_dwordx2 s[2:3], s[34:35], 0x24
11651 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
11652 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], 0
11653 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
11654 ; GFX9-DPP-NEXT: global_load_dwordx2 v[11:12], v0, s[2:3]
11655 ; GFX9-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
11656 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
11657 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
11658 ; GFX9-DPP-NEXT: v_add_f64 v[9:10], v[11:12], s[0:1]
11659 ; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[1:2], v0, v[9:12], s[2:3] glc
11660 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
11661 ; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[1:2], v[11:12]
11662 ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v2
11663 ; GFX9-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
11664 ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
11665 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
11666 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_2
11667 ; GFX9-DPP-NEXT: .LBB15_3:
11668 ; GFX9-DPP-NEXT: s_endpgm
11670 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
11671 ; GFX1064-DPP: ; %bb.0:
11672 ; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
11673 ; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
11674 ; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
11675 ; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
11676 ; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
11677 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
11678 ; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
11679 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
11680 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
11681 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
11682 ; GFX1064-DPP-NEXT: s_getpc_b64 s[2:3]
11683 ; GFX1064-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
11684 ; GFX1064-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
11685 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
11686 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
11687 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
11688 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
11689 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
11690 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
11691 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
11692 ; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
11693 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
11694 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
11695 ; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
11696 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
11697 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
11698 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
11699 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
11700 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v4, 1
11701 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
11702 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v0
11703 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v1
11704 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
11705 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
11706 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
11707 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
11708 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
11709 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
11710 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
11711 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
11712 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
11713 ; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
11714 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
11715 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
11716 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
11717 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
11718 ; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
11719 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, v3
11720 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, v4
11721 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
11722 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
11723 ; GFX1064-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
11724 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
11725 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
11726 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
11727 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, v4
11728 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
11729 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
11730 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
11731 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
11732 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v4, 0
11733 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v3, 0
11734 ; GFX1064-DPP-NEXT: v_readlane_b32 s5, v4, 32
11735 ; GFX1064-DPP-NEXT: v_readlane_b32 s4, v3, 32
11736 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], s[2:3], s[4:5]
11737 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
11738 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11739 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v0
11740 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, v3
11741 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v4
11742 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
11743 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
11744 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB15_3
11745 ; GFX1064-DPP-NEXT: ; %bb.1:
11746 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
11747 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, 0
11748 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
11749 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
11750 ; GFX1064-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
11751 ; GFX1064-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
11752 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
11753 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
11754 ; GFX1064-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
11755 ; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
11756 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
11757 ; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[9:10], v[11:12]
11758 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v10
11759 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
11760 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
11761 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
11762 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_2
11763 ; GFX1064-DPP-NEXT: .LBB15_3:
11764 ; GFX1064-DPP-NEXT: s_endpgm
11766 ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
11767 ; GFX1032-DPP: ; %bb.0:
11768 ; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
11769 ; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
11770 ; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
11771 ; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
11772 ; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
11773 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
11774 ; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
11775 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
11776 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
11777 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
11778 ; GFX1032-DPP-NEXT: s_getpc_b64 s[2:3]
11779 ; GFX1032-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
11780 ; GFX1032-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
11781 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
11782 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
11783 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
11784 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
11785 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
11786 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
11787 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
11788 ; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
11789 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
11790 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
11791 ; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
11792 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
11793 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
11794 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
11795 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
11796 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v4, 1
11797 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
11798 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v0
11799 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v1
11800 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
11801 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
11802 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
11803 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
11804 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
11805 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
11806 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
11807 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
11808 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:1 row_mask:0xf bank_mask:0xf
11809 ; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
11810 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
11811 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
11812 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
11813 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:2 row_mask:0xf bank_mask:0xf
11814 ; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
11815 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, v3
11816 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, v4
11817 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
11818 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v6 row_xmask:4 row_mask:0xf bank_mask:0xf
11819 ; GFX1032-DPP-NEXT: v_add_f64 v[5:6], v[5:6], v[7:8]
11820 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
11821 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v4, v6 row_xmask:8 row_mask:0xf bank_mask:0xf
11822 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[5:6], v[3:4]
11823 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, v4
11824 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
11825 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v6, v6, -1, -1
11826 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
11827 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[3:4], v[5:6]
11828 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
11829 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
11830 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, v3
11831 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v4
11832 ; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
11833 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
11834 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
11835 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB15_3
11836 ; GFX1032-DPP-NEXT: ; %bb.1:
11837 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[34:35], 0x24
11838 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, 0
11839 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
11840 ; GFX1032-DPP-NEXT: global_load_dwordx2 v[11:12], v2, s[0:1]
11841 ; GFX1032-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
11842 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
11843 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
11844 ; GFX1032-DPP-NEXT: v_add_f64 v[9:10], v[11:12], v[0:1]
11845 ; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[9:10], v2, v[9:12], s[0:1] glc
11846 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
11847 ; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[9:10], v[11:12]
11848 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v10
11849 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
11850 ; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
11851 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
11852 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_2
11853 ; GFX1032-DPP-NEXT: .LBB15_3:
11854 ; GFX1032-DPP-NEXT: s_endpgm
11856 ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
11857 ; GFX1164-DPP: ; %bb.0:
11858 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
11859 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
11860 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
11861 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
11862 ; GFX1164-DPP-NEXT: s_getpc_b64 s[2:3]
11863 ; GFX1164-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
11864 ; GFX1164-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
11865 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
11866 ; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
11867 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
11868 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
11869 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
11870 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
11871 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
11872 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
11873 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
11874 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
11875 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, 0
11876 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v3, 1
11877 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
11878 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v0
11879 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v1
11880 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
11881 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
11882 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
11883 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
11884 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
11885 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
11886 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
11887 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
11888 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
11889 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
11890 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
11891 ; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
11892 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
11893 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
11894 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
11895 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
11896 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
11897 ; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
11898 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, v2
11899 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, v3
11900 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
11901 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
11902 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
11903 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11904 ; GFX1164-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
11905 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
11906 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
11907 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
11908 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
11909 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
11910 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
11911 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
11912 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
11913 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
11914 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
11915 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11916 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
11917 ; GFX1164-DPP-NEXT: v_permlane64_b32 v5, v3
11918 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
11919 ; GFX1164-DPP-NEXT: v_permlane64_b32 v4, v2
11920 ; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
11921 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
11922 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
11923 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
11924 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
11925 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v8, exec_hi, v0
11926 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
11927 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, v2
11928 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v3
11929 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3)
11930 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
11931 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB15_3
11932 ; GFX1164-DPP-NEXT: ; %bb.1:
11933 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
11934 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, 0
11935 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
11936 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
11937 ; GFX1164-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
11938 ; GFX1164-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
11939 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
11940 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
11941 ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
11942 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
11943 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
11944 ; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[8:9], v[10:11]
11945 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
11946 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
11947 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
11948 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
11949 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
11950 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_2
11951 ; GFX1164-DPP-NEXT: .LBB15_3:
11952 ; GFX1164-DPP-NEXT: s_endpgm
11954 ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
11955 ; GFX1132-DPP: ; %bb.0:
11956 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
11957 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
11958 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
11959 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
11960 ; GFX1132-DPP-NEXT: s_getpc_b64 s[2:3]
11961 ; GFX1132-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
11962 ; GFX1132-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
11963 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
11964 ; GFX1132-DPP-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
11965 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
11966 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
11967 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
11968 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
11969 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
11970 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
11971 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[2:3]
11972 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
11973 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v2, 0
11974 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v3, 1
11975 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
11976 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v0
11977 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v1
11978 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
11979 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, v2
11980 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v5, v3
11981 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
11982 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
11983 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
11984 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
11985 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:1 row_mask:0xf bank_mask:0xf
11986 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:1 row_mask:0xf bank_mask:0xf
11987 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
11988 ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
11989 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
11990 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:2 row_mask:0xf bank_mask:0xf
11991 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
11992 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:2 row_mask:0xf bank_mask:0xf
11993 ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
11994 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v6, v2 :: v_dual_mov_b32 v7, v3
11995 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
11996 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v6, v4 row_xmask:4 row_mask:0xf bank_mask:0xf
11997 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v7, v5 row_xmask:4 row_mask:0xf bank_mask:0xf
11998 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
11999 ; GFX1132-DPP-NEXT: v_add_f64 v[4:5], v[4:5], v[6:7]
12000 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v2, v4 row_xmask:8 row_mask:0xf bank_mask:0xf
12001 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
12002 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v3, v5 row_xmask:8 row_mask:0xf bank_mask:0xf
12003 ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[2:3]
12004 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
12005 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
12006 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v5, v5, -1, -1
12007 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
12008 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v4, v4, -1, -1
12009 ; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[2:3], v[4:5]
12010 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
12011 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
12012 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, v2
12013 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v8, exec_lo, 0
12014 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v3
12015 ; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
12016 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
12017 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
12018 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v8
12019 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB15_3
12020 ; GFX1132-DPP-NEXT: ; %bb.1:
12021 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[34:35], 0x24
12022 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v12, 0
12023 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
12024 ; GFX1132-DPP-NEXT: global_load_b64 v[10:11], v12, s[0:1]
12025 ; GFX1132-DPP-NEXT: .LBB15_2: ; %atomicrmw.start
12026 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
12027 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
12028 ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[0:1]
12029 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[8:9], v12, v[8:11], s[0:1] glc
12030 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
12031 ; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[8:9], v[10:11]
12032 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
12033 ; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
12034 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
12035 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
12036 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_2
12037 ; GFX1132-DPP-NEXT: .LBB15_3:
12038 ; GFX1132-DPP-NEXT: s_endpgm
12039 %divValue = call double @div.float.value() strictfp
12040 %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic
12044 define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 {
12045 ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
12046 ; GFX7LESS: ; %bb.0:
12047 ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
12048 ; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
12049 ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
12050 ; GFX7LESS-NEXT: s_mov_b32 s50, -1
12051 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
12052 ; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
12053 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
12054 ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
12055 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0
12056 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, exec
12057 ; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000
12058 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0
12059 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0xc3300000
12060 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5
12061 ; GFX7LESS-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4]
12062 ; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4]
12063 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
12064 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
12065 ; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3
12066 ; GFX7LESS-NEXT: ; %bb.1:
12067 ; GFX7LESS-NEXT: s_mov_b32 s33, s8
12068 ; GFX7LESS-NEXT: s_mov_b32 s40, s7
12069 ; GFX7LESS-NEXT: s_mov_b32 s41, s6
12070 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
12071 ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
12072 ; GFX7LESS-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9
12073 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
12074 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0
12075 ; GFX7LESS-NEXT: s_mov_b64 s[44:45], 0
12076 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
12077 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
12078 ; GFX7LESS-NEXT: v_or_b32_e32 v3, v0, v1
12079 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
12080 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0
12081 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1
12082 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v3, v2
12083 ; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start
12084 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
12085 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
12086 ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42]
12087 ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
12088 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0
12089 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
12090 ; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12
12091 ; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8
12092 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
12093 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
12094 ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
12095 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
12096 ; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
12097 ; GFX7LESS-NEXT: s_waitcnt expcnt(2)
12098 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
12099 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
12100 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
12101 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
12102 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
12103 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
12104 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
12105 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
12106 ; GFX7LESS-NEXT: s_mov_b32 s12, s41
12107 ; GFX7LESS-NEXT: s_mov_b32 s13, s40
12108 ; GFX7LESS-NEXT: s_mov_b32 s14, s33
12109 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
12110 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
12111 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
12112 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
12113 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s42
12114 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s43
12115 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
12116 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
12117 ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
12118 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0
12119 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
12120 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
12121 ; GFX7LESS-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
12122 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[44:45]
12123 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2
12124 ; GFX7LESS-NEXT: .LBB16_3:
12125 ; GFX7LESS-NEXT: s_endpgm
12127 ; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
12129 ; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
12130 ; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
12131 ; GFX9-NEXT: s_mov_b32 s50, -1
12132 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000
12133 ; GFX9-NEXT: s_add_u32 s48, s48, s9
12134 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
12135 ; GFX9-NEXT: s_addc_u32 s49, s49, 0
12136 ; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
12137 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, exec
12138 ; GFX9-NEXT: v_mov_b32_e32 v4, 0xc3300000
12139 ; GFX9-NEXT: s_mov_b32 s1, 0x43300000
12140 ; GFX9-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4]
12141 ; GFX9-NEXT: s_movk_i32 s32, 0x800
12142 ; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4]
12143 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
12144 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
12145 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
12146 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
12147 ; GFX9-NEXT: s_cbranch_execz .LBB16_3
12148 ; GFX9-NEXT: ; %bb.1:
12149 ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
12150 ; GFX9-NEXT: v_lshlrev_b32_e32 v3, 20, v2
12151 ; GFX9-NEXT: v_lshlrev_b32_e32 v4, 10, v1
12152 ; GFX9-NEXT: s_mov_b32 s33, s8
12153 ; GFX9-NEXT: s_mov_b32 s40, s7
12154 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
12155 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0
12156 ; GFX9-NEXT: s_mov_b32 s41, s6
12157 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
12158 ; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
12159 ; GFX9-NEXT: s_mov_b64 s[44:45], 0
12160 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
12161 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
12162 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
12163 ; GFX9-NEXT: v_or3_b32 v40, v0, v4, v3
12164 ; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start
12165 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
12166 ; GFX9-NEXT: s_waitcnt vmcnt(0)
12167 ; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
12168 ; GFX9-NEXT: s_add_u32 s8, s36, 44
12169 ; GFX9-NEXT: s_addc_u32 s9, s37, 0
12170 ; GFX9-NEXT: s_getpc_b64 s[0:1]
12171 ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
12172 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
12173 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
12174 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
12175 ; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
12176 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
12177 ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
12178 ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
12179 ; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
12180 ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
12181 ; GFX9-NEXT: s_mov_b32 s12, s41
12182 ; GFX9-NEXT: s_mov_b32 s13, s40
12183 ; GFX9-NEXT: s_mov_b32 s14, s33
12184 ; GFX9-NEXT: v_mov_b32_e32 v31, v40
12185 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
12186 ; GFX9-NEXT: v_mov_b32_e32 v0, 8
12187 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
12188 ; GFX9-NEXT: v_mov_b32_e32 v2, s42
12189 ; GFX9-NEXT: v_mov_b32_e32 v3, s43
12190 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
12191 ; GFX9-NEXT: v_mov_b32_e32 v5, 8
12192 ; GFX9-NEXT: v_mov_b32_e32 v6, 0
12193 ; GFX9-NEXT: v_mov_b32_e32 v7, 0
12194 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
12195 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
12196 ; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0
12197 ; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
12198 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
12199 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
12200 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
12201 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
12202 ; GFX9-NEXT: s_cbranch_execnz .LBB16_2
12203 ; GFX9-NEXT: .LBB16_3:
12204 ; GFX9-NEXT: s_endpgm
12206 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
12207 ; GFX1064: ; %bb.0:
12208 ; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
12209 ; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
12210 ; GFX1064-NEXT: s_mov_b32 s50, -1
12211 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
12212 ; GFX1064-NEXT: s_add_u32 s48, s48, s9
12213 ; GFX1064-NEXT: s_addc_u32 s49, s49, 0
12214 ; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
12215 ; GFX1064-NEXT: s_bcnt1_i32_b64 s0, exec
12216 ; GFX1064-NEXT: s_mov_b32 s1, 0x43300000
12217 ; GFX1064-NEXT: s_movk_i32 s32, 0x800
12218 ; GFX1064-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1]
12219 ; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4]
12220 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
12221 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
12222 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
12223 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
12224 ; GFX1064-NEXT: s_cbranch_execz .LBB16_3
12225 ; GFX1064-NEXT: ; %bb.1:
12226 ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
12227 ; GFX1064-NEXT: v_lshlrev_b32_e32 v3, 20, v2
12228 ; GFX1064-NEXT: v_lshlrev_b32_e32 v4, 10, v1
12229 ; GFX1064-NEXT: s_mov_b32 s33, s8
12230 ; GFX1064-NEXT: s_mov_b32 s40, s7
12231 ; GFX1064-NEXT: s_mov_b32 s41, s6
12232 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[4:5]
12233 ; GFX1064-NEXT: v_or3_b32 v40, v0, v4, v3
12234 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[2:3]
12235 ; GFX1064-NEXT: s_mov_b64 s[44:45], 0
12236 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
12237 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0
12238 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
12239 ; GFX1064-NEXT: v_mov_b32_e32 v2, s1
12240 ; GFX1064-NEXT: v_mov_b32_e32 v1, s0
12241 ; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start
12242 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
12243 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
12244 ; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
12245 ; GFX1064-NEXT: s_add_u32 s8, s36, 44
12246 ; GFX1064-NEXT: s_addc_u32 s9, s37, 0
12247 ; GFX1064-NEXT: s_getpc_b64 s[0:1]
12248 ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
12249 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
12250 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
12251 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0
12252 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
12253 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40
12254 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8
12255 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0
12256 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42
12257 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8
12258 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0
12259 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0
12260 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
12261 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
12262 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[34:35]
12263 ; GFX1064-NEXT: s_mov_b32 s12, s41
12264 ; GFX1064-NEXT: s_mov_b32 s13, s40
12265 ; GFX1064-NEXT: s_mov_b32 s14, s33
12266 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
12267 ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
12268 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
12269 ; GFX1064-NEXT: v_mov_b32_e32 v3, s43
12270 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0
12271 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
12272 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
12273 ; GFX1064-NEXT: s_clause 0x1
12274 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0
12275 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
12276 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
12277 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
12278 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
12279 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
12280 ; GFX1064-NEXT: s_cbranch_execnz .LBB16_2
12281 ; GFX1064-NEXT: .LBB16_3:
12282 ; GFX1064-NEXT: s_endpgm
12284 ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
12285 ; GFX1032: ; %bb.0:
12286 ; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
12287 ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
12288 ; GFX1032-NEXT: s_mov_b32 s50, -1
12289 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
12290 ; GFX1032-NEXT: s_add_u32 s48, s48, s9
12291 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0
12292 ; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
12293 ; GFX1032-NEXT: s_bcnt1_i32_b32 s0, exec_lo
12294 ; GFX1032-NEXT: s_mov_b32 s1, 0x43300000
12295 ; GFX1032-NEXT: s_mov_b32 s44, 0
12296 ; GFX1032-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1]
12297 ; GFX1032-NEXT: s_movk_i32 s32, 0x400
12298 ; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4]
12299 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
12300 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
12301 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
12302 ; GFX1032-NEXT: s_cbranch_execz .LBB16_3
12303 ; GFX1032-NEXT: ; %bb.1:
12304 ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
12305 ; GFX1032-NEXT: v_lshlrev_b32_e32 v3, 20, v2
12306 ; GFX1032-NEXT: v_lshlrev_b32_e32 v4, 10, v1
12307 ; GFX1032-NEXT: s_mov_b32 s33, s8
12308 ; GFX1032-NEXT: s_mov_b32 s40, s7
12309 ; GFX1032-NEXT: s_mov_b32 s41, s6
12310 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[4:5]
12311 ; GFX1032-NEXT: v_or3_b32 v40, v0, v4, v3
12312 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[2:3]
12313 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
12314 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0
12315 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
12316 ; GFX1032-NEXT: v_mov_b32_e32 v2, s1
12317 ; GFX1032-NEXT: v_mov_b32_e32 v1, s0
12318 ; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start
12319 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
12320 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
12321 ; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
12322 ; GFX1032-NEXT: s_add_u32 s8, s36, 44
12323 ; GFX1032-NEXT: s_addc_u32 s9, s37, 0
12324 ; GFX1032-NEXT: s_getpc_b64 s[0:1]
12325 ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
12326 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
12327 ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
12328 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0
12329 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
12330 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40
12331 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8
12332 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0
12333 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42
12334 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8
12335 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0
12336 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0
12337 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
12338 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
12339 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[34:35]
12340 ; GFX1032-NEXT: s_mov_b32 s12, s41
12341 ; GFX1032-NEXT: s_mov_b32 s13, s40
12342 ; GFX1032-NEXT: s_mov_b32 s14, s33
12343 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
12344 ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
12345 ; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
12346 ; GFX1032-NEXT: v_mov_b32_e32 v3, s43
12347 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0
12348 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
12349 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
12350 ; GFX1032-NEXT: s_clause 0x1
12351 ; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0
12352 ; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
12353 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
12354 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
12355 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
12356 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
12357 ; GFX1032-NEXT: s_cbranch_execnz .LBB16_2
12358 ; GFX1032-NEXT: .LBB16_3:
12359 ; GFX1032-NEXT: s_endpgm
12361 ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
12362 ; GFX1164: ; %bb.0:
12363 ; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
12364 ; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec
12365 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0
12366 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
12367 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0
12368 ; GFX1164-NEXT: s_mov_b32 s32, 32
12369 ; GFX1164-NEXT: s_clause 0x1
12370 ; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:20
12371 ; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:16
12372 ; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16
12373 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
12374 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
12375 ; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
12376 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
12377 ; GFX1164-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
12378 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12379 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
12380 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
12381 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
12382 ; GFX1164-NEXT: s_cbranch_execz .LBB16_3
12383 ; GFX1164-NEXT: ; %bb.1:
12384 ; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
12385 ; GFX1164-NEXT: s_mov_b32 s33, s8
12386 ; GFX1164-NEXT: s_mov_b32 s40, s7
12387 ; GFX1164-NEXT: s_mov_b32 s41, s6
12388 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[4:5]
12389 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[2:3]
12390 ; GFX1164-NEXT: s_mov_b64 s[44:45], 0
12391 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
12392 ; GFX1164-NEXT: s_load_b64 s[0:1], s[42:43], 0x0
12393 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
12394 ; GFX1164-NEXT: v_mov_b32_e32 v2, s1
12395 ; GFX1164-NEXT: v_mov_b32_e32 v1, s0
12396 ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
12397 ; GFX1164-NEXT: .p2align 6
12398 ; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start
12399 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
12400 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
12401 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
12402 ; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
12403 ; GFX1164-NEXT: s_add_u32 s8, s36, 44
12404 ; GFX1164-NEXT: s_addc_u32 s9, s37, 0
12405 ; GFX1164-NEXT: s_getpc_b64 s[0:1]
12406 ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
12407 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
12408 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40
12409 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
12410 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8
12411 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8
12412 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0
12413 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0
12414 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
12415 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[34:35]
12416 ; GFX1164-NEXT: s_mov_b32 s12, s41
12417 ; GFX1164-NEXT: s_mov_b32 s13, s40
12418 ; GFX1164-NEXT: s_mov_b32 s14, s33
12419 ; GFX1164-NEXT: s_clause 0x1
12420 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
12421 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
12422 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0
12423 ; GFX1164-NEXT: v_mov_b32_e32 v2, s42
12424 ; GFX1164-NEXT: v_mov_b32_e32 v3, s43
12425 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0
12426 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
12427 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
12428 ; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
12429 ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
12430 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
12431 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
12432 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
12433 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
12434 ; GFX1164-NEXT: s_cbranch_execnz .LBB16_2
12435 ; GFX1164-NEXT: .LBB16_3:
12436 ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
12437 ; GFX1164-NEXT: s_endpgm
12439 ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
12440 ; GFX1132: ; %bb.0:
12441 ; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
12442 ; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo
12443 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
12444 ; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0
12445 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000
12446 ; GFX1132-NEXT: s_mov_b32 s44, 0
12447 ; GFX1132-NEXT: s_clause 0x1
12448 ; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20
12449 ; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16
12450 ; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16
12451 ; GFX1132-NEXT: s_mov_b32 s32, 32
12452 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
12453 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
12454 ; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
12455 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
12456 ; GFX1132-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
12457 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12458 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
12459 ; GFX1132-NEXT: s_cbranch_execz .LBB16_3
12460 ; GFX1132-NEXT: ; %bb.1:
12461 ; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
12462 ; GFX1132-NEXT: s_mov_b32 s33, s15
12463 ; GFX1132-NEXT: s_mov_b32 s40, s14
12464 ; GFX1132-NEXT: s_mov_b32 s41, s13
12465 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[4:5]
12466 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[2:3]
12467 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
12468 ; GFX1132-NEXT: s_load_b64 s[0:1], s[42:43], 0x0
12469 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
12470 ; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
12471 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
12472 ; GFX1132-NEXT: .p2align 6
12473 ; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start
12474 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
12475 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
12476 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
12477 ; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
12478 ; GFX1132-NEXT: s_add_u32 s8, s36, 44
12479 ; GFX1132-NEXT: s_addc_u32 s9, s37, 0
12480 ; GFX1132-NEXT: s_getpc_b64 s[0:1]
12481 ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
12482 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
12483 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
12484 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
12485 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
12486 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0
12487 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
12488 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[34:35]
12489 ; GFX1132-NEXT: s_mov_b32 s12, s41
12490 ; GFX1132-NEXT: s_mov_b32 s13, s40
12491 ; GFX1132-NEXT: s_mov_b32 s14, s33
12492 ; GFX1132-NEXT: s_clause 0x1
12493 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
12494 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
12495 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
12496 ; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
12497 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
12498 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
12499 ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
12500 ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
12501 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
12502 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
12503 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
12504 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
12505 ; GFX1132-NEXT: s_cbranch_execnz .LBB16_2
12506 ; GFX1132-NEXT: .LBB16_3:
12507 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
12508 ; GFX1132-NEXT: s_endpgm
12510 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
12511 ; GFX7LESS-DPP: ; %bb.0:
12512 ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800
12513 ; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
12514 ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
12515 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1
12516 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000
12517 ; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9
12518 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0
12519 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
12520 ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v5, exec_lo, 0
12521 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s0, exec
12522 ; GFX7LESS-DPP-NEXT: s_mov_b32 s1, 0x43300000
12523 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, 0
12524 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000
12525 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v5, exec_hi, v5
12526 ; GFX7LESS-DPP-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4]
12527 ; GFX7LESS-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4]
12528 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
12529 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
12530 ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB16_3
12531 ; GFX7LESS-DPP-NEXT: ; %bb.1:
12532 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8
12533 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7
12534 ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6
12535 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
12536 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
12537 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x9
12538 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
12539 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0
12540 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[44:45], 0
12541 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
12542 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
12543 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v3, v0, v1
12544 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
12545 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, s0
12546 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s1
12547 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v40, v3, v2
12548 ; GFX7LESS-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
12549 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
12550 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
12551 ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42]
12552 ; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
12553 ; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0
12554 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
12555 ; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12
12556 ; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8
12557 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0
12558 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
12559 ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
12560 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
12561 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
12562 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2)
12563 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8
12564 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0
12565 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0
12566 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8
12567 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0
12568 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0
12569 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
12570 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
12571 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41
12572 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40
12573 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33
12574 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v40
12575 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
12576 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
12577 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
12578 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s42
12579 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s43
12580 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
12581 ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
12582 ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0
12583 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0
12584 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
12585 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
12586 ; GFX7LESS-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
12587 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
12588 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB16_2
12589 ; GFX7LESS-DPP-NEXT: .LBB16_3:
12590 ; GFX7LESS-DPP-NEXT: s_endpgm
12592 ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
12593 ; GFX9-DPP: ; %bb.0:
12594 ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
12595 ; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
12596 ; GFX9-DPP-NEXT: s_mov_b32 s50, -1
12597 ; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
12598 ; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
12599 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
12600 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
12601 ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
12602 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, exec
12603 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0xc3300000
12604 ; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000
12605 ; GFX9-DPP-NEXT: v_add_f64 v[3:4], s[0:1], v[3:4]
12606 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
12607 ; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4]
12608 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
12609 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
12610 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
12611 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
12612 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3
12613 ; GFX9-DPP-NEXT: ; %bb.1:
12614 ; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
12615 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2
12616 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1
12617 ; GFX9-DPP-NEXT: s_mov_b32 s33, s8
12618 ; GFX9-DPP-NEXT: s_mov_b32 s40, s7
12619 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
12620 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0
12621 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6
12622 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
12623 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
12624 ; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
12625 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
12626 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
12627 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
12628 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v4, v3
12629 ; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
12630 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
12631 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
12632 ; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
12633 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
12634 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
12635 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
12636 ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
12637 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
12638 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
12639 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
12640 ; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
12641 ; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
12642 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
12643 ; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
12644 ; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
12645 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
12646 ; GFX9-DPP-NEXT: s_mov_b32 s12, s41
12647 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40
12648 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33
12649 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
12650 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
12651 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
12652 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
12653 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
12654 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
12655 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
12656 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
12657 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
12658 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
12659 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
12660 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
12661 ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
12662 ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
12663 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
12664 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
12665 ; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
12666 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
12667 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2
12668 ; GFX9-DPP-NEXT: .LBB16_3:
12669 ; GFX9-DPP-NEXT: s_endpgm
12671 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
12672 ; GFX1064-DPP: ; %bb.0:
12673 ; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
12674 ; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
12675 ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
12676 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
12677 ; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
12678 ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
12679 ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
12680 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, exec
12681 ; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000
12682 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
12683 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1]
12684 ; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4]
12685 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
12686 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v3, exec_hi, v3
12687 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
12688 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
12689 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3
12690 ; GFX1064-DPP-NEXT: ; %bb.1:
12691 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
12692 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2
12693 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1
12694 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
12695 ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
12696 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
12697 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
12698 ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v4, v3
12699 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
12700 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
12701 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
12702 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0
12703 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
12704 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
12705 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
12706 ; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
12707 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
12708 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
12709 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
12710 ; GFX1064-DPP-NEXT: s_add_u32 s8, s36, 44
12711 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s37, 0
12712 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
12713 ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
12714 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
12715 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
12716 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
12717 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
12718 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
12719 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
12720 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
12721 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
12722 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
12723 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
12724 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
12725 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
12726 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
12727 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
12728 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
12729 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
12730 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
12731 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
12732 ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
12733 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
12734 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
12735 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
12736 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
12737 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
12738 ; GFX1064-DPP-NEXT: s_clause 0x1
12739 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
12740 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
12741 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
12742 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
12743 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
12744 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
12745 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2
12746 ; GFX1064-DPP-NEXT: .LBB16_3:
12747 ; GFX1064-DPP-NEXT: s_endpgm
12749 ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
12750 ; GFX1032-DPP: ; %bb.0:
12751 ; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
12752 ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
12753 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
12754 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
12755 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
12756 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
12757 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
12758 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo
12759 ; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000
12760 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
12761 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], 0xc3300000, s[0:1]
12762 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
12763 ; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[3:4]
12764 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v3, exec_lo, 0
12765 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v3
12766 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
12767 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3
12768 ; GFX1032-DPP-NEXT: ; %bb.1:
12769 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
12770 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v3, 20, v2
12771 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v4, 10, v1
12772 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
12773 ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
12774 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
12775 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
12776 ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v4, v3
12777 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
12778 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
12779 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[42:43], 0x0
12780 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
12781 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
12782 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
12783 ; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
12784 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
12785 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
12786 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
12787 ; GFX1032-DPP-NEXT: s_add_u32 s8, s36, 44
12788 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s37, 0
12789 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
12790 ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
12791 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
12792 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
12793 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
12794 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
12795 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
12796 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
12797 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
12798 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
12799 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
12800 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
12801 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
12802 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
12803 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
12804 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
12805 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
12806 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
12807 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
12808 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
12809 ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
12810 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
12811 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
12812 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
12813 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
12814 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
12815 ; GFX1032-DPP-NEXT: s_clause 0x1
12816 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
12817 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
12818 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
12819 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
12820 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
12821 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
12822 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2
12823 ; GFX1032-DPP-NEXT: .LBB16_3:
12824 ; GFX1032-DPP-NEXT: s_endpgm
12826 ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
12827 ; GFX1164-DPP: ; %bb.0:
12828 ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
12829 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec
12830 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
12831 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
12832 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
12833 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
12834 ; GFX1164-DPP-NEXT: s_clause 0x1
12835 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:20
12836 ; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:16
12837 ; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16
12838 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
12839 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
12840 ; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
12841 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
12842 ; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
12843 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12844 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
12845 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
12846 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
12847 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3
12848 ; GFX1164-DPP-NEXT: ; %bb.1:
12849 ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
12850 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
12851 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
12852 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
12853 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
12854 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
12855 ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
12856 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
12857 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0
12858 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
12859 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
12860 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
12861 ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
12862 ; GFX1164-DPP-NEXT: .p2align 6
12863 ; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
12864 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
12865 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
12866 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
12867 ; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
12868 ; GFX1164-DPP-NEXT: s_add_u32 s8, s36, 44
12869 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s37, 0
12870 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
12871 ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
12872 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
12873 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
12874 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
12875 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
12876 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
12877 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
12878 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
12879 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
12880 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
12881 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
12882 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
12883 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
12884 ; GFX1164-DPP-NEXT: s_clause 0x1
12885 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
12886 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
12887 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
12888 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
12889 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
12890 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
12891 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
12892 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
12893 ; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
12894 ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
12895 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
12896 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
12897 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
12898 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
12899 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2
12900 ; GFX1164-DPP-NEXT: .LBB16_3:
12901 ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
12902 ; GFX1164-DPP-NEXT: s_endpgm
12904 ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
12905 ; GFX1132-DPP: ; %bb.0:
12906 ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
12907 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo
12908 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
12909 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0
12910 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
12911 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
12912 ; GFX1132-DPP-NEXT: s_clause 0x1
12913 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20
12914 ; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16
12915 ; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16
12916 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
12917 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
12918 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
12919 ; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
12920 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
12921 ; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
12922 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
12923 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
12924 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3
12925 ; GFX1132-DPP-NEXT: ; %bb.1:
12926 ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
12927 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
12928 ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
12929 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
12930 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
12931 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
12932 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
12933 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[42:43], 0x0
12934 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
12935 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
12936 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
12937 ; GFX1132-DPP-NEXT: .p2align 6
12938 ; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
12939 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
12940 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
12941 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
12942 ; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
12943 ; GFX1132-DPP-NEXT: s_add_u32 s8, s36, 44
12944 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s37, 0
12945 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
12946 ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
12947 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
12948 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
12949 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
12950 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
12951 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
12952 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
12953 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
12954 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
12955 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
12956 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
12957 ; GFX1132-DPP-NEXT: s_clause 0x1
12958 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
12959 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
12960 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
12961 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
12962 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
12963 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
12964 ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
12965 ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
12966 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
12967 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
12968 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
12969 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
12970 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2
12971 ; GFX1132-DPP-NEXT: .LBB16_3:
12972 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
12973 ; GFX1132-DPP-NEXT: s_endpgm
12974 %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 monotonic, align 4
12978 define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp(ptr addrspace(1) %ptr) #2 {
12979 ; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
12980 ; GFX7LESS: ; %bb.0:
12981 ; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
12982 ; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
12983 ; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
12984 ; GFX7LESS-NEXT: s_mov_b32 s50, -1
12985 ; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
12986 ; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
12987 ; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
12988 ; GFX7LESS-NEXT: s_mov_b32 s33, s8
12989 ; GFX7LESS-NEXT: s_mov_b32 s40, s7
12990 ; GFX7LESS-NEXT: s_mov_b32 s41, s6
12991 ; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
12992 ; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
12993 ; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
12994 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
12995 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
12996 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
12997 ; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
12998 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
12999 ; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
13000 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
13001 ; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
13002 ; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
13003 ; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2
13004 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
13005 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
13006 ; GFX7LESS-NEXT: s_mov_b32 s12, s41
13007 ; GFX7LESS-NEXT: s_mov_b32 s13, s40
13008 ; GFX7LESS-NEXT: s_mov_b32 s14, s33
13009 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
13010 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
13011 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
13012 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
13013 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
13014 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
13015 ; GFX7LESS-NEXT: v_mov_b32_e32 v41, 0
13016 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v42, 1
13017 ; GFX7LESS-NEXT: .LBB17_1: ; %ComputeLoop
13018 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
13019 ; GFX7LESS-NEXT: s_ff1_i32_b64 s4, s[0:1]
13020 ; GFX7LESS-NEXT: v_readlane_b32 s3, v1, s4
13021 ; GFX7LESS-NEXT: v_readlane_b32 s2, v0, s4
13022 ; GFX7LESS-NEXT: s_lshl_b64 s[4:5], 1, s4
13023 ; GFX7LESS-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5]
13024 ; GFX7LESS-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0
13025 ; GFX7LESS-NEXT: s_and_b64 vcc, exec, s[4:5]
13026 ; GFX7LESS-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
13027 ; GFX7LESS-NEXT: s_cbranch_vccnz .LBB17_1
13028 ; GFX7LESS-NEXT: ; %bb.2: ; %ComputeEnd
13029 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
13030 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
13031 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
13032 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
13033 ; GFX7LESS-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
13034 ; GFX7LESS-NEXT: s_cbranch_execz .LBB17_5
13035 ; GFX7LESS-NEXT: ; %bb.3:
13036 ; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x9
13037 ; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
13038 ; GFX7LESS-NEXT: s_mov_b32 s46, -1
13039 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
13040 ; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
13041 ; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
13042 ; GFX7LESS-NEXT: .LBB17_4: ; %atomicrmw.start
13043 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
13044 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
13045 ; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42]
13046 ; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
13047 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0
13048 ; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
13049 ; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12
13050 ; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8
13051 ; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
13052 ; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
13053 ; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
13054 ; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
13055 ; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
13056 ; GFX7LESS-NEXT: s_waitcnt expcnt(2)
13057 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
13058 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
13059 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
13060 ; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
13061 ; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
13062 ; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
13063 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
13064 ; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
13065 ; GFX7LESS-NEXT: s_mov_b32 s12, s41
13066 ; GFX7LESS-NEXT: s_mov_b32 s13, s40
13067 ; GFX7LESS-NEXT: s_mov_b32 s14, s33
13068 ; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
13069 ; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
13070 ; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
13071 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
13072 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
13073 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
13074 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
13075 ; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
13076 ; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
13077 ; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0
13078 ; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
13079 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
13080 ; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
13081 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
13082 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_4
13083 ; GFX7LESS-NEXT: .LBB17_5:
13084 ; GFX7LESS-NEXT: s_endpgm
13086 ; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
13088 ; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
13089 ; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
13090 ; GFX9-NEXT: s_mov_b32 s50, -1
13091 ; GFX9-NEXT: s_mov_b32 s51, 0xe00000
13092 ; GFX9-NEXT: s_add_u32 s48, s48, s9
13093 ; GFX9-NEXT: s_addc_u32 s49, s49, 0
13094 ; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
13095 ; GFX9-NEXT: s_mov_b32 s33, s8
13096 ; GFX9-NEXT: s_add_u32 s8, s36, 44
13097 ; GFX9-NEXT: s_addc_u32 s9, s37, 0
13098 ; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
13099 ; GFX9-NEXT: s_getpc_b64 s[0:1]
13100 ; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
13101 ; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
13102 ; GFX9-NEXT: s_mov_b32 s40, s7
13103 ; GFX9-NEXT: s_mov_b32 s41, s6
13104 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
13105 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
13106 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
13107 ; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
13108 ; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
13109 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
13110 ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
13111 ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
13112 ; GFX9-NEXT: s_mov_b32 s12, s41
13113 ; GFX9-NEXT: s_mov_b32 s13, s40
13114 ; GFX9-NEXT: s_mov_b32 s14, s33
13115 ; GFX9-NEXT: v_mov_b32_e32 v31, v40
13116 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
13117 ; GFX9-NEXT: s_movk_i32 s32, 0x800
13118 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
13119 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
13120 ; GFX9-NEXT: v_mov_b32_e32 v41, 0
13121 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
13122 ; GFX9-NEXT: v_bfrev_b32_e32 v42, 1
13123 ; GFX9-NEXT: .LBB17_1: ; %ComputeLoop
13124 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
13125 ; GFX9-NEXT: s_ff1_i32_b64 s4, s[0:1]
13126 ; GFX9-NEXT: v_readlane_b32 s3, v1, s4
13127 ; GFX9-NEXT: v_readlane_b32 s2, v0, s4
13128 ; GFX9-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
13129 ; GFX9-NEXT: s_lshl_b64 s[2:3], 1, s4
13130 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
13131 ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0
13132 ; GFX9-NEXT: s_cbranch_scc1 .LBB17_1
13133 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd
13134 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
13135 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
13136 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
13137 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
13138 ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
13139 ; GFX9-NEXT: s_cbranch_execz .LBB17_5
13140 ; GFX9-NEXT: ; %bb.3:
13141 ; GFX9-NEXT: s_load_dwordx2 s[42:43], s[36:37], 0x24
13142 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
13143 ; GFX9-NEXT: s_mov_b64 s[44:45], 0
13144 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
13145 ; GFX9-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43]
13146 ; GFX9-NEXT: .LBB17_4: ; %atomicrmw.start
13147 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
13148 ; GFX9-NEXT: s_waitcnt vmcnt(0)
13149 ; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
13150 ; GFX9-NEXT: s_add_u32 s8, s36, 44
13151 ; GFX9-NEXT: s_addc_u32 s9, s37, 0
13152 ; GFX9-NEXT: s_getpc_b64 s[0:1]
13153 ; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
13154 ; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
13155 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
13156 ; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
13157 ; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
13158 ; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
13159 ; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
13160 ; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
13161 ; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
13162 ; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
13163 ; GFX9-NEXT: s_mov_b32 s12, s41
13164 ; GFX9-NEXT: s_mov_b32 s13, s40
13165 ; GFX9-NEXT: s_mov_b32 s14, s33
13166 ; GFX9-NEXT: v_mov_b32_e32 v31, v40
13167 ; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
13168 ; GFX9-NEXT: v_mov_b32_e32 v0, 8
13169 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
13170 ; GFX9-NEXT: v_mov_b32_e32 v2, s42
13171 ; GFX9-NEXT: v_mov_b32_e32 v3, s43
13172 ; GFX9-NEXT: v_mov_b32_e32 v4, 0
13173 ; GFX9-NEXT: v_mov_b32_e32 v5, 8
13174 ; GFX9-NEXT: v_mov_b32_e32 v6, 0
13175 ; GFX9-NEXT: v_mov_b32_e32 v7, 0
13176 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
13177 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
13178 ; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0
13179 ; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
13180 ; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
13181 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
13182 ; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
13183 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
13184 ; GFX9-NEXT: s_cbranch_execnz .LBB17_4
13185 ; GFX9-NEXT: .LBB17_5:
13186 ; GFX9-NEXT: s_endpgm
13188 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
13189 ; GFX1064: ; %bb.0:
13190 ; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
13191 ; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
13192 ; GFX1064-NEXT: s_mov_b32 s50, -1
13193 ; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
13194 ; GFX1064-NEXT: s_add_u32 s48, s48, s9
13195 ; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
13196 ; GFX1064-NEXT: s_addc_u32 s49, s49, 0
13197 ; GFX1064-NEXT: s_mov_b32 s33, s8
13198 ; GFX1064-NEXT: s_add_u32 s8, s34, 44
13199 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0
13200 ; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
13201 ; GFX1064-NEXT: s_getpc_b64 s[0:1]
13202 ; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
13203 ; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
13204 ; GFX1064-NEXT: s_mov_b32 s40, s7
13205 ; GFX1064-NEXT: s_mov_b32 s41, s6
13206 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
13207 ; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
13208 ; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
13209 ; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
13210 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
13211 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
13212 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
13213 ; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
13214 ; GFX1064-NEXT: s_mov_b32 s12, s41
13215 ; GFX1064-NEXT: s_mov_b32 s13, s40
13216 ; GFX1064-NEXT: s_mov_b32 s14, s33
13217 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
13218 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40
13219 ; GFX1064-NEXT: s_movk_i32 s32, 0x800
13220 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
13221 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
13222 ; GFX1064-NEXT: v_mov_b32_e32 v41, 0
13223 ; GFX1064-NEXT: v_bfrev_b32_e32 v42, 1
13224 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec
13225 ; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop
13226 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
13227 ; GFX1064-NEXT: s_ff1_i32_b64 s4, s[0:1]
13228 ; GFX1064-NEXT: v_readlane_b32 s3, v1, s4
13229 ; GFX1064-NEXT: v_readlane_b32 s2, v0, s4
13230 ; GFX1064-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
13231 ; GFX1064-NEXT: s_lshl_b64 s[2:3], 1, s4
13232 ; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3]
13233 ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0
13234 ; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1
13235 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd
13236 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
13237 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
13238 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
13239 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
13240 ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
13241 ; GFX1064-NEXT: s_cbranch_execz .LBB17_5
13242 ; GFX1064-NEXT: ; %bb.3:
13243 ; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24
13244 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0
13245 ; GFX1064-NEXT: s_mov_b64 s[44:45], 0
13246 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
13247 ; GFX1064-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43]
13248 ; GFX1064-NEXT: .LBB17_4: ; %atomicrmw.start
13249 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
13250 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
13251 ; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
13252 ; GFX1064-NEXT: s_add_u32 s8, s34, 44
13253 ; GFX1064-NEXT: s_addc_u32 s9, s35, 0
13254 ; GFX1064-NEXT: s_getpc_b64 s[0:1]
13255 ; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
13256 ; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
13257 ; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
13258 ; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0
13259 ; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
13260 ; GFX1064-NEXT: v_mov_b32_e32 v31, v40
13261 ; GFX1064-NEXT: v_mov_b32_e32 v0, 8
13262 ; GFX1064-NEXT: v_mov_b32_e32 v1, 0
13263 ; GFX1064-NEXT: v_mov_b32_e32 v2, s42
13264 ; GFX1064-NEXT: v_mov_b32_e32 v5, 8
13265 ; GFX1064-NEXT: v_mov_b32_e32 v6, 0
13266 ; GFX1064-NEXT: v_mov_b32_e32 v7, 0
13267 ; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
13268 ; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
13269 ; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
13270 ; GFX1064-NEXT: s_mov_b32 s12, s41
13271 ; GFX1064-NEXT: s_mov_b32 s13, s40
13272 ; GFX1064-NEXT: s_mov_b32 s14, s33
13273 ; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
13274 ; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
13275 ; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
13276 ; GFX1064-NEXT: v_mov_b32_e32 v3, s43
13277 ; GFX1064-NEXT: v_mov_b32_e32 v4, 0
13278 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
13279 ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
13280 ; GFX1064-NEXT: s_clause 0x1
13281 ; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0
13282 ; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
13283 ; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
13284 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
13285 ; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
13286 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
13287 ; GFX1064-NEXT: s_cbranch_execnz .LBB17_4
13288 ; GFX1064-NEXT: .LBB17_5:
13289 ; GFX1064-NEXT: s_endpgm
13291 ; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
13292 ; GFX1032: ; %bb.0:
13293 ; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
13294 ; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
13295 ; GFX1032-NEXT: s_mov_b32 s50, -1
13296 ; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
13297 ; GFX1032-NEXT: s_add_u32 s48, s48, s9
13298 ; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
13299 ; GFX1032-NEXT: s_addc_u32 s49, s49, 0
13300 ; GFX1032-NEXT: s_mov_b32 s33, s8
13301 ; GFX1032-NEXT: s_add_u32 s8, s34, 44
13302 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0
13303 ; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
13304 ; GFX1032-NEXT: s_getpc_b64 s[0:1]
13305 ; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
13306 ; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
13307 ; GFX1032-NEXT: s_mov_b32 s40, s7
13308 ; GFX1032-NEXT: s_mov_b32 s41, s6
13309 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
13310 ; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
13311 ; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
13312 ; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
13313 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
13314 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
13315 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
13316 ; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
13317 ; GFX1032-NEXT: s_mov_b32 s12, s41
13318 ; GFX1032-NEXT: s_mov_b32 s13, s40
13319 ; GFX1032-NEXT: s_mov_b32 s14, s33
13320 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
13321 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40
13322 ; GFX1032-NEXT: s_movk_i32 s32, 0x400
13323 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
13324 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
13325 ; GFX1032-NEXT: v_mov_b32_e32 v41, 0
13326 ; GFX1032-NEXT: v_bfrev_b32_e32 v42, 1
13327 ; GFX1032-NEXT: s_mov_b32 s0, exec_lo
13328 ; GFX1032-NEXT: .LBB17_1: ; %ComputeLoop
13329 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
13330 ; GFX1032-NEXT: s_ff1_i32_b32 s1, s0
13331 ; GFX1032-NEXT: v_readlane_b32 s3, v1, s1
13332 ; GFX1032-NEXT: v_readlane_b32 s2, v0, s1
13333 ; GFX1032-NEXT: s_lshl_b32 s1, 1, s1
13334 ; GFX1032-NEXT: s_andn2_b32 s0, s0, s1
13335 ; GFX1032-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
13336 ; GFX1032-NEXT: s_cmp_lg_u32 s0, 0
13337 ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1
13338 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd
13339 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
13340 ; GFX1032-NEXT: s_mov_b32 s44, 0
13341 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
13342 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
13343 ; GFX1032-NEXT: s_xor_b32 s0, exec_lo, s0
13344 ; GFX1032-NEXT: s_cbranch_execz .LBB17_5
13345 ; GFX1032-NEXT: ; %bb.3:
13346 ; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24
13347 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0
13348 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
13349 ; GFX1032-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43]
13350 ; GFX1032-NEXT: .LBB17_4: ; %atomicrmw.start
13351 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
13352 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
13353 ; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
13354 ; GFX1032-NEXT: s_add_u32 s8, s34, 44
13355 ; GFX1032-NEXT: s_addc_u32 s9, s35, 0
13356 ; GFX1032-NEXT: s_getpc_b64 s[0:1]
13357 ; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
13358 ; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
13359 ; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
13360 ; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0
13361 ; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
13362 ; GFX1032-NEXT: v_mov_b32_e32 v31, v40
13363 ; GFX1032-NEXT: v_mov_b32_e32 v0, 8
13364 ; GFX1032-NEXT: v_mov_b32_e32 v1, 0
13365 ; GFX1032-NEXT: v_mov_b32_e32 v2, s42
13366 ; GFX1032-NEXT: v_mov_b32_e32 v5, 8
13367 ; GFX1032-NEXT: v_mov_b32_e32 v6, 0
13368 ; GFX1032-NEXT: v_mov_b32_e32 v7, 0
13369 ; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
13370 ; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
13371 ; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
13372 ; GFX1032-NEXT: s_mov_b32 s12, s41
13373 ; GFX1032-NEXT: s_mov_b32 s13, s40
13374 ; GFX1032-NEXT: s_mov_b32 s14, s33
13375 ; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
13376 ; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
13377 ; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
13378 ; GFX1032-NEXT: v_mov_b32_e32 v3, s43
13379 ; GFX1032-NEXT: v_mov_b32_e32 v4, 0
13380 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
13381 ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
13382 ; GFX1032-NEXT: s_clause 0x1
13383 ; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0
13384 ; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
13385 ; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
13386 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
13387 ; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
13388 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
13389 ; GFX1032-NEXT: s_cbranch_execnz .LBB17_4
13390 ; GFX1032-NEXT: .LBB17_5:
13391 ; GFX1032-NEXT: s_endpgm
13393 ; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
13394 ; GFX1164: ; %bb.0:
13395 ; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
13396 ; GFX1164-NEXT: s_mov_b32 s33, s8
13397 ; GFX1164-NEXT: s_add_u32 s8, s34, 44
13398 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0
13399 ; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
13400 ; GFX1164-NEXT: s_getpc_b64 s[0:1]
13401 ; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
13402 ; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
13403 ; GFX1164-NEXT: v_mov_b32_e32 v31, v0
13404 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
13405 ; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
13406 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
13407 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
13408 ; GFX1164-NEXT: s_mov_b32 s12, s6
13409 ; GFX1164-NEXT: s_mov_b32 s13, s7
13410 ; GFX1164-NEXT: s_mov_b32 s14, s33
13411 ; GFX1164-NEXT: s_mov_b32 s32, 32
13412 ; GFX1164-NEXT: v_mov_b32_e32 v40, v0
13413 ; GFX1164-NEXT: s_mov_b32 s40, s7
13414 ; GFX1164-NEXT: s_mov_b32 s41, s6
13415 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
13416 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
13417 ; GFX1164-NEXT: v_mov_b32_e32 v41, 0
13418 ; GFX1164-NEXT: v_bfrev_b32_e32 v42, 1
13419 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
13420 ; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop
13421 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
13422 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
13423 ; GFX1164-NEXT: s_ctz_i32_b64 s4, s[0:1]
13424 ; GFX1164-NEXT: v_readlane_b32 s3, v1, s4
13425 ; GFX1164-NEXT: v_readlane_b32 s2, v0, s4
13426 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
13427 ; GFX1164-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
13428 ; GFX1164-NEXT: s_lshl_b64 s[2:3], 1, s4
13429 ; GFX1164-NEXT: s_and_not1_b64 s[0:1], s[0:1], s[2:3]
13430 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
13431 ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0
13432 ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1
13433 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd
13434 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
13435 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
13436 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13437 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
13438 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
13439 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
13440 ; GFX1164-NEXT: s_cbranch_execz .LBB17_5
13441 ; GFX1164-NEXT: ; %bb.3:
13442 ; GFX1164-NEXT: s_load_b64 s[42:43], s[34:35], 0x24
13443 ; GFX1164-NEXT: v_mov_b32_e32 v0, 0
13444 ; GFX1164-NEXT: s_mov_b64 s[44:45], 0
13445 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
13446 ; GFX1164-NEXT: global_load_b64 v[1:2], v0, s[42:43]
13447 ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
13448 ; GFX1164-NEXT: .p2align 6
13449 ; GFX1164-NEXT: .LBB17_4: ; %atomicrmw.start
13450 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
13451 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
13452 ; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
13453 ; GFX1164-NEXT: s_add_u32 s8, s34, 44
13454 ; GFX1164-NEXT: s_addc_u32 s9, s35, 0
13455 ; GFX1164-NEXT: s_getpc_b64 s[0:1]
13456 ; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
13457 ; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
13458 ; GFX1164-NEXT: v_mov_b32_e32 v31, v40
13459 ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
13460 ; GFX1164-NEXT: v_mov_b32_e32 v0, 8
13461 ; GFX1164-NEXT: v_mov_b32_e32 v5, 8
13462 ; GFX1164-NEXT: v_mov_b32_e32 v6, 0
13463 ; GFX1164-NEXT: v_mov_b32_e32 v7, 0
13464 ; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
13465 ; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
13466 ; GFX1164-NEXT: s_mov_b32 s12, s41
13467 ; GFX1164-NEXT: s_mov_b32 s13, s40
13468 ; GFX1164-NEXT: s_mov_b32 s14, s33
13469 ; GFX1164-NEXT: s_clause 0x1
13470 ; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
13471 ; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
13472 ; GFX1164-NEXT: v_mov_b32_e32 v1, 0
13473 ; GFX1164-NEXT: v_mov_b32_e32 v2, s42
13474 ; GFX1164-NEXT: v_mov_b32_e32 v3, s43
13475 ; GFX1164-NEXT: v_mov_b32_e32 v4, 0
13476 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
13477 ; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
13478 ; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
13479 ; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
13480 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
13481 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
13482 ; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
13483 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
13484 ; GFX1164-NEXT: s_cbranch_execnz .LBB17_4
13485 ; GFX1164-NEXT: .LBB17_5:
13486 ; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
13487 ; GFX1164-NEXT: s_endpgm
13489 ; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
13490 ; GFX1132: ; %bb.0:
13491 ; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
13492 ; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
13493 ; GFX1132-NEXT: s_add_u32 s8, s34, 44
13494 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0
13495 ; GFX1132-NEXT: s_getpc_b64 s[0:1]
13496 ; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
13497 ; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
13498 ; GFX1132-NEXT: v_mov_b32_e32 v31, v0
13499 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
13500 ; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
13501 ; GFX1132-NEXT: s_mov_b32 s40, s14
13502 ; GFX1132-NEXT: s_mov_b32 s41, s13
13503 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
13504 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
13505 ; GFX1132-NEXT: s_mov_b32 s12, s13
13506 ; GFX1132-NEXT: s_mov_b32 s13, s14
13507 ; GFX1132-NEXT: s_mov_b32 s14, s15
13508 ; GFX1132-NEXT: s_mov_b32 s32, 32
13509 ; GFX1132-NEXT: s_mov_b32 s33, s15
13510 ; GFX1132-NEXT: v_mov_b32_e32 v40, v0
13511 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
13512 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
13513 ; GFX1132-NEXT: v_mov_b32_e32 v41, 0
13514 ; GFX1132-NEXT: v_bfrev_b32_e32 v42, 1
13515 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
13516 ; GFX1132-NEXT: .LBB17_1: ; %ComputeLoop
13517 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
13518 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
13519 ; GFX1132-NEXT: s_ctz_i32_b32 s1, s0
13520 ; GFX1132-NEXT: v_readlane_b32 s3, v1, s1
13521 ; GFX1132-NEXT: v_readlane_b32 s2, v0, s1
13522 ; GFX1132-NEXT: s_lshl_b32 s1, 1, s1
13523 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
13524 ; GFX1132-NEXT: s_and_not1_b32 s0, s0, s1
13525 ; GFX1132-NEXT: v_add_f64 v[41:42], v[41:42], s[2:3]
13526 ; GFX1132-NEXT: s_cmp_lg_u32 s0, 0
13527 ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1
13528 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd
13529 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
13530 ; GFX1132-NEXT: s_mov_b32 s44, 0
13531 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
13532 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
13533 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
13534 ; GFX1132-NEXT: s_xor_b32 s0, exec_lo, s0
13535 ; GFX1132-NEXT: s_cbranch_execz .LBB17_5
13536 ; GFX1132-NEXT: ; %bb.3:
13537 ; GFX1132-NEXT: s_load_b64 s[42:43], s[34:35], 0x24
13538 ; GFX1132-NEXT: v_mov_b32_e32 v0, 0
13539 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
13540 ; GFX1132-NEXT: global_load_b64 v[1:2], v0, s[42:43]
13541 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
13542 ; GFX1132-NEXT: .p2align 6
13543 ; GFX1132-NEXT: .LBB17_4: ; %atomicrmw.start
13544 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
13545 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
13546 ; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
13547 ; GFX1132-NEXT: s_add_u32 s8, s34, 44
13548 ; GFX1132-NEXT: s_addc_u32 s9, s35, 0
13549 ; GFX1132-NEXT: s_getpc_b64 s[0:1]
13550 ; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
13551 ; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
13552 ; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
13553 ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
13554 ; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
13555 ; GFX1132-NEXT: v_mov_b32_e32 v7, 0
13556 ; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
13557 ; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
13558 ; GFX1132-NEXT: s_mov_b32 s12, s41
13559 ; GFX1132-NEXT: s_mov_b32 s13, s40
13560 ; GFX1132-NEXT: s_mov_b32 s14, s33
13561 ; GFX1132-NEXT: s_clause 0x1
13562 ; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
13563 ; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
13564 ; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
13565 ; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
13566 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
13567 ; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
13568 ; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
13569 ; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
13570 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
13571 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
13572 ; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
13573 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
13574 ; GFX1132-NEXT: s_cbranch_execnz .LBB17_4
13575 ; GFX1132-NEXT: .LBB17_5:
13576 ; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
13577 ; GFX1132-NEXT: s_endpgm
13579 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
13580 ; GFX7LESS-DPP: ; %bb.0:
13581 ; GFX7LESS-DPP-NEXT: s_movk_i32 s32, 0x800
13582 ; GFX7LESS-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
13583 ; GFX7LESS-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
13584 ; GFX7LESS-DPP-NEXT: s_mov_b32 s50, -1
13585 ; GFX7LESS-DPP-NEXT: s_mov_b32 s51, 0xe8f000
13586 ; GFX7LESS-DPP-NEXT: s_add_u32 s48, s48, s9
13587 ; GFX7LESS-DPP-NEXT: s_addc_u32 s49, s49, 0
13588 ; GFX7LESS-DPP-NEXT: s_mov_b32 s33, s8
13589 ; GFX7LESS-DPP-NEXT: s_mov_b32 s40, s7
13590 ; GFX7LESS-DPP-NEXT: s_mov_b32 s41, s6
13591 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
13592 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
13593 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
13594 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
13595 ; GFX7LESS-DPP-NEXT: s_mov_b32 s47, 0xf000
13596 ; GFX7LESS-DPP-NEXT: s_mov_b32 s46, -1
13597 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
13598 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0
13599 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
13600 ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
13601 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
13602 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
13603 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
13604 ; GFX7LESS-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
13605 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v0, v0, v1
13606 ; GFX7LESS-DPP-NEXT: v_or_b32_e32 v42, v0, v2
13607 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
13608 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
13609 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41
13610 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40
13611 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33
13612 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42
13613 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
13614 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
13615 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
13616 ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
13617 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v40, v0
13618 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v41, v1
13619 ; GFX7LESS-DPP-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
13620 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[42:43], 0
13621 ; GFX7LESS-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
13622 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
13623 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
13624 ; GFX7LESS-DPP-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41]
13625 ; GFX7LESS-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
13626 ; GFX7LESS-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0
13627 ; GFX7LESS-DPP-NEXT: s_add_u32 s8, s36, 44
13628 ; GFX7LESS-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12
13629 ; GFX7LESS-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8
13630 ; GFX7LESS-DPP-NEXT: s_addc_u32 s9, s37, 0
13631 ; GFX7LESS-DPP-NEXT: s_getpc_b64 s[0:1]
13632 ; GFX7LESS-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
13633 ; GFX7LESS-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
13634 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
13635 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(2)
13636 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v0, 8
13637 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, 0
13638 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, 0
13639 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v5, 8
13640 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v6, 0
13641 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v7, 0
13642 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
13643 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
13644 ; GFX7LESS-DPP-NEXT: s_mov_b32 s12, s41
13645 ; GFX7LESS-DPP-NEXT: s_mov_b32 s13, s40
13646 ; GFX7LESS-DPP-NEXT: s_mov_b32 s14, s33
13647 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v31, v42
13648 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
13649 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
13650 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
13651 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v2, s44
13652 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, s45
13653 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
13654 ; GFX7LESS-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
13655 ; GFX7LESS-DPP-NEXT: v_and_b32_e32 v2, 1, v0
13656 ; GFX7LESS-DPP-NEXT: buffer_load_dword v0, off, s[48:51], 0
13657 ; GFX7LESS-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
13658 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
13659 ; GFX7LESS-DPP-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
13660 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[42:43]
13661 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB17_1
13662 ; GFX7LESS-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
13663 ; GFX7LESS-DPP-NEXT: s_endpgm
13665 ; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
13666 ; GFX9-DPP: ; %bb.0:
13667 ; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
13668 ; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
13669 ; GFX9-DPP-NEXT: s_mov_b32 s50, -1
13670 ; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
13671 ; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
13672 ; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
13673 ; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
13674 ; GFX9-DPP-NEXT: s_mov_b32 s33, s8
13675 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
13676 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
13677 ; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
13678 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
13679 ; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
13680 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
13681 ; GFX9-DPP-NEXT: s_mov_b32 s40, s7
13682 ; GFX9-DPP-NEXT: s_mov_b32 s41, s6
13683 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
13684 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
13685 ; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
13686 ; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
13687 ; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
13688 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
13689 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
13690 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
13691 ; GFX9-DPP-NEXT: s_mov_b32 s12, s41
13692 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40
13693 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33
13694 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
13695 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
13696 ; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
13697 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
13698 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
13699 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
13700 ; GFX9-DPP-NEXT: v_mov_b32_e32 v8, 0
13701 ; GFX9-DPP-NEXT: v_bfrev_b32_e32 v9, 1
13702 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
13703 ; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v0
13704 ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v1
13705 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
13706 ; GFX9-DPP-NEXT: v_mov_b32_e32 v10, v8
13707 ; GFX9-DPP-NEXT: v_mov_b32_e32 v11, v9
13708 ; GFX9-DPP-NEXT: s_not_b64 exec, exec
13709 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
13710 ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
13711 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
13712 ; GFX9-DPP-NEXT: s_nop 0
13713 ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:1 row_mask:0xf bank_mask:0xf
13714 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:1 row_mask:0xf bank_mask:0xf
13715 ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
13716 ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
13717 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
13718 ; GFX9-DPP-NEXT: s_nop 0
13719 ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:2 row_mask:0xf bank_mask:0xf
13720 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:2 row_mask:0xf bank_mask:0xf
13721 ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
13722 ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
13723 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
13724 ; GFX9-DPP-NEXT: s_nop 0
13725 ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:4 row_mask:0xf bank_mask:0xf
13726 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:4 row_mask:0xf bank_mask:0xf
13727 ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
13728 ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
13729 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
13730 ; GFX9-DPP-NEXT: s_nop 0
13731 ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_shr:8 row_mask:0xf bank_mask:0xf
13732 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_shr:8 row_mask:0xf bank_mask:0xf
13733 ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
13734 ; GFX9-DPP-NEXT: v_mov_b32_e32 v12, v8
13735 ; GFX9-DPP-NEXT: v_mov_b32_e32 v13, v9
13736 ; GFX9-DPP-NEXT: s_nop 0
13737 ; GFX9-DPP-NEXT: v_mov_b32_dpp v12, v10 row_bcast:15 row_mask:0xa bank_mask:0xf
13738 ; GFX9-DPP-NEXT: v_mov_b32_dpp v13, v11 row_bcast:15 row_mask:0xa bank_mask:0xf
13739 ; GFX9-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
13740 ; GFX9-DPP-NEXT: s_nop 1
13741 ; GFX9-DPP-NEXT: v_mov_b32_dpp v8, v10 row_bcast:31 row_mask:0xc bank_mask:0xf
13742 ; GFX9-DPP-NEXT: v_mov_b32_dpp v9, v11 row_bcast:31 row_mask:0xc bank_mask:0xf
13743 ; GFX9-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
13744 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
13745 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
13746 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
13747 ; GFX9-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
13748 ; GFX9-DPP-NEXT: v_readlane_b32 s43, v9, 63
13749 ; GFX9-DPP-NEXT: v_readlane_b32 s42, v8, 63
13750 ; GFX9-DPP-NEXT: s_mov_b64 exec, s[0:1]
13751 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
13752 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
13753 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB17_3
13754 ; GFX9-DPP-NEXT: ; %bb.1:
13755 ; GFX9-DPP-NEXT: s_load_dwordx2 s[44:45], s[36:37], 0x24
13756 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
13757 ; GFX9-DPP-NEXT: s_mov_b64 s[46:47], 0
13758 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
13759 ; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[44:45]
13760 ; GFX9-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
13761 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
13762 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
13763 ; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], s[42:43]
13764 ; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
13765 ; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
13766 ; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
13767 ; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
13768 ; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
13769 ; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
13770 ; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
13771 ; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
13772 ; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
13773 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
13774 ; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
13775 ; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
13776 ; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
13777 ; GFX9-DPP-NEXT: s_mov_b32 s12, s41
13778 ; GFX9-DPP-NEXT: s_mov_b32 s13, s40
13779 ; GFX9-DPP-NEXT: s_mov_b32 s14, s33
13780 ; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
13781 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
13782 ; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
13783 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
13784 ; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s44
13785 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s45
13786 ; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
13787 ; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
13788 ; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
13789 ; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
13790 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
13791 ; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
13792 ; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
13793 ; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
13794 ; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
13795 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
13796 ; GFX9-DPP-NEXT: s_or_b64 s[46:47], vcc, s[46:47]
13797 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[46:47]
13798 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_2
13799 ; GFX9-DPP-NEXT: .LBB17_3:
13800 ; GFX9-DPP-NEXT: s_endpgm
13802 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
13803 ; GFX1064-DPP: ; %bb.0:
13804 ; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
13805 ; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
13806 ; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
13807 ; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
13808 ; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
13809 ; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
13810 ; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
13811 ; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
13812 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
13813 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
13814 ; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
13815 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
13816 ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
13817 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
13818 ; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
13819 ; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
13820 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
13821 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
13822 ; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
13823 ; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
13824 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
13825 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
13826 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
13827 ; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
13828 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
13829 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
13830 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
13831 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
13832 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
13833 ; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
13834 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
13835 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
13836 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
13837 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v8, 0
13838 ; GFX1064-DPP-NEXT: v_bfrev_b32_e32 v9, 1
13839 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
13840 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v0
13841 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v1
13842 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
13843 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
13844 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
13845 ; GFX1064-DPP-NEXT: s_not_b64 exec, exec
13846 ; GFX1064-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
13847 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
13848 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
13849 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
13850 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
13851 ; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
13852 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
13853 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
13854 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
13855 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
13856 ; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
13857 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v12, v8
13858 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v13, v9
13859 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
13860 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
13861 ; GFX1064-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
13862 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
13863 ; GFX1064-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
13864 ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
13865 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v11, v9
13866 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v10, v8
13867 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1
13868 ; GFX1064-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1
13869 ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
13870 ; GFX1064-DPP-NEXT: v_readlane_b32 s3, v9, 0
13871 ; GFX1064-DPP-NEXT: v_readlane_b32 s2, v8, 0
13872 ; GFX1064-DPP-NEXT: v_readlane_b32 s5, v9, 32
13873 ; GFX1064-DPP-NEXT: v_readlane_b32 s4, v8, 32
13874 ; GFX1064-DPP-NEXT: v_add_f64 v[8:9], s[2:3], s[4:5]
13875 ; GFX1064-DPP-NEXT: s_mov_b64 exec, s[0:1]
13876 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
13877 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v8
13878 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v42, v9
13879 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
13880 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
13881 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
13882 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB17_3
13883 ; GFX1064-DPP-NEXT: ; %bb.1:
13884 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24
13885 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 0
13886 ; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
13887 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
13888 ; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43]
13889 ; GFX1064-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
13890 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
13891 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
13892 ; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
13893 ; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
13894 ; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
13895 ; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
13896 ; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
13897 ; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
13898 ; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
13899 ; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
13900 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
13901 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
13902 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
13903 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
13904 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
13905 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
13906 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
13907 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
13908 ; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
13909 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
13910 ; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
13911 ; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
13912 ; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
13913 ; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
13914 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
13915 ; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
13916 ; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
13917 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
13918 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
13919 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
13920 ; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
13921 ; GFX1064-DPP-NEXT: s_clause 0x1
13922 ; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
13923 ; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
13924 ; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
13925 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
13926 ; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
13927 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
13928 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_2
13929 ; GFX1064-DPP-NEXT: .LBB17_3:
13930 ; GFX1064-DPP-NEXT: s_endpgm
13932 ; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
13933 ; GFX1032-DPP: ; %bb.0:
13934 ; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
13935 ; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
13936 ; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
13937 ; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
13938 ; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
13939 ; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
13940 ; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
13941 ; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
13942 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
13943 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
13944 ; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
13945 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
13946 ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
13947 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
13948 ; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
13949 ; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
13950 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
13951 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
13952 ; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
13953 ; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
13954 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
13955 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
13956 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
13957 ; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
13958 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
13959 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
13960 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
13961 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
13962 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
13963 ; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
13964 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
13965 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
13966 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
13967 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v8, 0
13968 ; GFX1032-DPP-NEXT: v_bfrev_b32_e32 v9, 1
13969 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
13970 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v0
13971 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v1
13972 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
13973 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
13974 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
13975 ; GFX1032-DPP-NEXT: s_not_b32 exec_lo, exec_lo
13976 ; GFX1032-DPP-NEXT: s_or_saveexec_b32 s0, -1
13977 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
13978 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
13979 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
13980 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
13981 ; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
13982 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
13983 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
13984 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
13985 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
13986 ; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
13987 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v12, v8
13988 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v13, v9
13989 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
13990 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
13991 ; GFX1032-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
13992 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
13993 ; GFX1032-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
13994 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
13995 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v11, v9
13996 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v10, v8
13997 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1
13998 ; GFX1032-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1
13999 ; GFX1032-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
14000 ; GFX1032-DPP-NEXT: s_mov_b32 exec_lo, s0
14001 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14002 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v8
14003 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v42, v9
14004 ; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
14005 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
14006 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
14007 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB17_3
14008 ; GFX1032-DPP-NEXT: ; %bb.1:
14009 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[34:35], 0x24
14010 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 0
14011 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
14012 ; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v0, s[42:43]
14013 ; GFX1032-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
14014 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
14015 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
14016 ; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
14017 ; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
14018 ; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
14019 ; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
14020 ; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
14021 ; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
14022 ; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
14023 ; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
14024 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
14025 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
14026 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
14027 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
14028 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
14029 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
14030 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
14031 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
14032 ; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
14033 ; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
14034 ; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
14035 ; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
14036 ; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
14037 ; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
14038 ; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
14039 ; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
14040 ; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
14041 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
14042 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
14043 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
14044 ; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
14045 ; GFX1032-DPP-NEXT: s_clause 0x1
14046 ; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
14047 ; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
14048 ; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
14049 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
14050 ; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
14051 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
14052 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_2
14053 ; GFX1032-DPP-NEXT: .LBB17_3:
14054 ; GFX1032-DPP-NEXT: s_endpgm
14056 ; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
14057 ; GFX1164-DPP: ; %bb.0:
14058 ; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
14059 ; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
14060 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
14061 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
14062 ; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
14063 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
14064 ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
14065 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
14066 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
14067 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
14068 ; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
14069 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
14070 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
14071 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
14072 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
14073 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
14074 ; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
14075 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
14076 ; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
14077 ; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
14078 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
14079 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
14080 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
14081 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v8, 0
14082 ; GFX1164-DPP-NEXT: v_bfrev_b32_e32 v9, 1
14083 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
14084 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v0
14085 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v1
14086 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
14087 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
14088 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
14089 ; GFX1164-DPP-NEXT: s_not_b64 exec, exec
14090 ; GFX1164-DPP-NEXT: s_or_saveexec_b64 s[0:1], -1
14091 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
14092 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
14093 ; GFX1164-DPP-NEXT: s_waitcnt_depctr 0xfff
14094 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
14095 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
14096 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
14097 ; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
14098 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
14099 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
14100 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
14101 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
14102 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
14103 ; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
14104 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v12, v8
14105 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v13, v9
14106 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14107 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
14108 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
14109 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14110 ; GFX1164-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
14111 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
14112 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
14113 ; GFX1164-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
14114 ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
14115 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
14116 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v11, v9
14117 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v10, v8
14118 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
14119 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1
14120 ; GFX1164-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1
14121 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14122 ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
14123 ; GFX1164-DPP-NEXT: v_permlane64_b32 v11, v9
14124 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
14125 ; GFX1164-DPP-NEXT: v_permlane64_b32 v10, v8
14126 ; GFX1164-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
14127 ; GFX1164-DPP-NEXT: s_mov_b64 exec, s[0:1]
14128 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
14129 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14130 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v8
14131 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
14132 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v9
14133 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
14134 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
14135 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
14136 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
14137 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB17_3
14138 ; GFX1164-DPP-NEXT: ; %bb.1:
14139 ; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24
14140 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0
14141 ; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
14142 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
14143 ; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43]
14144 ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
14145 ; GFX1164-DPP-NEXT: .p2align 6
14146 ; GFX1164-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
14147 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
14148 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
14149 ; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
14150 ; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
14151 ; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
14152 ; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
14153 ; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
14154 ; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
14155 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
14156 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
14157 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
14158 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
14159 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
14160 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
14161 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
14162 ; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
14163 ; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
14164 ; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
14165 ; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
14166 ; GFX1164-DPP-NEXT: s_clause 0x1
14167 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
14168 ; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
14169 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
14170 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
14171 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
14172 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
14173 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
14174 ; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
14175 ; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
14176 ; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
14177 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
14178 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
14179 ; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
14180 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
14181 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_2
14182 ; GFX1164-DPP-NEXT: .LBB17_3:
14183 ; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
14184 ; GFX1164-DPP-NEXT: s_endpgm
14186 ; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
14187 ; GFX1132-DPP: ; %bb.0:
14188 ; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
14189 ; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
14190 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
14191 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
14192 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
14193 ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
14194 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
14195 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
14196 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
14197 ; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
14198 ; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
14199 ; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
14200 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
14201 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
14202 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
14203 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
14204 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
14205 ; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
14206 ; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
14207 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0
14208 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
14209 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
14210 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
14211 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v8, 0
14212 ; GFX1132-DPP-NEXT: v_bfrev_b32_e32 v9, 1
14213 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
14214 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v0
14215 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v1
14216 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
14217 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v10, v8
14218 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v11, v9
14219 ; GFX1132-DPP-NEXT: s_not_b32 exec_lo, exec_lo
14220 ; GFX1132-DPP-NEXT: s_or_saveexec_b32 s0, -1
14221 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
14222 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
14223 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:1 row_mask:0xf bank_mask:0xf
14224 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:1 row_mask:0xf bank_mask:0xf
14225 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
14226 ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
14227 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
14228 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:2 row_mask:0xf bank_mask:0xf
14229 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
14230 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:2 row_mask:0xf bank_mask:0xf
14231 ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
14232 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v12, v8 :: v_dual_mov_b32 v13, v9
14233 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
14234 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v12, v10 row_xmask:4 row_mask:0xf bank_mask:0xf
14235 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v13, v11 row_xmask:4 row_mask:0xf bank_mask:0xf
14236 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14237 ; GFX1132-DPP-NEXT: v_add_f64 v[10:11], v[10:11], v[12:13]
14238 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v8, v10 row_xmask:8 row_mask:0xf bank_mask:0xf
14239 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
14240 ; GFX1132-DPP-NEXT: v_mov_b32_dpp v9, v11 row_xmask:8 row_mask:0xf bank_mask:0xf
14241 ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[10:11], v[8:9]
14242 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14243 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v11, v9 :: v_dual_mov_b32 v10, v8
14244 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v11, v11, -1, -1
14245 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
14246 ; GFX1132-DPP-NEXT: v_permlanex16_b32 v10, v10, -1, -1
14247 ; GFX1132-DPP-NEXT: v_add_f64 v[8:9], v[8:9], v[10:11]
14248 ; GFX1132-DPP-NEXT: s_mov_b32 exec_lo, s0
14249 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
14250 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v41, v8
14251 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
14252 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v42, v9
14253 ; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
14254 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
14255 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
14256 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
14257 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB17_3
14258 ; GFX1132-DPP-NEXT: ; %bb.1:
14259 ; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[34:35], 0x24
14260 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0
14261 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
14262 ; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v0, s[42:43]
14263 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
14264 ; GFX1132-DPP-NEXT: .p2align 6
14265 ; GFX1132-DPP-NEXT: .LBB17_2: ; %atomicrmw.start
14266 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
14267 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
14268 ; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
14269 ; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
14270 ; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
14271 ; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
14272 ; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
14273 ; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
14274 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
14275 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
14276 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
14277 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
14278 ; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
14279 ; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
14280 ; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
14281 ; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
14282 ; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
14283 ; GFX1132-DPP-NEXT: s_clause 0x1
14284 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
14285 ; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
14286 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
14287 ; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
14288 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
14289 ; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
14290 ; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
14291 ; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
14292 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
14293 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
14294 ; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
14295 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
14296 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_2
14297 ; GFX1132-DPP-NEXT: .LBB17_3:
14298 ; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
14299 ; GFX1132-DPP-NEXT: s_endpgm
14300 %divValue = call double @div.float.value() strictfp
14301 %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue monotonic, align 4
14305 define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
14306 ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14307 ; GFX7LESS: ; %bb.0:
14308 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
14309 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
14310 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
14311 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
14312 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
14313 ; GFX7LESS-NEXT: s_cbranch_execz .LBB18_3
14314 ; GFX7LESS-NEXT: ; %bb.1:
14315 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
14316 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5]
14317 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
14318 ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0
14319 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
14320 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
14321 ; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
14322 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
14323 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
14324 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
14325 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
14326 ; GFX7LESS-NEXT: .LBB18_2: ; %atomicrmw.start
14327 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
14328 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
14329 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
14330 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
14331 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
14332 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
14333 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
14334 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
14335 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
14336 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
14337 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
14338 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB18_2
14339 ; GFX7LESS-NEXT: .LBB18_3:
14340 ; GFX7LESS-NEXT: s_endpgm
14342 ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14344 ; GFX9-NEXT: s_mov_b64 s[4:5], exec
14345 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
14346 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
14347 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
14348 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
14349 ; GFX9-NEXT: s_cbranch_execz .LBB18_3
14350 ; GFX9-NEXT: ; %bb.1:
14351 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
14352 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
14353 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
14354 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
14355 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
14356 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
14357 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0
14358 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
14359 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
14360 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
14361 ; GFX9-NEXT: .LBB18_2: ; %atomicrmw.start
14362 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
14363 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
14364 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
14365 ; GFX9-NEXT: s_waitcnt vmcnt(0)
14366 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
14367 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
14368 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
14369 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
14370 ; GFX9-NEXT: s_cbranch_execnz .LBB18_2
14371 ; GFX9-NEXT: .LBB18_3:
14372 ; GFX9-NEXT: s_endpgm
14374 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14375 ; GFX1064: ; %bb.0:
14376 ; GFX1064-NEXT: s_mov_b64 s[4:5], exec
14377 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
14378 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
14379 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
14380 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
14381 ; GFX1064-NEXT: s_cbranch_execz .LBB18_3
14382 ; GFX1064-NEXT: ; %bb.1:
14383 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
14384 ; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5]
14385 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0
14386 ; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
14387 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
14388 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
14389 ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
14390 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
14391 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2
14392 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
14393 ; GFX1064-NEXT: .LBB18_2: ; %atomicrmw.start
14394 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
14395 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
14396 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
14397 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
14398 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
14399 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
14400 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
14401 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
14402 ; GFX1064-NEXT: s_cbranch_execnz .LBB18_2
14403 ; GFX1064-NEXT: .LBB18_3:
14404 ; GFX1064-NEXT: s_endpgm
14406 ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14407 ; GFX1032: ; %bb.0:
14408 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo
14409 ; GFX1032-NEXT: s_mov_b32 s4, 0
14410 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
14411 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
14412 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
14413 ; GFX1032-NEXT: s_cbranch_execz .LBB18_3
14414 ; GFX1032-NEXT: ; %bb.1:
14415 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
14416 ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5
14417 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0
14418 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
14419 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
14420 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
14421 ; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0
14422 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
14423 ; GFX1032-NEXT: v_mov_b32_e32 v1, s2
14424 ; GFX1032-NEXT: .LBB18_2: ; %atomicrmw.start
14425 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
14426 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
14427 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
14428 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
14429 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
14430 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
14431 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
14432 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
14433 ; GFX1032-NEXT: s_cbranch_execnz .LBB18_2
14434 ; GFX1032-NEXT: .LBB18_3:
14435 ; GFX1032-NEXT: s_endpgm
14437 ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14438 ; GFX1164: ; %bb.0:
14439 ; GFX1164-NEXT: s_mov_b64 s[4:5], exec
14440 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
14441 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
14442 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14443 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
14444 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
14445 ; GFX1164-NEXT: s_cbranch_execz .LBB18_3
14446 ; GFX1164-NEXT: ; %bb.1:
14447 ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
14448 ; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[4:5]
14449 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0
14450 ; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
14451 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
14452 ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
14453 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
14454 ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
14455 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
14456 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2
14457 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
14458 ; GFX1164-NEXT: .LBB18_2: ; %atomicrmw.start
14459 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
14460 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
14461 ; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
14462 ; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
14463 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
14464 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
14465 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0
14466 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
14467 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
14468 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
14469 ; GFX1164-NEXT: s_cbranch_execnz .LBB18_2
14470 ; GFX1164-NEXT: .LBB18_3:
14471 ; GFX1164-NEXT: s_endpgm
14473 ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14474 ; GFX1132: ; %bb.0:
14475 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo
14476 ; GFX1132-NEXT: s_mov_b32 s4, 0
14477 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
14478 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
14479 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
14480 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
14481 ; GFX1132-NEXT: s_cbranch_execz .LBB18_3
14482 ; GFX1132-NEXT: ; %bb.1:
14483 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
14484 ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s5
14485 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0
14486 ; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
14487 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
14488 ; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0
14489 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
14490 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
14491 ; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2
14492 ; GFX1132-NEXT: .LBB18_2: ; %atomicrmw.start
14493 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
14494 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
14495 ; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
14496 ; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
14497 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
14498 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
14499 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0
14500 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
14501 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
14502 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
14503 ; GFX1132-NEXT: s_cbranch_execnz .LBB18_2
14504 ; GFX1132-NEXT: .LBB18_3:
14505 ; GFX1132-NEXT: s_endpgm
14507 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14508 ; GFX7LESS-DPP: ; %bb.0:
14509 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec
14510 ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
14511 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
14512 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
14513 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
14514 ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB18_3
14515 ; GFX7LESS-DPP-NEXT: ; %bb.1:
14516 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
14517 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5]
14518 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
14519 ; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0
14520 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
14521 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
14522 ; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
14523 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
14524 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
14525 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6
14526 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
14527 ; GFX7LESS-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
14528 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
14529 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
14530 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
14531 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
14532 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
14533 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
14534 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
14535 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
14536 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
14537 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
14538 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
14539 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB18_2
14540 ; GFX7LESS-DPP-NEXT: .LBB18_3:
14541 ; GFX7LESS-DPP-NEXT: s_endpgm
14543 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14544 ; GFX9-DPP: ; %bb.0:
14545 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
14546 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
14547 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
14548 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
14549 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
14550 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB18_3
14551 ; GFX9-DPP-NEXT: ; %bb.1:
14552 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
14553 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
14554 ; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
14555 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
14556 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
14557 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
14558 ; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0
14559 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
14560 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
14561 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6
14562 ; GFX9-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
14563 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
14564 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
14565 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
14566 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
14567 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
14568 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
14569 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
14570 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
14571 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB18_2
14572 ; GFX9-DPP-NEXT: .LBB18_3:
14573 ; GFX9-DPP-NEXT: s_endpgm
14575 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14576 ; GFX1064-DPP: ; %bb.0:
14577 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
14578 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
14579 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
14580 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
14581 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
14582 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB18_3
14583 ; GFX1064-DPP-NEXT: ; %bb.1:
14584 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
14585 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5]
14586 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
14587 ; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
14588 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
14589 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
14590 ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
14591 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
14592 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
14593 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
14594 ; GFX1064-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
14595 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
14596 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
14597 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
14598 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
14599 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
14600 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
14601 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
14602 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
14603 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB18_2
14604 ; GFX1064-DPP-NEXT: .LBB18_3:
14605 ; GFX1064-DPP-NEXT: s_endpgm
14607 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14608 ; GFX1032-DPP: ; %bb.0:
14609 ; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
14610 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
14611 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
14612 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
14613 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
14614 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB18_3
14615 ; GFX1032-DPP-NEXT: ; %bb.1:
14616 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
14617 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5
14618 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
14619 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
14620 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
14621 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
14622 ; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
14623 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
14624 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2
14625 ; GFX1032-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
14626 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
14627 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
14628 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
14629 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
14630 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
14631 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
14632 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
14633 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
14634 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB18_2
14635 ; GFX1032-DPP-NEXT: .LBB18_3:
14636 ; GFX1032-DPP-NEXT: s_endpgm
14638 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14639 ; GFX1164-DPP: ; %bb.0:
14640 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
14641 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
14642 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
14643 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14644 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
14645 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
14646 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB18_3
14647 ; GFX1164-DPP-NEXT: ; %bb.1:
14648 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
14649 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5]
14650 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
14651 ; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
14652 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
14653 ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
14654 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
14655 ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
14656 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
14657 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
14658 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
14659 ; GFX1164-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
14660 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
14661 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
14662 ; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
14663 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
14664 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
14665 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
14666 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
14667 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
14668 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
14669 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
14670 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB18_2
14671 ; GFX1164-DPP-NEXT: .LBB18_3:
14672 ; GFX1164-DPP-NEXT: s_endpgm
14674 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14675 ; GFX1132-DPP: ; %bb.0:
14676 ; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo
14677 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
14678 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
14679 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
14680 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
14681 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
14682 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB18_3
14683 ; GFX1132-DPP-NEXT: ; %bb.1:
14684 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
14685 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s5
14686 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
14687 ; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
14688 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
14689 ; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
14690 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
14691 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
14692 ; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2
14693 ; GFX1132-DPP-NEXT: .LBB18_2: ; %atomicrmw.start
14694 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
14695 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
14696 ; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
14697 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
14698 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
14699 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
14700 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
14701 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
14702 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
14703 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
14704 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB18_2
14705 ; GFX1132-DPP-NEXT: .LBB18_3:
14706 ; GFX1132-DPP-NEXT: s_endpgm
14707 %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1, !amdgpu.ignore.denormal.mode !1
14711 define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr) {
14712 ; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14713 ; GFX7LESS: ; %bb.0:
14714 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec
14715 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
14716 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
14717 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
14718 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
14719 ; GFX7LESS-NEXT: s_cbranch_execz .LBB19_3
14720 ; GFX7LESS-NEXT: ; %bb.1:
14721 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
14722 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[4:5]
14723 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
14724 ; GFX7LESS-NEXT: s_load_dword s6, s[0:1], 0x0
14725 ; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
14726 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
14727 ; GFX7LESS-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
14728 ; GFX7LESS-NEXT: v_mul_f32_e32 v2, 4.0, v0
14729 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
14730 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6
14731 ; GFX7LESS-NEXT: s_mov_b32 s2, -1
14732 ; GFX7LESS-NEXT: .LBB19_2: ; %atomicrmw.start
14733 ; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
14734 ; GFX7LESS-NEXT: v_add_f32_e32 v0, v1, v2
14735 ; GFX7LESS-NEXT: s_waitcnt expcnt(0)
14736 ; GFX7LESS-NEXT: v_mov_b32_e32 v4, v1
14737 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, v0
14738 ; GFX7LESS-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
14739 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
14740 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
14741 ; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
14742 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, v3
14743 ; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
14744 ; GFX7LESS-NEXT: s_cbranch_execnz .LBB19_2
14745 ; GFX7LESS-NEXT: .LBB19_3:
14746 ; GFX7LESS-NEXT: s_endpgm
14748 ; GFX9-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14750 ; GFX9-NEXT: s_mov_b64 s[4:5], exec
14751 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
14752 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
14753 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
14754 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
14755 ; GFX9-NEXT: s_cbranch_execz .LBB19_3
14756 ; GFX9-NEXT: ; %bb.1:
14757 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
14758 ; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
14759 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
14760 ; GFX9-NEXT: s_mov_b64 s[2:3], 0
14761 ; GFX9-NEXT: v_mul_f32_e32 v2, 4.0, v0
14762 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
14763 ; GFX9-NEXT: s_load_dword s6, s[0:1], 0x0
14764 ; GFX9-NEXT: v_mov_b32_e32 v3, 0
14765 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
14766 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
14767 ; GFX9-NEXT: .LBB19_2: ; %atomicrmw.start
14768 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
14769 ; GFX9-NEXT: v_add_f32_e32 v0, v1, v2
14770 ; GFX9-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
14771 ; GFX9-NEXT: s_waitcnt vmcnt(0)
14772 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
14773 ; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
14774 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
14775 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
14776 ; GFX9-NEXT: s_cbranch_execnz .LBB19_2
14777 ; GFX9-NEXT: .LBB19_3:
14778 ; GFX9-NEXT: s_endpgm
14780 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14781 ; GFX1064: ; %bb.0:
14782 ; GFX1064-NEXT: s_mov_b64 s[4:5], exec
14783 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
14784 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
14785 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
14786 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
14787 ; GFX1064-NEXT: s_cbranch_execz .LBB19_3
14788 ; GFX1064-NEXT: ; %bb.1:
14789 ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
14790 ; GFX1064-NEXT: s_bcnt1_i32_b64 s3, s[4:5]
14791 ; GFX1064-NEXT: v_mov_b32_e32 v3, 0
14792 ; GFX1064-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
14793 ; GFX1064-NEXT: v_mul_f32_e32 v2, 4.0, v0
14794 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
14795 ; GFX1064-NEXT: s_load_dword s2, s[0:1], 0x0
14796 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
14797 ; GFX1064-NEXT: v_mov_b32_e32 v1, s2
14798 ; GFX1064-NEXT: s_mov_b64 s[2:3], 0
14799 ; GFX1064-NEXT: .LBB19_2: ; %atomicrmw.start
14800 ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
14801 ; GFX1064-NEXT: v_add_f32_e32 v0, v1, v2
14802 ; GFX1064-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
14803 ; GFX1064-NEXT: s_waitcnt vmcnt(0)
14804 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
14805 ; GFX1064-NEXT: v_mov_b32_e32 v1, v0
14806 ; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
14807 ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
14808 ; GFX1064-NEXT: s_cbranch_execnz .LBB19_2
14809 ; GFX1064-NEXT: .LBB19_3:
14810 ; GFX1064-NEXT: s_endpgm
14812 ; GFX1032-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14813 ; GFX1032: ; %bb.0:
14814 ; GFX1032-NEXT: s_mov_b32 s5, exec_lo
14815 ; GFX1032-NEXT: s_mov_b32 s4, 0
14816 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
14817 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
14818 ; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
14819 ; GFX1032-NEXT: s_cbranch_execz .LBB19_3
14820 ; GFX1032-NEXT: ; %bb.1:
14821 ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
14822 ; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s5
14823 ; GFX1032-NEXT: v_mov_b32_e32 v3, 0
14824 ; GFX1032-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
14825 ; GFX1032-NEXT: v_mul_f32_e32 v2, 4.0, v0
14826 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
14827 ; GFX1032-NEXT: s_load_dword s2, s[0:1], 0x0
14828 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
14829 ; GFX1032-NEXT: v_mov_b32_e32 v1, s2
14830 ; GFX1032-NEXT: .LBB19_2: ; %atomicrmw.start
14831 ; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
14832 ; GFX1032-NEXT: v_add_f32_e32 v0, v1, v2
14833 ; GFX1032-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
14834 ; GFX1032-NEXT: s_waitcnt vmcnt(0)
14835 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
14836 ; GFX1032-NEXT: v_mov_b32_e32 v1, v0
14837 ; GFX1032-NEXT: s_or_b32 s4, vcc_lo, s4
14838 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
14839 ; GFX1032-NEXT: s_cbranch_execnz .LBB19_2
14840 ; GFX1032-NEXT: .LBB19_3:
14841 ; GFX1032-NEXT: s_endpgm
14843 ; GFX1164-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14844 ; GFX1164: ; %bb.0:
14845 ; GFX1164-NEXT: s_mov_b64 s[4:5], exec
14846 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec
14847 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
14848 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
14849 ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
14850 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
14851 ; GFX1164-NEXT: s_cbranch_execz .LBB19_3
14852 ; GFX1164-NEXT: ; %bb.1:
14853 ; GFX1164-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
14854 ; GFX1164-NEXT: s_bcnt1_i32_b64 s3, s[4:5]
14855 ; GFX1164-NEXT: v_mov_b32_e32 v3, 0
14856 ; GFX1164-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
14857 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
14858 ; GFX1164-NEXT: v_mul_f32_e32 v2, 4.0, v0
14859 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
14860 ; GFX1164-NEXT: s_load_b32 s2, s[0:1], 0x0
14861 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
14862 ; GFX1164-NEXT: v_mov_b32_e32 v1, s2
14863 ; GFX1164-NEXT: s_mov_b64 s[2:3], 0
14864 ; GFX1164-NEXT: .LBB19_2: ; %atomicrmw.start
14865 ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
14866 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
14867 ; GFX1164-NEXT: v_add_f32_e32 v0, v1, v2
14868 ; GFX1164-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
14869 ; GFX1164-NEXT: s_waitcnt vmcnt(0)
14870 ; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
14871 ; GFX1164-NEXT: v_mov_b32_e32 v1, v0
14872 ; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
14873 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
14874 ; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
14875 ; GFX1164-NEXT: s_cbranch_execnz .LBB19_2
14876 ; GFX1164-NEXT: .LBB19_3:
14877 ; GFX1164-NEXT: s_endpgm
14879 ; GFX1132-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14880 ; GFX1132: ; %bb.0:
14881 ; GFX1132-NEXT: s_mov_b32 s5, exec_lo
14882 ; GFX1132-NEXT: s_mov_b32 s4, 0
14883 ; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
14884 ; GFX1132-NEXT: s_mov_b32 s0, exec_lo
14885 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
14886 ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
14887 ; GFX1132-NEXT: s_cbranch_execz .LBB19_3
14888 ; GFX1132-NEXT: ; %bb.1:
14889 ; GFX1132-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
14890 ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s5
14891 ; GFX1132-NEXT: v_mov_b32_e32 v3, 0
14892 ; GFX1132-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
14893 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
14894 ; GFX1132-NEXT: s_load_b32 s2, s[0:1], 0x0
14895 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
14896 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
14897 ; GFX1132-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2
14898 ; GFX1132-NEXT: .LBB19_2: ; %atomicrmw.start
14899 ; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
14900 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
14901 ; GFX1132-NEXT: v_add_f32_e32 v0, v1, v2
14902 ; GFX1132-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
14903 ; GFX1132-NEXT: s_waitcnt vmcnt(0)
14904 ; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
14905 ; GFX1132-NEXT: v_mov_b32_e32 v1, v0
14906 ; GFX1132-NEXT: s_or_b32 s4, vcc_lo, s4
14907 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
14908 ; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
14909 ; GFX1132-NEXT: s_cbranch_execnz .LBB19_2
14910 ; GFX1132-NEXT: .LBB19_3:
14911 ; GFX1132-NEXT: s_endpgm
14913 ; GFX7LESS-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14914 ; GFX7LESS-DPP: ; %bb.0:
14915 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], exec
14916 ; GFX7LESS-DPP-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0
14917 ; GFX7LESS-DPP-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0
14918 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
14919 ; GFX7LESS-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
14920 ; GFX7LESS-DPP-NEXT: s_cbranch_execz .LBB19_3
14921 ; GFX7LESS-DPP-NEXT: ; %bb.1:
14922 ; GFX7LESS-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
14923 ; GFX7LESS-DPP-NEXT: s_bcnt1_i32_b64 s2, s[4:5]
14924 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
14925 ; GFX7LESS-DPP-NEXT: s_load_dword s6, s[0:1], 0x0
14926 ; GFX7LESS-DPP-NEXT: s_mov_b64 s[4:5], 0
14927 ; GFX7LESS-DPP-NEXT: s_mov_b32 s3, 0xf000
14928 ; GFX7LESS-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
14929 ; GFX7LESS-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
14930 ; GFX7LESS-DPP-NEXT: s_waitcnt lgkmcnt(0)
14931 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, s6
14932 ; GFX7LESS-DPP-NEXT: s_mov_b32 s2, -1
14933 ; GFX7LESS-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
14934 ; GFX7LESS-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
14935 ; GFX7LESS-DPP-NEXT: v_add_f32_e32 v0, v1, v2
14936 ; GFX7LESS-DPP-NEXT: s_waitcnt expcnt(0)
14937 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v4, v1
14938 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v3, v0
14939 ; GFX7LESS-DPP-NEXT: buffer_atomic_cmpswap v[3:4], off, s[0:3], 0 glc
14940 ; GFX7LESS-DPP-NEXT: s_waitcnt vmcnt(0)
14941 ; GFX7LESS-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v3, v1
14942 ; GFX7LESS-DPP-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
14943 ; GFX7LESS-DPP-NEXT: v_mov_b32_e32 v1, v3
14944 ; GFX7LESS-DPP-NEXT: s_andn2_b64 exec, exec, s[4:5]
14945 ; GFX7LESS-DPP-NEXT: s_cbranch_execnz .LBB19_2
14946 ; GFX7LESS-DPP-NEXT: .LBB19_3:
14947 ; GFX7LESS-DPP-NEXT: s_endpgm
14949 ; GFX9-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14950 ; GFX9-DPP: ; %bb.0:
14951 ; GFX9-DPP-NEXT: s_mov_b64 s[4:5], exec
14952 ; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
14953 ; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
14954 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
14955 ; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
14956 ; GFX9-DPP-NEXT: s_cbranch_execz .LBB19_3
14957 ; GFX9-DPP-NEXT: ; %bb.1:
14958 ; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
14959 ; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s4, s[4:5]
14960 ; GFX9-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s4
14961 ; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
14962 ; GFX9-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
14963 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
14964 ; GFX9-DPP-NEXT: s_load_dword s6, s[0:1], 0x0
14965 ; GFX9-DPP-NEXT: v_mov_b32_e32 v3, 0
14966 ; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
14967 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s6
14968 ; GFX9-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
14969 ; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
14970 ; GFX9-DPP-NEXT: v_add_f32_e32 v0, v1, v2
14971 ; GFX9-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
14972 ; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
14973 ; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
14974 ; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
14975 ; GFX9-DPP-NEXT: v_mov_b32_e32 v1, v0
14976 ; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
14977 ; GFX9-DPP-NEXT: s_cbranch_execnz .LBB19_2
14978 ; GFX9-DPP-NEXT: .LBB19_3:
14979 ; GFX9-DPP-NEXT: s_endpgm
14981 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
14982 ; GFX1064-DPP: ; %bb.0:
14983 ; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], exec
14984 ; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
14985 ; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
14986 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
14987 ; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
14988 ; GFX1064-DPP-NEXT: s_cbranch_execz .LBB19_3
14989 ; GFX1064-DPP-NEXT: ; %bb.1:
14990 ; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
14991 ; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5]
14992 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, 0
14993 ; GFX1064-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
14994 ; GFX1064-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
14995 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
14996 ; GFX1064-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
14997 ; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
14998 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s2
14999 ; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
15000 ; GFX1064-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
15001 ; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
15002 ; GFX1064-DPP-NEXT: v_add_f32_e32 v0, v1, v2
15003 ; GFX1064-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
15004 ; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
15005 ; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
15006 ; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, v0
15007 ; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
15008 ; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
15009 ; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB19_2
15010 ; GFX1064-DPP-NEXT: .LBB19_3:
15011 ; GFX1064-DPP-NEXT: s_endpgm
15013 ; GFX1032-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
15014 ; GFX1032-DPP: ; %bb.0:
15015 ; GFX1032-DPP-NEXT: s_mov_b32 s5, exec_lo
15016 ; GFX1032-DPP-NEXT: s_mov_b32 s4, 0
15017 ; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
15018 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
15019 ; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
15020 ; GFX1032-DPP-NEXT: s_cbranch_execz .LBB19_3
15021 ; GFX1032-DPP-NEXT: ; %bb.1:
15022 ; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x24
15023 ; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s3, s5
15024 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, 0
15025 ; GFX1032-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
15026 ; GFX1032-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
15027 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
15028 ; GFX1032-DPP-NEXT: s_load_dword s2, s[0:1], 0x0
15029 ; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
15030 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s2
15031 ; GFX1032-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
15032 ; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
15033 ; GFX1032-DPP-NEXT: v_add_f32_e32 v0, v1, v2
15034 ; GFX1032-DPP-NEXT: global_atomic_cmpswap v0, v3, v[0:1], s[0:1] glc
15035 ; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
15036 ; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
15037 ; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, v0
15038 ; GFX1032-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
15039 ; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
15040 ; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB19_2
15041 ; GFX1032-DPP-NEXT: .LBB19_3:
15042 ; GFX1032-DPP-NEXT: s_endpgm
15044 ; GFX1164-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
15045 ; GFX1164-DPP: ; %bb.0:
15046 ; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], exec
15047 ; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
15048 ; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0
15049 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
15050 ; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0
15051 ; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
15052 ; GFX1164-DPP-NEXT: s_cbranch_execz .LBB19_3
15053 ; GFX1164-DPP-NEXT: ; %bb.1:
15054 ; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
15055 ; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s3, s[4:5]
15056 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, 0
15057 ; GFX1164-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
15058 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
15059 ; GFX1164-DPP-NEXT: v_mul_f32_e32 v2, 4.0, v0
15060 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
15061 ; GFX1164-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
15062 ; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
15063 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
15064 ; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
15065 ; GFX1164-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
15066 ; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
15067 ; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
15068 ; GFX1164-DPP-NEXT: v_add_f32_e32 v0, v1, v2
15069 ; GFX1164-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
15070 ; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
15071 ; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
15072 ; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, v0
15073 ; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
15074 ; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
15075 ; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
15076 ; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB19_2
15077 ; GFX1164-DPP-NEXT: .LBB19_3:
15078 ; GFX1164-DPP-NEXT: s_endpgm
15080 ; GFX1132-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
15081 ; GFX1132-DPP: ; %bb.0:
15082 ; GFX1132-DPP-NEXT: s_mov_b32 s5, exec_lo
15083 ; GFX1132-DPP-NEXT: s_mov_b32 s4, 0
15084 ; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s5, 0
15085 ; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
15086 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
15087 ; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
15088 ; GFX1132-DPP-NEXT: s_cbranch_execz .LBB19_3
15089 ; GFX1132-DPP-NEXT: ; %bb.1:
15090 ; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
15091 ; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s3, s5
15092 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v3, 0
15093 ; GFX1132-DPP-NEXT: v_cvt_f32_ubyte0_e32 v0, s3
15094 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
15095 ; GFX1132-DPP-NEXT: s_load_b32 s2, s[0:1], 0x0
15096 ; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
15097 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
15098 ; GFX1132-DPP-NEXT: v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s2
15099 ; GFX1132-DPP-NEXT: .LBB19_2: ; %atomicrmw.start
15100 ; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
15101 ; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
15102 ; GFX1132-DPP-NEXT: v_add_f32_e32 v0, v1, v2
15103 ; GFX1132-DPP-NEXT: global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] glc
15104 ; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
15105 ; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
15106 ; GFX1132-DPP-NEXT: v_mov_b32_e32 v1, v0
15107 ; GFX1132-DPP-NEXT: s_or_b32 s4, vcc_lo, s4
15108 ; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
15109 ; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
15110 ; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB19_2
15111 ; GFX1132-DPP-NEXT: .LBB19_3:
15112 ; GFX1132-DPP-NEXT: s_endpgm
15113 %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 monotonic, align 4, !amdgpu.no.fine.grained.memory !1, !amdgpu.no.remote.memory !1
15117 attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
15118 attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
15119 attributes #2 = { strictfp }
15121 !llvm.module.flags = !{!0}
15122 !0 = !{i32 1, !"amdhsa_code_object_version", i32 500}