1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX10 %s
4 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX11 %s
5 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12 %s
7 ; Test using saddr addressing mode of global_* flat atomic instructions.
9 ; --------------------------------------------------------------------------------
11 ; --------------------------------------------------------------------------------
13 define amdgpu_ps float @global_max_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
14 ; GFX9-LABEL: global_max_saddr_i32_rtn:
16 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
17 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
18 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
19 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
20 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
21 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
22 ; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start
23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
24 ; GFX9-NEXT: s_waitcnt vmcnt(0)
25 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
26 ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1
27 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
28 ; GFX9-NEXT: s_waitcnt vmcnt(0)
29 ; GFX9-NEXT: buffer_wbinvl1
30 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
31 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
32 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
33 ; GFX9-NEXT: s_cbranch_execnz .LBB0_1
34 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
35 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
36 ; GFX9-NEXT: ; return to shader part epilog
38 ; GFX10-LABEL: global_max_saddr_i32_rtn:
40 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
41 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
42 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
43 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
44 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
45 ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
46 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
47 ; GFX10-NEXT: s_waitcnt vmcnt(0)
48 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
49 ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1
50 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
51 ; GFX10-NEXT: s_waitcnt vmcnt(0)
52 ; GFX10-NEXT: buffer_gl1_inv
53 ; GFX10-NEXT: buffer_gl0_inv
54 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
55 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
56 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
57 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1
58 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
59 ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
60 ; GFX10-NEXT: ; return to shader part epilog
62 ; GFX11-LABEL: global_max_saddr_i32_rtn:
64 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
65 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
66 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
67 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
68 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
69 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
70 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
71 ; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
72 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
73 ; GFX11-NEXT: s_waitcnt vmcnt(0)
74 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
75 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
76 ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1
77 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
78 ; GFX11-NEXT: s_waitcnt vmcnt(0)
79 ; GFX11-NEXT: buffer_gl1_inv
80 ; GFX11-NEXT: buffer_gl0_inv
81 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
82 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
83 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
84 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
85 ; GFX11-NEXT: s_cbranch_execnz .LBB0_1
86 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
87 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
88 ; GFX11-NEXT: ; return to shader part epilog
90 ; GFX12-LABEL: global_max_saddr_i32_rtn:
92 ; GFX12-NEXT: v_mov_b32_e32 v2, v0
93 ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3]
94 ; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
95 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
96 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
97 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
98 ; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start
99 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
100 ; GFX12-NEXT: s_wait_loadcnt 0x0
101 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
102 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
103 ; GFX12-NEXT: v_max_i32_e32 v4, v5, v1
104 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
105 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
106 ; GFX12-NEXT: s_wait_loadcnt 0x0
107 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
108 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
109 ; GFX12-NEXT: s_wait_alu 0xfffe
110 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
111 ; GFX12-NEXT: s_wait_alu 0xfffe
112 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
113 ; GFX12-NEXT: s_cbranch_execnz .LBB0_1
114 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
115 ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1]
116 ; GFX12-NEXT: ; return to shader part epilog
117 %zext.offset = zext i32 %voffset to i64
118 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
119 %rtn = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst
120 %cast.rtn = bitcast i32 %rtn to float
124 define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
125 ; GFX9-LABEL: global_max_saddr_i32_rtn_neg128:
127 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
128 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
129 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
130 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
131 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
132 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
133 ; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start
134 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
135 ; GFX9-NEXT: s_waitcnt vmcnt(0)
136 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
137 ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1
138 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
139 ; GFX9-NEXT: s_waitcnt vmcnt(0)
140 ; GFX9-NEXT: buffer_wbinvl1
141 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
142 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
143 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
144 ; GFX9-NEXT: s_cbranch_execnz .LBB1_1
145 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
146 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
147 ; GFX9-NEXT: ; return to shader part epilog
149 ; GFX10-LABEL: global_max_saddr_i32_rtn_neg128:
151 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
152 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
153 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
154 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
155 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
156 ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
157 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
158 ; GFX10-NEXT: s_waitcnt vmcnt(0)
159 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
160 ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1
161 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
162 ; GFX10-NEXT: s_waitcnt vmcnt(0)
163 ; GFX10-NEXT: buffer_gl1_inv
164 ; GFX10-NEXT: buffer_gl0_inv
165 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
166 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
167 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
168 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1
169 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
170 ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
171 ; GFX10-NEXT: ; return to shader part epilog
173 ; GFX11-LABEL: global_max_saddr_i32_rtn_neg128:
175 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
176 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
177 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
178 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
179 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
180 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
181 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
182 ; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
183 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
184 ; GFX11-NEXT: s_waitcnt vmcnt(0)
185 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
186 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
187 ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1
188 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
189 ; GFX11-NEXT: s_waitcnt vmcnt(0)
190 ; GFX11-NEXT: buffer_gl1_inv
191 ; GFX11-NEXT: buffer_gl0_inv
192 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
193 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
194 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
195 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
196 ; GFX11-NEXT: s_cbranch_execnz .LBB1_1
197 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
198 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
199 ; GFX11-NEXT: ; return to shader part epilog
201 ; GFX12-LABEL: global_max_saddr_i32_rtn_neg128:
203 ; GFX12-NEXT: v_mov_b32_e32 v2, v0
204 ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
205 ; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
206 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
207 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
208 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
209 ; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
210 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
211 ; GFX12-NEXT: s_wait_loadcnt 0x0
212 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
213 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
214 ; GFX12-NEXT: v_max_i32_e32 v4, v5, v1
215 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
216 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
217 ; GFX12-NEXT: s_wait_loadcnt 0x0
218 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
219 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
220 ; GFX12-NEXT: s_wait_alu 0xfffe
221 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
222 ; GFX12-NEXT: s_wait_alu 0xfffe
223 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
224 ; GFX12-NEXT: s_cbranch_execnz .LBB1_1
225 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
226 ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1]
227 ; GFX12-NEXT: ; return to shader part epilog
228 %zext.offset = zext i32 %voffset to i64
229 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
230 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
231 %rtn = atomicrmw max ptr addrspace(1) %gep1, i32 %data seq_cst
232 %cast.rtn = bitcast i32 %rtn to float
236 define amdgpu_ps void @global_max_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
237 ; GFX9-LABEL: global_max_saddr_i32_nortn:
239 ; GFX9-NEXT: global_load_dword v5, v0, s[2:3]
240 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
241 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
242 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
243 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
244 ; GFX9-NEXT: .LBB2_1: ; %atomicrmw.start
245 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
246 ; GFX9-NEXT: s_waitcnt vmcnt(0)
247 ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1
248 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
249 ; GFX9-NEXT: s_waitcnt vmcnt(0)
250 ; GFX9-NEXT: buffer_wbinvl1
251 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
252 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
253 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
254 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
255 ; GFX9-NEXT: s_cbranch_execnz .LBB2_1
256 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
257 ; GFX9-NEXT: s_endpgm
259 ; GFX10-LABEL: global_max_saddr_i32_nortn:
261 ; GFX10-NEXT: global_load_dword v5, v0, s[2:3]
262 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
263 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
264 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
265 ; GFX10-NEXT: .LBB2_1: ; %atomicrmw.start
266 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
267 ; GFX10-NEXT: s_waitcnt vmcnt(0)
268 ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1
269 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
270 ; GFX10-NEXT: s_waitcnt vmcnt(0)
271 ; GFX10-NEXT: buffer_gl1_inv
272 ; GFX10-NEXT: buffer_gl0_inv
273 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
274 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
275 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
276 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
277 ; GFX10-NEXT: s_cbranch_execnz .LBB2_1
278 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
279 ; GFX10-NEXT: s_endpgm
281 ; GFX11-LABEL: global_max_saddr_i32_nortn:
283 ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3]
284 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
285 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
286 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
287 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
288 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
289 ; GFX11-NEXT: .LBB2_1: ; %atomicrmw.start
290 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
291 ; GFX11-NEXT: s_waitcnt vmcnt(0)
292 ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1
293 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
294 ; GFX11-NEXT: s_waitcnt vmcnt(0)
295 ; GFX11-NEXT: buffer_gl1_inv
296 ; GFX11-NEXT: buffer_gl0_inv
297 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
298 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
299 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
300 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
301 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
302 ; GFX11-NEXT: s_cbranch_execnz .LBB2_1
303 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
304 ; GFX11-NEXT: s_endpgm
306 ; GFX12-LABEL: global_max_saddr_i32_nortn:
308 ; GFX12-NEXT: global_load_b32 v5, v0, s[2:3]
309 ; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
310 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
311 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
312 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
313 ; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start
314 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
315 ; GFX12-NEXT: s_wait_loadcnt 0x0
316 ; GFX12-NEXT: v_max_i32_e32 v4, v5, v1
317 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
318 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
319 ; GFX12-NEXT: s_wait_loadcnt 0x0
320 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
321 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
322 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
323 ; GFX12-NEXT: s_wait_alu 0xfffe
324 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
325 ; GFX12-NEXT: s_wait_alu 0xfffe
326 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
327 ; GFX12-NEXT: s_cbranch_execnz .LBB2_1
328 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
329 ; GFX12-NEXT: s_endpgm
330 %zext.offset = zext i32 %voffset to i64
331 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
332 %unused = atomicrmw max ptr addrspace(1) %gep0, i32 %data seq_cst
336 define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
337 ; GFX9-LABEL: global_max_saddr_i32_nortn_neg128:
339 ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128
340 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
341 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
342 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
343 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
344 ; GFX9-NEXT: .LBB3_1: ; %atomicrmw.start
345 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
346 ; GFX9-NEXT: s_waitcnt vmcnt(0)
347 ; GFX9-NEXT: v_max_i32_e32 v4, v5, v1
348 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
349 ; GFX9-NEXT: s_waitcnt vmcnt(0)
350 ; GFX9-NEXT: buffer_wbinvl1
351 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
352 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
353 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
354 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
355 ; GFX9-NEXT: s_cbranch_execnz .LBB3_1
356 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
357 ; GFX9-NEXT: s_endpgm
359 ; GFX10-LABEL: global_max_saddr_i32_nortn_neg128:
361 ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128
362 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
363 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
364 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
365 ; GFX10-NEXT: .LBB3_1: ; %atomicrmw.start
366 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
367 ; GFX10-NEXT: s_waitcnt vmcnt(0)
368 ; GFX10-NEXT: v_max_i32_e32 v4, v5, v1
369 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
370 ; GFX10-NEXT: s_waitcnt vmcnt(0)
371 ; GFX10-NEXT: buffer_gl1_inv
372 ; GFX10-NEXT: buffer_gl0_inv
373 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
374 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
375 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
376 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
377 ; GFX10-NEXT: s_cbranch_execnz .LBB3_1
378 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
379 ; GFX10-NEXT: s_endpgm
381 ; GFX11-LABEL: global_max_saddr_i32_nortn_neg128:
383 ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128
384 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
385 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
386 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
387 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
388 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
389 ; GFX11-NEXT: .LBB3_1: ; %atomicrmw.start
390 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
391 ; GFX11-NEXT: s_waitcnt vmcnt(0)
392 ; GFX11-NEXT: v_max_i32_e32 v4, v5, v1
393 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
394 ; GFX11-NEXT: s_waitcnt vmcnt(0)
395 ; GFX11-NEXT: buffer_gl1_inv
396 ; GFX11-NEXT: buffer_gl0_inv
397 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
398 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
399 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
401 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
402 ; GFX11-NEXT: s_cbranch_execnz .LBB3_1
403 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
404 ; GFX11-NEXT: s_endpgm
406 ; GFX12-LABEL: global_max_saddr_i32_nortn_neg128:
408 ; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128
409 ; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
410 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
411 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
412 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
413 ; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start
414 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
415 ; GFX12-NEXT: s_wait_loadcnt 0x0
416 ; GFX12-NEXT: v_max_i32_e32 v4, v5, v1
417 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
418 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
419 ; GFX12-NEXT: s_wait_loadcnt 0x0
420 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
421 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
422 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
423 ; GFX12-NEXT: s_wait_alu 0xfffe
424 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
425 ; GFX12-NEXT: s_wait_alu 0xfffe
426 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
427 ; GFX12-NEXT: s_cbranch_execnz .LBB3_1
428 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
429 ; GFX12-NEXT: s_endpgm
430 %zext.offset = zext i32 %voffset to i64
431 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
432 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
433 %unused = atomicrmw max ptr addrspace(1) %gep1, i32 %data seq_cst
437 define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
438 ; GFX9-LABEL: global_max_saddr_i64_rtn:
440 ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3]
441 ; GFX9-NEXT: v_mov_b32_e32 v6, s3
442 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
443 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
444 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
445 ; GFX9-NEXT: .LBB4_1: ; %atomicrmw.start
446 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
447 ; GFX9-NEXT: s_waitcnt vmcnt(0)
448 ; GFX9-NEXT: v_mov_b32_e32 v10, v4
449 ; GFX9-NEXT: v_mov_b32_e32 v9, v3
450 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
451 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
452 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
453 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
454 ; GFX9-NEXT: s_waitcnt vmcnt(0)
455 ; GFX9-NEXT: buffer_wbinvl1
456 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
457 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
458 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
459 ; GFX9-NEXT: s_cbranch_execnz .LBB4_1
460 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
461 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
462 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
463 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
464 ; GFX9-NEXT: ; return to shader part epilog
466 ; GFX10-LABEL: global_max_saddr_i64_rtn:
468 ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3]
469 ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
470 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
471 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
472 ; GFX10-NEXT: .LBB4_1: ; %atomicrmw.start
473 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
474 ; GFX10-NEXT: s_waitcnt vmcnt(0)
475 ; GFX10-NEXT: v_mov_b32_e32 v10, v4
476 ; GFX10-NEXT: v_mov_b32_e32 v9, v3
477 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
478 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
479 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
480 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
481 ; GFX10-NEXT: s_waitcnt vmcnt(0)
482 ; GFX10-NEXT: buffer_gl1_inv
483 ; GFX10-NEXT: buffer_gl0_inv
484 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
485 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
486 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
487 ; GFX10-NEXT: s_cbranch_execnz .LBB4_1
488 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
489 ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
490 ; GFX10-NEXT: v_mov_b32_e32 v0, v3
491 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
492 ; GFX10-NEXT: ; return to shader part epilog
494 ; GFX11-LABEL: global_max_saddr_i64_rtn:
496 ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3]
497 ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
498 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
499 ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
500 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
501 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
502 ; GFX11-NEXT: .LBB4_1: ; %atomicrmw.start
503 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
504 ; GFX11-NEXT: s_waitcnt vmcnt(0)
505 ; GFX11-NEXT: v_mov_b32_e32 v10, v4
506 ; GFX11-NEXT: v_mov_b32_e32 v9, v3
507 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
508 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
509 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
510 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
511 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
512 ; GFX11-NEXT: s_waitcnt vmcnt(0)
513 ; GFX11-NEXT: buffer_gl1_inv
514 ; GFX11-NEXT: buffer_gl0_inv
515 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
516 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
517 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
518 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
519 ; GFX11-NEXT: s_cbranch_execnz .LBB4_1
520 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
521 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
522 ; GFX11-NEXT: v_mov_b32_e32 v0, v3
523 ; GFX11-NEXT: v_mov_b32_e32 v1, v4
524 ; GFX11-NEXT: ; return to shader part epilog
526 ; GFX12-LABEL: global_max_saddr_i64_rtn:
528 ; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3]
529 ; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
530 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
531 ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
532 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
533 ; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start
534 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
535 ; GFX12-NEXT: s_wait_loadcnt 0x0
536 ; GFX12-NEXT: v_mov_b32_e32 v10, v4
537 ; GFX12-NEXT: v_mov_b32_e32 v9, v3
538 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
539 ; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
540 ; GFX12-NEXT: s_wait_alu 0xfffd
541 ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
542 ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
543 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
544 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
545 ; GFX12-NEXT: s_wait_loadcnt 0x0
546 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
547 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
548 ; GFX12-NEXT: s_wait_alu 0xfffe
549 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
550 ; GFX12-NEXT: s_wait_alu 0xfffe
551 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
552 ; GFX12-NEXT: s_cbranch_execnz .LBB4_1
553 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
554 ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1]
555 ; GFX12-NEXT: v_mov_b32_e32 v0, v3
556 ; GFX12-NEXT: v_mov_b32_e32 v1, v4
557 ; GFX12-NEXT: ; return to shader part epilog
558 %zext.offset = zext i32 %voffset to i64
559 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
560 %rtn = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst
561 %cast.rtn = bitcast i64 %rtn to <2 x float>
562 ret <2 x float> %cast.rtn
565 define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
566 ; GFX9-LABEL: global_max_saddr_i64_rtn_neg128:
568 ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
569 ; GFX9-NEXT: v_mov_b32_e32 v6, s3
570 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
571 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
572 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
573 ; GFX9-NEXT: .LBB5_1: ; %atomicrmw.start
574 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
575 ; GFX9-NEXT: s_waitcnt vmcnt(0)
576 ; GFX9-NEXT: v_mov_b32_e32 v10, v4
577 ; GFX9-NEXT: v_mov_b32_e32 v9, v3
578 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
579 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
580 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
581 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
582 ; GFX9-NEXT: s_waitcnt vmcnt(0)
583 ; GFX9-NEXT: buffer_wbinvl1
584 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
585 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
586 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
587 ; GFX9-NEXT: s_cbranch_execnz .LBB5_1
588 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
589 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
590 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
591 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
592 ; GFX9-NEXT: ; return to shader part epilog
594 ; GFX10-LABEL: global_max_saddr_i64_rtn_neg128:
596 ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
597 ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
598 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
599 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
600 ; GFX10-NEXT: .LBB5_1: ; %atomicrmw.start
601 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
602 ; GFX10-NEXT: s_waitcnt vmcnt(0)
603 ; GFX10-NEXT: v_mov_b32_e32 v10, v4
604 ; GFX10-NEXT: v_mov_b32_e32 v9, v3
605 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
606 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
607 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
608 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
609 ; GFX10-NEXT: s_waitcnt vmcnt(0)
610 ; GFX10-NEXT: buffer_gl1_inv
611 ; GFX10-NEXT: buffer_gl0_inv
612 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
613 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
614 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
615 ; GFX10-NEXT: s_cbranch_execnz .LBB5_1
616 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
617 ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
618 ; GFX10-NEXT: v_mov_b32_e32 v0, v3
619 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
620 ; GFX10-NEXT: ; return to shader part epilog
622 ; GFX11-LABEL: global_max_saddr_i64_rtn_neg128:
624 ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128
625 ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
626 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
627 ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
628 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
629 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
630 ; GFX11-NEXT: .LBB5_1: ; %atomicrmw.start
631 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
632 ; GFX11-NEXT: s_waitcnt vmcnt(0)
633 ; GFX11-NEXT: v_mov_b32_e32 v10, v4
634 ; GFX11-NEXT: v_mov_b32_e32 v9, v3
635 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
636 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
637 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
638 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
639 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
640 ; GFX11-NEXT: s_waitcnt vmcnt(0)
641 ; GFX11-NEXT: buffer_gl1_inv
642 ; GFX11-NEXT: buffer_gl0_inv
643 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
644 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
645 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
646 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
647 ; GFX11-NEXT: s_cbranch_execnz .LBB5_1
648 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
649 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
650 ; GFX11-NEXT: v_mov_b32_e32 v0, v3
651 ; GFX11-NEXT: v_mov_b32_e32 v1, v4
652 ; GFX11-NEXT: ; return to shader part epilog
654 ; GFX12-LABEL: global_max_saddr_i64_rtn_neg128:
656 ; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128
657 ; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
658 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
659 ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
660 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
661 ; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start
662 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
663 ; GFX12-NEXT: s_wait_loadcnt 0x0
664 ; GFX12-NEXT: v_mov_b32_e32 v10, v4
665 ; GFX12-NEXT: v_mov_b32_e32 v9, v3
666 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
667 ; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[9:10], v[1:2]
668 ; GFX12-NEXT: s_wait_alu 0xfffd
669 ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
670 ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
671 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
672 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
673 ; GFX12-NEXT: s_wait_loadcnt 0x0
674 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
675 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
676 ; GFX12-NEXT: s_wait_alu 0xfffe
677 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
678 ; GFX12-NEXT: s_wait_alu 0xfffe
679 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
680 ; GFX12-NEXT: s_cbranch_execnz .LBB5_1
681 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
682 ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1]
683 ; GFX12-NEXT: v_mov_b32_e32 v0, v3
684 ; GFX12-NEXT: v_mov_b32_e32 v1, v4
685 ; GFX12-NEXT: ; return to shader part epilog
686 %zext.offset = zext i32 %voffset to i64
687 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
688 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
689 %rtn = atomicrmw max ptr addrspace(1) %gep1, i64 %data seq_cst
690 %cast.rtn = bitcast i64 %rtn to <2 x float>
691 ret <2 x float> %cast.rtn
694 define amdgpu_ps void @global_max_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
695 ; GFX9-LABEL: global_max_saddr_i64_nortn:
697 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3]
698 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
699 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0
700 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
701 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
702 ; GFX9-NEXT: .LBB6_1: ; %atomicrmw.start
703 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
704 ; GFX9-NEXT: s_waitcnt vmcnt(0)
705 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
706 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
707 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
708 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
709 ; GFX9-NEXT: s_waitcnt vmcnt(0)
710 ; GFX9-NEXT: buffer_wbinvl1
711 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
712 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
713 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
714 ; GFX9-NEXT: v_mov_b32_e32 v5, v3
715 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
716 ; GFX9-NEXT: s_cbranch_execnz .LBB6_1
717 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
718 ; GFX9-NEXT: s_endpgm
720 ; GFX10-LABEL: global_max_saddr_i64_nortn:
722 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3]
723 ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
724 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
725 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
726 ; GFX10-NEXT: .LBB6_1: ; %atomicrmw.start
727 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
728 ; GFX10-NEXT: s_waitcnt vmcnt(0)
729 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
730 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
731 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
732 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
733 ; GFX10-NEXT: s_waitcnt vmcnt(0)
734 ; GFX10-NEXT: buffer_gl1_inv
735 ; GFX10-NEXT: buffer_gl0_inv
736 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
737 ; GFX10-NEXT: v_mov_b32_e32 v6, v4
738 ; GFX10-NEXT: v_mov_b32_e32 v5, v3
739 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
740 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
741 ; GFX10-NEXT: s_cbranch_execnz .LBB6_1
742 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
743 ; GFX10-NEXT: s_endpgm
745 ; GFX11-LABEL: global_max_saddr_i64_nortn:
747 ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3]
748 ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
749 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
750 ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
751 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
752 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
753 ; GFX11-NEXT: .LBB6_1: ; %atomicrmw.start
754 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
755 ; GFX11-NEXT: s_waitcnt vmcnt(0)
756 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
757 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
758 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
759 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
760 ; GFX11-NEXT: s_waitcnt vmcnt(0)
761 ; GFX11-NEXT: buffer_gl1_inv
762 ; GFX11-NEXT: buffer_gl0_inv
763 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
764 ; GFX11-NEXT: v_mov_b32_e32 v6, v4
765 ; GFX11-NEXT: v_mov_b32_e32 v5, v3
766 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
767 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
768 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
769 ; GFX11-NEXT: s_cbranch_execnz .LBB6_1
770 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
771 ; GFX11-NEXT: s_endpgm
773 ; GFX12-LABEL: global_max_saddr_i64_nortn:
775 ; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3]
776 ; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
777 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
778 ; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
779 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
780 ; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
781 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
782 ; GFX12-NEXT: s_wait_loadcnt 0x0
783 ; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
784 ; GFX12-NEXT: s_wait_alu 0xfffd
785 ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
786 ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
787 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
788 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
789 ; GFX12-NEXT: s_wait_loadcnt 0x0
790 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
791 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
792 ; GFX12-NEXT: v_mov_b32_e32 v6, v4
793 ; GFX12-NEXT: v_mov_b32_e32 v5, v3
794 ; GFX12-NEXT: s_wait_alu 0xfffe
795 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
796 ; GFX12-NEXT: s_wait_alu 0xfffe
797 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
798 ; GFX12-NEXT: s_cbranch_execnz .LBB6_1
799 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
800 ; GFX12-NEXT: s_endpgm
801 %zext.offset = zext i32 %voffset to i64
802 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
803 %unused = atomicrmw max ptr addrspace(1) %gep0, i64 %data seq_cst
807 define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
808 ; GFX9-LABEL: global_max_saddr_i64_nortn_neg128:
810 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
811 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
812 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0
813 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
814 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
815 ; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start
816 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
817 ; GFX9-NEXT: s_waitcnt vmcnt(0)
818 ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
819 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
820 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
821 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
822 ; GFX9-NEXT: s_waitcnt vmcnt(0)
823 ; GFX9-NEXT: buffer_wbinvl1
824 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
825 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
826 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
827 ; GFX9-NEXT: v_mov_b32_e32 v5, v3
828 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
829 ; GFX9-NEXT: s_cbranch_execnz .LBB7_1
830 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
831 ; GFX9-NEXT: s_endpgm
833 ; GFX10-LABEL: global_max_saddr_i64_nortn_neg128:
835 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
836 ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
837 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
838 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
839 ; GFX10-NEXT: .LBB7_1: ; %atomicrmw.start
840 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
841 ; GFX10-NEXT: s_waitcnt vmcnt(0)
842 ; GFX10-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
843 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
844 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
845 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
846 ; GFX10-NEXT: s_waitcnt vmcnt(0)
847 ; GFX10-NEXT: buffer_gl1_inv
848 ; GFX10-NEXT: buffer_gl0_inv
849 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
850 ; GFX10-NEXT: v_mov_b32_e32 v6, v4
851 ; GFX10-NEXT: v_mov_b32_e32 v5, v3
852 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
853 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
854 ; GFX10-NEXT: s_cbranch_execnz .LBB7_1
855 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
856 ; GFX10-NEXT: s_endpgm
858 ; GFX11-LABEL: global_max_saddr_i64_nortn_neg128:
860 ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128
861 ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
862 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
863 ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
864 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
865 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
866 ; GFX11-NEXT: .LBB7_1: ; %atomicrmw.start
867 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
868 ; GFX11-NEXT: s_waitcnt vmcnt(0)
869 ; GFX11-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
870 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
871 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
872 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
873 ; GFX11-NEXT: s_waitcnt vmcnt(0)
874 ; GFX11-NEXT: buffer_gl1_inv
875 ; GFX11-NEXT: buffer_gl0_inv
876 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
877 ; GFX11-NEXT: v_mov_b32_e32 v6, v4
878 ; GFX11-NEXT: v_mov_b32_e32 v5, v3
879 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
880 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
881 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
882 ; GFX11-NEXT: s_cbranch_execnz .LBB7_1
883 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
884 ; GFX11-NEXT: s_endpgm
886 ; GFX12-LABEL: global_max_saddr_i64_nortn_neg128:
888 ; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128
889 ; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
890 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
891 ; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
892 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
893 ; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
894 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
895 ; GFX12-NEXT: s_wait_loadcnt 0x0
896 ; GFX12-NEXT: v_cmp_gt_i64_e32 vcc, v[5:6], v[1:2]
897 ; GFX12-NEXT: s_wait_alu 0xfffd
898 ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
899 ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
900 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
901 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
902 ; GFX12-NEXT: s_wait_loadcnt 0x0
903 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
904 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
905 ; GFX12-NEXT: v_mov_b32_e32 v6, v4
906 ; GFX12-NEXT: v_mov_b32_e32 v5, v3
907 ; GFX12-NEXT: s_wait_alu 0xfffe
908 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
909 ; GFX12-NEXT: s_wait_alu 0xfffe
910 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
911 ; GFX12-NEXT: s_cbranch_execnz .LBB7_1
912 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
913 ; GFX12-NEXT: s_endpgm
914 %zext.offset = zext i32 %voffset to i64
915 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
916 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
917 %unused = atomicrmw max ptr addrspace(1) %gep1, i64 %data seq_cst
921 ; --------------------------------------------------------------------------------
923 ; --------------------------------------------------------------------------------
925 define amdgpu_ps float @global_min_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
926 ; GFX9-LABEL: global_min_saddr_i32_rtn:
928 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
929 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
930 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
931 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
932 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
933 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
934 ; GFX9-NEXT: .LBB8_1: ; %atomicrmw.start
935 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
936 ; GFX9-NEXT: s_waitcnt vmcnt(0)
937 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
938 ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1
939 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
940 ; GFX9-NEXT: s_waitcnt vmcnt(0)
941 ; GFX9-NEXT: buffer_wbinvl1
942 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
943 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
944 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
945 ; GFX9-NEXT: s_cbranch_execnz .LBB8_1
946 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
947 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
948 ; GFX9-NEXT: ; return to shader part epilog
950 ; GFX10-LABEL: global_min_saddr_i32_rtn:
952 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
953 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
954 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
955 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
956 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
957 ; GFX10-NEXT: .LBB8_1: ; %atomicrmw.start
958 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
959 ; GFX10-NEXT: s_waitcnt vmcnt(0)
960 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
961 ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1
962 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
963 ; GFX10-NEXT: s_waitcnt vmcnt(0)
964 ; GFX10-NEXT: buffer_gl1_inv
965 ; GFX10-NEXT: buffer_gl0_inv
966 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
967 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
968 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
969 ; GFX10-NEXT: s_cbranch_execnz .LBB8_1
970 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
971 ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
972 ; GFX10-NEXT: ; return to shader part epilog
974 ; GFX11-LABEL: global_min_saddr_i32_rtn:
976 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
977 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
978 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
979 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
980 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
981 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
982 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
983 ; GFX11-NEXT: .LBB8_1: ; %atomicrmw.start
984 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
985 ; GFX11-NEXT: s_waitcnt vmcnt(0)
986 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
987 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
988 ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1
989 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
990 ; GFX11-NEXT: s_waitcnt vmcnt(0)
991 ; GFX11-NEXT: buffer_gl1_inv
992 ; GFX11-NEXT: buffer_gl0_inv
993 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
994 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
995 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
996 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
997 ; GFX11-NEXT: s_cbranch_execnz .LBB8_1
998 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
999 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
1000 ; GFX11-NEXT: ; return to shader part epilog
1002 ; GFX12-LABEL: global_min_saddr_i32_rtn:
1004 ; GFX12-NEXT: v_mov_b32_e32 v2, v0
1005 ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3]
1006 ; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
1007 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1008 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1009 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
1010 ; GFX12-NEXT: .LBB8_1: ; %atomicrmw.start
1011 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1012 ; GFX12-NEXT: s_wait_loadcnt 0x0
1013 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
1014 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1015 ; GFX12-NEXT: v_min_i32_e32 v4, v5, v1
1016 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
1017 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1018 ; GFX12-NEXT: s_wait_loadcnt 0x0
1019 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
1020 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1021 ; GFX12-NEXT: s_wait_alu 0xfffe
1022 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1023 ; GFX12-NEXT: s_wait_alu 0xfffe
1024 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1025 ; GFX12-NEXT: s_cbranch_execnz .LBB8_1
1026 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
1027 ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1]
1028 ; GFX12-NEXT: ; return to shader part epilog
1029 %zext.offset = zext i32 %voffset to i64
1030 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1031 %rtn = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst
1032 %cast.rtn = bitcast i32 %rtn to float
1036 define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1037 ; GFX9-LABEL: global_min_saddr_i32_rtn_neg128:
1039 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
1040 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
1041 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1042 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
1043 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1044 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
1045 ; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start
1046 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1047 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1048 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
1049 ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1
1050 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
1051 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1052 ; GFX9-NEXT: buffer_wbinvl1
1053 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1054 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1055 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
1056 ; GFX9-NEXT: s_cbranch_execnz .LBB9_1
1057 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
1058 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
1059 ; GFX9-NEXT: ; return to shader part epilog
1061 ; GFX10-LABEL: global_min_saddr_i32_rtn_neg128:
1063 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
1064 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
1065 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
1066 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
1067 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
1068 ; GFX10-NEXT: .LBB9_1: ; %atomicrmw.start
1069 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1070 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1071 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
1072 ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1
1073 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
1074 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1075 ; GFX10-NEXT: buffer_gl1_inv
1076 ; GFX10-NEXT: buffer_gl0_inv
1077 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1078 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1079 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
1080 ; GFX10-NEXT: s_cbranch_execnz .LBB9_1
1081 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
1082 ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
1083 ; GFX10-NEXT: ; return to shader part epilog
1085 ; GFX11-LABEL: global_min_saddr_i32_rtn_neg128:
1087 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
1088 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
1089 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
1090 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1091 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1092 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
1093 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
1094 ; GFX11-NEXT: .LBB9_1: ; %atomicrmw.start
1095 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1096 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1097 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
1098 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1099 ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1
1100 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
1101 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1102 ; GFX11-NEXT: buffer_gl1_inv
1103 ; GFX11-NEXT: buffer_gl0_inv
1104 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1105 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1106 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1107 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1108 ; GFX11-NEXT: s_cbranch_execnz .LBB9_1
1109 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
1110 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
1111 ; GFX11-NEXT: ; return to shader part epilog
1113 ; GFX12-LABEL: global_min_saddr_i32_rtn_neg128:
1115 ; GFX12-NEXT: v_mov_b32_e32 v2, v0
1116 ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
1117 ; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
1118 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1119 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1120 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
1121 ; GFX12-NEXT: .LBB9_1: ; %atomicrmw.start
1122 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1123 ; GFX12-NEXT: s_wait_loadcnt 0x0
1124 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
1125 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1126 ; GFX12-NEXT: v_min_i32_e32 v4, v5, v1
1127 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
1128 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1129 ; GFX12-NEXT: s_wait_loadcnt 0x0
1130 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
1131 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1132 ; GFX12-NEXT: s_wait_alu 0xfffe
1133 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1134 ; GFX12-NEXT: s_wait_alu 0xfffe
1135 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1136 ; GFX12-NEXT: s_cbranch_execnz .LBB9_1
1137 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
1138 ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1]
1139 ; GFX12-NEXT: ; return to shader part epilog
1140 %zext.offset = zext i32 %voffset to i64
1141 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1142 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1143 %rtn = atomicrmw min ptr addrspace(1) %gep1, i32 %data seq_cst
1144 %cast.rtn = bitcast i32 %rtn to float
1148 define amdgpu_ps void @global_min_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1149 ; GFX9-LABEL: global_min_saddr_i32_nortn:
1151 ; GFX9-NEXT: global_load_dword v5, v0, s[2:3]
1152 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1153 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
1154 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1155 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
1156 ; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start
1157 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1158 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1159 ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1
1160 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
1161 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1162 ; GFX9-NEXT: buffer_wbinvl1
1163 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1164 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1165 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
1166 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
1167 ; GFX9-NEXT: s_cbranch_execnz .LBB10_1
1168 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
1169 ; GFX9-NEXT: s_endpgm
1171 ; GFX10-LABEL: global_min_saddr_i32_nortn:
1173 ; GFX10-NEXT: global_load_dword v5, v0, s[2:3]
1174 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
1175 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
1176 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
1177 ; GFX10-NEXT: .LBB10_1: ; %atomicrmw.start
1178 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1179 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1180 ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1
1181 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
1182 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1183 ; GFX10-NEXT: buffer_gl1_inv
1184 ; GFX10-NEXT: buffer_gl0_inv
1185 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1186 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
1187 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1188 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
1189 ; GFX10-NEXT: s_cbranch_execnz .LBB10_1
1190 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
1191 ; GFX10-NEXT: s_endpgm
1193 ; GFX11-LABEL: global_min_saddr_i32_nortn:
1195 ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3]
1196 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
1197 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1198 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1199 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
1200 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
1201 ; GFX11-NEXT: .LBB10_1: ; %atomicrmw.start
1202 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1203 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1204 ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1
1205 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
1206 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1207 ; GFX11-NEXT: buffer_gl1_inv
1208 ; GFX11-NEXT: buffer_gl0_inv
1209 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1210 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
1211 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1212 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1213 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1214 ; GFX11-NEXT: s_cbranch_execnz .LBB10_1
1215 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
1216 ; GFX11-NEXT: s_endpgm
1218 ; GFX12-LABEL: global_min_saddr_i32_nortn:
1220 ; GFX12-NEXT: global_load_b32 v5, v0, s[2:3]
1221 ; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
1222 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1223 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1224 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
1225 ; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
1226 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1227 ; GFX12-NEXT: s_wait_loadcnt 0x0
1228 ; GFX12-NEXT: v_min_i32_e32 v4, v5, v1
1229 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
1230 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1231 ; GFX12-NEXT: s_wait_loadcnt 0x0
1232 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
1233 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1234 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
1235 ; GFX12-NEXT: s_wait_alu 0xfffe
1236 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1237 ; GFX12-NEXT: s_wait_alu 0xfffe
1238 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1239 ; GFX12-NEXT: s_cbranch_execnz .LBB10_1
1240 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
1241 ; GFX12-NEXT: s_endpgm
1242 %zext.offset = zext i32 %voffset to i64
1243 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1244 %unused = atomicrmw min ptr addrspace(1) %gep0, i32 %data seq_cst
1248 define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1249 ; GFX9-LABEL: global_min_saddr_i32_nortn_neg128:
1251 ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128
1252 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1253 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
1254 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1255 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
1256 ; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start
1257 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1258 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1259 ; GFX9-NEXT: v_min_i32_e32 v4, v5, v1
1260 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
1261 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1262 ; GFX9-NEXT: buffer_wbinvl1
1263 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1264 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1265 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
1266 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
1267 ; GFX9-NEXT: s_cbranch_execnz .LBB11_1
1268 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
1269 ; GFX9-NEXT: s_endpgm
1271 ; GFX10-LABEL: global_min_saddr_i32_nortn_neg128:
1273 ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128
1274 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
1275 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
1276 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
1277 ; GFX10-NEXT: .LBB11_1: ; %atomicrmw.start
1278 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1279 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1280 ; GFX10-NEXT: v_min_i32_e32 v4, v5, v1
1281 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
1282 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1283 ; GFX10-NEXT: buffer_gl1_inv
1284 ; GFX10-NEXT: buffer_gl0_inv
1285 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1286 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
1287 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1288 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
1289 ; GFX10-NEXT: s_cbranch_execnz .LBB11_1
1290 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
1291 ; GFX10-NEXT: s_endpgm
1293 ; GFX11-LABEL: global_min_saddr_i32_nortn_neg128:
1295 ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128
1296 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
1297 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1298 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1299 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
1300 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
1301 ; GFX11-NEXT: .LBB11_1: ; %atomicrmw.start
1302 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1303 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1304 ; GFX11-NEXT: v_min_i32_e32 v4, v5, v1
1305 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
1306 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1307 ; GFX11-NEXT: buffer_gl1_inv
1308 ; GFX11-NEXT: buffer_gl0_inv
1309 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1310 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
1311 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1312 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1313 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1314 ; GFX11-NEXT: s_cbranch_execnz .LBB11_1
1315 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
1316 ; GFX11-NEXT: s_endpgm
1318 ; GFX12-LABEL: global_min_saddr_i32_nortn_neg128:
1320 ; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128
1321 ; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
1322 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1323 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1324 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
1325 ; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
1326 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1327 ; GFX12-NEXT: s_wait_loadcnt 0x0
1328 ; GFX12-NEXT: v_min_i32_e32 v4, v5, v1
1329 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
1330 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1331 ; GFX12-NEXT: s_wait_loadcnt 0x0
1332 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
1333 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1334 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
1335 ; GFX12-NEXT: s_wait_alu 0xfffe
1336 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1337 ; GFX12-NEXT: s_wait_alu 0xfffe
1338 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1339 ; GFX12-NEXT: s_cbranch_execnz .LBB11_1
1340 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
1341 ; GFX12-NEXT: s_endpgm
1342 %zext.offset = zext i32 %voffset to i64
1343 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1344 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1345 %unused = atomicrmw min ptr addrspace(1) %gep1, i32 %data seq_cst
1349 define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1350 ; GFX9-LABEL: global_min_saddr_i64_rtn:
1352 ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3]
1353 ; GFX9-NEXT: v_mov_b32_e32 v6, s3
1354 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
1355 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
1356 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
1357 ; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start
1358 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1359 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1360 ; GFX9-NEXT: v_mov_b32_e32 v10, v4
1361 ; GFX9-NEXT: v_mov_b32_e32 v9, v3
1362 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
1363 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
1364 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
1365 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
1366 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1367 ; GFX9-NEXT: buffer_wbinvl1
1368 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
1369 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1370 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
1371 ; GFX9-NEXT: s_cbranch_execnz .LBB12_1
1372 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
1373 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
1374 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
1375 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
1376 ; GFX9-NEXT: ; return to shader part epilog
1378 ; GFX10-LABEL: global_min_saddr_i64_rtn:
1380 ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3]
1381 ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
1382 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
1383 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
1384 ; GFX10-NEXT: .LBB12_1: ; %atomicrmw.start
1385 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1386 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1387 ; GFX10-NEXT: v_mov_b32_e32 v10, v4
1388 ; GFX10-NEXT: v_mov_b32_e32 v9, v3
1389 ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
1390 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
1391 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
1392 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
1393 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1394 ; GFX10-NEXT: buffer_gl1_inv
1395 ; GFX10-NEXT: buffer_gl0_inv
1396 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
1397 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1398 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
1399 ; GFX10-NEXT: s_cbranch_execnz .LBB12_1
1400 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
1401 ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
1402 ; GFX10-NEXT: v_mov_b32_e32 v0, v3
1403 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
1404 ; GFX10-NEXT: ; return to shader part epilog
1406 ; GFX11-LABEL: global_min_saddr_i64_rtn:
1408 ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3]
1409 ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
1410 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1411 ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
1412 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
1413 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
1414 ; GFX11-NEXT: .LBB12_1: ; %atomicrmw.start
1415 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1416 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1417 ; GFX11-NEXT: v_mov_b32_e32 v10, v4
1418 ; GFX11-NEXT: v_mov_b32_e32 v9, v3
1419 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1420 ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
1421 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
1422 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
1423 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
1424 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1425 ; GFX11-NEXT: buffer_gl1_inv
1426 ; GFX11-NEXT: buffer_gl0_inv
1427 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
1428 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1429 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1430 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1431 ; GFX11-NEXT: s_cbranch_execnz .LBB12_1
1432 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
1433 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
1434 ; GFX11-NEXT: v_mov_b32_e32 v0, v3
1435 ; GFX11-NEXT: v_mov_b32_e32 v1, v4
1436 ; GFX11-NEXT: ; return to shader part epilog
1438 ; GFX12-LABEL: global_min_saddr_i64_rtn:
1440 ; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3]
1441 ; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
1442 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1443 ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
1444 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
1445 ; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start
1446 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1447 ; GFX12-NEXT: s_wait_loadcnt 0x0
1448 ; GFX12-NEXT: v_mov_b32_e32 v10, v4
1449 ; GFX12-NEXT: v_mov_b32_e32 v9, v3
1450 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1451 ; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
1452 ; GFX12-NEXT: s_wait_alu 0xfffd
1453 ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
1454 ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
1455 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
1456 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1457 ; GFX12-NEXT: s_wait_loadcnt 0x0
1458 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
1459 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
1460 ; GFX12-NEXT: s_wait_alu 0xfffe
1461 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1462 ; GFX12-NEXT: s_wait_alu 0xfffe
1463 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1464 ; GFX12-NEXT: s_cbranch_execnz .LBB12_1
1465 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
1466 ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1]
1467 ; GFX12-NEXT: v_mov_b32_e32 v0, v3
1468 ; GFX12-NEXT: v_mov_b32_e32 v1, v4
1469 ; GFX12-NEXT: ; return to shader part epilog
1470 %zext.offset = zext i32 %voffset to i64
1471 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1472 %rtn = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst
1473 %cast.rtn = bitcast i64 %rtn to <2 x float>
1474 ret <2 x float> %cast.rtn
1477 define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1478 ; GFX9-LABEL: global_min_saddr_i64_rtn_neg128:
1480 ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
1481 ; GFX9-NEXT: v_mov_b32_e32 v6, s3
1482 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
1483 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
1484 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
1485 ; GFX9-NEXT: .LBB13_1: ; %atomicrmw.start
1486 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1487 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1488 ; GFX9-NEXT: v_mov_b32_e32 v10, v4
1489 ; GFX9-NEXT: v_mov_b32_e32 v9, v3
1490 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
1491 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
1492 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
1493 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
1494 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1495 ; GFX9-NEXT: buffer_wbinvl1
1496 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
1497 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1498 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
1499 ; GFX9-NEXT: s_cbranch_execnz .LBB13_1
1500 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
1501 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
1502 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
1503 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
1504 ; GFX9-NEXT: ; return to shader part epilog
1506 ; GFX10-LABEL: global_min_saddr_i64_rtn_neg128:
1508 ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
1509 ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
1510 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
1511 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
1512 ; GFX10-NEXT: .LBB13_1: ; %atomicrmw.start
1513 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1514 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1515 ; GFX10-NEXT: v_mov_b32_e32 v10, v4
1516 ; GFX10-NEXT: v_mov_b32_e32 v9, v3
1517 ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
1518 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
1519 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
1520 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
1521 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1522 ; GFX10-NEXT: buffer_gl1_inv
1523 ; GFX10-NEXT: buffer_gl0_inv
1524 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
1525 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1526 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
1527 ; GFX10-NEXT: s_cbranch_execnz .LBB13_1
1528 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
1529 ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
1530 ; GFX10-NEXT: v_mov_b32_e32 v0, v3
1531 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
1532 ; GFX10-NEXT: ; return to shader part epilog
1534 ; GFX11-LABEL: global_min_saddr_i64_rtn_neg128:
1536 ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128
1537 ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
1538 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1539 ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
1540 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
1541 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
1542 ; GFX11-NEXT: .LBB13_1: ; %atomicrmw.start
1543 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1544 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1545 ; GFX11-NEXT: v_mov_b32_e32 v10, v4
1546 ; GFX11-NEXT: v_mov_b32_e32 v9, v3
1547 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1548 ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
1549 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
1550 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
1551 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
1552 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1553 ; GFX11-NEXT: buffer_gl1_inv
1554 ; GFX11-NEXT: buffer_gl0_inv
1555 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
1556 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1557 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1558 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1559 ; GFX11-NEXT: s_cbranch_execnz .LBB13_1
1560 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
1561 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
1562 ; GFX11-NEXT: v_mov_b32_e32 v0, v3
1563 ; GFX11-NEXT: v_mov_b32_e32 v1, v4
1564 ; GFX11-NEXT: ; return to shader part epilog
1566 ; GFX12-LABEL: global_min_saddr_i64_rtn_neg128:
1568 ; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128
1569 ; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
1570 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1571 ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
1572 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
1573 ; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start
1574 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1575 ; GFX12-NEXT: s_wait_loadcnt 0x0
1576 ; GFX12-NEXT: v_mov_b32_e32 v10, v4
1577 ; GFX12-NEXT: v_mov_b32_e32 v9, v3
1578 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1579 ; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[9:10], v[1:2]
1580 ; GFX12-NEXT: s_wait_alu 0xfffd
1581 ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
1582 ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
1583 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
1584 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1585 ; GFX12-NEXT: s_wait_loadcnt 0x0
1586 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
1587 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
1588 ; GFX12-NEXT: s_wait_alu 0xfffe
1589 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1590 ; GFX12-NEXT: s_wait_alu 0xfffe
1591 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1592 ; GFX12-NEXT: s_cbranch_execnz .LBB13_1
1593 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
1594 ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1]
1595 ; GFX12-NEXT: v_mov_b32_e32 v0, v3
1596 ; GFX12-NEXT: v_mov_b32_e32 v1, v4
1597 ; GFX12-NEXT: ; return to shader part epilog
1598 %zext.offset = zext i32 %voffset to i64
1599 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1600 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1601 %rtn = atomicrmw min ptr addrspace(1) %gep1, i64 %data seq_cst
1602 %cast.rtn = bitcast i64 %rtn to <2 x float>
1603 ret <2 x float> %cast.rtn
1606 define amdgpu_ps void @global_min_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1607 ; GFX9-LABEL: global_min_saddr_i64_nortn:
1609 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3]
1610 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1611 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0
1612 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
1613 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
1614 ; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start
1615 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1616 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1617 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
1618 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
1619 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
1620 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
1621 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1622 ; GFX9-NEXT: buffer_wbinvl1
1623 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
1624 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
1625 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1626 ; GFX9-NEXT: v_mov_b32_e32 v5, v3
1627 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
1628 ; GFX9-NEXT: s_cbranch_execnz .LBB14_1
1629 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
1630 ; GFX9-NEXT: s_endpgm
1632 ; GFX10-LABEL: global_min_saddr_i64_nortn:
1634 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3]
1635 ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
1636 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
1637 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
1638 ; GFX10-NEXT: .LBB14_1: ; %atomicrmw.start
1639 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1640 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1641 ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
1642 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
1643 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
1644 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
1645 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1646 ; GFX10-NEXT: buffer_gl1_inv
1647 ; GFX10-NEXT: buffer_gl0_inv
1648 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
1649 ; GFX10-NEXT: v_mov_b32_e32 v6, v4
1650 ; GFX10-NEXT: v_mov_b32_e32 v5, v3
1651 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1652 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
1653 ; GFX10-NEXT: s_cbranch_execnz .LBB14_1
1654 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
1655 ; GFX10-NEXT: s_endpgm
1657 ; GFX11-LABEL: global_min_saddr_i64_nortn:
1659 ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3]
1660 ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
1661 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1662 ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
1663 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
1664 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
1665 ; GFX11-NEXT: .LBB14_1: ; %atomicrmw.start
1666 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1667 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1668 ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
1669 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
1670 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
1671 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
1672 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1673 ; GFX11-NEXT: buffer_gl1_inv
1674 ; GFX11-NEXT: buffer_gl0_inv
1675 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
1676 ; GFX11-NEXT: v_mov_b32_e32 v6, v4
1677 ; GFX11-NEXT: v_mov_b32_e32 v5, v3
1678 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1679 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1680 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1681 ; GFX11-NEXT: s_cbranch_execnz .LBB14_1
1682 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
1683 ; GFX11-NEXT: s_endpgm
1685 ; GFX12-LABEL: global_min_saddr_i64_nortn:
1687 ; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3]
1688 ; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
1689 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1690 ; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
1691 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
1692 ; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
1693 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1694 ; GFX12-NEXT: s_wait_loadcnt 0x0
1695 ; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
1696 ; GFX12-NEXT: s_wait_alu 0xfffd
1697 ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
1698 ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
1699 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
1700 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1701 ; GFX12-NEXT: s_wait_loadcnt 0x0
1702 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
1703 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
1704 ; GFX12-NEXT: v_mov_b32_e32 v6, v4
1705 ; GFX12-NEXT: v_mov_b32_e32 v5, v3
1706 ; GFX12-NEXT: s_wait_alu 0xfffe
1707 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1708 ; GFX12-NEXT: s_wait_alu 0xfffe
1709 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1710 ; GFX12-NEXT: s_cbranch_execnz .LBB14_1
1711 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
1712 ; GFX12-NEXT: s_endpgm
1713 %zext.offset = zext i32 %voffset to i64
1714 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1715 %unused = atomicrmw min ptr addrspace(1) %gep0, i64 %data seq_cst
1719 define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
1720 ; GFX9-LABEL: global_min_saddr_i64_nortn_neg128:
1722 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
1723 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1724 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0
1725 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
1726 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
1727 ; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start
1728 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1729 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1730 ; GFX9-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
1731 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
1732 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
1733 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
1734 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1735 ; GFX9-NEXT: buffer_wbinvl1
1736 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
1737 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
1738 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1739 ; GFX9-NEXT: v_mov_b32_e32 v5, v3
1740 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
1741 ; GFX9-NEXT: s_cbranch_execnz .LBB15_1
1742 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
1743 ; GFX9-NEXT: s_endpgm
1745 ; GFX10-LABEL: global_min_saddr_i64_nortn_neg128:
1747 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
1748 ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
1749 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
1750 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
1751 ; GFX10-NEXT: .LBB15_1: ; %atomicrmw.start
1752 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1753 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1754 ; GFX10-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
1755 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
1756 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
1757 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
1758 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1759 ; GFX10-NEXT: buffer_gl1_inv
1760 ; GFX10-NEXT: buffer_gl0_inv
1761 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
1762 ; GFX10-NEXT: v_mov_b32_e32 v6, v4
1763 ; GFX10-NEXT: v_mov_b32_e32 v5, v3
1764 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1765 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
1766 ; GFX10-NEXT: s_cbranch_execnz .LBB15_1
1767 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
1768 ; GFX10-NEXT: s_endpgm
1770 ; GFX11-LABEL: global_min_saddr_i64_nortn_neg128:
1772 ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128
1773 ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
1774 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1775 ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
1776 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
1777 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
1778 ; GFX11-NEXT: .LBB15_1: ; %atomicrmw.start
1779 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1780 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1781 ; GFX11-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
1782 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
1783 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
1784 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
1785 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1786 ; GFX11-NEXT: buffer_gl1_inv
1787 ; GFX11-NEXT: buffer_gl0_inv
1788 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
1789 ; GFX11-NEXT: v_mov_b32_e32 v6, v4
1790 ; GFX11-NEXT: v_mov_b32_e32 v5, v3
1791 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1792 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1793 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1794 ; GFX11-NEXT: s_cbranch_execnz .LBB15_1
1795 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
1796 ; GFX11-NEXT: s_endpgm
1798 ; GFX12-LABEL: global_min_saddr_i64_nortn_neg128:
1800 ; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128
1801 ; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
1802 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1803 ; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
1804 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
1805 ; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
1806 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1807 ; GFX12-NEXT: s_wait_loadcnt 0x0
1808 ; GFX12-NEXT: v_cmp_le_i64_e32 vcc, v[5:6], v[1:2]
1809 ; GFX12-NEXT: s_wait_alu 0xfffd
1810 ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
1811 ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
1812 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
1813 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1814 ; GFX12-NEXT: s_wait_loadcnt 0x0
1815 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
1816 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
1817 ; GFX12-NEXT: v_mov_b32_e32 v6, v4
1818 ; GFX12-NEXT: v_mov_b32_e32 v5, v3
1819 ; GFX12-NEXT: s_wait_alu 0xfffe
1820 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1821 ; GFX12-NEXT: s_wait_alu 0xfffe
1822 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1823 ; GFX12-NEXT: s_cbranch_execnz .LBB15_1
1824 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
1825 ; GFX12-NEXT: s_endpgm
1826 %zext.offset = zext i32 %voffset to i64
1827 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1828 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
1829 %unused = atomicrmw min ptr addrspace(1) %gep1, i64 %data seq_cst
1833 ; --------------------------------------------------------------------------------
1835 ; --------------------------------------------------------------------------------
1837 define amdgpu_ps float @global_umax_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1838 ; GFX9-LABEL: global_umax_saddr_i32_rtn:
1840 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
1841 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
1842 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1843 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
1844 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1845 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
1846 ; GFX9-NEXT: .LBB16_1: ; %atomicrmw.start
1847 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1848 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1849 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
1850 ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1
1851 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
1852 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1853 ; GFX9-NEXT: buffer_wbinvl1
1854 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1855 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1856 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
1857 ; GFX9-NEXT: s_cbranch_execnz .LBB16_1
1858 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
1859 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
1860 ; GFX9-NEXT: ; return to shader part epilog
1862 ; GFX10-LABEL: global_umax_saddr_i32_rtn:
1864 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
1865 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
1866 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
1867 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
1868 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
1869 ; GFX10-NEXT: .LBB16_1: ; %atomicrmw.start
1870 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1871 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1872 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
1873 ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1
1874 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
1875 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1876 ; GFX10-NEXT: buffer_gl1_inv
1877 ; GFX10-NEXT: buffer_gl0_inv
1878 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1879 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1880 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
1881 ; GFX10-NEXT: s_cbranch_execnz .LBB16_1
1882 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
1883 ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
1884 ; GFX10-NEXT: ; return to shader part epilog
1886 ; GFX11-LABEL: global_umax_saddr_i32_rtn:
1888 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
1889 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
1890 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
1891 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1892 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1893 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
1894 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
1895 ; GFX11-NEXT: .LBB16_1: ; %atomicrmw.start
1896 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
1897 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1898 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
1899 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1900 ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1
1901 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
1902 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1903 ; GFX11-NEXT: buffer_gl1_inv
1904 ; GFX11-NEXT: buffer_gl0_inv
1905 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1906 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1907 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1908 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1909 ; GFX11-NEXT: s_cbranch_execnz .LBB16_1
1910 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
1911 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
1912 ; GFX11-NEXT: ; return to shader part epilog
1914 ; GFX12-LABEL: global_umax_saddr_i32_rtn:
1916 ; GFX12-NEXT: v_mov_b32_e32 v2, v0
1917 ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3]
1918 ; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
1919 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1920 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
1921 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
1922 ; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start
1923 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1924 ; GFX12-NEXT: s_wait_loadcnt 0x0
1925 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
1926 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1927 ; GFX12-NEXT: v_max_u32_e32 v4, v5, v1
1928 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
1929 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
1930 ; GFX12-NEXT: s_wait_loadcnt 0x0
1931 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
1932 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1933 ; GFX12-NEXT: s_wait_alu 0xfffe
1934 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1935 ; GFX12-NEXT: s_wait_alu 0xfffe
1936 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
1937 ; GFX12-NEXT: s_cbranch_execnz .LBB16_1
1938 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
1939 ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1]
1940 ; GFX12-NEXT: ; return to shader part epilog
1941 %zext.offset = zext i32 %voffset to i64
1942 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
1943 %rtn = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst
1944 %cast.rtn = bitcast i32 %rtn to float
1948 define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
1949 ; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128:
1951 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
1952 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
1953 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1954 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
1955 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
1956 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
1957 ; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start
1958 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
1959 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1960 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
1961 ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1
1962 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
1963 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1964 ; GFX9-NEXT: buffer_wbinvl1
1965 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1966 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1967 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
1968 ; GFX9-NEXT: s_cbranch_execnz .LBB17_1
1969 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
1970 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
1971 ; GFX9-NEXT: ; return to shader part epilog
1973 ; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128:
1975 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
1976 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
1977 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
1978 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
1979 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
1980 ; GFX10-NEXT: .LBB17_1: ; %atomicrmw.start
1981 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
1982 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1983 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
1984 ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1
1985 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
1986 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1987 ; GFX10-NEXT: buffer_gl1_inv
1988 ; GFX10-NEXT: buffer_gl0_inv
1989 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
1990 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
1991 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
1992 ; GFX10-NEXT: s_cbranch_execnz .LBB17_1
1993 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
1994 ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
1995 ; GFX10-NEXT: ; return to shader part epilog
1997 ; GFX11-LABEL: global_umax_saddr_i32_rtn_neg128:
1999 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
2000 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
2001 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
2002 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2003 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2004 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
2005 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
2006 ; GFX11-NEXT: .LBB17_1: ; %atomicrmw.start
2007 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
2008 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2009 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
2010 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2011 ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1
2012 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
2013 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2014 ; GFX11-NEXT: buffer_gl1_inv
2015 ; GFX11-NEXT: buffer_gl0_inv
2016 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2017 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2018 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2019 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2020 ; GFX11-NEXT: s_cbranch_execnz .LBB17_1
2021 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
2022 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
2023 ; GFX11-NEXT: ; return to shader part epilog
2025 ; GFX12-LABEL: global_umax_saddr_i32_rtn_neg128:
2027 ; GFX12-NEXT: v_mov_b32_e32 v2, v0
2028 ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
2029 ; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
2030 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2031 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2032 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
2033 ; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start
2034 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
2035 ; GFX12-NEXT: s_wait_loadcnt 0x0
2036 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
2037 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2038 ; GFX12-NEXT: v_max_u32_e32 v4, v5, v1
2039 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
2040 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2041 ; GFX12-NEXT: s_wait_loadcnt 0x0
2042 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
2043 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2044 ; GFX12-NEXT: s_wait_alu 0xfffe
2045 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2046 ; GFX12-NEXT: s_wait_alu 0xfffe
2047 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2048 ; GFX12-NEXT: s_cbranch_execnz .LBB17_1
2049 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
2050 ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1]
2051 ; GFX12-NEXT: ; return to shader part epilog
2052 %zext.offset = zext i32 %voffset to i64
2053 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2054 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2055 %rtn = atomicrmw umax ptr addrspace(1) %gep1, i32 %data seq_cst
2056 %cast.rtn = bitcast i32 %rtn to float
2060 define amdgpu_ps void @global_umax_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2061 ; GFX9-LABEL: global_umax_saddr_i32_nortn:
2063 ; GFX9-NEXT: global_load_dword v5, v0, s[2:3]
2064 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2065 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
2066 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2067 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
2068 ; GFX9-NEXT: .LBB18_1: ; %atomicrmw.start
2069 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2070 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2071 ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1
2072 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
2073 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2074 ; GFX9-NEXT: buffer_wbinvl1
2075 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2076 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2077 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
2078 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
2079 ; GFX9-NEXT: s_cbranch_execnz .LBB18_1
2080 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2081 ; GFX9-NEXT: s_endpgm
2083 ; GFX10-LABEL: global_umax_saddr_i32_nortn:
2085 ; GFX10-NEXT: global_load_dword v5, v0, s[2:3]
2086 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
2087 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
2088 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
2089 ; GFX10-NEXT: .LBB18_1: ; %atomicrmw.start
2090 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2091 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2092 ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1
2093 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
2094 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2095 ; GFX10-NEXT: buffer_gl1_inv
2096 ; GFX10-NEXT: buffer_gl0_inv
2097 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2098 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
2099 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2100 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
2101 ; GFX10-NEXT: s_cbranch_execnz .LBB18_1
2102 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
2103 ; GFX10-NEXT: s_endpgm
2105 ; GFX11-LABEL: global_umax_saddr_i32_nortn:
2107 ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3]
2108 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
2109 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2110 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2111 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
2112 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
2113 ; GFX11-NEXT: .LBB18_1: ; %atomicrmw.start
2114 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
2115 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2116 ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1
2117 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
2118 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2119 ; GFX11-NEXT: buffer_gl1_inv
2120 ; GFX11-NEXT: buffer_gl0_inv
2121 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2122 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
2123 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2124 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2125 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2126 ; GFX11-NEXT: s_cbranch_execnz .LBB18_1
2127 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
2128 ; GFX11-NEXT: s_endpgm
2130 ; GFX12-LABEL: global_umax_saddr_i32_nortn:
2132 ; GFX12-NEXT: global_load_b32 v5, v0, s[2:3]
2133 ; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
2134 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2135 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2136 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
2137 ; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start
2138 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
2139 ; GFX12-NEXT: s_wait_loadcnt 0x0
2140 ; GFX12-NEXT: v_max_u32_e32 v4, v5, v1
2141 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
2142 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2143 ; GFX12-NEXT: s_wait_loadcnt 0x0
2144 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
2145 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2146 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
2147 ; GFX12-NEXT: s_wait_alu 0xfffe
2148 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2149 ; GFX12-NEXT: s_wait_alu 0xfffe
2150 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2151 ; GFX12-NEXT: s_cbranch_execnz .LBB18_1
2152 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
2153 ; GFX12-NEXT: s_endpgm
2154 %zext.offset = zext i32 %voffset to i64
2155 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2156 %unused = atomicrmw umax ptr addrspace(1) %gep0, i32 %data seq_cst
2160 define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2161 ; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128:
2163 ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128
2164 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2165 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
2166 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2167 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
2168 ; GFX9-NEXT: .LBB19_1: ; %atomicrmw.start
2169 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2170 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2171 ; GFX9-NEXT: v_max_u32_e32 v4, v5, v1
2172 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
2173 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2174 ; GFX9-NEXT: buffer_wbinvl1
2175 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2176 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2177 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
2178 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
2179 ; GFX9-NEXT: s_cbranch_execnz .LBB19_1
2180 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2181 ; GFX9-NEXT: s_endpgm
2183 ; GFX10-LABEL: global_umax_saddr_i32_nortn_neg128:
2185 ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128
2186 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
2187 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
2188 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
2189 ; GFX10-NEXT: .LBB19_1: ; %atomicrmw.start
2190 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2191 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2192 ; GFX10-NEXT: v_max_u32_e32 v4, v5, v1
2193 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
2194 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2195 ; GFX10-NEXT: buffer_gl1_inv
2196 ; GFX10-NEXT: buffer_gl0_inv
2197 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2198 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
2199 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2200 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
2201 ; GFX10-NEXT: s_cbranch_execnz .LBB19_1
2202 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
2203 ; GFX10-NEXT: s_endpgm
2205 ; GFX11-LABEL: global_umax_saddr_i32_nortn_neg128:
2207 ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128
2208 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
2209 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2210 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2211 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
2212 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
2213 ; GFX11-NEXT: .LBB19_1: ; %atomicrmw.start
2214 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
2215 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2216 ; GFX11-NEXT: v_max_u32_e32 v4, v5, v1
2217 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
2218 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2219 ; GFX11-NEXT: buffer_gl1_inv
2220 ; GFX11-NEXT: buffer_gl0_inv
2221 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2222 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
2223 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2224 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2225 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2226 ; GFX11-NEXT: s_cbranch_execnz .LBB19_1
2227 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
2228 ; GFX11-NEXT: s_endpgm
2230 ; GFX12-LABEL: global_umax_saddr_i32_nortn_neg128:
2232 ; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128
2233 ; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
2234 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2235 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2236 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
2237 ; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start
2238 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
2239 ; GFX12-NEXT: s_wait_loadcnt 0x0
2240 ; GFX12-NEXT: v_max_u32_e32 v4, v5, v1
2241 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
2242 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2243 ; GFX12-NEXT: s_wait_loadcnt 0x0
2244 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
2245 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2246 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
2247 ; GFX12-NEXT: s_wait_alu 0xfffe
2248 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2249 ; GFX12-NEXT: s_wait_alu 0xfffe
2250 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2251 ; GFX12-NEXT: s_cbranch_execnz .LBB19_1
2252 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
2253 ; GFX12-NEXT: s_endpgm
2254 %zext.offset = zext i32 %voffset to i64
2255 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2256 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2257 %unused = atomicrmw umax ptr addrspace(1) %gep1, i32 %data seq_cst
2261 define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2262 ; GFX9-LABEL: global_umax_saddr_i64_rtn:
2264 ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3]
2265 ; GFX9-NEXT: v_mov_b32_e32 v6, s3
2266 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
2267 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
2268 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
2269 ; GFX9-NEXT: .LBB20_1: ; %atomicrmw.start
2270 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2271 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2272 ; GFX9-NEXT: v_mov_b32_e32 v10, v4
2273 ; GFX9-NEXT: v_mov_b32_e32 v9, v3
2274 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
2275 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
2276 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
2277 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
2278 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2279 ; GFX9-NEXT: buffer_wbinvl1
2280 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
2281 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2282 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
2283 ; GFX9-NEXT: s_cbranch_execnz .LBB20_1
2284 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2285 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
2286 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
2287 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
2288 ; GFX9-NEXT: ; return to shader part epilog
2290 ; GFX10-LABEL: global_umax_saddr_i64_rtn:
2292 ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3]
2293 ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
2294 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
2295 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
2296 ; GFX10-NEXT: .LBB20_1: ; %atomicrmw.start
2297 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2298 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2299 ; GFX10-NEXT: v_mov_b32_e32 v10, v4
2300 ; GFX10-NEXT: v_mov_b32_e32 v9, v3
2301 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
2302 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
2303 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
2304 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
2305 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2306 ; GFX10-NEXT: buffer_gl1_inv
2307 ; GFX10-NEXT: buffer_gl0_inv
2308 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
2309 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2310 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
2311 ; GFX10-NEXT: s_cbranch_execnz .LBB20_1
2312 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
2313 ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
2314 ; GFX10-NEXT: v_mov_b32_e32 v0, v3
2315 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
2316 ; GFX10-NEXT: ; return to shader part epilog
2318 ; GFX11-LABEL: global_umax_saddr_i64_rtn:
2320 ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3]
2321 ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
2322 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2323 ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
2324 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
2325 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
2326 ; GFX11-NEXT: .LBB20_1: ; %atomicrmw.start
2327 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
2328 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2329 ; GFX11-NEXT: v_mov_b32_e32 v10, v4
2330 ; GFX11-NEXT: v_mov_b32_e32 v9, v3
2331 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2332 ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
2333 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
2334 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
2335 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
2336 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2337 ; GFX11-NEXT: buffer_gl1_inv
2338 ; GFX11-NEXT: buffer_gl0_inv
2339 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
2340 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2341 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2342 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2343 ; GFX11-NEXT: s_cbranch_execnz .LBB20_1
2344 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
2345 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
2346 ; GFX11-NEXT: v_mov_b32_e32 v0, v3
2347 ; GFX11-NEXT: v_mov_b32_e32 v1, v4
2348 ; GFX11-NEXT: ; return to shader part epilog
2350 ; GFX12-LABEL: global_umax_saddr_i64_rtn:
2352 ; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3]
2353 ; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
2354 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2355 ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
2356 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
2357 ; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start
2358 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
2359 ; GFX12-NEXT: s_wait_loadcnt 0x0
2360 ; GFX12-NEXT: v_mov_b32_e32 v10, v4
2361 ; GFX12-NEXT: v_mov_b32_e32 v9, v3
2362 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2363 ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
2364 ; GFX12-NEXT: s_wait_alu 0xfffd
2365 ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
2366 ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
2367 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
2368 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2369 ; GFX12-NEXT: s_wait_loadcnt 0x0
2370 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
2371 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
2372 ; GFX12-NEXT: s_wait_alu 0xfffe
2373 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2374 ; GFX12-NEXT: s_wait_alu 0xfffe
2375 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2376 ; GFX12-NEXT: s_cbranch_execnz .LBB20_1
2377 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
2378 ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1]
2379 ; GFX12-NEXT: v_mov_b32_e32 v0, v3
2380 ; GFX12-NEXT: v_mov_b32_e32 v1, v4
2381 ; GFX12-NEXT: ; return to shader part epilog
2382 %zext.offset = zext i32 %voffset to i64
2383 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2384 %rtn = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst
2385 %cast.rtn = bitcast i64 %rtn to <2 x float>
2386 ret <2 x float> %cast.rtn
2389 define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2390 ; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128:
2392 ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
2393 ; GFX9-NEXT: v_mov_b32_e32 v6, s3
2394 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
2395 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
2396 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
2397 ; GFX9-NEXT: .LBB21_1: ; %atomicrmw.start
2398 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2399 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2400 ; GFX9-NEXT: v_mov_b32_e32 v10, v4
2401 ; GFX9-NEXT: v_mov_b32_e32 v9, v3
2402 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
2403 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
2404 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
2405 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
2406 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2407 ; GFX9-NEXT: buffer_wbinvl1
2408 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
2409 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2410 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
2411 ; GFX9-NEXT: s_cbranch_execnz .LBB21_1
2412 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2413 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
2414 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
2415 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
2416 ; GFX9-NEXT: ; return to shader part epilog
2418 ; GFX10-LABEL: global_umax_saddr_i64_rtn_neg128:
2420 ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
2421 ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
2422 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
2423 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
2424 ; GFX10-NEXT: .LBB21_1: ; %atomicrmw.start
2425 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2426 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2427 ; GFX10-NEXT: v_mov_b32_e32 v10, v4
2428 ; GFX10-NEXT: v_mov_b32_e32 v9, v3
2429 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
2430 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
2431 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
2432 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
2433 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2434 ; GFX10-NEXT: buffer_gl1_inv
2435 ; GFX10-NEXT: buffer_gl0_inv
2436 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
2437 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2438 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
2439 ; GFX10-NEXT: s_cbranch_execnz .LBB21_1
2440 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
2441 ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
2442 ; GFX10-NEXT: v_mov_b32_e32 v0, v3
2443 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
2444 ; GFX10-NEXT: ; return to shader part epilog
2446 ; GFX11-LABEL: global_umax_saddr_i64_rtn_neg128:
2448 ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128
2449 ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
2450 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2451 ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
2452 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
2453 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
2454 ; GFX11-NEXT: .LBB21_1: ; %atomicrmw.start
2455 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
2456 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2457 ; GFX11-NEXT: v_mov_b32_e32 v10, v4
2458 ; GFX11-NEXT: v_mov_b32_e32 v9, v3
2459 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2460 ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
2461 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
2462 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
2463 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
2464 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2465 ; GFX11-NEXT: buffer_gl1_inv
2466 ; GFX11-NEXT: buffer_gl0_inv
2467 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
2468 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2469 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2470 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2471 ; GFX11-NEXT: s_cbranch_execnz .LBB21_1
2472 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
2473 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
2474 ; GFX11-NEXT: v_mov_b32_e32 v0, v3
2475 ; GFX11-NEXT: v_mov_b32_e32 v1, v4
2476 ; GFX11-NEXT: ; return to shader part epilog
2478 ; GFX12-LABEL: global_umax_saddr_i64_rtn_neg128:
2480 ; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128
2481 ; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
2482 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2483 ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
2484 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
2485 ; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start
2486 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
2487 ; GFX12-NEXT: s_wait_loadcnt 0x0
2488 ; GFX12-NEXT: v_mov_b32_e32 v10, v4
2489 ; GFX12-NEXT: v_mov_b32_e32 v9, v3
2490 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2491 ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[9:10], v[1:2]
2492 ; GFX12-NEXT: s_wait_alu 0xfffd
2493 ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
2494 ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
2495 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
2496 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2497 ; GFX12-NEXT: s_wait_loadcnt 0x0
2498 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
2499 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
2500 ; GFX12-NEXT: s_wait_alu 0xfffe
2501 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2502 ; GFX12-NEXT: s_wait_alu 0xfffe
2503 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2504 ; GFX12-NEXT: s_cbranch_execnz .LBB21_1
2505 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
2506 ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1]
2507 ; GFX12-NEXT: v_mov_b32_e32 v0, v3
2508 ; GFX12-NEXT: v_mov_b32_e32 v1, v4
2509 ; GFX12-NEXT: ; return to shader part epilog
2510 %zext.offset = zext i32 %voffset to i64
2511 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2512 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2513 %rtn = atomicrmw umax ptr addrspace(1) %gep1, i64 %data seq_cst
2514 %cast.rtn = bitcast i64 %rtn to <2 x float>
2515 ret <2 x float> %cast.rtn
2518 define amdgpu_ps void @global_umax_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2519 ; GFX9-LABEL: global_umax_saddr_i64_nortn:
2521 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3]
2522 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2523 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0
2524 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
2525 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
2526 ; GFX9-NEXT: .LBB22_1: ; %atomicrmw.start
2527 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2528 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2529 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
2530 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
2531 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
2532 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
2533 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2534 ; GFX9-NEXT: buffer_wbinvl1
2535 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
2536 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
2537 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2538 ; GFX9-NEXT: v_mov_b32_e32 v5, v3
2539 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
2540 ; GFX9-NEXT: s_cbranch_execnz .LBB22_1
2541 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2542 ; GFX9-NEXT: s_endpgm
2544 ; GFX10-LABEL: global_umax_saddr_i64_nortn:
2546 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3]
2547 ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
2548 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
2549 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
2550 ; GFX10-NEXT: .LBB22_1: ; %atomicrmw.start
2551 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2552 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2553 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
2554 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
2555 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
2556 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
2557 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2558 ; GFX10-NEXT: buffer_gl1_inv
2559 ; GFX10-NEXT: buffer_gl0_inv
2560 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
2561 ; GFX10-NEXT: v_mov_b32_e32 v6, v4
2562 ; GFX10-NEXT: v_mov_b32_e32 v5, v3
2563 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2564 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
2565 ; GFX10-NEXT: s_cbranch_execnz .LBB22_1
2566 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
2567 ; GFX10-NEXT: s_endpgm
2569 ; GFX11-LABEL: global_umax_saddr_i64_nortn:
2571 ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3]
2572 ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
2573 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2574 ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
2575 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
2576 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
2577 ; GFX11-NEXT: .LBB22_1: ; %atomicrmw.start
2578 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
2579 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2580 ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
2581 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
2582 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
2583 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
2584 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2585 ; GFX11-NEXT: buffer_gl1_inv
2586 ; GFX11-NEXT: buffer_gl0_inv
2587 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
2588 ; GFX11-NEXT: v_mov_b32_e32 v6, v4
2589 ; GFX11-NEXT: v_mov_b32_e32 v5, v3
2590 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2591 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2592 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2593 ; GFX11-NEXT: s_cbranch_execnz .LBB22_1
2594 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
2595 ; GFX11-NEXT: s_endpgm
2597 ; GFX12-LABEL: global_umax_saddr_i64_nortn:
2599 ; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3]
2600 ; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
2601 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2602 ; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
2603 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
2604 ; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start
2605 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
2606 ; GFX12-NEXT: s_wait_loadcnt 0x0
2607 ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
2608 ; GFX12-NEXT: s_wait_alu 0xfffd
2609 ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
2610 ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
2611 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
2612 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2613 ; GFX12-NEXT: s_wait_loadcnt 0x0
2614 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
2615 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
2616 ; GFX12-NEXT: v_mov_b32_e32 v6, v4
2617 ; GFX12-NEXT: v_mov_b32_e32 v5, v3
2618 ; GFX12-NEXT: s_wait_alu 0xfffe
2619 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2620 ; GFX12-NEXT: s_wait_alu 0xfffe
2621 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2622 ; GFX12-NEXT: s_cbranch_execnz .LBB22_1
2623 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
2624 ; GFX12-NEXT: s_endpgm
2625 %zext.offset = zext i32 %voffset to i64
2626 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2627 %unused = atomicrmw umax ptr addrspace(1) %gep0, i64 %data seq_cst
2631 define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
2632 ; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128:
2634 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
2635 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2636 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0
2637 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
2638 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
2639 ; GFX9-NEXT: .LBB23_1: ; %atomicrmw.start
2640 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2641 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2642 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
2643 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
2644 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
2645 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
2646 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2647 ; GFX9-NEXT: buffer_wbinvl1
2648 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
2649 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
2650 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2651 ; GFX9-NEXT: v_mov_b32_e32 v5, v3
2652 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
2653 ; GFX9-NEXT: s_cbranch_execnz .LBB23_1
2654 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2655 ; GFX9-NEXT: s_endpgm
2657 ; GFX10-LABEL: global_umax_saddr_i64_nortn_neg128:
2659 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
2660 ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
2661 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
2662 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
2663 ; GFX10-NEXT: .LBB23_1: ; %atomicrmw.start
2664 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2665 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2666 ; GFX10-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
2667 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
2668 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
2669 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
2670 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2671 ; GFX10-NEXT: buffer_gl1_inv
2672 ; GFX10-NEXT: buffer_gl0_inv
2673 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
2674 ; GFX10-NEXT: v_mov_b32_e32 v6, v4
2675 ; GFX10-NEXT: v_mov_b32_e32 v5, v3
2676 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2677 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
2678 ; GFX10-NEXT: s_cbranch_execnz .LBB23_1
2679 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
2680 ; GFX10-NEXT: s_endpgm
2682 ; GFX11-LABEL: global_umax_saddr_i64_nortn_neg128:
2684 ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128
2685 ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
2686 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2687 ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
2688 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
2689 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
2690 ; GFX11-NEXT: .LBB23_1: ; %atomicrmw.start
2691 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
2692 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2693 ; GFX11-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
2694 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
2695 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
2696 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
2697 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2698 ; GFX11-NEXT: buffer_gl1_inv
2699 ; GFX11-NEXT: buffer_gl0_inv
2700 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
2701 ; GFX11-NEXT: v_mov_b32_e32 v6, v4
2702 ; GFX11-NEXT: v_mov_b32_e32 v5, v3
2703 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2704 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2705 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2706 ; GFX11-NEXT: s_cbranch_execnz .LBB23_1
2707 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
2708 ; GFX11-NEXT: s_endpgm
2710 ; GFX12-LABEL: global_umax_saddr_i64_nortn_neg128:
2712 ; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128
2713 ; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
2714 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2715 ; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
2716 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
2717 ; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start
2718 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
2719 ; GFX12-NEXT: s_wait_loadcnt 0x0
2720 ; GFX12-NEXT: v_cmp_gt_u64_e32 vcc, v[5:6], v[1:2]
2721 ; GFX12-NEXT: s_wait_alu 0xfffd
2722 ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
2723 ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
2724 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
2725 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2726 ; GFX12-NEXT: s_wait_loadcnt 0x0
2727 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
2728 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
2729 ; GFX12-NEXT: v_mov_b32_e32 v6, v4
2730 ; GFX12-NEXT: v_mov_b32_e32 v5, v3
2731 ; GFX12-NEXT: s_wait_alu 0xfffe
2732 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2733 ; GFX12-NEXT: s_wait_alu 0xfffe
2734 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2735 ; GFX12-NEXT: s_cbranch_execnz .LBB23_1
2736 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
2737 ; GFX12-NEXT: s_endpgm
2738 %zext.offset = zext i32 %voffset to i64
2739 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2740 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2741 %unused = atomicrmw umax ptr addrspace(1) %gep1, i64 %data seq_cst
2745 ; --------------------------------------------------------------------------------
2747 ; --------------------------------------------------------------------------------
2749 define amdgpu_ps float @global_umin_saddr_i32_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2750 ; GFX9-LABEL: global_umin_saddr_i32_rtn:
2752 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
2753 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3]
2754 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2755 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
2756 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2757 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
2758 ; GFX9-NEXT: .LBB24_1: ; %atomicrmw.start
2759 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2760 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2761 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
2762 ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1
2763 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
2764 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2765 ; GFX9-NEXT: buffer_wbinvl1
2766 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2767 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2768 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
2769 ; GFX9-NEXT: s_cbranch_execnz .LBB24_1
2770 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2771 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
2772 ; GFX9-NEXT: ; return to shader part epilog
2774 ; GFX10-LABEL: global_umin_saddr_i32_rtn:
2776 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
2777 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3]
2778 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
2779 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
2780 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
2781 ; GFX10-NEXT: .LBB24_1: ; %atomicrmw.start
2782 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2783 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2784 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
2785 ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1
2786 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
2787 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2788 ; GFX10-NEXT: buffer_gl1_inv
2789 ; GFX10-NEXT: buffer_gl0_inv
2790 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2791 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2792 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
2793 ; GFX10-NEXT: s_cbranch_execnz .LBB24_1
2794 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
2795 ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
2796 ; GFX10-NEXT: ; return to shader part epilog
2798 ; GFX11-LABEL: global_umin_saddr_i32_rtn:
2800 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
2801 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3]
2802 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
2803 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2804 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2805 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
2806 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
2807 ; GFX11-NEXT: .LBB24_1: ; %atomicrmw.start
2808 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
2809 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2810 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
2811 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2812 ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1
2813 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
2814 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2815 ; GFX11-NEXT: buffer_gl1_inv
2816 ; GFX11-NEXT: buffer_gl0_inv
2817 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2818 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2819 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2820 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2821 ; GFX11-NEXT: s_cbranch_execnz .LBB24_1
2822 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
2823 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
2824 ; GFX11-NEXT: ; return to shader part epilog
2826 ; GFX12-LABEL: global_umin_saddr_i32_rtn:
2828 ; GFX12-NEXT: v_mov_b32_e32 v2, v0
2829 ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3]
2830 ; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
2831 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2832 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2833 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
2834 ; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start
2835 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
2836 ; GFX12-NEXT: s_wait_loadcnt 0x0
2837 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
2838 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2839 ; GFX12-NEXT: v_min_u32_e32 v4, v5, v1
2840 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
2841 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2842 ; GFX12-NEXT: s_wait_loadcnt 0x0
2843 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
2844 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2845 ; GFX12-NEXT: s_wait_alu 0xfffe
2846 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2847 ; GFX12-NEXT: s_wait_alu 0xfffe
2848 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2849 ; GFX12-NEXT: s_cbranch_execnz .LBB24_1
2850 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
2851 ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1]
2852 ; GFX12-NEXT: ; return to shader part epilog
2853 %zext.offset = zext i32 %voffset to i64
2854 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2855 %rtn = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst
2856 %cast.rtn = bitcast i32 %rtn to float
2860 define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2861 ; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128:
2863 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
2864 ; GFX9-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
2865 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2866 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v2
2867 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2868 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
2869 ; GFX9-NEXT: .LBB25_1: ; %atomicrmw.start
2870 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2871 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2872 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
2873 ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1
2874 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
2875 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2876 ; GFX9-NEXT: buffer_wbinvl1
2877 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2878 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2879 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
2880 ; GFX9-NEXT: s_cbranch_execnz .LBB25_1
2881 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2882 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
2883 ; GFX9-NEXT: ; return to shader part epilog
2885 ; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128:
2887 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
2888 ; GFX10-NEXT: global_load_dword v0, v0, s[2:3] offset:-128
2889 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
2890 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
2891 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
2892 ; GFX10-NEXT: .LBB25_1: ; %atomicrmw.start
2893 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
2894 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2895 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
2896 ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1
2897 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
2898 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2899 ; GFX10-NEXT: buffer_gl1_inv
2900 ; GFX10-NEXT: buffer_gl0_inv
2901 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2902 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2903 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
2904 ; GFX10-NEXT: s_cbranch_execnz .LBB25_1
2905 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
2906 ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
2907 ; GFX10-NEXT: ; return to shader part epilog
2909 ; GFX11-LABEL: global_umin_saddr_i32_rtn_neg128:
2911 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
2912 ; GFX11-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
2913 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
2914 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2915 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2916 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
2917 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
2918 ; GFX11-NEXT: .LBB25_1: ; %atomicrmw.start
2919 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
2920 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2921 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
2922 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2923 ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1
2924 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
2925 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2926 ; GFX11-NEXT: buffer_gl1_inv
2927 ; GFX11-NEXT: buffer_gl0_inv
2928 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2929 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2930 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2931 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2932 ; GFX11-NEXT: s_cbranch_execnz .LBB25_1
2933 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
2934 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
2935 ; GFX11-NEXT: ; return to shader part epilog
2937 ; GFX12-LABEL: global_umin_saddr_i32_rtn_neg128:
2939 ; GFX12-NEXT: v_mov_b32_e32 v2, v0
2940 ; GFX12-NEXT: global_load_b32 v0, v0, s[2:3] offset:-128
2941 ; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v2
2942 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2943 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
2944 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
2945 ; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start
2946 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
2947 ; GFX12-NEXT: s_wait_loadcnt 0x0
2948 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
2949 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2950 ; GFX12-NEXT: v_min_u32_e32 v4, v5, v1
2951 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
2952 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
2953 ; GFX12-NEXT: s_wait_loadcnt 0x0
2954 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
2955 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2956 ; GFX12-NEXT: s_wait_alu 0xfffe
2957 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2958 ; GFX12-NEXT: s_wait_alu 0xfffe
2959 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
2960 ; GFX12-NEXT: s_cbranch_execnz .LBB25_1
2961 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
2962 ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1]
2963 ; GFX12-NEXT: ; return to shader part epilog
2964 %zext.offset = zext i32 %voffset to i64
2965 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
2966 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
2967 %rtn = atomicrmw umin ptr addrspace(1) %gep1, i32 %data seq_cst
2968 %cast.rtn = bitcast i32 %rtn to float
2972 define amdgpu_ps void @global_umin_saddr_i32_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
2973 ; GFX9-LABEL: global_umin_saddr_i32_nortn:
2975 ; GFX9-NEXT: global_load_dword v5, v0, s[2:3]
2976 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2977 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
2978 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
2979 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
2980 ; GFX9-NEXT: .LBB26_1: ; %atomicrmw.start
2981 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
2982 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2983 ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1
2984 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
2985 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2986 ; GFX9-NEXT: buffer_wbinvl1
2987 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
2988 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
2989 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
2990 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
2991 ; GFX9-NEXT: s_cbranch_execnz .LBB26_1
2992 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
2993 ; GFX9-NEXT: s_endpgm
2995 ; GFX10-LABEL: global_umin_saddr_i32_nortn:
2997 ; GFX10-NEXT: global_load_dword v5, v0, s[2:3]
2998 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
2999 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
3000 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
3001 ; GFX10-NEXT: .LBB26_1: ; %atomicrmw.start
3002 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
3003 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3004 ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1
3005 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off glc
3006 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3007 ; GFX10-NEXT: buffer_gl1_inv
3008 ; GFX10-NEXT: buffer_gl0_inv
3009 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
3010 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
3011 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3012 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
3013 ; GFX10-NEXT: s_cbranch_execnz .LBB26_1
3014 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
3015 ; GFX10-NEXT: s_endpgm
3017 ; GFX11-LABEL: global_umin_saddr_i32_nortn:
3019 ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3]
3020 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
3021 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3022 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
3023 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
3024 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
3025 ; GFX11-NEXT: .LBB26_1: ; %atomicrmw.start
3026 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
3027 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3028 ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1
3029 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off glc
3030 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3031 ; GFX11-NEXT: buffer_gl1_inv
3032 ; GFX11-NEXT: buffer_gl0_inv
3033 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
3034 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
3035 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3036 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3037 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
3038 ; GFX11-NEXT: s_cbranch_execnz .LBB26_1
3039 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
3040 ; GFX11-NEXT: s_endpgm
3042 ; GFX12-LABEL: global_umin_saddr_i32_nortn:
3044 ; GFX12-NEXT: global_load_b32 v5, v0, s[2:3]
3045 ; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
3046 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
3047 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
3048 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
3049 ; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start
3050 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
3051 ; GFX12-NEXT: s_wait_loadcnt 0x0
3052 ; GFX12-NEXT: v_min_u32_e32 v4, v5, v1
3053 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
3054 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3055 ; GFX12-NEXT: s_wait_loadcnt 0x0
3056 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
3057 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
3058 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
3059 ; GFX12-NEXT: s_wait_alu 0xfffe
3060 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3061 ; GFX12-NEXT: s_wait_alu 0xfffe
3062 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
3063 ; GFX12-NEXT: s_cbranch_execnz .LBB26_1
3064 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
3065 ; GFX12-NEXT: s_endpgm
3066 %zext.offset = zext i32 %voffset to i64
3067 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3068 %unused = atomicrmw umin ptr addrspace(1) %gep0, i32 %data seq_cst
3072 define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i32 %data) {
3073 ; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128:
3075 ; GFX9-NEXT: global_load_dword v5, v0, s[2:3] offset:-128
3076 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
3077 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
3078 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
3079 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
3080 ; GFX9-NEXT: .LBB27_1: ; %atomicrmw.start
3081 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3082 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3083 ; GFX9-NEXT: v_min_u32_e32 v4, v5, v1
3084 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
3085 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3086 ; GFX9-NEXT: buffer_wbinvl1
3087 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
3088 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3089 ; GFX9-NEXT: v_mov_b32_e32 v5, v0
3090 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
3091 ; GFX9-NEXT: s_cbranch_execnz .LBB27_1
3092 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
3093 ; GFX9-NEXT: s_endpgm
3095 ; GFX10-LABEL: global_umin_saddr_i32_nortn_neg128:
3097 ; GFX10-NEXT: global_load_dword v5, v0, s[2:3] offset:-128
3098 ; GFX10-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
3099 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
3100 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
3101 ; GFX10-NEXT: .LBB27_1: ; %atomicrmw.start
3102 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
3103 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3104 ; GFX10-NEXT: v_min_u32_e32 v4, v5, v1
3105 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], v[4:5], off offset:-128 glc
3106 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3107 ; GFX10-NEXT: buffer_gl1_inv
3108 ; GFX10-NEXT: buffer_gl0_inv
3109 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
3110 ; GFX10-NEXT: v_mov_b32_e32 v5, v0
3111 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3112 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
3113 ; GFX10-NEXT: s_cbranch_execnz .LBB27_1
3114 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
3115 ; GFX10-NEXT: s_endpgm
3117 ; GFX11-LABEL: global_umin_saddr_i32_nortn_neg128:
3119 ; GFX11-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128
3120 ; GFX11-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
3121 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3122 ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
3123 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
3124 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
3125 ; GFX11-NEXT: .LBB27_1: ; %atomicrmw.start
3126 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
3127 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3128 ; GFX11-NEXT: v_min_u32_e32 v4, v5, v1
3129 ; GFX11-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 glc
3130 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3131 ; GFX11-NEXT: buffer_gl1_inv
3132 ; GFX11-NEXT: buffer_gl0_inv
3133 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
3134 ; GFX11-NEXT: v_mov_b32_e32 v5, v0
3135 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3136 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3137 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
3138 ; GFX11-NEXT: s_cbranch_execnz .LBB27_1
3139 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
3140 ; GFX11-NEXT: s_endpgm
3142 ; GFX12-LABEL: global_umin_saddr_i32_nortn_neg128:
3144 ; GFX12-NEXT: global_load_b32 v5, v0, s[2:3] offset:-128
3145 ; GFX12-NEXT: v_add_co_u32 v2, s[0:1], s2, v0
3146 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
3147 ; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s3, 0, s[0:1]
3148 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
3149 ; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start
3150 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
3151 ; GFX12-NEXT: s_wait_loadcnt 0x0
3152 ; GFX12-NEXT: v_min_u32_e32 v4, v5, v1
3153 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
3154 ; GFX12-NEXT: global_atomic_cmpswap_b32 v0, v[2:3], v[4:5], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3155 ; GFX12-NEXT: s_wait_loadcnt 0x0
3156 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
3157 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
3158 ; GFX12-NEXT: v_mov_b32_e32 v5, v0
3159 ; GFX12-NEXT: s_wait_alu 0xfffe
3160 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3161 ; GFX12-NEXT: s_wait_alu 0xfffe
3162 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
3163 ; GFX12-NEXT: s_cbranch_execnz .LBB27_1
3164 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
3165 ; GFX12-NEXT: s_endpgm
3166 %zext.offset = zext i32 %voffset to i64
3167 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3168 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3169 %unused = atomicrmw umin ptr addrspace(1) %gep1, i32 %data seq_cst
3173 define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3174 ; GFX9-LABEL: global_umin_saddr_i64_rtn:
3176 ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3]
3177 ; GFX9-NEXT: v_mov_b32_e32 v6, s3
3178 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
3179 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
3180 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
3181 ; GFX9-NEXT: .LBB28_1: ; %atomicrmw.start
3182 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3183 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3184 ; GFX9-NEXT: v_mov_b32_e32 v10, v4
3185 ; GFX9-NEXT: v_mov_b32_e32 v9, v3
3186 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
3187 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
3188 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
3189 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
3190 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3191 ; GFX9-NEXT: buffer_wbinvl1
3192 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
3193 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3194 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
3195 ; GFX9-NEXT: s_cbranch_execnz .LBB28_1
3196 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
3197 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
3198 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
3199 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
3200 ; GFX9-NEXT: ; return to shader part epilog
3202 ; GFX10-LABEL: global_umin_saddr_i64_rtn:
3204 ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3]
3205 ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
3206 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
3207 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
3208 ; GFX10-NEXT: .LBB28_1: ; %atomicrmw.start
3209 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
3210 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3211 ; GFX10-NEXT: v_mov_b32_e32 v10, v4
3212 ; GFX10-NEXT: v_mov_b32_e32 v9, v3
3213 ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
3214 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
3215 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
3216 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off glc
3217 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3218 ; GFX10-NEXT: buffer_gl1_inv
3219 ; GFX10-NEXT: buffer_gl0_inv
3220 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
3221 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3222 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
3223 ; GFX10-NEXT: s_cbranch_execnz .LBB28_1
3224 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
3225 ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
3226 ; GFX10-NEXT: v_mov_b32_e32 v0, v3
3227 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
3228 ; GFX10-NEXT: ; return to shader part epilog
3230 ; GFX11-LABEL: global_umin_saddr_i64_rtn:
3232 ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3]
3233 ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
3234 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3235 ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
3236 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
3237 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
3238 ; GFX11-NEXT: .LBB28_1: ; %atomicrmw.start
3239 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
3240 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3241 ; GFX11-NEXT: v_mov_b32_e32 v10, v4
3242 ; GFX11-NEXT: v_mov_b32_e32 v9, v3
3243 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3244 ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
3245 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
3246 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
3247 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off glc
3248 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3249 ; GFX11-NEXT: buffer_gl1_inv
3250 ; GFX11-NEXT: buffer_gl0_inv
3251 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
3252 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3253 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3254 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
3255 ; GFX11-NEXT: s_cbranch_execnz .LBB28_1
3256 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
3257 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
3258 ; GFX11-NEXT: v_mov_b32_e32 v0, v3
3259 ; GFX11-NEXT: v_mov_b32_e32 v1, v4
3260 ; GFX11-NEXT: ; return to shader part epilog
3262 ; GFX12-LABEL: global_umin_saddr_i64_rtn:
3264 ; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3]
3265 ; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
3266 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
3267 ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
3268 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
3269 ; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start
3270 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
3271 ; GFX12-NEXT: s_wait_loadcnt 0x0
3272 ; GFX12-NEXT: v_mov_b32_e32 v10, v4
3273 ; GFX12-NEXT: v_mov_b32_e32 v9, v3
3274 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
3275 ; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
3276 ; GFX12-NEXT: s_wait_alu 0xfffd
3277 ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
3278 ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
3279 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
3280 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3281 ; GFX12-NEXT: s_wait_loadcnt 0x0
3282 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
3283 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
3284 ; GFX12-NEXT: s_wait_alu 0xfffe
3285 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3286 ; GFX12-NEXT: s_wait_alu 0xfffe
3287 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
3288 ; GFX12-NEXT: s_cbranch_execnz .LBB28_1
3289 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
3290 ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1]
3291 ; GFX12-NEXT: v_mov_b32_e32 v0, v3
3292 ; GFX12-NEXT: v_mov_b32_e32 v1, v4
3293 ; GFX12-NEXT: ; return to shader part epilog
3294 %zext.offset = zext i32 %voffset to i64
3295 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3296 %rtn = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst
3297 %cast.rtn = bitcast i64 %rtn to <2 x float>
3298 ret <2 x float> %cast.rtn
3301 define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3302 ; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128:
3304 ; GFX9-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
3305 ; GFX9-NEXT: v_mov_b32_e32 v6, s3
3306 ; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, s2, v0
3307 ; GFX9-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
3308 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
3309 ; GFX9-NEXT: .LBB29_1: ; %atomicrmw.start
3310 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3311 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3312 ; GFX9-NEXT: v_mov_b32_e32 v10, v4
3313 ; GFX9-NEXT: v_mov_b32_e32 v9, v3
3314 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
3315 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
3316 ; GFX9-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
3317 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
3318 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3319 ; GFX9-NEXT: buffer_wbinvl1
3320 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
3321 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3322 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
3323 ; GFX9-NEXT: s_cbranch_execnz .LBB29_1
3324 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
3325 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
3326 ; GFX9-NEXT: v_mov_b32_e32 v0, v3
3327 ; GFX9-NEXT: v_mov_b32_e32 v1, v4
3328 ; GFX9-NEXT: ; return to shader part epilog
3330 ; GFX10-LABEL: global_umin_saddr_i64_rtn_neg128:
3332 ; GFX10-NEXT: global_load_dwordx2 v[3:4], v0, s[2:3] offset:-128
3333 ; GFX10-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
3334 ; GFX10-NEXT: v_add_co_ci_u32_e64 v6, s[0:1], s3, 0, s[0:1]
3335 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
3336 ; GFX10-NEXT: .LBB29_1: ; %atomicrmw.start
3337 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
3338 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3339 ; GFX10-NEXT: v_mov_b32_e32 v10, v4
3340 ; GFX10-NEXT: v_mov_b32_e32 v9, v3
3341 ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
3342 ; GFX10-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
3343 ; GFX10-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
3344 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[5:6], v[7:10], off offset:-128 glc
3345 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3346 ; GFX10-NEXT: buffer_gl1_inv
3347 ; GFX10-NEXT: buffer_gl0_inv
3348 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
3349 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3350 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
3351 ; GFX10-NEXT: s_cbranch_execnz .LBB29_1
3352 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
3353 ; GFX10-NEXT: s_or_b64 exec, exec, s[0:1]
3354 ; GFX10-NEXT: v_mov_b32_e32 v0, v3
3355 ; GFX10-NEXT: v_mov_b32_e32 v1, v4
3356 ; GFX10-NEXT: ; return to shader part epilog
3358 ; GFX11-LABEL: global_umin_saddr_i64_rtn_neg128:
3360 ; GFX11-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128
3361 ; GFX11-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
3362 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3363 ; GFX11-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
3364 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
3365 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
3366 ; GFX11-NEXT: .LBB29_1: ; %atomicrmw.start
3367 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
3368 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3369 ; GFX11-NEXT: v_mov_b32_e32 v10, v4
3370 ; GFX11-NEXT: v_mov_b32_e32 v9, v3
3371 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3372 ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
3373 ; GFX11-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
3374 ; GFX11-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
3375 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 glc
3376 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3377 ; GFX11-NEXT: buffer_gl1_inv
3378 ; GFX11-NEXT: buffer_gl0_inv
3379 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
3380 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3381 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3382 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
3383 ; GFX11-NEXT: s_cbranch_execnz .LBB29_1
3384 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
3385 ; GFX11-NEXT: s_or_b64 exec, exec, s[0:1]
3386 ; GFX11-NEXT: v_mov_b32_e32 v0, v3
3387 ; GFX11-NEXT: v_mov_b32_e32 v1, v4
3388 ; GFX11-NEXT: ; return to shader part epilog
3390 ; GFX12-LABEL: global_umin_saddr_i64_rtn_neg128:
3392 ; GFX12-NEXT: global_load_b64 v[3:4], v0, s[2:3] offset:-128
3393 ; GFX12-NEXT: v_add_co_u32 v5, s[0:1], s2, v0
3394 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
3395 ; GFX12-NEXT: v_add_co_ci_u32_e64 v6, null, s3, 0, s[0:1]
3396 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
3397 ; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start
3398 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
3399 ; GFX12-NEXT: s_wait_loadcnt 0x0
3400 ; GFX12-NEXT: v_mov_b32_e32 v10, v4
3401 ; GFX12-NEXT: v_mov_b32_e32 v9, v3
3402 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
3403 ; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[9:10], v[1:2]
3404 ; GFX12-NEXT: s_wait_alu 0xfffd
3405 ; GFX12-NEXT: v_cndmask_b32_e32 v8, v2, v10, vcc
3406 ; GFX12-NEXT: v_cndmask_b32_e32 v7, v1, v9, vcc
3407 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
3408 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[5:6], v[7:10], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3409 ; GFX12-NEXT: s_wait_loadcnt 0x0
3410 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
3411 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[9:10]
3412 ; GFX12-NEXT: s_wait_alu 0xfffe
3413 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3414 ; GFX12-NEXT: s_wait_alu 0xfffe
3415 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
3416 ; GFX12-NEXT: s_cbranch_execnz .LBB29_1
3417 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
3418 ; GFX12-NEXT: s_or_b64 exec, exec, s[0:1]
3419 ; GFX12-NEXT: v_mov_b32_e32 v0, v3
3420 ; GFX12-NEXT: v_mov_b32_e32 v1, v4
3421 ; GFX12-NEXT: ; return to shader part epilog
3422 %zext.offset = zext i32 %voffset to i64
3423 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3424 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3425 %rtn = atomicrmw umin ptr addrspace(1) %gep1, i64 %data seq_cst
3426 %cast.rtn = bitcast i64 %rtn to <2 x float>
3427 ret <2 x float> %cast.rtn
3430 define amdgpu_ps void @global_umin_saddr_i64_nortn(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3431 ; GFX9-LABEL: global_umin_saddr_i64_nortn:
3433 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3]
3434 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
3435 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0
3436 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
3437 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
3438 ; GFX9-NEXT: .LBB30_1: ; %atomicrmw.start
3439 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3440 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3441 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
3442 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3443 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
3444 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
3445 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3446 ; GFX9-NEXT: buffer_wbinvl1
3447 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
3448 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
3449 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3450 ; GFX9-NEXT: v_mov_b32_e32 v5, v3
3451 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
3452 ; GFX9-NEXT: s_cbranch_execnz .LBB30_1
3453 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
3454 ; GFX9-NEXT: s_endpgm
3456 ; GFX10-LABEL: global_umin_saddr_i64_nortn:
3458 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3]
3459 ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
3460 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
3461 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
3462 ; GFX10-NEXT: .LBB30_1: ; %atomicrmw.start
3463 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
3464 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3465 ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
3466 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3467 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
3468 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off glc
3469 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3470 ; GFX10-NEXT: buffer_gl1_inv
3471 ; GFX10-NEXT: buffer_gl0_inv
3472 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
3473 ; GFX10-NEXT: v_mov_b32_e32 v6, v4
3474 ; GFX10-NEXT: v_mov_b32_e32 v5, v3
3475 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3476 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
3477 ; GFX10-NEXT: s_cbranch_execnz .LBB30_1
3478 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
3479 ; GFX10-NEXT: s_endpgm
3481 ; GFX11-LABEL: global_umin_saddr_i64_nortn:
3483 ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3]
3484 ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
3485 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3486 ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
3487 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
3488 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
3489 ; GFX11-NEXT: .LBB30_1: ; %atomicrmw.start
3490 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
3491 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3492 ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
3493 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3494 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
3495 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off glc
3496 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3497 ; GFX11-NEXT: buffer_gl1_inv
3498 ; GFX11-NEXT: buffer_gl0_inv
3499 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
3500 ; GFX11-NEXT: v_mov_b32_e32 v6, v4
3501 ; GFX11-NEXT: v_mov_b32_e32 v5, v3
3502 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3503 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3504 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
3505 ; GFX11-NEXT: s_cbranch_execnz .LBB30_1
3506 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
3507 ; GFX11-NEXT: s_endpgm
3509 ; GFX12-LABEL: global_umin_saddr_i64_nortn:
3511 ; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3]
3512 ; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
3513 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
3514 ; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
3515 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
3516 ; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start
3517 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
3518 ; GFX12-NEXT: s_wait_loadcnt 0x0
3519 ; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
3520 ; GFX12-NEXT: s_wait_alu 0xfffd
3521 ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3522 ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
3523 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
3524 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3525 ; GFX12-NEXT: s_wait_loadcnt 0x0
3526 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
3527 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
3528 ; GFX12-NEXT: v_mov_b32_e32 v6, v4
3529 ; GFX12-NEXT: v_mov_b32_e32 v5, v3
3530 ; GFX12-NEXT: s_wait_alu 0xfffe
3531 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3532 ; GFX12-NEXT: s_wait_alu 0xfffe
3533 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
3534 ; GFX12-NEXT: s_cbranch_execnz .LBB30_1
3535 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
3536 ; GFX12-NEXT: s_endpgm
3537 %zext.offset = zext i32 %voffset to i64
3538 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3539 %unused = atomicrmw umin ptr addrspace(1) %gep0, i64 %data seq_cst
3543 define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(ptr addrspace(1) inreg %sbase, i32 %voffset, i64 %data) {
3544 ; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128:
3546 ; GFX9-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
3547 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
3548 ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, s2, v0
3549 ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, 0, v3, vcc
3550 ; GFX9-NEXT: s_mov_b64 s[0:1], 0
3551 ; GFX9-NEXT: .LBB31_1: ; %atomicrmw.start
3552 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
3553 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3554 ; GFX9-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
3555 ; GFX9-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3556 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
3557 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
3558 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3559 ; GFX9-NEXT: buffer_wbinvl1
3560 ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
3561 ; GFX9-NEXT: v_mov_b32_e32 v6, v4
3562 ; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3563 ; GFX9-NEXT: v_mov_b32_e32 v5, v3
3564 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
3565 ; GFX9-NEXT: s_cbranch_execnz .LBB31_1
3566 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
3567 ; GFX9-NEXT: s_endpgm
3569 ; GFX10-LABEL: global_umin_saddr_i64_nortn_neg128:
3571 ; GFX10-NEXT: global_load_dwordx2 v[5:6], v0, s[2:3] offset:-128
3572 ; GFX10-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
3573 ; GFX10-NEXT: v_add_co_ci_u32_e64 v8, s[0:1], s3, 0, s[0:1]
3574 ; GFX10-NEXT: s_mov_b64 s[0:1], 0
3575 ; GFX10-NEXT: .LBB31_1: ; %atomicrmw.start
3576 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
3577 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3578 ; GFX10-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
3579 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3580 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
3581 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[3:4], v[7:8], v[3:6], off offset:-128 glc
3582 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3583 ; GFX10-NEXT: buffer_gl1_inv
3584 ; GFX10-NEXT: buffer_gl0_inv
3585 ; GFX10-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
3586 ; GFX10-NEXT: v_mov_b32_e32 v6, v4
3587 ; GFX10-NEXT: v_mov_b32_e32 v5, v3
3588 ; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3589 ; GFX10-NEXT: s_andn2_b64 exec, exec, s[0:1]
3590 ; GFX10-NEXT: s_cbranch_execnz .LBB31_1
3591 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
3592 ; GFX10-NEXT: s_endpgm
3594 ; GFX11-LABEL: global_umin_saddr_i64_nortn_neg128:
3596 ; GFX11-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128
3597 ; GFX11-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
3598 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3599 ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
3600 ; GFX11-NEXT: s_mov_b64 s[0:1], 0
3601 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe
3602 ; GFX11-NEXT: .LBB31_1: ; %atomicrmw.start
3603 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
3604 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3605 ; GFX11-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
3606 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3607 ; GFX11-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
3608 ; GFX11-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 glc
3609 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3610 ; GFX11-NEXT: buffer_gl1_inv
3611 ; GFX11-NEXT: buffer_gl0_inv
3612 ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
3613 ; GFX11-NEXT: v_mov_b32_e32 v6, v4
3614 ; GFX11-NEXT: v_mov_b32_e32 v5, v3
3615 ; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3616 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3617 ; GFX11-NEXT: s_and_not1_b64 exec, exec, s[0:1]
3618 ; GFX11-NEXT: s_cbranch_execnz .LBB31_1
3619 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
3620 ; GFX11-NEXT: s_endpgm
3622 ; GFX12-LABEL: global_umin_saddr_i64_nortn_neg128:
3624 ; GFX12-NEXT: global_load_b64 v[5:6], v0, s[2:3] offset:-128
3625 ; GFX12-NEXT: v_add_co_u32 v7, s[0:1], s2, v0
3626 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
3627 ; GFX12-NEXT: v_add_co_ci_u32_e64 v8, null, s3, 0, s[0:1]
3628 ; GFX12-NEXT: s_mov_b64 s[0:1], 0
3629 ; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start
3630 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
3631 ; GFX12-NEXT: s_wait_loadcnt 0x0
3632 ; GFX12-NEXT: v_cmp_le_u64_e32 vcc, v[5:6], v[1:2]
3633 ; GFX12-NEXT: s_wait_alu 0xfffd
3634 ; GFX12-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc
3635 ; GFX12-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
3636 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
3637 ; GFX12-NEXT: global_atomic_cmpswap_b64 v[3:4], v[7:8], v[3:6], off offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
3638 ; GFX12-NEXT: s_wait_loadcnt 0x0
3639 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
3640 ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
3641 ; GFX12-NEXT: v_mov_b32_e32 v6, v4
3642 ; GFX12-NEXT: v_mov_b32_e32 v5, v3
3643 ; GFX12-NEXT: s_wait_alu 0xfffe
3644 ; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
3645 ; GFX12-NEXT: s_wait_alu 0xfffe
3646 ; GFX12-NEXT: s_and_not1_b64 exec, exec, s[0:1]
3647 ; GFX12-NEXT: s_cbranch_execnz .LBB31_1
3648 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
3649 ; GFX12-NEXT: s_endpgm
3650 %zext.offset = zext i32 %voffset to i64
3651 %gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
3652 %gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
3653 %unused = atomicrmw umin ptr addrspace(1) %gep1, i64 %data seq_cst
3657 attributes #0 = { argmemonly nounwind willreturn }