1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
3 ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck -check-prefixes=GCN,GFX10 %s
5 ; Test using saddr addressing mode of global_* flat atomic instructions.
7 define amdgpu_ps void @global_xchg_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
8 ; GFX9-LABEL: global_xchg_saddr_i32_nortn:
10 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
11 ; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3]
12 ; GFX9-NEXT: s_waitcnt vmcnt(0)
13 ; GFX9-NEXT: buffer_wbinvl1
16 ; GFX10-LABEL: global_xchg_saddr_i32_nortn:
18 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
19 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
20 ; GFX10-NEXT: global_atomic_swap v0, v1, s[2:3]
21 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
22 ; GFX10-NEXT: buffer_gl0_inv
23 ; GFX10-NEXT: buffer_gl1_inv
24 ; GFX10-NEXT: s_endpgm
25 %zext.offset = zext i32 %voffset to i64
26 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
27 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
28 %unused = atomicrmw xchg i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
32 ; Maximum positive offset on gfx10
33 define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_2047(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
34 ; GFX9-LABEL: global_xchg_saddr_i32_nortn_offset_2047:
36 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
37 ; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:2047
38 ; GFX9-NEXT: s_waitcnt vmcnt(0)
39 ; GFX9-NEXT: buffer_wbinvl1
42 ; GFX10-LABEL: global_xchg_saddr_i32_nortn_offset_2047:
44 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
45 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
46 ; GFX10-NEXT: global_atomic_swap v0, v1, s[2:3] offset:2047
47 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
48 ; GFX10-NEXT: buffer_gl0_inv
49 ; GFX10-NEXT: buffer_gl1_inv
50 ; GFX10-NEXT: s_endpgm
51 %zext.offset = zext i32 %voffset to i64
52 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
53 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2047
54 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
55 %unused = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
59 ; Maximum negative offset on gfx10
60 define amdgpu_ps void @global_xchg_saddr_i32_nortn_offset_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
61 ; GFX9-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048:
63 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
64 ; GFX9-NEXT: global_atomic_swap v0, v1, s[2:3] offset:-2048
65 ; GFX9-NEXT: s_waitcnt vmcnt(0)
66 ; GFX9-NEXT: buffer_wbinvl1
69 ; GFX10-LABEL: global_xchg_saddr_i32_nortn_offset_neg2048:
71 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
72 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
73 ; GFX10-NEXT: global_atomic_swap v0, v1, s[2:3] offset:-2048
74 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
75 ; GFX10-NEXT: buffer_gl0_inv
76 ; GFX10-NEXT: buffer_gl1_inv
77 ; GFX10-NEXT: s_endpgm
78 %zext.offset = zext i32 %voffset to i64
79 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
80 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048
81 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
82 %unused = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
86 define amdgpu_ps float @global_xchg_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
87 ; GFX9-LABEL: global_xchg_saddr_i32_rtn:
89 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
90 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[2:3] glc
91 ; GFX9-NEXT: s_waitcnt vmcnt(0)
92 ; GFX9-NEXT: buffer_wbinvl1
93 ; GFX9-NEXT: ; return to shader part epilog
95 ; GFX10-LABEL: global_xchg_saddr_i32_rtn:
97 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
98 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
99 ; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[2:3] glc
100 ; GFX10-NEXT: s_waitcnt vmcnt(0)
101 ; GFX10-NEXT: buffer_gl0_inv
102 ; GFX10-NEXT: buffer_gl1_inv
103 ; GFX10-NEXT: ; return to shader part epilog
104 %zext.offset = zext i32 %voffset to i64
105 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
106 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
107 %rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
108 %cast.rtn = bitcast i32 %rtn to float
112 define amdgpu_ps float @global_xchg_saddr_i32_rtn_2048(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
113 ; GFX9-LABEL: global_xchg_saddr_i32_rtn_2048:
115 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
116 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[2:3] offset:2048 glc
117 ; GFX9-NEXT: s_waitcnt vmcnt(0)
118 ; GFX9-NEXT: buffer_wbinvl1
119 ; GFX9-NEXT: ; return to shader part epilog
121 ; GFX10-LABEL: global_xchg_saddr_i32_rtn_2048:
123 ; GFX10-NEXT: v_add_co_u32 v0, s[0:1], s2, v0
124 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s[0:1], s3, 0, s[0:1]
125 ; GFX10-NEXT: v_add_co_u32 v2, vcc, 0x800, v0
126 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc, 0, v3, vcc
127 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
128 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
129 ; GFX10-NEXT: global_atomic_swap v0, v[2:3], v1, off glc
130 ; GFX10-NEXT: s_waitcnt vmcnt(0)
131 ; GFX10-NEXT: buffer_gl0_inv
132 ; GFX10-NEXT: buffer_gl1_inv
133 ; GFX10-NEXT: ; return to shader part epilog
134 %zext.offset = zext i32 %voffset to i64
135 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
136 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 2048
137 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
138 %rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
139 %cast.rtn = bitcast i32 %rtn to float
143 define amdgpu_ps float @global_xchg_saddr_i32_rtn_neg2048(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
144 ; GFX9-LABEL: global_xchg_saddr_i32_rtn_neg2048:
146 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
147 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[2:3] offset:-2048 glc
148 ; GFX9-NEXT: s_waitcnt vmcnt(0)
149 ; GFX9-NEXT: buffer_wbinvl1
150 ; GFX9-NEXT: ; return to shader part epilog
152 ; GFX10-LABEL: global_xchg_saddr_i32_rtn_neg2048:
154 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
155 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
156 ; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[2:3] offset:-2048 glc
157 ; GFX10-NEXT: s_waitcnt vmcnt(0)
158 ; GFX10-NEXT: buffer_gl0_inv
159 ; GFX10-NEXT: buffer_gl1_inv
160 ; GFX10-NEXT: ; return to shader part epilog
161 %zext.offset = zext i32 %voffset to i64
162 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
163 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -2048
164 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
165 %rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
166 %cast.rtn = bitcast i32 %rtn to float
170 ; --------------------------------------------------------------------------------
171 ; Uniformity edge cases
172 ; --------------------------------------------------------------------------------
174 @ptr.in.lds = internal addrspace(3) global i8 addrspace(1)* undef
176 ; Base pointer is uniform, but also in VGPRs
177 define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i32 %data) {
178 ; GFX9-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn:
180 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
181 ; GFX9-NEXT: ds_read_b64 v[2:3], v2
182 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
183 ; GFX9-NEXT: v_readfirstlane_b32 s0, v2
184 ; GFX9-NEXT: v_readfirstlane_b32 s1, v3
185 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
187 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[0:1] glc
188 ; GFX9-NEXT: s_waitcnt vmcnt(0)
189 ; GFX9-NEXT: buffer_wbinvl1
190 ; GFX9-NEXT: ; return to shader part epilog
192 ; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn:
194 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
195 ; GFX10-NEXT: ds_read_b64 v[2:3], v2
196 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
197 ; GFX10-NEXT: v_readfirstlane_b32 s0, v2
198 ; GFX10-NEXT: v_readfirstlane_b32 s1, v3
199 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
200 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
201 ; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[0:1] glc
202 ; GFX10-NEXT: s_waitcnt vmcnt(0)
203 ; GFX10-NEXT: buffer_gl0_inv
204 ; GFX10-NEXT: buffer_gl1_inv
205 ; GFX10-NEXT: ; return to shader part epilog
206 %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
207 %zext.offset = zext i32 %voffset to i64
208 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
209 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
210 %rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
211 %cast.rtn = bitcast i32 %rtn to float
215 ; Base pointer is uniform, but also in VGPRs, with imm offset
216 define amdgpu_ps float @global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 %voffset, i32 %data) {
217 ; GFX9-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset:
219 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
220 ; GFX9-NEXT: ds_read_b64 v[2:3], v2
221 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
222 ; GFX9-NEXT: v_readfirstlane_b32 s0, v2
223 ; GFX9-NEXT: v_readfirstlane_b32 s1, v3
224 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
226 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[0:1] offset:42 glc
227 ; GFX9-NEXT: s_waitcnt vmcnt(0)
228 ; GFX9-NEXT: buffer_wbinvl1
229 ; GFX9-NEXT: ; return to shader part epilog
231 ; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset:
233 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
234 ; GFX10-NEXT: ds_read_b64 v[2:3], v2
235 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
236 ; GFX10-NEXT: v_readfirstlane_b32 s0, v2
237 ; GFX10-NEXT: v_readfirstlane_b32 s1, v3
238 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
239 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
240 ; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[0:1] offset:42 glc
241 ; GFX10-NEXT: s_waitcnt vmcnt(0)
242 ; GFX10-NEXT: buffer_gl0_inv
243 ; GFX10-NEXT: buffer_gl1_inv
244 ; GFX10-NEXT: ; return to shader part epilog
245 %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
246 %zext.offset = zext i32 %voffset to i64
247 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
248 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42
249 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
250 %rtn = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
251 %cast.rtn = bitcast i32 %rtn to float
255 ; Base pointer is uniform, but also in VGPRs
256 define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, i32 %data) {
257 ; GFX9-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn:
259 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
260 ; GFX9-NEXT: ds_read_b64 v[2:3], v2
261 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
262 ; GFX9-NEXT: v_readfirstlane_b32 s0, v2
263 ; GFX9-NEXT: v_readfirstlane_b32 s1, v3
264 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
266 ; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1]
267 ; GFX9-NEXT: s_waitcnt vmcnt(0)
268 ; GFX9-NEXT: buffer_wbinvl1
269 ; GFX9-NEXT: s_endpgm
271 ; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn:
273 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
274 ; GFX10-NEXT: ds_read_b64 v[2:3], v2
275 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
276 ; GFX10-NEXT: v_readfirstlane_b32 s0, v2
277 ; GFX10-NEXT: v_readfirstlane_b32 s1, v3
278 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
279 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
280 ; GFX10-NEXT: global_atomic_swap v0, v1, s[0:1]
281 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
282 ; GFX10-NEXT: buffer_gl0_inv
283 ; GFX10-NEXT: buffer_gl1_inv
284 ; GFX10-NEXT: s_endpgm
285 %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
286 %zext.offset = zext i32 %voffset to i64
287 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
288 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
289 %unused = atomicrmw xchg i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
293 ; Base pointer is uniform, but also in VGPRs, with imm offset
294 define amdgpu_ps void @global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 %voffset, i32 %data) {
295 ; GFX9-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset:
297 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
298 ; GFX9-NEXT: ds_read_b64 v[2:3], v2
299 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
300 ; GFX9-NEXT: v_readfirstlane_b32 s0, v2
301 ; GFX9-NEXT: v_readfirstlane_b32 s1, v3
302 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
304 ; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:42
305 ; GFX9-NEXT: s_waitcnt vmcnt(0)
306 ; GFX9-NEXT: buffer_wbinvl1
307 ; GFX9-NEXT: s_endpgm
309 ; GFX10-LABEL: global_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset:
311 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
312 ; GFX10-NEXT: ds_read_b64 v[2:3], v2
313 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
314 ; GFX10-NEXT: v_readfirstlane_b32 s0, v2
315 ; GFX10-NEXT: v_readfirstlane_b32 s1, v3
316 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
317 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
318 ; GFX10-NEXT: global_atomic_swap v0, v1, s[0:1] offset:42
319 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
320 ; GFX10-NEXT: buffer_gl0_inv
321 ; GFX10-NEXT: buffer_gl1_inv
322 ; GFX10-NEXT: s_endpgm
323 %sbase = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(3)* @ptr.in.lds
324 %zext.offset = zext i32 %voffset to i64
325 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
326 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 42
327 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
328 %unused = atomicrmw xchg i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
332 ; --------------------------------------------------------------------------------
334 ; --------------------------------------------------------------------------------
336 ; --------------------------------------------------------------------------------
338 ; --------------------------------------------------------------------------------
340 define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
341 ; GFX9-LABEL: global_xchg_saddr_i64_rtn:
343 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
344 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v0, v[1:2], s[2:3] glc
345 ; GFX9-NEXT: s_waitcnt vmcnt(0)
346 ; GFX9-NEXT: buffer_wbinvl1
347 ; GFX9-NEXT: ; return to shader part epilog
349 ; GFX10-LABEL: global_xchg_saddr_i64_rtn:
351 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
352 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
353 ; GFX10-NEXT: global_atomic_swap_x2 v[0:1], v0, v[1:2], s[2:3] glc
354 ; GFX10-NEXT: s_waitcnt vmcnt(0)
355 ; GFX10-NEXT: buffer_gl0_inv
356 ; GFX10-NEXT: buffer_gl1_inv
357 ; GFX10-NEXT: ; return to shader part epilog
358 %zext.offset = zext i32 %voffset to i64
359 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
360 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
361 %rtn = atomicrmw xchg i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
362 %cast.rtn = bitcast i64 %rtn to <2 x float>
363 ret <2 x float> %cast.rtn
366 define amdgpu_ps <2 x float> @global_xchg_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
367 ; GFX9-LABEL: global_xchg_saddr_i64_rtn_neg128:
369 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
370 ; GFX9-NEXT: global_atomic_swap_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
371 ; GFX9-NEXT: s_waitcnt vmcnt(0)
372 ; GFX9-NEXT: buffer_wbinvl1
373 ; GFX9-NEXT: ; return to shader part epilog
375 ; GFX10-LABEL: global_xchg_saddr_i64_rtn_neg128:
377 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
378 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
379 ; GFX10-NEXT: global_atomic_swap_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
380 ; GFX10-NEXT: s_waitcnt vmcnt(0)
381 ; GFX10-NEXT: buffer_gl0_inv
382 ; GFX10-NEXT: buffer_gl1_inv
383 ; GFX10-NEXT: ; return to shader part epilog
384 %zext.offset = zext i32 %voffset to i64
385 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
386 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
387 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
388 %rtn = atomicrmw xchg i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
389 %cast.rtn = bitcast i64 %rtn to <2 x float>
390 ret <2 x float> %cast.rtn
393 define amdgpu_ps void @global_xchg_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
394 ; GFX9-LABEL: global_xchg_saddr_i64_nortn:
396 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
397 ; GFX9-NEXT: global_atomic_swap_x2 v0, v[1:2], s[2:3]
398 ; GFX9-NEXT: s_waitcnt vmcnt(0)
399 ; GFX9-NEXT: buffer_wbinvl1
400 ; GFX9-NEXT: s_endpgm
402 ; GFX10-LABEL: global_xchg_saddr_i64_nortn:
404 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
405 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
406 ; GFX10-NEXT: global_atomic_swap_x2 v0, v[1:2], s[2:3]
407 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
408 ; GFX10-NEXT: buffer_gl0_inv
409 ; GFX10-NEXT: buffer_gl1_inv
410 ; GFX10-NEXT: s_endpgm
411 %zext.offset = zext i32 %voffset to i64
412 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
413 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
414 %unused = atomicrmw xchg i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
418 define amdgpu_ps void @global_xchg_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
419 ; GFX9-LABEL: global_xchg_saddr_i64_nortn_neg128:
421 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
422 ; GFX9-NEXT: global_atomic_swap_x2 v0, v[1:2], s[2:3] offset:-128
423 ; GFX9-NEXT: s_waitcnt vmcnt(0)
424 ; GFX9-NEXT: buffer_wbinvl1
425 ; GFX9-NEXT: s_endpgm
427 ; GFX10-LABEL: global_xchg_saddr_i64_nortn_neg128:
429 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
430 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
431 ; GFX10-NEXT: global_atomic_swap_x2 v0, v[1:2], s[2:3] offset:-128
432 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
433 ; GFX10-NEXT: buffer_gl0_inv
434 ; GFX10-NEXT: buffer_gl1_inv
435 ; GFX10-NEXT: s_endpgm
436 %zext.offset = zext i32 %voffset to i64
437 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
438 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
439 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
440 %unused = atomicrmw xchg i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
444 ; --------------------------------------------------------------------------------
446 ; --------------------------------------------------------------------------------
448 define amdgpu_ps float @global_add_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
449 ; GFX9-LABEL: global_add_saddr_i32_rtn:
451 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
452 ; GFX9-NEXT: global_atomic_add v0, v0, v1, s[2:3] glc
453 ; GFX9-NEXT: s_waitcnt vmcnt(0)
454 ; GFX9-NEXT: buffer_wbinvl1
455 ; GFX9-NEXT: ; return to shader part epilog
457 ; GFX10-LABEL: global_add_saddr_i32_rtn:
459 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
460 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
461 ; GFX10-NEXT: global_atomic_add v0, v0, v1, s[2:3] glc
462 ; GFX10-NEXT: s_waitcnt vmcnt(0)
463 ; GFX10-NEXT: buffer_gl0_inv
464 ; GFX10-NEXT: buffer_gl1_inv
465 ; GFX10-NEXT: ; return to shader part epilog
466 %zext.offset = zext i32 %voffset to i64
467 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
468 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
469 %rtn = atomicrmw add i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
470 %cast.rtn = bitcast i32 %rtn to float
474 define amdgpu_ps float @global_add_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
475 ; GFX9-LABEL: global_add_saddr_i32_rtn_neg128:
477 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
478 ; GFX9-NEXT: global_atomic_add v0, v0, v1, s[2:3] offset:-128 glc
479 ; GFX9-NEXT: s_waitcnt vmcnt(0)
480 ; GFX9-NEXT: buffer_wbinvl1
481 ; GFX9-NEXT: ; return to shader part epilog
483 ; GFX10-LABEL: global_add_saddr_i32_rtn_neg128:
485 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
486 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
487 ; GFX10-NEXT: global_atomic_add v0, v0, v1, s[2:3] offset:-128 glc
488 ; GFX10-NEXT: s_waitcnt vmcnt(0)
489 ; GFX10-NEXT: buffer_gl0_inv
490 ; GFX10-NEXT: buffer_gl1_inv
491 ; GFX10-NEXT: ; return to shader part epilog
492 %zext.offset = zext i32 %voffset to i64
493 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
494 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
495 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
496 %rtn = atomicrmw add i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
497 %cast.rtn = bitcast i32 %rtn to float
501 define amdgpu_ps void @global_add_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
502 ; GFX9-LABEL: global_add_saddr_i32_nortn:
504 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
505 ; GFX9-NEXT: global_atomic_add v0, v1, s[2:3]
506 ; GFX9-NEXT: s_waitcnt vmcnt(0)
507 ; GFX9-NEXT: buffer_wbinvl1
508 ; GFX9-NEXT: s_endpgm
510 ; GFX10-LABEL: global_add_saddr_i32_nortn:
512 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
513 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
514 ; GFX10-NEXT: global_atomic_add v0, v1, s[2:3]
515 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
516 ; GFX10-NEXT: buffer_gl0_inv
517 ; GFX10-NEXT: buffer_gl1_inv
518 ; GFX10-NEXT: s_endpgm
519 %zext.offset = zext i32 %voffset to i64
520 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
521 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
522 %unused = atomicrmw add i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
526 define amdgpu_ps void @global_add_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
527 ; GFX9-LABEL: global_add_saddr_i32_nortn_neg128:
529 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
530 ; GFX9-NEXT: global_atomic_add v0, v1, s[2:3] offset:-128
531 ; GFX9-NEXT: s_waitcnt vmcnt(0)
532 ; GFX9-NEXT: buffer_wbinvl1
533 ; GFX9-NEXT: s_endpgm
535 ; GFX10-LABEL: global_add_saddr_i32_nortn_neg128:
537 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
538 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
539 ; GFX10-NEXT: global_atomic_add v0, v1, s[2:3] offset:-128
540 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
541 ; GFX10-NEXT: buffer_gl0_inv
542 ; GFX10-NEXT: buffer_gl1_inv
543 ; GFX10-NEXT: s_endpgm
544 %zext.offset = zext i32 %voffset to i64
545 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
546 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
547 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
548 %unused = atomicrmw add i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
552 define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
553 ; GFX9-LABEL: global_add_saddr_i64_rtn:
555 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
556 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v0, v[1:2], s[2:3] glc
557 ; GFX9-NEXT: s_waitcnt vmcnt(0)
558 ; GFX9-NEXT: buffer_wbinvl1
559 ; GFX9-NEXT: ; return to shader part epilog
561 ; GFX10-LABEL: global_add_saddr_i64_rtn:
563 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
564 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
565 ; GFX10-NEXT: global_atomic_add_x2 v[0:1], v0, v[1:2], s[2:3] glc
566 ; GFX10-NEXT: s_waitcnt vmcnt(0)
567 ; GFX10-NEXT: buffer_gl0_inv
568 ; GFX10-NEXT: buffer_gl1_inv
569 ; GFX10-NEXT: ; return to shader part epilog
570 %zext.offset = zext i32 %voffset to i64
571 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
572 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
573 %rtn = atomicrmw add i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
574 %cast.rtn = bitcast i64 %rtn to <2 x float>
575 ret <2 x float> %cast.rtn
578 define amdgpu_ps <2 x float> @global_add_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
579 ; GFX9-LABEL: global_add_saddr_i64_rtn_neg128:
581 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
582 ; GFX9-NEXT: global_atomic_add_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
583 ; GFX9-NEXT: s_waitcnt vmcnt(0)
584 ; GFX9-NEXT: buffer_wbinvl1
585 ; GFX9-NEXT: ; return to shader part epilog
587 ; GFX10-LABEL: global_add_saddr_i64_rtn_neg128:
589 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
590 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
591 ; GFX10-NEXT: global_atomic_add_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
592 ; GFX10-NEXT: s_waitcnt vmcnt(0)
593 ; GFX10-NEXT: buffer_gl0_inv
594 ; GFX10-NEXT: buffer_gl1_inv
595 ; GFX10-NEXT: ; return to shader part epilog
596 %zext.offset = zext i32 %voffset to i64
597 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
598 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
599 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
600 %rtn = atomicrmw add i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
601 %cast.rtn = bitcast i64 %rtn to <2 x float>
602 ret <2 x float> %cast.rtn
605 define amdgpu_ps void @global_add_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
606 ; GFX9-LABEL: global_add_saddr_i64_nortn:
608 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
609 ; GFX9-NEXT: global_atomic_add_x2 v0, v[1:2], s[2:3]
610 ; GFX9-NEXT: s_waitcnt vmcnt(0)
611 ; GFX9-NEXT: buffer_wbinvl1
612 ; GFX9-NEXT: s_endpgm
614 ; GFX10-LABEL: global_add_saddr_i64_nortn:
616 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
617 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
618 ; GFX10-NEXT: global_atomic_add_x2 v0, v[1:2], s[2:3]
619 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
620 ; GFX10-NEXT: buffer_gl0_inv
621 ; GFX10-NEXT: buffer_gl1_inv
622 ; GFX10-NEXT: s_endpgm
623 %zext.offset = zext i32 %voffset to i64
624 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
625 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
626 %unused = atomicrmw add i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
630 define amdgpu_ps void @global_add_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
631 ; GFX9-LABEL: global_add_saddr_i64_nortn_neg128:
633 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
634 ; GFX9-NEXT: global_atomic_add_x2 v0, v[1:2], s[2:3] offset:-128
635 ; GFX9-NEXT: s_waitcnt vmcnt(0)
636 ; GFX9-NEXT: buffer_wbinvl1
637 ; GFX9-NEXT: s_endpgm
639 ; GFX10-LABEL: global_add_saddr_i64_nortn_neg128:
641 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
642 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
643 ; GFX10-NEXT: global_atomic_add_x2 v0, v[1:2], s[2:3] offset:-128
644 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
645 ; GFX10-NEXT: buffer_gl0_inv
646 ; GFX10-NEXT: buffer_gl1_inv
647 ; GFX10-NEXT: s_endpgm
648 %zext.offset = zext i32 %voffset to i64
649 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
650 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
651 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
652 %unused = atomicrmw add i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
656 ; --------------------------------------------------------------------------------
658 ; --------------------------------------------------------------------------------
660 define amdgpu_ps float @global_sub_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
661 ; GFX9-LABEL: global_sub_saddr_i32_rtn:
663 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
664 ; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[2:3] glc
665 ; GFX9-NEXT: s_waitcnt vmcnt(0)
666 ; GFX9-NEXT: buffer_wbinvl1
667 ; GFX9-NEXT: ; return to shader part epilog
669 ; GFX10-LABEL: global_sub_saddr_i32_rtn:
671 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
672 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
673 ; GFX10-NEXT: global_atomic_sub v0, v0, v1, s[2:3] glc
674 ; GFX10-NEXT: s_waitcnt vmcnt(0)
675 ; GFX10-NEXT: buffer_gl0_inv
676 ; GFX10-NEXT: buffer_gl1_inv
677 ; GFX10-NEXT: ; return to shader part epilog
678 %zext.offset = zext i32 %voffset to i64
679 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
680 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
681 %rtn = atomicrmw sub i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
682 %cast.rtn = bitcast i32 %rtn to float
686 define amdgpu_ps float @global_sub_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
687 ; GFX9-LABEL: global_sub_saddr_i32_rtn_neg128:
689 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
690 ; GFX9-NEXT: global_atomic_sub v0, v0, v1, s[2:3] offset:-128 glc
691 ; GFX9-NEXT: s_waitcnt vmcnt(0)
692 ; GFX9-NEXT: buffer_wbinvl1
693 ; GFX9-NEXT: ; return to shader part epilog
695 ; GFX10-LABEL: global_sub_saddr_i32_rtn_neg128:
697 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
698 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
699 ; GFX10-NEXT: global_atomic_sub v0, v0, v1, s[2:3] offset:-128 glc
700 ; GFX10-NEXT: s_waitcnt vmcnt(0)
701 ; GFX10-NEXT: buffer_gl0_inv
702 ; GFX10-NEXT: buffer_gl1_inv
703 ; GFX10-NEXT: ; return to shader part epilog
704 %zext.offset = zext i32 %voffset to i64
705 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
706 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
707 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
708 %rtn = atomicrmw sub i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
709 %cast.rtn = bitcast i32 %rtn to float
713 define amdgpu_ps void @global_sub_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
714 ; GFX9-LABEL: global_sub_saddr_i32_nortn:
716 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
717 ; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3]
718 ; GFX9-NEXT: s_waitcnt vmcnt(0)
719 ; GFX9-NEXT: buffer_wbinvl1
720 ; GFX9-NEXT: s_endpgm
722 ; GFX10-LABEL: global_sub_saddr_i32_nortn:
724 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
725 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
726 ; GFX10-NEXT: global_atomic_sub v0, v1, s[2:3]
727 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
728 ; GFX10-NEXT: buffer_gl0_inv
729 ; GFX10-NEXT: buffer_gl1_inv
730 ; GFX10-NEXT: s_endpgm
731 %zext.offset = zext i32 %voffset to i64
732 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
733 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
734 %unused = atomicrmw sub i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
738 define amdgpu_ps void @global_sub_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
739 ; GFX9-LABEL: global_sub_saddr_i32_nortn_neg128:
741 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
742 ; GFX9-NEXT: global_atomic_sub v0, v1, s[2:3] offset:-128
743 ; GFX9-NEXT: s_waitcnt vmcnt(0)
744 ; GFX9-NEXT: buffer_wbinvl1
745 ; GFX9-NEXT: s_endpgm
747 ; GFX10-LABEL: global_sub_saddr_i32_nortn_neg128:
749 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
750 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
751 ; GFX10-NEXT: global_atomic_sub v0, v1, s[2:3] offset:-128
752 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
753 ; GFX10-NEXT: buffer_gl0_inv
754 ; GFX10-NEXT: buffer_gl1_inv
755 ; GFX10-NEXT: s_endpgm
756 %zext.offset = zext i32 %voffset to i64
757 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
758 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
759 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
760 %unused = atomicrmw sub i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
764 define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
765 ; GFX9-LABEL: global_sub_saddr_i64_rtn:
767 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
768 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v0, v[1:2], s[2:3] glc
769 ; GFX9-NEXT: s_waitcnt vmcnt(0)
770 ; GFX9-NEXT: buffer_wbinvl1
771 ; GFX9-NEXT: ; return to shader part epilog
773 ; GFX10-LABEL: global_sub_saddr_i64_rtn:
775 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
776 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
777 ; GFX10-NEXT: global_atomic_sub_x2 v[0:1], v0, v[1:2], s[2:3] glc
778 ; GFX10-NEXT: s_waitcnt vmcnt(0)
779 ; GFX10-NEXT: buffer_gl0_inv
780 ; GFX10-NEXT: buffer_gl1_inv
781 ; GFX10-NEXT: ; return to shader part epilog
782 %zext.offset = zext i32 %voffset to i64
783 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
784 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
785 %rtn = atomicrmw sub i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
786 %cast.rtn = bitcast i64 %rtn to <2 x float>
787 ret <2 x float> %cast.rtn
790 define amdgpu_ps <2 x float> @global_sub_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
791 ; GFX9-LABEL: global_sub_saddr_i64_rtn_neg128:
793 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
794 ; GFX9-NEXT: global_atomic_sub_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
795 ; GFX9-NEXT: s_waitcnt vmcnt(0)
796 ; GFX9-NEXT: buffer_wbinvl1
797 ; GFX9-NEXT: ; return to shader part epilog
799 ; GFX10-LABEL: global_sub_saddr_i64_rtn_neg128:
801 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
802 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
803 ; GFX10-NEXT: global_atomic_sub_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
804 ; GFX10-NEXT: s_waitcnt vmcnt(0)
805 ; GFX10-NEXT: buffer_gl0_inv
806 ; GFX10-NEXT: buffer_gl1_inv
807 ; GFX10-NEXT: ; return to shader part epilog
808 %zext.offset = zext i32 %voffset to i64
809 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
810 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
811 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
812 %rtn = atomicrmw sub i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
813 %cast.rtn = bitcast i64 %rtn to <2 x float>
814 ret <2 x float> %cast.rtn
817 define amdgpu_ps void @global_sub_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
818 ; GFX9-LABEL: global_sub_saddr_i64_nortn:
820 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
821 ; GFX9-NEXT: global_atomic_sub_x2 v0, v[1:2], s[2:3]
822 ; GFX9-NEXT: s_waitcnt vmcnt(0)
823 ; GFX9-NEXT: buffer_wbinvl1
824 ; GFX9-NEXT: s_endpgm
826 ; GFX10-LABEL: global_sub_saddr_i64_nortn:
828 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
829 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
830 ; GFX10-NEXT: global_atomic_sub_x2 v0, v[1:2], s[2:3]
831 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
832 ; GFX10-NEXT: buffer_gl0_inv
833 ; GFX10-NEXT: buffer_gl1_inv
834 ; GFX10-NEXT: s_endpgm
835 %zext.offset = zext i32 %voffset to i64
836 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
837 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
838 %unused = atomicrmw sub i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
842 define amdgpu_ps void @global_sub_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
843 ; GFX9-LABEL: global_sub_saddr_i64_nortn_neg128:
845 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
846 ; GFX9-NEXT: global_atomic_sub_x2 v0, v[1:2], s[2:3] offset:-128
847 ; GFX9-NEXT: s_waitcnt vmcnt(0)
848 ; GFX9-NEXT: buffer_wbinvl1
849 ; GFX9-NEXT: s_endpgm
851 ; GFX10-LABEL: global_sub_saddr_i64_nortn_neg128:
853 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
854 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
855 ; GFX10-NEXT: global_atomic_sub_x2 v0, v[1:2], s[2:3] offset:-128
856 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
857 ; GFX10-NEXT: buffer_gl0_inv
858 ; GFX10-NEXT: buffer_gl1_inv
859 ; GFX10-NEXT: s_endpgm
860 %zext.offset = zext i32 %voffset to i64
861 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
862 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
863 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
864 %unused = atomicrmw sub i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
868 ; --------------------------------------------------------------------------------
870 ; --------------------------------------------------------------------------------
872 define amdgpu_ps float @global_and_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
873 ; GFX9-LABEL: global_and_saddr_i32_rtn:
875 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
876 ; GFX9-NEXT: global_atomic_and v0, v0, v1, s[2:3] glc
877 ; GFX9-NEXT: s_waitcnt vmcnt(0)
878 ; GFX9-NEXT: buffer_wbinvl1
879 ; GFX9-NEXT: ; return to shader part epilog
881 ; GFX10-LABEL: global_and_saddr_i32_rtn:
883 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
884 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
885 ; GFX10-NEXT: global_atomic_and v0, v0, v1, s[2:3] glc
886 ; GFX10-NEXT: s_waitcnt vmcnt(0)
887 ; GFX10-NEXT: buffer_gl0_inv
888 ; GFX10-NEXT: buffer_gl1_inv
889 ; GFX10-NEXT: ; return to shader part epilog
890 %zext.offset = zext i32 %voffset to i64
891 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
892 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
893 %rtn = atomicrmw and i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
894 %cast.rtn = bitcast i32 %rtn to float
898 define amdgpu_ps float @global_and_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
899 ; GFX9-LABEL: global_and_saddr_i32_rtn_neg128:
901 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
902 ; GFX9-NEXT: global_atomic_and v0, v0, v1, s[2:3] offset:-128 glc
903 ; GFX9-NEXT: s_waitcnt vmcnt(0)
904 ; GFX9-NEXT: buffer_wbinvl1
905 ; GFX9-NEXT: ; return to shader part epilog
907 ; GFX10-LABEL: global_and_saddr_i32_rtn_neg128:
909 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
910 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
911 ; GFX10-NEXT: global_atomic_and v0, v0, v1, s[2:3] offset:-128 glc
912 ; GFX10-NEXT: s_waitcnt vmcnt(0)
913 ; GFX10-NEXT: buffer_gl0_inv
914 ; GFX10-NEXT: buffer_gl1_inv
915 ; GFX10-NEXT: ; return to shader part epilog
916 %zext.offset = zext i32 %voffset to i64
917 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
918 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
919 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
920 %rtn = atomicrmw and i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
921 %cast.rtn = bitcast i32 %rtn to float
925 define amdgpu_ps void @global_and_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
926 ; GFX9-LABEL: global_and_saddr_i32_nortn:
928 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
929 ; GFX9-NEXT: global_atomic_and v0, v1, s[2:3]
930 ; GFX9-NEXT: s_waitcnt vmcnt(0)
931 ; GFX9-NEXT: buffer_wbinvl1
932 ; GFX9-NEXT: s_endpgm
934 ; GFX10-LABEL: global_and_saddr_i32_nortn:
936 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
937 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
938 ; GFX10-NEXT: global_atomic_and v0, v1, s[2:3]
939 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
940 ; GFX10-NEXT: buffer_gl0_inv
941 ; GFX10-NEXT: buffer_gl1_inv
942 ; GFX10-NEXT: s_endpgm
943 %zext.offset = zext i32 %voffset to i64
944 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
945 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
946 %unused = atomicrmw and i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
950 define amdgpu_ps void @global_and_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
951 ; GFX9-LABEL: global_and_saddr_i32_nortn_neg128:
953 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
954 ; GFX9-NEXT: global_atomic_and v0, v1, s[2:3] offset:-128
955 ; GFX9-NEXT: s_waitcnt vmcnt(0)
956 ; GFX9-NEXT: buffer_wbinvl1
957 ; GFX9-NEXT: s_endpgm
959 ; GFX10-LABEL: global_and_saddr_i32_nortn_neg128:
961 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
962 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
963 ; GFX10-NEXT: global_atomic_and v0, v1, s[2:3] offset:-128
964 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
965 ; GFX10-NEXT: buffer_gl0_inv
966 ; GFX10-NEXT: buffer_gl1_inv
967 ; GFX10-NEXT: s_endpgm
968 %zext.offset = zext i32 %voffset to i64
969 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
970 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
971 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
972 %unused = atomicrmw and i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
976 define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
977 ; GFX9-LABEL: global_and_saddr_i64_rtn:
979 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
980 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v0, v[1:2], s[2:3] glc
981 ; GFX9-NEXT: s_waitcnt vmcnt(0)
982 ; GFX9-NEXT: buffer_wbinvl1
983 ; GFX9-NEXT: ; return to shader part epilog
985 ; GFX10-LABEL: global_and_saddr_i64_rtn:
987 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
988 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
989 ; GFX10-NEXT: global_atomic_and_x2 v[0:1], v0, v[1:2], s[2:3] glc
990 ; GFX10-NEXT: s_waitcnt vmcnt(0)
991 ; GFX10-NEXT: buffer_gl0_inv
992 ; GFX10-NEXT: buffer_gl1_inv
993 ; GFX10-NEXT: ; return to shader part epilog
994 %zext.offset = zext i32 %voffset to i64
995 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
996 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
997 %rtn = atomicrmw and i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
998 %cast.rtn = bitcast i64 %rtn to <2 x float>
999 ret <2 x float> %cast.rtn
1002 define amdgpu_ps <2 x float> @global_and_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1003 ; GFX9-LABEL: global_and_saddr_i64_rtn_neg128:
1005 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1006 ; GFX9-NEXT: global_atomic_and_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1007 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1008 ; GFX9-NEXT: buffer_wbinvl1
1009 ; GFX9-NEXT: ; return to shader part epilog
1011 ; GFX10-LABEL: global_and_saddr_i64_rtn_neg128:
1013 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1014 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1015 ; GFX10-NEXT: global_atomic_and_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1016 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1017 ; GFX10-NEXT: buffer_gl0_inv
1018 ; GFX10-NEXT: buffer_gl1_inv
1019 ; GFX10-NEXT: ; return to shader part epilog
1020 %zext.offset = zext i32 %voffset to i64
1021 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1022 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1023 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
1024 %rtn = atomicrmw and i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
1025 %cast.rtn = bitcast i64 %rtn to <2 x float>
1026 ret <2 x float> %cast.rtn
1029 define amdgpu_ps void @global_and_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1030 ; GFX9-LABEL: global_and_saddr_i64_nortn:
1032 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1033 ; GFX9-NEXT: global_atomic_and_x2 v0, v[1:2], s[2:3]
1034 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1035 ; GFX9-NEXT: buffer_wbinvl1
1036 ; GFX9-NEXT: s_endpgm
1038 ; GFX10-LABEL: global_and_saddr_i64_nortn:
1040 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1041 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1042 ; GFX10-NEXT: global_atomic_and_x2 v0, v[1:2], s[2:3]
1043 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1044 ; GFX10-NEXT: buffer_gl0_inv
1045 ; GFX10-NEXT: buffer_gl1_inv
1046 ; GFX10-NEXT: s_endpgm
1047 %zext.offset = zext i32 %voffset to i64
1048 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1049 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
1050 %unused = atomicrmw and i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
1054 define amdgpu_ps void @global_and_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1055 ; GFX9-LABEL: global_and_saddr_i64_nortn_neg128:
1057 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1058 ; GFX9-NEXT: global_atomic_and_x2 v0, v[1:2], s[2:3] offset:-128
1059 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1060 ; GFX9-NEXT: buffer_wbinvl1
1061 ; GFX9-NEXT: s_endpgm
1063 ; GFX10-LABEL: global_and_saddr_i64_nortn_neg128:
1065 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1066 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1067 ; GFX10-NEXT: global_atomic_and_x2 v0, v[1:2], s[2:3] offset:-128
1068 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1069 ; GFX10-NEXT: buffer_gl0_inv
1070 ; GFX10-NEXT: buffer_gl1_inv
1071 ; GFX10-NEXT: s_endpgm
1072 %zext.offset = zext i32 %voffset to i64
1073 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1074 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1075 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
1076 %unused = atomicrmw and i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
1080 ; --------------------------------------------------------------------------------
1082 ; --------------------------------------------------------------------------------
1084 define amdgpu_ps float @global_or_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1085 ; GFX9-LABEL: global_or_saddr_i32_rtn:
1087 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1088 ; GFX9-NEXT: global_atomic_or v0, v0, v1, s[2:3] glc
1089 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1090 ; GFX9-NEXT: buffer_wbinvl1
1091 ; GFX9-NEXT: ; return to shader part epilog
1093 ; GFX10-LABEL: global_or_saddr_i32_rtn:
1095 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1096 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1097 ; GFX10-NEXT: global_atomic_or v0, v0, v1, s[2:3] glc
1098 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1099 ; GFX10-NEXT: buffer_gl0_inv
1100 ; GFX10-NEXT: buffer_gl1_inv
1101 ; GFX10-NEXT: ; return to shader part epilog
1102 %zext.offset = zext i32 %voffset to i64
1103 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1104 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
1105 %rtn = atomicrmw or i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
1106 %cast.rtn = bitcast i32 %rtn to float
1110 define amdgpu_ps float @global_or_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1111 ; GFX9-LABEL: global_or_saddr_i32_rtn_neg128:
1113 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1114 ; GFX9-NEXT: global_atomic_or v0, v0, v1, s[2:3] offset:-128 glc
1115 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1116 ; GFX9-NEXT: buffer_wbinvl1
1117 ; GFX9-NEXT: ; return to shader part epilog
1119 ; GFX10-LABEL: global_or_saddr_i32_rtn_neg128:
1121 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1122 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1123 ; GFX10-NEXT: global_atomic_or v0, v0, v1, s[2:3] offset:-128 glc
1124 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1125 ; GFX10-NEXT: buffer_gl0_inv
1126 ; GFX10-NEXT: buffer_gl1_inv
1127 ; GFX10-NEXT: ; return to shader part epilog
1128 %zext.offset = zext i32 %voffset to i64
1129 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1130 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1131 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
1132 %rtn = atomicrmw or i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
1133 %cast.rtn = bitcast i32 %rtn to float
1137 define amdgpu_ps void @global_or_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1138 ; GFX9-LABEL: global_or_saddr_i32_nortn:
1140 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1141 ; GFX9-NEXT: global_atomic_or v0, v1, s[2:3]
1142 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1143 ; GFX9-NEXT: buffer_wbinvl1
1144 ; GFX9-NEXT: s_endpgm
1146 ; GFX10-LABEL: global_or_saddr_i32_nortn:
1148 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1149 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1150 ; GFX10-NEXT: global_atomic_or v0, v1, s[2:3]
1151 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1152 ; GFX10-NEXT: buffer_gl0_inv
1153 ; GFX10-NEXT: buffer_gl1_inv
1154 ; GFX10-NEXT: s_endpgm
1155 %zext.offset = zext i32 %voffset to i64
1156 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1157 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
1158 %unused = atomicrmw or i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
1162 define amdgpu_ps void @global_or_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1163 ; GFX9-LABEL: global_or_saddr_i32_nortn_neg128:
1165 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1166 ; GFX9-NEXT: global_atomic_or v0, v1, s[2:3] offset:-128
1167 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1168 ; GFX9-NEXT: buffer_wbinvl1
1169 ; GFX9-NEXT: s_endpgm
1171 ; GFX10-LABEL: global_or_saddr_i32_nortn_neg128:
1173 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1174 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1175 ; GFX10-NEXT: global_atomic_or v0, v1, s[2:3] offset:-128
1176 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1177 ; GFX10-NEXT: buffer_gl0_inv
1178 ; GFX10-NEXT: buffer_gl1_inv
1179 ; GFX10-NEXT: s_endpgm
1180 %zext.offset = zext i32 %voffset to i64
1181 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1182 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1183 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
1184 %unused = atomicrmw or i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
1188 define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1189 ; GFX9-LABEL: global_or_saddr_i64_rtn:
1191 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1192 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v0, v[1:2], s[2:3] glc
1193 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1194 ; GFX9-NEXT: buffer_wbinvl1
1195 ; GFX9-NEXT: ; return to shader part epilog
1197 ; GFX10-LABEL: global_or_saddr_i64_rtn:
1199 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1200 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1201 ; GFX10-NEXT: global_atomic_or_x2 v[0:1], v0, v[1:2], s[2:3] glc
1202 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1203 ; GFX10-NEXT: buffer_gl0_inv
1204 ; GFX10-NEXT: buffer_gl1_inv
1205 ; GFX10-NEXT: ; return to shader part epilog
1206 %zext.offset = zext i32 %voffset to i64
1207 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1208 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
1209 %rtn = atomicrmw or i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
1210 %cast.rtn = bitcast i64 %rtn to <2 x float>
1211 ret <2 x float> %cast.rtn
1214 define amdgpu_ps <2 x float> @global_or_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1215 ; GFX9-LABEL: global_or_saddr_i64_rtn_neg128:
1217 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1218 ; GFX9-NEXT: global_atomic_or_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1219 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1220 ; GFX9-NEXT: buffer_wbinvl1
1221 ; GFX9-NEXT: ; return to shader part epilog
1223 ; GFX10-LABEL: global_or_saddr_i64_rtn_neg128:
1225 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1226 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1227 ; GFX10-NEXT: global_atomic_or_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1228 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1229 ; GFX10-NEXT: buffer_gl0_inv
1230 ; GFX10-NEXT: buffer_gl1_inv
1231 ; GFX10-NEXT: ; return to shader part epilog
1232 %zext.offset = zext i32 %voffset to i64
1233 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1234 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1235 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
1236 %rtn = atomicrmw or i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
1237 %cast.rtn = bitcast i64 %rtn to <2 x float>
1238 ret <2 x float> %cast.rtn
1241 define amdgpu_ps void @global_or_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1242 ; GFX9-LABEL: global_or_saddr_i64_nortn:
1244 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1245 ; GFX9-NEXT: global_atomic_or_x2 v0, v[1:2], s[2:3]
1246 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1247 ; GFX9-NEXT: buffer_wbinvl1
1248 ; GFX9-NEXT: s_endpgm
1250 ; GFX10-LABEL: global_or_saddr_i64_nortn:
1252 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1253 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1254 ; GFX10-NEXT: global_atomic_or_x2 v0, v[1:2], s[2:3]
1255 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1256 ; GFX10-NEXT: buffer_gl0_inv
1257 ; GFX10-NEXT: buffer_gl1_inv
1258 ; GFX10-NEXT: s_endpgm
1259 %zext.offset = zext i32 %voffset to i64
1260 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1261 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
1262 %unused = atomicrmw or i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
1266 define amdgpu_ps void @global_or_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1267 ; GFX9-LABEL: global_or_saddr_i64_nortn_neg128:
1269 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1270 ; GFX9-NEXT: global_atomic_or_x2 v0, v[1:2], s[2:3] offset:-128
1271 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1272 ; GFX9-NEXT: buffer_wbinvl1
1273 ; GFX9-NEXT: s_endpgm
1275 ; GFX10-LABEL: global_or_saddr_i64_nortn_neg128:
1277 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1278 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1279 ; GFX10-NEXT: global_atomic_or_x2 v0, v[1:2], s[2:3] offset:-128
1280 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1281 ; GFX10-NEXT: buffer_gl0_inv
1282 ; GFX10-NEXT: buffer_gl1_inv
1283 ; GFX10-NEXT: s_endpgm
1284 %zext.offset = zext i32 %voffset to i64
1285 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1286 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1287 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
1288 %unused = atomicrmw or i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
1292 ; --------------------------------------------------------------------------------
1294 ; --------------------------------------------------------------------------------
1296 define amdgpu_ps float @global_xor_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1297 ; GFX9-LABEL: global_xor_saddr_i32_rtn:
1299 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1300 ; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[2:3] glc
1301 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1302 ; GFX9-NEXT: buffer_wbinvl1
1303 ; GFX9-NEXT: ; return to shader part epilog
1305 ; GFX10-LABEL: global_xor_saddr_i32_rtn:
1307 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1308 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1309 ; GFX10-NEXT: global_atomic_xor v0, v0, v1, s[2:3] glc
1310 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1311 ; GFX10-NEXT: buffer_gl0_inv
1312 ; GFX10-NEXT: buffer_gl1_inv
1313 ; GFX10-NEXT: ; return to shader part epilog
1314 %zext.offset = zext i32 %voffset to i64
1315 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1316 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
1317 %rtn = atomicrmw xor i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
1318 %cast.rtn = bitcast i32 %rtn to float
1322 define amdgpu_ps float @global_xor_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1323 ; GFX9-LABEL: global_xor_saddr_i32_rtn_neg128:
1325 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1326 ; GFX9-NEXT: global_atomic_xor v0, v0, v1, s[2:3] offset:-128 glc
1327 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1328 ; GFX9-NEXT: buffer_wbinvl1
1329 ; GFX9-NEXT: ; return to shader part epilog
1331 ; GFX10-LABEL: global_xor_saddr_i32_rtn_neg128:
1333 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1334 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1335 ; GFX10-NEXT: global_atomic_xor v0, v0, v1, s[2:3] offset:-128 glc
1336 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1337 ; GFX10-NEXT: buffer_gl0_inv
1338 ; GFX10-NEXT: buffer_gl1_inv
1339 ; GFX10-NEXT: ; return to shader part epilog
1340 %zext.offset = zext i32 %voffset to i64
1341 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1342 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1343 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
1344 %rtn = atomicrmw xor i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
1345 %cast.rtn = bitcast i32 %rtn to float
1349 define amdgpu_ps void @global_xor_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1350 ; GFX9-LABEL: global_xor_saddr_i32_nortn:
1352 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1353 ; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3]
1354 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1355 ; GFX9-NEXT: buffer_wbinvl1
1356 ; GFX9-NEXT: s_endpgm
1358 ; GFX10-LABEL: global_xor_saddr_i32_nortn:
1360 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1361 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1362 ; GFX10-NEXT: global_atomic_xor v0, v1, s[2:3]
1363 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1364 ; GFX10-NEXT: buffer_gl0_inv
1365 ; GFX10-NEXT: buffer_gl1_inv
1366 ; GFX10-NEXT: s_endpgm
1367 %zext.offset = zext i32 %voffset to i64
1368 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1369 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
1370 %unused = atomicrmw xor i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
1374 define amdgpu_ps void @global_xor_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1375 ; GFX9-LABEL: global_xor_saddr_i32_nortn_neg128:
1377 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1378 ; GFX9-NEXT: global_atomic_xor v0, v1, s[2:3] offset:-128
1379 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1380 ; GFX9-NEXT: buffer_wbinvl1
1381 ; GFX9-NEXT: s_endpgm
1383 ; GFX10-LABEL: global_xor_saddr_i32_nortn_neg128:
1385 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1386 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1387 ; GFX10-NEXT: global_atomic_xor v0, v1, s[2:3] offset:-128
1388 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1389 ; GFX10-NEXT: buffer_gl0_inv
1390 ; GFX10-NEXT: buffer_gl1_inv
1391 ; GFX10-NEXT: s_endpgm
1392 %zext.offset = zext i32 %voffset to i64
1393 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1394 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1395 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
1396 %unused = atomicrmw xor i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
1400 define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1401 ; GFX9-LABEL: global_xor_saddr_i64_rtn:
1403 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1404 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v0, v[1:2], s[2:3] glc
1405 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1406 ; GFX9-NEXT: buffer_wbinvl1
1407 ; GFX9-NEXT: ; return to shader part epilog
1409 ; GFX10-LABEL: global_xor_saddr_i64_rtn:
1411 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1412 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1413 ; GFX10-NEXT: global_atomic_xor_x2 v[0:1], v0, v[1:2], s[2:3] glc
1414 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1415 ; GFX10-NEXT: buffer_gl0_inv
1416 ; GFX10-NEXT: buffer_gl1_inv
1417 ; GFX10-NEXT: ; return to shader part epilog
1418 %zext.offset = zext i32 %voffset to i64
1419 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1420 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
1421 %rtn = atomicrmw xor i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
1422 %cast.rtn = bitcast i64 %rtn to <2 x float>
1423 ret <2 x float> %cast.rtn
1426 define amdgpu_ps <2 x float> @global_xor_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1427 ; GFX9-LABEL: global_xor_saddr_i64_rtn_neg128:
1429 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1430 ; GFX9-NEXT: global_atomic_xor_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1431 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1432 ; GFX9-NEXT: buffer_wbinvl1
1433 ; GFX9-NEXT: ; return to shader part epilog
1435 ; GFX10-LABEL: global_xor_saddr_i64_rtn_neg128:
1437 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1438 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1439 ; GFX10-NEXT: global_atomic_xor_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1440 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1441 ; GFX10-NEXT: buffer_gl0_inv
1442 ; GFX10-NEXT: buffer_gl1_inv
1443 ; GFX10-NEXT: ; return to shader part epilog
1444 %zext.offset = zext i32 %voffset to i64
1445 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1446 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1447 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
1448 %rtn = atomicrmw xor i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
1449 %cast.rtn = bitcast i64 %rtn to <2 x float>
1450 ret <2 x float> %cast.rtn
1453 define amdgpu_ps void @global_xor_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1454 ; GFX9-LABEL: global_xor_saddr_i64_nortn:
1456 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1457 ; GFX9-NEXT: global_atomic_xor_x2 v0, v[1:2], s[2:3]
1458 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1459 ; GFX9-NEXT: buffer_wbinvl1
1460 ; GFX9-NEXT: s_endpgm
1462 ; GFX10-LABEL: global_xor_saddr_i64_nortn:
1464 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1465 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1466 ; GFX10-NEXT: global_atomic_xor_x2 v0, v[1:2], s[2:3]
1467 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1468 ; GFX10-NEXT: buffer_gl0_inv
1469 ; GFX10-NEXT: buffer_gl1_inv
1470 ; GFX10-NEXT: s_endpgm
1471 %zext.offset = zext i32 %voffset to i64
1472 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1473 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
1474 %unused = atomicrmw xor i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
1478 define amdgpu_ps void @global_xor_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1479 ; GFX9-LABEL: global_xor_saddr_i64_nortn_neg128:
1481 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1482 ; GFX9-NEXT: global_atomic_xor_x2 v0, v[1:2], s[2:3] offset:-128
1483 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1484 ; GFX9-NEXT: buffer_wbinvl1
1485 ; GFX9-NEXT: s_endpgm
1487 ; GFX10-LABEL: global_xor_saddr_i64_nortn_neg128:
1489 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1490 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1491 ; GFX10-NEXT: global_atomic_xor_x2 v0, v[1:2], s[2:3] offset:-128
1492 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1493 ; GFX10-NEXT: buffer_gl0_inv
1494 ; GFX10-NEXT: buffer_gl1_inv
1495 ; GFX10-NEXT: s_endpgm
1496 %zext.offset = zext i32 %voffset to i64
1497 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1498 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1499 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
1500 %unused = atomicrmw xor i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
1504 ; --------------------------------------------------------------------------------
1506 ; --------------------------------------------------------------------------------
1508 define amdgpu_ps float @global_max_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1509 ; GFX9-LABEL: global_max_saddr_i32_rtn:
1511 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1512 ; GFX9-NEXT: global_atomic_smax v0, v0, v1, s[2:3] glc
1513 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1514 ; GFX9-NEXT: buffer_wbinvl1
1515 ; GFX9-NEXT: ; return to shader part epilog
1517 ; GFX10-LABEL: global_max_saddr_i32_rtn:
1519 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1520 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1521 ; GFX10-NEXT: global_atomic_smax v0, v0, v1, s[2:3] glc
1522 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1523 ; GFX10-NEXT: buffer_gl0_inv
1524 ; GFX10-NEXT: buffer_gl1_inv
1525 ; GFX10-NEXT: ; return to shader part epilog
1526 %zext.offset = zext i32 %voffset to i64
1527 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1528 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
1529 %rtn = atomicrmw max i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
1530 %cast.rtn = bitcast i32 %rtn to float
1534 define amdgpu_ps float @global_max_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1535 ; GFX9-LABEL: global_max_saddr_i32_rtn_neg128:
1537 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1538 ; GFX9-NEXT: global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc
1539 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1540 ; GFX9-NEXT: buffer_wbinvl1
1541 ; GFX9-NEXT: ; return to shader part epilog
1543 ; GFX10-LABEL: global_max_saddr_i32_rtn_neg128:
1545 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1546 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1547 ; GFX10-NEXT: global_atomic_smax v0, v0, v1, s[2:3] offset:-128 glc
1548 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1549 ; GFX10-NEXT: buffer_gl0_inv
1550 ; GFX10-NEXT: buffer_gl1_inv
1551 ; GFX10-NEXT: ; return to shader part epilog
1552 %zext.offset = zext i32 %voffset to i64
1553 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1554 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1555 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
1556 %rtn = atomicrmw max i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
1557 %cast.rtn = bitcast i32 %rtn to float
1561 define amdgpu_ps void @global_max_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1562 ; GFX9-LABEL: global_max_saddr_i32_nortn:
1564 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1565 ; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3]
1566 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1567 ; GFX9-NEXT: buffer_wbinvl1
1568 ; GFX9-NEXT: s_endpgm
1570 ; GFX10-LABEL: global_max_saddr_i32_nortn:
1572 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1573 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1574 ; GFX10-NEXT: global_atomic_smax v0, v1, s[2:3]
1575 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1576 ; GFX10-NEXT: buffer_gl0_inv
1577 ; GFX10-NEXT: buffer_gl1_inv
1578 ; GFX10-NEXT: s_endpgm
1579 %zext.offset = zext i32 %voffset to i64
1580 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1581 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
1582 %unused = atomicrmw max i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
1586 define amdgpu_ps void @global_max_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1587 ; GFX9-LABEL: global_max_saddr_i32_nortn_neg128:
1589 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1590 ; GFX9-NEXT: global_atomic_smax v0, v1, s[2:3] offset:-128
1591 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1592 ; GFX9-NEXT: buffer_wbinvl1
1593 ; GFX9-NEXT: s_endpgm
1595 ; GFX10-LABEL: global_max_saddr_i32_nortn_neg128:
1597 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1598 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1599 ; GFX10-NEXT: global_atomic_smax v0, v1, s[2:3] offset:-128
1600 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1601 ; GFX10-NEXT: buffer_gl0_inv
1602 ; GFX10-NEXT: buffer_gl1_inv
1603 ; GFX10-NEXT: s_endpgm
1604 %zext.offset = zext i32 %voffset to i64
1605 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1606 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1607 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
1608 %unused = atomicrmw max i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
1612 define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1613 ; GFX9-LABEL: global_max_saddr_i64_rtn:
1615 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1616 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc
1617 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1618 ; GFX9-NEXT: buffer_wbinvl1
1619 ; GFX9-NEXT: ; return to shader part epilog
1621 ; GFX10-LABEL: global_max_saddr_i64_rtn:
1623 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1624 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1625 ; GFX10-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] glc
1626 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1627 ; GFX10-NEXT: buffer_gl0_inv
1628 ; GFX10-NEXT: buffer_gl1_inv
1629 ; GFX10-NEXT: ; return to shader part epilog
1630 %zext.offset = zext i32 %voffset to i64
1631 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1632 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
1633 %rtn = atomicrmw max i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
1634 %cast.rtn = bitcast i64 %rtn to <2 x float>
1635 ret <2 x float> %cast.rtn
1638 define amdgpu_ps <2 x float> @global_max_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1639 ; GFX9-LABEL: global_max_saddr_i64_rtn_neg128:
1641 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1642 ; GFX9-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1643 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1644 ; GFX9-NEXT: buffer_wbinvl1
1645 ; GFX9-NEXT: ; return to shader part epilog
1647 ; GFX10-LABEL: global_max_saddr_i64_rtn_neg128:
1649 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1650 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1651 ; GFX10-NEXT: global_atomic_smax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1652 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1653 ; GFX10-NEXT: buffer_gl0_inv
1654 ; GFX10-NEXT: buffer_gl1_inv
1655 ; GFX10-NEXT: ; return to shader part epilog
1656 %zext.offset = zext i32 %voffset to i64
1657 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1658 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1659 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
1660 %rtn = atomicrmw max i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
1661 %cast.rtn = bitcast i64 %rtn to <2 x float>
1662 ret <2 x float> %cast.rtn
1665 define amdgpu_ps void @global_max_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1666 ; GFX9-LABEL: global_max_saddr_i64_nortn:
1668 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1669 ; GFX9-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3]
1670 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1671 ; GFX9-NEXT: buffer_wbinvl1
1672 ; GFX9-NEXT: s_endpgm
1674 ; GFX10-LABEL: global_max_saddr_i64_nortn:
1676 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1677 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1678 ; GFX10-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3]
1679 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1680 ; GFX10-NEXT: buffer_gl0_inv
1681 ; GFX10-NEXT: buffer_gl1_inv
1682 ; GFX10-NEXT: s_endpgm
1683 %zext.offset = zext i32 %voffset to i64
1684 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1685 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
1686 %unused = atomicrmw max i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
1690 define amdgpu_ps void @global_max_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1691 ; GFX9-LABEL: global_max_saddr_i64_nortn_neg128:
1693 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1694 ; GFX9-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128
1695 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1696 ; GFX9-NEXT: buffer_wbinvl1
1697 ; GFX9-NEXT: s_endpgm
1699 ; GFX10-LABEL: global_max_saddr_i64_nortn_neg128:
1701 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1702 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1703 ; GFX10-NEXT: global_atomic_smax_x2 v0, v[1:2], s[2:3] offset:-128
1704 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1705 ; GFX10-NEXT: buffer_gl0_inv
1706 ; GFX10-NEXT: buffer_gl1_inv
1707 ; GFX10-NEXT: s_endpgm
1708 %zext.offset = zext i32 %voffset to i64
1709 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1710 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1711 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
1712 %unused = atomicrmw max i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
1716 ; --------------------------------------------------------------------------------
1718 ; --------------------------------------------------------------------------------
1720 define amdgpu_ps float @global_min_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1721 ; GFX9-LABEL: global_min_saddr_i32_rtn:
1723 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1724 ; GFX9-NEXT: global_atomic_smin v0, v0, v1, s[2:3] glc
1725 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1726 ; GFX9-NEXT: buffer_wbinvl1
1727 ; GFX9-NEXT: ; return to shader part epilog
1729 ; GFX10-LABEL: global_min_saddr_i32_rtn:
1731 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1732 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1733 ; GFX10-NEXT: global_atomic_smin v0, v0, v1, s[2:3] glc
1734 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1735 ; GFX10-NEXT: buffer_gl0_inv
1736 ; GFX10-NEXT: buffer_gl1_inv
1737 ; GFX10-NEXT: ; return to shader part epilog
1738 %zext.offset = zext i32 %voffset to i64
1739 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1740 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
1741 %rtn = atomicrmw min i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
1742 %cast.rtn = bitcast i32 %rtn to float
1746 define amdgpu_ps float @global_min_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1747 ; GFX9-LABEL: global_min_saddr_i32_rtn_neg128:
1749 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1750 ; GFX9-NEXT: global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc
1751 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1752 ; GFX9-NEXT: buffer_wbinvl1
1753 ; GFX9-NEXT: ; return to shader part epilog
1755 ; GFX10-LABEL: global_min_saddr_i32_rtn_neg128:
1757 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1758 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1759 ; GFX10-NEXT: global_atomic_smin v0, v0, v1, s[2:3] offset:-128 glc
1760 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1761 ; GFX10-NEXT: buffer_gl0_inv
1762 ; GFX10-NEXT: buffer_gl1_inv
1763 ; GFX10-NEXT: ; return to shader part epilog
1764 %zext.offset = zext i32 %voffset to i64
1765 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1766 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1767 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
1768 %rtn = atomicrmw min i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
1769 %cast.rtn = bitcast i32 %rtn to float
1773 define amdgpu_ps void @global_min_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1774 ; GFX9-LABEL: global_min_saddr_i32_nortn:
1776 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1777 ; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3]
1778 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1779 ; GFX9-NEXT: buffer_wbinvl1
1780 ; GFX9-NEXT: s_endpgm
1782 ; GFX10-LABEL: global_min_saddr_i32_nortn:
1784 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1785 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1786 ; GFX10-NEXT: global_atomic_smin v0, v1, s[2:3]
1787 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1788 ; GFX10-NEXT: buffer_gl0_inv
1789 ; GFX10-NEXT: buffer_gl1_inv
1790 ; GFX10-NEXT: s_endpgm
1791 %zext.offset = zext i32 %voffset to i64
1792 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1793 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
1794 %unused = atomicrmw min i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
1798 define amdgpu_ps void @global_min_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1799 ; GFX9-LABEL: global_min_saddr_i32_nortn_neg128:
1801 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1802 ; GFX9-NEXT: global_atomic_smin v0, v1, s[2:3] offset:-128
1803 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1804 ; GFX9-NEXT: buffer_wbinvl1
1805 ; GFX9-NEXT: s_endpgm
1807 ; GFX10-LABEL: global_min_saddr_i32_nortn_neg128:
1809 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1810 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1811 ; GFX10-NEXT: global_atomic_smin v0, v1, s[2:3] offset:-128
1812 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1813 ; GFX10-NEXT: buffer_gl0_inv
1814 ; GFX10-NEXT: buffer_gl1_inv
1815 ; GFX10-NEXT: s_endpgm
1816 %zext.offset = zext i32 %voffset to i64
1817 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1818 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1819 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
1820 %unused = atomicrmw min i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
1824 define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1825 ; GFX9-LABEL: global_min_saddr_i64_rtn:
1827 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1828 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc
1829 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1830 ; GFX9-NEXT: buffer_wbinvl1
1831 ; GFX9-NEXT: ; return to shader part epilog
1833 ; GFX10-LABEL: global_min_saddr_i64_rtn:
1835 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1836 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1837 ; GFX10-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] glc
1838 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1839 ; GFX10-NEXT: buffer_gl0_inv
1840 ; GFX10-NEXT: buffer_gl1_inv
1841 ; GFX10-NEXT: ; return to shader part epilog
1842 %zext.offset = zext i32 %voffset to i64
1843 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1844 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
1845 %rtn = atomicrmw min i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
1846 %cast.rtn = bitcast i64 %rtn to <2 x float>
1847 ret <2 x float> %cast.rtn
1850 define amdgpu_ps <2 x float> @global_min_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1851 ; GFX9-LABEL: global_min_saddr_i64_rtn_neg128:
1853 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1854 ; GFX9-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1855 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1856 ; GFX9-NEXT: buffer_wbinvl1
1857 ; GFX9-NEXT: ; return to shader part epilog
1859 ; GFX10-LABEL: global_min_saddr_i64_rtn_neg128:
1861 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1862 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1863 ; GFX10-NEXT: global_atomic_smin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
1864 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1865 ; GFX10-NEXT: buffer_gl0_inv
1866 ; GFX10-NEXT: buffer_gl1_inv
1867 ; GFX10-NEXT: ; return to shader part epilog
1868 %zext.offset = zext i32 %voffset to i64
1869 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1870 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1871 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
1872 %rtn = atomicrmw min i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
1873 %cast.rtn = bitcast i64 %rtn to <2 x float>
1874 ret <2 x float> %cast.rtn
1877 define amdgpu_ps void @global_min_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1878 ; GFX9-LABEL: global_min_saddr_i64_nortn:
1880 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1881 ; GFX9-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3]
1882 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1883 ; GFX9-NEXT: buffer_wbinvl1
1884 ; GFX9-NEXT: s_endpgm
1886 ; GFX10-LABEL: global_min_saddr_i64_nortn:
1888 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1889 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1890 ; GFX10-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3]
1891 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1892 ; GFX10-NEXT: buffer_gl0_inv
1893 ; GFX10-NEXT: buffer_gl1_inv
1894 ; GFX10-NEXT: s_endpgm
1895 %zext.offset = zext i32 %voffset to i64
1896 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1897 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
1898 %unused = atomicrmw min i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
1902 define amdgpu_ps void @global_min_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
1903 ; GFX9-LABEL: global_min_saddr_i64_nortn_neg128:
1905 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1906 ; GFX9-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128
1907 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1908 ; GFX9-NEXT: buffer_wbinvl1
1909 ; GFX9-NEXT: s_endpgm
1911 ; GFX10-LABEL: global_min_saddr_i64_nortn_neg128:
1913 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1914 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1915 ; GFX10-NEXT: global_atomic_smin_x2 v0, v[1:2], s[2:3] offset:-128
1916 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1917 ; GFX10-NEXT: buffer_gl0_inv
1918 ; GFX10-NEXT: buffer_gl1_inv
1919 ; GFX10-NEXT: s_endpgm
1920 %zext.offset = zext i32 %voffset to i64
1921 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1922 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1923 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
1924 %unused = atomicrmw min i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
1928 ; --------------------------------------------------------------------------------
1930 ; --------------------------------------------------------------------------------
1932 define amdgpu_ps float @global_umax_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1933 ; GFX9-LABEL: global_umax_saddr_i32_rtn:
1935 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1936 ; GFX9-NEXT: global_atomic_umax v0, v0, v1, s[2:3] glc
1937 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1938 ; GFX9-NEXT: buffer_wbinvl1
1939 ; GFX9-NEXT: ; return to shader part epilog
1941 ; GFX10-LABEL: global_umax_saddr_i32_rtn:
1943 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1944 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1945 ; GFX10-NEXT: global_atomic_umax v0, v0, v1, s[2:3] glc
1946 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1947 ; GFX10-NEXT: buffer_gl0_inv
1948 ; GFX10-NEXT: buffer_gl1_inv
1949 ; GFX10-NEXT: ; return to shader part epilog
1950 %zext.offset = zext i32 %voffset to i64
1951 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1952 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
1953 %rtn = atomicrmw umax i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
1954 %cast.rtn = bitcast i32 %rtn to float
1958 define amdgpu_ps float @global_umax_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1959 ; GFX9-LABEL: global_umax_saddr_i32_rtn_neg128:
1961 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1962 ; GFX9-NEXT: global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc
1963 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1964 ; GFX9-NEXT: buffer_wbinvl1
1965 ; GFX9-NEXT: ; return to shader part epilog
1967 ; GFX10-LABEL: global_umax_saddr_i32_rtn_neg128:
1969 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1970 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1971 ; GFX10-NEXT: global_atomic_umax v0, v0, v1, s[2:3] offset:-128 glc
1972 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1973 ; GFX10-NEXT: buffer_gl0_inv
1974 ; GFX10-NEXT: buffer_gl1_inv
1975 ; GFX10-NEXT: ; return to shader part epilog
1976 %zext.offset = zext i32 %voffset to i64
1977 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
1978 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
1979 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
1980 %rtn = atomicrmw umax i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
1981 %cast.rtn = bitcast i32 %rtn to float
1985 define amdgpu_ps void @global_umax_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
1986 ; GFX9-LABEL: global_umax_saddr_i32_nortn:
1988 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1989 ; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3]
1990 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1991 ; GFX9-NEXT: buffer_wbinvl1
1992 ; GFX9-NEXT: s_endpgm
1994 ; GFX10-LABEL: global_umax_saddr_i32_nortn:
1996 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1997 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1998 ; GFX10-NEXT: global_atomic_umax v0, v1, s[2:3]
1999 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2000 ; GFX10-NEXT: buffer_gl0_inv
2001 ; GFX10-NEXT: buffer_gl1_inv
2002 ; GFX10-NEXT: s_endpgm
2003 %zext.offset = zext i32 %voffset to i64
2004 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2005 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
2006 %unused = atomicrmw umax i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
2010 define amdgpu_ps void @global_umax_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
2011 ; GFX9-LABEL: global_umax_saddr_i32_nortn_neg128:
2013 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2014 ; GFX9-NEXT: global_atomic_umax v0, v1, s[2:3] offset:-128
2015 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2016 ; GFX9-NEXT: buffer_wbinvl1
2017 ; GFX9-NEXT: s_endpgm
2019 ; GFX10-LABEL: global_umax_saddr_i32_nortn_neg128:
2021 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2022 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2023 ; GFX10-NEXT: global_atomic_umax v0, v1, s[2:3] offset:-128
2024 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2025 ; GFX10-NEXT: buffer_gl0_inv
2026 ; GFX10-NEXT: buffer_gl1_inv
2027 ; GFX10-NEXT: s_endpgm
2028 %zext.offset = zext i32 %voffset to i64
2029 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2030 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2031 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
2032 %unused = atomicrmw umax i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
2036 define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
2037 ; GFX9-LABEL: global_umax_saddr_i64_rtn:
2039 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2040 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc
2041 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2042 ; GFX9-NEXT: buffer_wbinvl1
2043 ; GFX9-NEXT: ; return to shader part epilog
2045 ; GFX10-LABEL: global_umax_saddr_i64_rtn:
2047 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2048 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2049 ; GFX10-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] glc
2050 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2051 ; GFX10-NEXT: buffer_gl0_inv
2052 ; GFX10-NEXT: buffer_gl1_inv
2053 ; GFX10-NEXT: ; return to shader part epilog
2054 %zext.offset = zext i32 %voffset to i64
2055 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2056 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
2057 %rtn = atomicrmw umax i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
2058 %cast.rtn = bitcast i64 %rtn to <2 x float>
2059 ret <2 x float> %cast.rtn
2062 define amdgpu_ps <2 x float> @global_umax_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
2063 ; GFX9-LABEL: global_umax_saddr_i64_rtn_neg128:
2065 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2066 ; GFX9-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2067 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2068 ; GFX9-NEXT: buffer_wbinvl1
2069 ; GFX9-NEXT: ; return to shader part epilog
2071 ; GFX10-LABEL: global_umax_saddr_i64_rtn_neg128:
2073 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2074 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2075 ; GFX10-NEXT: global_atomic_umax_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2076 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2077 ; GFX10-NEXT: buffer_gl0_inv
2078 ; GFX10-NEXT: buffer_gl1_inv
2079 ; GFX10-NEXT: ; return to shader part epilog
2080 %zext.offset = zext i32 %voffset to i64
2081 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2082 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2083 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
2084 %rtn = atomicrmw umax i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
2085 %cast.rtn = bitcast i64 %rtn to <2 x float>
2086 ret <2 x float> %cast.rtn
2089 define amdgpu_ps void @global_umax_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
2090 ; GFX9-LABEL: global_umax_saddr_i64_nortn:
2092 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2093 ; GFX9-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3]
2094 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2095 ; GFX9-NEXT: buffer_wbinvl1
2096 ; GFX9-NEXT: s_endpgm
2098 ; GFX10-LABEL: global_umax_saddr_i64_nortn:
2100 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2101 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2102 ; GFX10-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3]
2103 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2104 ; GFX10-NEXT: buffer_gl0_inv
2105 ; GFX10-NEXT: buffer_gl1_inv
2106 ; GFX10-NEXT: s_endpgm
2107 %zext.offset = zext i32 %voffset to i64
2108 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2109 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
2110 %unused = atomicrmw umax i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
2114 define amdgpu_ps void @global_umax_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
2115 ; GFX9-LABEL: global_umax_saddr_i64_nortn_neg128:
2117 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2118 ; GFX9-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128
2119 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2120 ; GFX9-NEXT: buffer_wbinvl1
2121 ; GFX9-NEXT: s_endpgm
2123 ; GFX10-LABEL: global_umax_saddr_i64_nortn_neg128:
2125 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2126 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2127 ; GFX10-NEXT: global_atomic_umax_x2 v0, v[1:2], s[2:3] offset:-128
2128 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2129 ; GFX10-NEXT: buffer_gl0_inv
2130 ; GFX10-NEXT: buffer_gl1_inv
2131 ; GFX10-NEXT: s_endpgm
2132 %zext.offset = zext i32 %voffset to i64
2133 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2134 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2135 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
2136 %unused = atomicrmw umax i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
2140 ; --------------------------------------------------------------------------------
2142 ; --------------------------------------------------------------------------------
2144 define amdgpu_ps float @global_umin_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
2145 ; GFX9-LABEL: global_umin_saddr_i32_rtn:
2147 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2148 ; GFX9-NEXT: global_atomic_umin v0, v0, v1, s[2:3] glc
2149 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2150 ; GFX9-NEXT: buffer_wbinvl1
2151 ; GFX9-NEXT: ; return to shader part epilog
2153 ; GFX10-LABEL: global_umin_saddr_i32_rtn:
2155 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2156 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2157 ; GFX10-NEXT: global_atomic_umin v0, v0, v1, s[2:3] glc
2158 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2159 ; GFX10-NEXT: buffer_gl0_inv
2160 ; GFX10-NEXT: buffer_gl1_inv
2161 ; GFX10-NEXT: ; return to shader part epilog
2162 %zext.offset = zext i32 %voffset to i64
2163 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2164 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
2165 %rtn = atomicrmw umin i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
2166 %cast.rtn = bitcast i32 %rtn to float
2170 define amdgpu_ps float @global_umin_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
2171 ; GFX9-LABEL: global_umin_saddr_i32_rtn_neg128:
2173 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2174 ; GFX9-NEXT: global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc
2175 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2176 ; GFX9-NEXT: buffer_wbinvl1
2177 ; GFX9-NEXT: ; return to shader part epilog
2179 ; GFX10-LABEL: global_umin_saddr_i32_rtn_neg128:
2181 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2182 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2183 ; GFX10-NEXT: global_atomic_umin v0, v0, v1, s[2:3] offset:-128 glc
2184 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2185 ; GFX10-NEXT: buffer_gl0_inv
2186 ; GFX10-NEXT: buffer_gl1_inv
2187 ; GFX10-NEXT: ; return to shader part epilog
2188 %zext.offset = zext i32 %voffset to i64
2189 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2190 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2191 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
2192 %rtn = atomicrmw umin i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
2193 %cast.rtn = bitcast i32 %rtn to float
2197 define amdgpu_ps void @global_umin_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
2198 ; GFX9-LABEL: global_umin_saddr_i32_nortn:
2200 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2201 ; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3]
2202 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2203 ; GFX9-NEXT: buffer_wbinvl1
2204 ; GFX9-NEXT: s_endpgm
2206 ; GFX10-LABEL: global_umin_saddr_i32_nortn:
2208 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2209 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2210 ; GFX10-NEXT: global_atomic_umin v0, v1, s[2:3]
2211 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2212 ; GFX10-NEXT: buffer_gl0_inv
2213 ; GFX10-NEXT: buffer_gl1_inv
2214 ; GFX10-NEXT: s_endpgm
2215 %zext.offset = zext i32 %voffset to i64
2216 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2217 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
2218 %unused = atomicrmw umin i32 addrspace(1)* %cast.gep0, i32 %data seq_cst
2222 define amdgpu_ps void @global_umin_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
2223 ; GFX9-LABEL: global_umin_saddr_i32_nortn_neg128:
2225 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2226 ; GFX9-NEXT: global_atomic_umin v0, v1, s[2:3] offset:-128
2227 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2228 ; GFX9-NEXT: buffer_wbinvl1
2229 ; GFX9-NEXT: s_endpgm
2231 ; GFX10-LABEL: global_umin_saddr_i32_nortn_neg128:
2233 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2234 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2235 ; GFX10-NEXT: global_atomic_umin v0, v1, s[2:3] offset:-128
2236 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2237 ; GFX10-NEXT: buffer_gl0_inv
2238 ; GFX10-NEXT: buffer_gl1_inv
2239 ; GFX10-NEXT: s_endpgm
2240 %zext.offset = zext i32 %voffset to i64
2241 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2242 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2243 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
2244 %unused = atomicrmw umin i32 addrspace(1)* %cast.gep1, i32 %data seq_cst
2248 define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
2249 ; GFX9-LABEL: global_umin_saddr_i64_rtn:
2251 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2252 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc
2253 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2254 ; GFX9-NEXT: buffer_wbinvl1
2255 ; GFX9-NEXT: ; return to shader part epilog
2257 ; GFX10-LABEL: global_umin_saddr_i64_rtn:
2259 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2260 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2261 ; GFX10-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] glc
2262 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2263 ; GFX10-NEXT: buffer_gl0_inv
2264 ; GFX10-NEXT: buffer_gl1_inv
2265 ; GFX10-NEXT: ; return to shader part epilog
2266 %zext.offset = zext i32 %voffset to i64
2267 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2268 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
2269 %rtn = atomicrmw umin i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
2270 %cast.rtn = bitcast i64 %rtn to <2 x float>
2271 ret <2 x float> %cast.rtn
2274 define amdgpu_ps <2 x float> @global_umin_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
2275 ; GFX9-LABEL: global_umin_saddr_i64_rtn_neg128:
2277 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2278 ; GFX9-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2279 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2280 ; GFX9-NEXT: buffer_wbinvl1
2281 ; GFX9-NEXT: ; return to shader part epilog
2283 ; GFX10-LABEL: global_umin_saddr_i64_rtn_neg128:
2285 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2286 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2287 ; GFX10-NEXT: global_atomic_umin_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2288 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2289 ; GFX10-NEXT: buffer_gl0_inv
2290 ; GFX10-NEXT: buffer_gl1_inv
2291 ; GFX10-NEXT: ; return to shader part epilog
2292 %zext.offset = zext i32 %voffset to i64
2293 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2294 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2295 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
2296 %rtn = atomicrmw umin i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
2297 %cast.rtn = bitcast i64 %rtn to <2 x float>
2298 ret <2 x float> %cast.rtn
2301 define amdgpu_ps void @global_umin_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
2302 ; GFX9-LABEL: global_umin_saddr_i64_nortn:
2304 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2305 ; GFX9-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3]
2306 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2307 ; GFX9-NEXT: buffer_wbinvl1
2308 ; GFX9-NEXT: s_endpgm
2310 ; GFX10-LABEL: global_umin_saddr_i64_nortn:
2312 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2313 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2314 ; GFX10-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3]
2315 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2316 ; GFX10-NEXT: buffer_gl0_inv
2317 ; GFX10-NEXT: buffer_gl1_inv
2318 ; GFX10-NEXT: s_endpgm
2319 %zext.offset = zext i32 %voffset to i64
2320 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2321 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
2322 %unused = atomicrmw umin i64 addrspace(1)* %cast.gep0, i64 %data seq_cst
2326 define amdgpu_ps void @global_umin_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
2327 ; GFX9-LABEL: global_umin_saddr_i64_nortn_neg128:
2329 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2330 ; GFX9-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128
2331 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2332 ; GFX9-NEXT: buffer_wbinvl1
2333 ; GFX9-NEXT: s_endpgm
2335 ; GFX10-LABEL: global_umin_saddr_i64_nortn_neg128:
2337 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2338 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2339 ; GFX10-NEXT: global_atomic_umin_x2 v0, v[1:2], s[2:3] offset:-128
2340 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2341 ; GFX10-NEXT: buffer_gl0_inv
2342 ; GFX10-NEXT: buffer_gl1_inv
2343 ; GFX10-NEXT: s_endpgm
2344 %zext.offset = zext i32 %voffset to i64
2345 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2346 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2347 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
2348 %unused = atomicrmw umin i64 addrspace(1)* %cast.gep1, i64 %data seq_cst
2352 ; --------------------------------------------------------------------------------
2354 ; --------------------------------------------------------------------------------
2356 define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
2357 ; GFX9-LABEL: global_cmpxchg_saddr_i32_rtn:
2359 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
2360 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2361 ; GFX9-NEXT: global_atomic_cmpswap v0, v0, v[2:3], s[2:3] glc
2362 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2363 ; GFX9-NEXT: buffer_wbinvl1
2364 ; GFX9-NEXT: ; return to shader part epilog
2366 ; GFX10-LABEL: global_cmpxchg_saddr_i32_rtn:
2368 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
2369 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2370 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2371 ; GFX10-NEXT: global_atomic_cmpswap v0, v0, v[2:3], s[2:3] glc
2372 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2373 ; GFX10-NEXT: buffer_gl0_inv
2374 ; GFX10-NEXT: buffer_gl1_inv
2375 ; GFX10-NEXT: ; return to shader part epilog
2376 %zext.offset = zext i32 %voffset to i64
2377 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2378 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
2379 %cmpxchg = cmpxchg i32 addrspace(1)* %cast.gep0, i32 %cmp, i32 %data seq_cst seq_cst
2380 %rtn = extractvalue { i32, i1 } %cmpxchg, 0
2381 %cast.rtn = bitcast i32 %rtn to float
2385 define amdgpu_ps float @global_cmpxchg_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
2386 ; GFX9-LABEL: global_cmpxchg_saddr_i32_rtn_neg128:
2388 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
2389 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2390 ; GFX9-NEXT: global_atomic_cmpswap v0, v0, v[2:3], s[2:3] offset:-128 glc
2391 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2392 ; GFX9-NEXT: buffer_wbinvl1
2393 ; GFX9-NEXT: ; return to shader part epilog
2395 ; GFX10-LABEL: global_cmpxchg_saddr_i32_rtn_neg128:
2397 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
2398 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2399 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2400 ; GFX10-NEXT: global_atomic_cmpswap v0, v0, v[2:3], s[2:3] offset:-128 glc
2401 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2402 ; GFX10-NEXT: buffer_gl0_inv
2403 ; GFX10-NEXT: buffer_gl1_inv
2404 ; GFX10-NEXT: ; return to shader part epilog
2405 %zext.offset = zext i32 %voffset to i64
2406 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2407 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2408 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
2409 %cmpxchg = cmpxchg i32 addrspace(1)* %cast.gep1, i32 %cmp, i32 %data seq_cst seq_cst
2410 %rtn = extractvalue { i32, i1 } %cmpxchg, 0
2411 %cast.rtn = bitcast i32 %rtn to float
2415 define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
2416 ; GFX9-LABEL: global_cmpxchg_saddr_i32_nortn:
2418 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
2419 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2420 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], s[2:3]
2421 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2422 ; GFX9-NEXT: buffer_wbinvl1
2423 ; GFX9-NEXT: s_endpgm
2425 ; GFX10-LABEL: global_cmpxchg_saddr_i32_nortn:
2427 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
2428 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2429 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2430 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], s[2:3]
2431 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2432 ; GFX10-NEXT: buffer_gl0_inv
2433 ; GFX10-NEXT: buffer_gl1_inv
2434 ; GFX10-NEXT: s_endpgm
2435 %zext.offset = zext i32 %voffset to i64
2436 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2437 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
2438 %unused = cmpxchg i32 addrspace(1)* %cast.gep0, i32 %cmp, i32 %data seq_cst seq_cst
2442 define amdgpu_ps void @global_cmpxchg_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
2443 ; GFX9-LABEL: global_cmpxchg_saddr_i32_nortn_neg128:
2445 ; GFX9-NEXT: v_mov_b32_e32 v3, v1
2446 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2447 ; GFX9-NEXT: global_atomic_cmpswap v0, v[2:3], s[2:3] offset:-128
2448 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2449 ; GFX9-NEXT: buffer_wbinvl1
2450 ; GFX9-NEXT: s_endpgm
2452 ; GFX10-LABEL: global_cmpxchg_saddr_i32_nortn_neg128:
2454 ; GFX10-NEXT: v_mov_b32_e32 v3, v1
2455 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2456 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2457 ; GFX10-NEXT: global_atomic_cmpswap v0, v[2:3], s[2:3] offset:-128
2458 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2459 ; GFX10-NEXT: buffer_gl0_inv
2460 ; GFX10-NEXT: buffer_gl1_inv
2461 ; GFX10-NEXT: s_endpgm
2462 %zext.offset = zext i32 %voffset to i64
2463 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2464 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2465 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
2466 %unused = cmpxchg i32 addrspace(1)* %cast.gep1, i32 %cmp, i32 %data seq_cst seq_cst
2470 define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
2471 ; GFX9-LABEL: global_cmpxchg_saddr_i64_rtn:
2473 ; GFX9-NEXT: v_mov_b32_e32 v6, v2
2474 ; GFX9-NEXT: v_mov_b32_e32 v5, v1
2475 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2476 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[3:6], s[2:3] glc
2477 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2478 ; GFX9-NEXT: buffer_wbinvl1
2479 ; GFX9-NEXT: ; return to shader part epilog
2481 ; GFX10-LABEL: global_cmpxchg_saddr_i64_rtn:
2483 ; GFX10-NEXT: v_mov_b32_e32 v6, v2
2484 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
2485 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2486 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2487 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[3:6], s[2:3] glc
2488 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2489 ; GFX10-NEXT: buffer_gl0_inv
2490 ; GFX10-NEXT: buffer_gl1_inv
2491 ; GFX10-NEXT: ; return to shader part epilog
2492 %zext.offset = zext i32 %voffset to i64
2493 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2494 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
2495 %cmpxchg = cmpxchg i64 addrspace(1)* %cast.gep0, i64 %cmp, i64 %data seq_cst seq_cst
2496 %rtn = extractvalue { i64, i1 } %cmpxchg, 0
2497 %cast.rtn = bitcast i64 %rtn to <2 x float>
2498 ret <2 x float> %cast.rtn
2501 define amdgpu_ps <2 x float> @global_cmpxchg_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
2502 ; GFX9-LABEL: global_cmpxchg_saddr_i64_rtn_neg128:
2504 ; GFX9-NEXT: v_mov_b32_e32 v6, v2
2505 ; GFX9-NEXT: v_mov_b32_e32 v5, v1
2506 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2507 ; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[3:6], s[2:3] offset:-128 glc
2508 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2509 ; GFX9-NEXT: buffer_wbinvl1
2510 ; GFX9-NEXT: ; return to shader part epilog
2512 ; GFX10-LABEL: global_cmpxchg_saddr_i64_rtn_neg128:
2514 ; GFX10-NEXT: v_mov_b32_e32 v6, v2
2515 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
2516 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2517 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2518 ; GFX10-NEXT: global_atomic_cmpswap_x2 v[0:1], v0, v[3:6], s[2:3] offset:-128 glc
2519 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2520 ; GFX10-NEXT: buffer_gl0_inv
2521 ; GFX10-NEXT: buffer_gl1_inv
2522 ; GFX10-NEXT: ; return to shader part epilog
2523 %zext.offset = zext i32 %voffset to i64
2524 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2525 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2526 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
2527 %cmpxchg = cmpxchg i64 addrspace(1)* %cast.gep1, i64 %cmp, i64 %data seq_cst seq_cst
2528 %rtn = extractvalue { i64, i1 } %cmpxchg, 0
2529 %cast.rtn = bitcast i64 %rtn to <2 x float>
2530 ret <2 x float> %cast.rtn
2533 define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
2534 ; GFX9-LABEL: global_cmpxchg_saddr_i64_nortn:
2536 ; GFX9-NEXT: v_mov_b32_e32 v6, v2
2537 ; GFX9-NEXT: v_mov_b32_e32 v5, v1
2538 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2539 ; GFX9-NEXT: global_atomic_cmpswap_x2 v0, v[3:6], s[2:3]
2540 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2541 ; GFX9-NEXT: buffer_wbinvl1
2542 ; GFX9-NEXT: s_endpgm
2544 ; GFX10-LABEL: global_cmpxchg_saddr_i64_nortn:
2546 ; GFX10-NEXT: v_mov_b32_e32 v6, v2
2547 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
2548 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2549 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2550 ; GFX10-NEXT: global_atomic_cmpswap_x2 v0, v[3:6], s[2:3]
2551 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2552 ; GFX10-NEXT: buffer_gl0_inv
2553 ; GFX10-NEXT: buffer_gl1_inv
2554 ; GFX10-NEXT: s_endpgm
2555 %zext.offset = zext i32 %voffset to i64
2556 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2557 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
2558 %unused = cmpxchg i64 addrspace(1)* %cast.gep0, i64 %cmp, i64 %data seq_cst seq_cst
2562 define amdgpu_ps void @global_cmpxchg_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
2563 ; GFX9-LABEL: global_cmpxchg_saddr_i64_nortn_neg128:
2565 ; GFX9-NEXT: v_mov_b32_e32 v6, v2
2566 ; GFX9-NEXT: v_mov_b32_e32 v5, v1
2567 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2568 ; GFX9-NEXT: global_atomic_cmpswap_x2 v0, v[3:6], s[2:3] offset:-128
2569 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2570 ; GFX9-NEXT: buffer_wbinvl1
2571 ; GFX9-NEXT: s_endpgm
2573 ; GFX10-LABEL: global_cmpxchg_saddr_i64_nortn_neg128:
2575 ; GFX10-NEXT: v_mov_b32_e32 v6, v2
2576 ; GFX10-NEXT: v_mov_b32_e32 v5, v1
2577 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2578 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2579 ; GFX10-NEXT: global_atomic_cmpswap_x2 v0, v[3:6], s[2:3] offset:-128
2580 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2581 ; GFX10-NEXT: buffer_gl0_inv
2582 ; GFX10-NEXT: buffer_gl1_inv
2583 ; GFX10-NEXT: s_endpgm
2584 %zext.offset = zext i32 %voffset to i64
2585 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2586 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2587 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
2588 %unused = cmpxchg i64 addrspace(1)* %cast.gep1, i64 %cmp, i64 %data seq_cst seq_cst
2592 ; --------------------------------------------------------------------------------
2594 ; --------------------------------------------------------------------------------
2596 declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
2597 declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
2599 define amdgpu_ps float @global_inc_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
2600 ; GCN-LABEL: global_inc_saddr_i32_rtn:
2602 ; GCN-NEXT: global_atomic_inc v0, v0, v1, s[2:3] glc
2603 ; GCN-NEXT: s_waitcnt vmcnt(0)
2604 ; GCN-NEXT: ; return to shader part epilog
2605 %zext.offset = zext i32 %voffset to i64
2606 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2607 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
2608 %rtn = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data, i32 0, i32 0, i1 false)
2609 %cast.rtn = bitcast i32 %rtn to float
2613 define amdgpu_ps float @global_inc_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
2614 ; GCN-LABEL: global_inc_saddr_i32_rtn_neg128:
2616 ; GCN-NEXT: global_atomic_inc v0, v0, v1, s[2:3] offset:-128 glc
2617 ; GCN-NEXT: s_waitcnt vmcnt(0)
2618 ; GCN-NEXT: ; return to shader part epilog
2619 %zext.offset = zext i32 %voffset to i64
2620 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2621 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2622 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
2623 %rtn = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data, i32 0, i32 0, i1 false)
2624 %cast.rtn = bitcast i32 %rtn to float
2628 define amdgpu_ps void @global_inc_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
2629 ; GCN-LABEL: global_inc_saddr_i32_nortn:
2631 ; GCN-NEXT: global_atomic_inc v0, v1, s[2:3]
2632 ; GCN-NEXT: s_endpgm
2633 %zext.offset = zext i32 %voffset to i64
2634 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2635 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
2636 %unused = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data, i32 0, i32 0, i1 false)
2640 define amdgpu_ps void @global_inc_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
2641 ; GCN-LABEL: global_inc_saddr_i32_nortn_neg128:
2643 ; GCN-NEXT: global_atomic_inc v0, v1, s[2:3] offset:-128
2644 ; GCN-NEXT: s_endpgm
2645 %zext.offset = zext i32 %voffset to i64
2646 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2647 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2648 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
2649 %unused = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data, i32 0, i32 0, i1 false)
2653 define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
2654 ; GCN-LABEL: global_inc_saddr_i64_rtn:
2656 ; GCN-NEXT: global_atomic_inc_x2 v[0:1], v0, v[1:2], s[2:3] glc
2657 ; GCN-NEXT: s_waitcnt vmcnt(0)
2658 ; GCN-NEXT: ; return to shader part epilog
2659 %zext.offset = zext i32 %voffset to i64
2660 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2661 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
2662 %rtn = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %cast.gep0, i64 %data, i32 0, i32 0, i1 false)
2663 %cast.rtn = bitcast i64 %rtn to <2 x float>
2664 ret <2 x float> %cast.rtn
2667 define amdgpu_ps <2 x float> @global_inc_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
2668 ; GCN-LABEL: global_inc_saddr_i64_rtn_neg128:
2670 ; GCN-NEXT: global_atomic_inc_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2671 ; GCN-NEXT: s_waitcnt vmcnt(0)
2672 ; GCN-NEXT: ; return to shader part epilog
2673 %zext.offset = zext i32 %voffset to i64
2674 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2675 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2676 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
2677 %rtn = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %cast.gep1, i64 %data, i32 0, i32 0, i1 false)
2678 %cast.rtn = bitcast i64 %rtn to <2 x float>
2679 ret <2 x float> %cast.rtn
2682 define amdgpu_ps void @global_inc_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
2683 ; GCN-LABEL: global_inc_saddr_i64_nortn:
2685 ; GCN-NEXT: global_atomic_inc_x2 v0, v[1:2], s[2:3]
2686 ; GCN-NEXT: s_endpgm
2687 %zext.offset = zext i32 %voffset to i64
2688 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2689 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
2690 %unused = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %cast.gep0, i64 %data, i32 0, i32 0, i1 false)
2694 define amdgpu_ps void @global_inc_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
2695 ; GCN-LABEL: global_inc_saddr_i64_nortn_neg128:
2697 ; GCN-NEXT: global_atomic_inc_x2 v0, v[1:2], s[2:3] offset:-128
2698 ; GCN-NEXT: s_endpgm
2699 %zext.offset = zext i32 %voffset to i64
2700 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2701 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2702 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
2703 %unused = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %cast.gep1, i64 %data, i32 0, i32 0, i1 false)
2707 ; --------------------------------------------------------------------------------
2709 ; --------------------------------------------------------------------------------
2711 declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
2712 declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32 immarg, i32 immarg, i1 immarg) #0
2714 define amdgpu_ps float @global_dec_saddr_i32_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
2715 ; GCN-LABEL: global_dec_saddr_i32_rtn:
2717 ; GCN-NEXT: global_atomic_dec v0, v0, v1, s[2:3] glc
2718 ; GCN-NEXT: s_waitcnt vmcnt(0)
2719 ; GCN-NEXT: ; return to shader part epilog
2720 %zext.offset = zext i32 %voffset to i64
2721 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2722 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
2723 %rtn = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data, i32 0, i32 0, i1 false)
2724 %cast.rtn = bitcast i32 %rtn to float
2728 define amdgpu_ps float @global_dec_saddr_i32_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
2729 ; GCN-LABEL: global_dec_saddr_i32_rtn_neg128:
2731 ; GCN-NEXT: global_atomic_dec v0, v0, v1, s[2:3] offset:-128 glc
2732 ; GCN-NEXT: s_waitcnt vmcnt(0)
2733 ; GCN-NEXT: ; return to shader part epilog
2734 %zext.offset = zext i32 %voffset to i64
2735 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2736 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2737 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
2738 %rtn = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data, i32 0, i32 0, i1 false)
2739 %cast.rtn = bitcast i32 %rtn to float
2743 define amdgpu_ps void @global_dec_saddr_i32_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
2744 ; GCN-LABEL: global_dec_saddr_i32_nortn:
2746 ; GCN-NEXT: global_atomic_dec v0, v1, s[2:3]
2747 ; GCN-NEXT: s_endpgm
2748 %zext.offset = zext i32 %voffset to i64
2749 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2750 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i32 addrspace(1)*
2751 %unused = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %cast.gep0, i32 %data, i32 0, i32 0, i1 false)
2755 define amdgpu_ps void @global_dec_saddr_i32_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i32 %data) {
2756 ; GCN-LABEL: global_dec_saddr_i32_nortn_neg128:
2758 ; GCN-NEXT: global_atomic_dec v0, v1, s[2:3] offset:-128
2759 ; GCN-NEXT: s_endpgm
2760 %zext.offset = zext i32 %voffset to i64
2761 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2762 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2763 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i32 addrspace(1)*
2764 %unused = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %cast.gep1, i32 %data, i32 0, i32 0, i1 false)
2768 define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
2769 ; GCN-LABEL: global_dec_saddr_i64_rtn:
2771 ; GCN-NEXT: global_atomic_dec_x2 v[0:1], v0, v[1:2], s[2:3] glc
2772 ; GCN-NEXT: s_waitcnt vmcnt(0)
2773 ; GCN-NEXT: ; return to shader part epilog
2774 %zext.offset = zext i32 %voffset to i64
2775 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2776 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
2777 %rtn = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %cast.gep0, i64 %data, i32 0, i32 0, i1 false)
2778 %cast.rtn = bitcast i64 %rtn to <2 x float>
2779 ret <2 x float> %cast.rtn
2782 define amdgpu_ps <2 x float> @global_dec_saddr_i64_rtn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
2783 ; GCN-LABEL: global_dec_saddr_i64_rtn_neg128:
2785 ; GCN-NEXT: global_atomic_dec_x2 v[0:1], v0, v[1:2], s[2:3] offset:-128 glc
2786 ; GCN-NEXT: s_waitcnt vmcnt(0)
2787 ; GCN-NEXT: ; return to shader part epilog
2788 %zext.offset = zext i32 %voffset to i64
2789 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2790 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2791 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
2792 %rtn = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %cast.gep1, i64 %data, i32 0, i32 0, i1 false)
2793 %cast.rtn = bitcast i64 %rtn to <2 x float>
2794 ret <2 x float> %cast.rtn
2797 define amdgpu_ps void @global_dec_saddr_i64_nortn(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
2798 ; GCN-LABEL: global_dec_saddr_i64_nortn:
2800 ; GCN-NEXT: global_atomic_dec_x2 v0, v[1:2], s[2:3]
2801 ; GCN-NEXT: s_endpgm
2802 %zext.offset = zext i32 %voffset to i64
2803 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2804 %cast.gep0 = bitcast i8 addrspace(1)* %gep0 to i64 addrspace(1)*
2805 %unused = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %cast.gep0, i64 %data, i32 0, i32 0, i1 false)
2809 define amdgpu_ps void @global_dec_saddr_i64_nortn_neg128(i8 addrspace(1)* inreg %sbase, i32 %voffset, i64 %data) {
2810 ; GCN-LABEL: global_dec_saddr_i64_nortn_neg128:
2812 ; GCN-NEXT: global_atomic_dec_x2 v0, v[1:2], s[2:3] offset:-128
2813 ; GCN-NEXT: s_endpgm
2814 %zext.offset = zext i32 %voffset to i64
2815 %gep0 = getelementptr inbounds i8, i8 addrspace(1)* %sbase, i64 %zext.offset
2816 %gep1 = getelementptr inbounds i8, i8 addrspace(1)* %gep0, i64 -128
2817 %cast.gep1 = bitcast i8 addrspace(1)* %gep1 to i64 addrspace(1)*
2818 %unused = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %cast.gep1, i64 %data, i32 0, i32 0, i1 false)
2822 attributes #0 = { argmemonly nounwind willreturn }