1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX9
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX90A
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX10
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=+enable-flat-scratch,+precise-memory < %s | FileCheck --check-prefixes=GFX9-FLATSCR %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX11
7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+precise-memory < %s | FileCheck %s -check-prefixes=GFX12
9 ; from atomicrmw-expand.ll
10 ; covers flat_load, flat_atomic (atomic with return)
12 define void @syncscope_workgroup_nortn(ptr %addr, float %val) {
13 ; GFX9-LABEL: syncscope_workgroup_nortn:
15 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
16 ; GFX9-NEXT: flat_load_dword v4, v[0:1]
17 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
18 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
19 ; GFX9-NEXT: .LBB0_1: ; %atomicrmw.start
20 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
21 ; GFX9-NEXT: v_add_f32_e32 v3, v4, v2
22 ; GFX9-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
23 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
24 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
25 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
26 ; GFX9-NEXT: v_mov_b32_e32 v4, v3
27 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
28 ; GFX9-NEXT: s_cbranch_execnz .LBB0_1
29 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
30 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
31 ; GFX9-NEXT: s_setpc_b64 s[30:31]
33 ; GFX90A-LABEL: syncscope_workgroup_nortn:
35 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
36 ; GFX90A-NEXT: flat_load_dword v5, v[0:1]
37 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
38 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
39 ; GFX90A-NEXT: .LBB0_1: ; %atomicrmw.start
40 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
41 ; GFX90A-NEXT: v_add_f32_e32 v4, v5, v2
42 ; GFX90A-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
43 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
44 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
45 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
46 ; GFX90A-NEXT: v_mov_b32_e32 v5, v3
47 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
48 ; GFX90A-NEXT: s_cbranch_execnz .LBB0_1
49 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
50 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
51 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
53 ; GFX10-LABEL: syncscope_workgroup_nortn:
55 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
56 ; GFX10-NEXT: flat_load_dword v4, v[0:1]
57 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
58 ; GFX10-NEXT: s_mov_b32 s4, 0
59 ; GFX10-NEXT: .LBB0_1: ; %atomicrmw.start
60 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
61 ; GFX10-NEXT: v_add_f32_e32 v3, v4, v2
62 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
63 ; GFX10-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
64 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
65 ; GFX10-NEXT: buffer_gl0_inv
66 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
67 ; GFX10-NEXT: v_mov_b32_e32 v4, v3
68 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
69 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
70 ; GFX10-NEXT: s_cbranch_execnz .LBB0_1
71 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
72 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
73 ; GFX10-NEXT: s_setpc_b64 s[30:31]
75 ; GFX9-FLATSCR-LABEL: syncscope_workgroup_nortn:
76 ; GFX9-FLATSCR: ; %bb.0:
77 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
78 ; GFX9-FLATSCR-NEXT: flat_load_dword v4, v[0:1]
79 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
80 ; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], 0
81 ; GFX9-FLATSCR-NEXT: .LBB0_1: ; %atomicrmw.start
82 ; GFX9-FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
83 ; GFX9-FLATSCR-NEXT: v_add_f32_e32 v3, v4, v2
84 ; GFX9-FLATSCR-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
85 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
86 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
87 ; GFX9-FLATSCR-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
88 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v4, v3
89 ; GFX9-FLATSCR-NEXT: s_andn2_b64 exec, exec, s[0:1]
90 ; GFX9-FLATSCR-NEXT: s_cbranch_execnz .LBB0_1
91 ; GFX9-FLATSCR-NEXT: ; %bb.2: ; %atomicrmw.end
92 ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1]
93 ; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31]
95 ; GFX11-LABEL: syncscope_workgroup_nortn:
97 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
98 ; GFX11-NEXT: flat_load_b32 v4, v[0:1]
99 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
100 ; GFX11-NEXT: s_mov_b32 s0, 0
101 ; GFX11-NEXT: .LBB0_1: ; %atomicrmw.start
102 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
103 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
104 ; GFX11-NEXT: v_add_f32_e32 v3, v4, v2
105 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
106 ; GFX11-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
107 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
108 ; GFX11-NEXT: buffer_gl0_inv
109 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
110 ; GFX11-NEXT: v_mov_b32_e32 v4, v3
111 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
112 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
113 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
114 ; GFX11-NEXT: s_cbranch_execnz .LBB0_1
115 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
116 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
117 ; GFX11-NEXT: s_setpc_b64 s[30:31]
119 ; GFX12-LABEL: syncscope_workgroup_nortn:
121 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
122 ; GFX12-NEXT: s_wait_expcnt 0x0
123 ; GFX12-NEXT: s_wait_samplecnt 0x0
124 ; GFX12-NEXT: s_wait_bvhcnt 0x0
125 ; GFX12-NEXT: s_wait_kmcnt 0x0
126 ; GFX12-NEXT: s_wait_storecnt 0x0
127 ; GFX12-NEXT: flat_atomic_add_f32 v[0:1], v2 scope:SCOPE_SE
128 ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
129 ; GFX12-NEXT: global_inv scope:SCOPE_SE
130 ; GFX12-NEXT: s_setpc_b64 s[30:31]
131 %res = atomicrmw fadd ptr %addr, float %val syncscope("workgroup") seq_cst
135 ; from atomicrmw-nand.ll
136 ; covers global_atomic (atomic with return), global_load
138 define i32 @atomic_nand_i32_global(ptr addrspace(1) %ptr) nounwind {
139 ; GFX9-LABEL: atomic_nand_i32_global:
141 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
142 ; GFX9-NEXT: global_load_dword v2, v[0:1], off
143 ; GFX9-NEXT: s_waitcnt vmcnt(0)
144 ; GFX9-NEXT: s_mov_b64 s[4:5], 0
145 ; GFX9-NEXT: .LBB1_1: ; %atomicrmw.start
146 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
147 ; GFX9-NEXT: v_mov_b32_e32 v3, v2
148 ; GFX9-NEXT: v_not_b32_e32 v2, v3
149 ; GFX9-NEXT: v_or_b32_e32 v2, -5, v2
150 ; GFX9-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
151 ; GFX9-NEXT: s_waitcnt vmcnt(0)
152 ; GFX9-NEXT: buffer_wbinvl1_vol
153 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
154 ; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
155 ; GFX9-NEXT: s_andn2_b64 exec, exec, s[4:5]
156 ; GFX9-NEXT: s_cbranch_execnz .LBB1_1
157 ; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
158 ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
159 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
160 ; GFX9-NEXT: s_setpc_b64 s[30:31]
162 ; GFX90A-LABEL: atomic_nand_i32_global:
164 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
165 ; GFX90A-NEXT: global_load_dword v2, v[0:1], off
166 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
167 ; GFX90A-NEXT: s_mov_b64 s[4:5], 0
168 ; GFX90A-NEXT: .LBB1_1: ; %atomicrmw.start
169 ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
170 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2
171 ; GFX90A-NEXT: v_not_b32_e32 v2, v3
172 ; GFX90A-NEXT: v_or_b32_e32 v2, -5, v2
173 ; GFX90A-NEXT: buffer_wbl2
174 ; GFX90A-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
175 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
176 ; GFX90A-NEXT: buffer_invl2
177 ; GFX90A-NEXT: buffer_wbinvl1_vol
178 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
179 ; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
180 ; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
181 ; GFX90A-NEXT: s_cbranch_execnz .LBB1_1
182 ; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
183 ; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
184 ; GFX90A-NEXT: v_mov_b32_e32 v0, v2
185 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
187 ; GFX10-LABEL: atomic_nand_i32_global:
189 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190 ; GFX10-NEXT: global_load_dword v2, v[0:1], off
191 ; GFX10-NEXT: s_waitcnt vmcnt(0)
192 ; GFX10-NEXT: s_mov_b32 s4, 0
193 ; GFX10-NEXT: .LBB1_1: ; %atomicrmw.start
194 ; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
195 ; GFX10-NEXT: v_mov_b32_e32 v3, v2
196 ; GFX10-NEXT: v_not_b32_e32 v2, v3
197 ; GFX10-NEXT: v_or_b32_e32 v2, -5, v2
198 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
199 ; GFX10-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
200 ; GFX10-NEXT: s_waitcnt vmcnt(0)
201 ; GFX10-NEXT: buffer_gl1_inv
202 ; GFX10-NEXT: buffer_gl0_inv
203 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
204 ; GFX10-NEXT: s_or_b32 s4, vcc_lo, s4
205 ; GFX10-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
206 ; GFX10-NEXT: s_cbranch_execnz .LBB1_1
207 ; GFX10-NEXT: ; %bb.2: ; %atomicrmw.end
208 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s4
209 ; GFX10-NEXT: v_mov_b32_e32 v0, v2
210 ; GFX10-NEXT: s_setpc_b64 s[30:31]
212 ; GFX9-FLATSCR-LABEL: atomic_nand_i32_global:
213 ; GFX9-FLATSCR: ; %bb.0:
214 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
215 ; GFX9-FLATSCR-NEXT: global_load_dword v2, v[0:1], off
216 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
217 ; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], 0
218 ; GFX9-FLATSCR-NEXT: .LBB1_1: ; %atomicrmw.start
219 ; GFX9-FLATSCR-NEXT: ; =>This Inner Loop Header: Depth=1
220 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v3, v2
221 ; GFX9-FLATSCR-NEXT: v_not_b32_e32 v2, v3
222 ; GFX9-FLATSCR-NEXT: v_or_b32_e32 v2, -5, v2
223 ; GFX9-FLATSCR-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
224 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
225 ; GFX9-FLATSCR-NEXT: buffer_wbinvl1_vol
226 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
227 ; GFX9-FLATSCR-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
228 ; GFX9-FLATSCR-NEXT: s_andn2_b64 exec, exec, s[0:1]
229 ; GFX9-FLATSCR-NEXT: s_cbranch_execnz .LBB1_1
230 ; GFX9-FLATSCR-NEXT: ; %bb.2: ; %atomicrmw.end
231 ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1]
232 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, v2
233 ; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31]
235 ; GFX11-LABEL: atomic_nand_i32_global:
237 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
238 ; GFX11-NEXT: global_load_b32 v2, v[0:1], off
239 ; GFX11-NEXT: s_waitcnt vmcnt(0)
240 ; GFX11-NEXT: s_mov_b32 s0, 0
241 ; GFX11-NEXT: .LBB1_1: ; %atomicrmw.start
242 ; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
243 ; GFX11-NEXT: v_mov_b32_e32 v3, v2
244 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
245 ; GFX11-NEXT: v_not_b32_e32 v2, v3
246 ; GFX11-NEXT: v_or_b32_e32 v2, -5, v2
247 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
248 ; GFX11-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
249 ; GFX11-NEXT: s_waitcnt vmcnt(0)
250 ; GFX11-NEXT: buffer_gl1_inv
251 ; GFX11-NEXT: buffer_gl0_inv
252 ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
253 ; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0
254 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
255 ; GFX11-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
256 ; GFX11-NEXT: s_cbranch_execnz .LBB1_1
257 ; GFX11-NEXT: ; %bb.2: ; %atomicrmw.end
258 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
259 ; GFX11-NEXT: v_mov_b32_e32 v0, v2
260 ; GFX11-NEXT: s_setpc_b64 s[30:31]
262 ; GFX12-LABEL: atomic_nand_i32_global:
264 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
265 ; GFX12-NEXT: s_wait_expcnt 0x0
266 ; GFX12-NEXT: s_wait_samplecnt 0x0
267 ; GFX12-NEXT: s_wait_bvhcnt 0x0
268 ; GFX12-NEXT: s_wait_kmcnt 0x0
269 ; GFX12-NEXT: global_load_b32 v2, v[0:1], off
270 ; GFX12-NEXT: s_wait_loadcnt 0x0
271 ; GFX12-NEXT: s_mov_b32 s0, 0
272 ; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start
273 ; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
274 ; GFX12-NEXT: v_mov_b32_e32 v3, v2
275 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
276 ; GFX12-NEXT: v_not_b32_e32 v2, v3
277 ; GFX12-NEXT: v_or_b32_e32 v2, -5, v2
278 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
279 ; GFX12-NEXT: s_wait_storecnt 0x0
280 ; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
281 ; GFX12-NEXT: s_wait_loadcnt 0x0
282 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
283 ; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
284 ; GFX12-NEXT: s_wait_alu 0xfffe
285 ; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
286 ; GFX12-NEXT: s_wait_alu 0xfffe
287 ; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
288 ; GFX12-NEXT: s_cbranch_execnz .LBB1_1
289 ; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
290 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
291 ; GFX12-NEXT: v_mov_b32_e32 v0, v2
292 ; GFX12-NEXT: s_setpc_b64 s[30:31]
293 %result = atomicrmw nand ptr addrspace(1) %ptr, i32 4 seq_cst
297 ; from call-argument-types.ll
298 ; covers scratch_load, scratch_store, buffer_load, buffer_store
300 declare hidden void @byval_align16_f64_arg(<32 x i32>, ptr addrspace(5) byval(double) align 16)
301 define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) {
302 ; GFX9-LABEL: tail_call_byval_align16:
303 ; GFX9: ; %bb.0: ; %entry
304 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
305 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
306 ; GFX9-NEXT: s_waitcnt vmcnt(0)
307 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s32
308 ; GFX9-NEXT: s_waitcnt vmcnt(0)
309 ; GFX9-NEXT: s_getpc_b64 s[16:17]
310 ; GFX9-NEXT: s_add_u32 s16, s16, byval_align16_f64_arg@rel32@lo+4
311 ; GFX9-NEXT: s_addc_u32 s17, s17, byval_align16_f64_arg@rel32@hi+12
312 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:20
313 ; GFX9-NEXT: s_waitcnt vmcnt(0)
314 ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
315 ; GFX9-NEXT: s_waitcnt vmcnt(0)
316 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:16
317 ; GFX9-NEXT: s_waitcnt vmcnt(0)
318 ; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s32
319 ; GFX9-NEXT: s_waitcnt vmcnt(0)
320 ; GFX9-NEXT: s_setpc_b64 s[16:17]
322 ; GFX90A-LABEL: tail_call_byval_align16:
323 ; GFX90A: ; %bb.0: ; %entry
324 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
325 ; GFX90A-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
326 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
327 ; GFX90A-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
328 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
329 ; GFX90A-NEXT: buffer_load_dword v34, off, s[0:3], s32
330 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
331 ; GFX90A-NEXT: s_getpc_b64 s[16:17]
332 ; GFX90A-NEXT: s_add_u32 s16, s16, byval_align16_f64_arg@rel32@lo+4
333 ; GFX90A-NEXT: s_addc_u32 s17, s17, byval_align16_f64_arg@rel32@hi+12
334 ; GFX90A-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:20
335 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
336 ; GFX90A-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:16
337 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
338 ; GFX90A-NEXT: buffer_store_dword v34, off, s[0:3], s32
339 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
340 ; GFX90A-NEXT: s_setpc_b64 s[16:17]
342 ; GFX10-LABEL: tail_call_byval_align16:
343 ; GFX10: ; %bb.0: ; %entry
344 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
345 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
346 ; GFX10-NEXT: s_waitcnt vmcnt(0)
347 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
348 ; GFX10-NEXT: s_waitcnt vmcnt(0)
349 ; GFX10-NEXT: buffer_load_dword v34, off, s[0:3], s32
350 ; GFX10-NEXT: s_waitcnt vmcnt(0)
351 ; GFX10-NEXT: s_getpc_b64 s[16:17]
352 ; GFX10-NEXT: s_add_u32 s16, s16, byval_align16_f64_arg@rel32@lo+4
353 ; GFX10-NEXT: s_addc_u32 s17, s17, byval_align16_f64_arg@rel32@hi+12
354 ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:20
355 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
356 ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:16
357 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
358 ; GFX10-NEXT: buffer_store_dword v34, off, s[0:3], s32
359 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
360 ; GFX10-NEXT: s_setpc_b64 s[16:17]
362 ; GFX9-FLATSCR-LABEL: tail_call_byval_align16:
363 ; GFX9-FLATSCR: ; %bb.0: ; %entry
364 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
365 ; GFX9-FLATSCR-NEXT: scratch_load_dword v32, off, s32
366 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
367 ; GFX9-FLATSCR-NEXT: s_getpc_b64 s[0:1]
368 ; GFX9-FLATSCR-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4
369 ; GFX9-FLATSCR-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12
370 ; GFX9-FLATSCR-NEXT: scratch_store_dword off, v32, s32
371 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
372 ; GFX9-FLATSCR-NEXT: scratch_load_dwordx2 v[32:33], off, s32 offset:24
373 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
374 ; GFX9-FLATSCR-NEXT: scratch_store_dwordx2 off, v[32:33], s32 offset:16
375 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
376 ; GFX9-FLATSCR-NEXT: s_setpc_b64 s[0:1]
378 ; GFX11-LABEL: tail_call_byval_align16:
379 ; GFX11: ; %bb.0: ; %entry
380 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
381 ; GFX11-NEXT: scratch_load_b32 v32, off, s32
382 ; GFX11-NEXT: s_waitcnt vmcnt(0)
383 ; GFX11-NEXT: s_getpc_b64 s[0:1]
384 ; GFX11-NEXT: s_add_u32 s0, s0, byval_align16_f64_arg@rel32@lo+4
385 ; GFX11-NEXT: s_addc_u32 s1, s1, byval_align16_f64_arg@rel32@hi+12
386 ; GFX11-NEXT: scratch_store_b32 off, v32, s32
387 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
388 ; GFX11-NEXT: scratch_load_b64 v[32:33], off, s32 offset:24
389 ; GFX11-NEXT: s_waitcnt vmcnt(0)
390 ; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32 offset:16
391 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
392 ; GFX11-NEXT: s_setpc_b64 s[0:1]
394 ; GFX12-LABEL: tail_call_byval_align16:
395 ; GFX12: ; %bb.0: ; %entry
396 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
397 ; GFX12-NEXT: s_wait_expcnt 0x0
398 ; GFX12-NEXT: s_wait_samplecnt 0x0
399 ; GFX12-NEXT: s_wait_bvhcnt 0x0
400 ; GFX12-NEXT: s_wait_kmcnt 0x0
401 ; GFX12-NEXT: scratch_load_b32 v32, off, s32
402 ; GFX12-NEXT: s_wait_loadcnt 0x0
403 ; GFX12-NEXT: s_getpc_b64 s[0:1]
404 ; GFX12-NEXT: s_wait_alu 0xfffe
405 ; GFX12-NEXT: s_sext_i32_i16 s1, s1
406 ; GFX12-NEXT: s_add_co_u32 s0, s0, byval_align16_f64_arg@rel32@lo+12
407 ; GFX12-NEXT: s_wait_alu 0xfffe
408 ; GFX12-NEXT: s_add_co_ci_u32 s1, s1, byval_align16_f64_arg@rel32@hi+24
409 ; GFX12-NEXT: scratch_store_b32 off, v32, s32
410 ; GFX12-NEXT: s_wait_storecnt 0x0
411 ; GFX12-NEXT: scratch_load_b64 v[32:33], off, s32 offset:24
412 ; GFX12-NEXT: s_wait_loadcnt 0x0
413 ; GFX12-NEXT: scratch_store_b64 off, v[32:33], s32 offset:16
414 ; GFX12-NEXT: s_wait_storecnt 0x0
415 ; GFX12-NEXT: s_wait_alu 0xfffe
416 ; GFX12-NEXT: s_setpc_b64 s[0:1]
418 %alloca = alloca double, align 8, addrspace(5)
419 tail call void @byval_align16_f64_arg(<32 x i32> %val, ptr addrspace(5) byval(double) align 16 %alloca)
426 define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
427 ; GFX9-LABEL: udiv_i32:
429 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
430 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
431 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
432 ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s3
433 ; GFX9-NEXT: s_sub_i32 s4, 0, s3
434 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
435 ; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
436 ; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
437 ; GFX9-NEXT: v_readfirstlane_b32 s5, v0
438 ; GFX9-NEXT: s_mul_i32 s4, s4, s5
439 ; GFX9-NEXT: s_mul_hi_u32 s4, s5, s4
440 ; GFX9-NEXT: s_add_i32 s5, s5, s4
441 ; GFX9-NEXT: s_mul_hi_u32 s4, s2, s5
442 ; GFX9-NEXT: s_mul_i32 s5, s4, s3
443 ; GFX9-NEXT: s_sub_i32 s2, s2, s5
444 ; GFX9-NEXT: s_add_i32 s6, s4, 1
445 ; GFX9-NEXT: s_sub_i32 s5, s2, s3
446 ; GFX9-NEXT: s_cmp_ge_u32 s2, s3
447 ; GFX9-NEXT: s_cselect_b32 s4, s6, s4
448 ; GFX9-NEXT: s_cselect_b32 s2, s5, s2
449 ; GFX9-NEXT: s_add_i32 s5, s4, 1
450 ; GFX9-NEXT: s_cmp_ge_u32 s2, s3
451 ; GFX9-NEXT: s_cselect_b32 s2, s5, s4
452 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
453 ; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
454 ; GFX9-NEXT: s_waitcnt vmcnt(0)
455 ; GFX9-NEXT: s_endpgm
457 ; GFX90A-LABEL: udiv_i32:
459 ; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
460 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
461 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0
462 ; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s3
463 ; GFX90A-NEXT: s_sub_i32 s4, 0, s3
464 ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0
465 ; GFX90A-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
466 ; GFX90A-NEXT: v_cvt_u32_f32_e32 v0, v0
467 ; GFX90A-NEXT: v_readfirstlane_b32 s5, v0
468 ; GFX90A-NEXT: s_mul_i32 s4, s4, s5
469 ; GFX90A-NEXT: s_mul_hi_u32 s4, s5, s4
470 ; GFX90A-NEXT: s_add_i32 s5, s5, s4
471 ; GFX90A-NEXT: s_mul_hi_u32 s4, s2, s5
472 ; GFX90A-NEXT: s_mul_i32 s5, s4, s3
473 ; GFX90A-NEXT: s_sub_i32 s2, s2, s5
474 ; GFX90A-NEXT: s_add_i32 s6, s4, 1
475 ; GFX90A-NEXT: s_sub_i32 s5, s2, s3
476 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s3
477 ; GFX90A-NEXT: s_cselect_b32 s4, s6, s4
478 ; GFX90A-NEXT: s_cselect_b32 s2, s5, s2
479 ; GFX90A-NEXT: s_add_i32 s5, s4, 1
480 ; GFX90A-NEXT: s_cmp_ge_u32 s2, s3
481 ; GFX90A-NEXT: s_cselect_b32 s2, s5, s4
482 ; GFX90A-NEXT: v_mov_b32_e32 v0, s2
483 ; GFX90A-NEXT: global_store_dword v1, v0, s[0:1]
484 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
485 ; GFX90A-NEXT: s_endpgm
487 ; GFX10-LABEL: udiv_i32:
489 ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
490 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
491 ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s3
492 ; GFX10-NEXT: s_sub_i32 s5, 0, s3
493 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
494 ; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
495 ; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
496 ; GFX10-NEXT: v_readfirstlane_b32 s4, v0
497 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
498 ; GFX10-NEXT: s_mul_i32 s5, s5, s4
499 ; GFX10-NEXT: s_mul_hi_u32 s5, s4, s5
500 ; GFX10-NEXT: s_add_i32 s4, s4, s5
501 ; GFX10-NEXT: s_mul_hi_u32 s4, s2, s4
502 ; GFX10-NEXT: s_mul_i32 s5, s4, s3
503 ; GFX10-NEXT: s_sub_i32 s2, s2, s5
504 ; GFX10-NEXT: s_add_i32 s5, s4, 1
505 ; GFX10-NEXT: s_sub_i32 s6, s2, s3
506 ; GFX10-NEXT: s_cmp_ge_u32 s2, s3
507 ; GFX10-NEXT: s_cselect_b32 s4, s5, s4
508 ; GFX10-NEXT: s_cselect_b32 s2, s6, s2
509 ; GFX10-NEXT: s_add_i32 s5, s4, 1
510 ; GFX10-NEXT: s_cmp_ge_u32 s2, s3
511 ; GFX10-NEXT: s_cselect_b32 s2, s5, s4
512 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
513 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1]
514 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
515 ; GFX10-NEXT: s_endpgm
517 ; GFX9-FLATSCR-LABEL: udiv_i32:
518 ; GFX9-FLATSCR: ; %bb.0:
519 ; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
520 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
521 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 0
522 ; GFX9-FLATSCR-NEXT: v_cvt_f32_u32_e32 v0, s3
523 ; GFX9-FLATSCR-NEXT: s_sub_i32 s4, 0, s3
524 ; GFX9-FLATSCR-NEXT: v_rcp_iflag_f32_e32 v0, v0
525 ; GFX9-FLATSCR-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
526 ; GFX9-FLATSCR-NEXT: v_cvt_u32_f32_e32 v0, v0
527 ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s5, v0
528 ; GFX9-FLATSCR-NEXT: s_mul_i32 s4, s4, s5
529 ; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s5, s4
530 ; GFX9-FLATSCR-NEXT: s_add_i32 s5, s5, s4
531 ; GFX9-FLATSCR-NEXT: s_mul_hi_u32 s4, s2, s5
532 ; GFX9-FLATSCR-NEXT: s_mul_i32 s5, s4, s3
533 ; GFX9-FLATSCR-NEXT: s_sub_i32 s2, s2, s5
534 ; GFX9-FLATSCR-NEXT: s_add_i32 s6, s4, 1
535 ; GFX9-FLATSCR-NEXT: s_sub_i32 s5, s2, s3
536 ; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3
537 ; GFX9-FLATSCR-NEXT: s_cselect_b32 s4, s6, s4
538 ; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s2
539 ; GFX9-FLATSCR-NEXT: s_add_i32 s5, s4, 1
540 ; GFX9-FLATSCR-NEXT: s_cmp_ge_u32 s2, s3
541 ; GFX9-FLATSCR-NEXT: s_cselect_b32 s2, s5, s4
542 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s2
543 ; GFX9-FLATSCR-NEXT: global_store_dword v1, v0, s[0:1]
544 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
545 ; GFX9-FLATSCR-NEXT: s_endpgm
547 ; GFX11-LABEL: udiv_i32:
549 ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
550 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
551 ; GFX11-NEXT: v_cvt_f32_u32_e32 v0, s3
552 ; GFX11-NEXT: s_sub_i32 s5, 0, s3
553 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
554 ; GFX11-NEXT: v_rcp_iflag_f32_e32 v0, v0
555 ; GFX11-NEXT: s_waitcnt_depctr 0xfff
556 ; GFX11-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
557 ; GFX11-NEXT: v_cvt_u32_f32_e32 v0, v0
558 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
559 ; GFX11-NEXT: v_readfirstlane_b32 s4, v0
560 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
561 ; GFX11-NEXT: s_mul_i32 s5, s5, s4
562 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
563 ; GFX11-NEXT: s_mul_hi_u32 s5, s4, s5
564 ; GFX11-NEXT: s_add_i32 s4, s4, s5
565 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
566 ; GFX11-NEXT: s_mul_hi_u32 s4, s2, s4
567 ; GFX11-NEXT: s_mul_i32 s5, s4, s3
568 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
569 ; GFX11-NEXT: s_sub_i32 s2, s2, s5
570 ; GFX11-NEXT: s_add_i32 s5, s4, 1
571 ; GFX11-NEXT: s_sub_i32 s6, s2, s3
572 ; GFX11-NEXT: s_cmp_ge_u32 s2, s3
573 ; GFX11-NEXT: s_cselect_b32 s4, s5, s4
574 ; GFX11-NEXT: s_cselect_b32 s2, s6, s2
575 ; GFX11-NEXT: s_add_i32 s5, s4, 1
576 ; GFX11-NEXT: s_cmp_ge_u32 s2, s3
577 ; GFX11-NEXT: s_cselect_b32 s2, s5, s4
578 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
579 ; GFX11-NEXT: v_mov_b32_e32 v1, s2
580 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1]
581 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
582 ; GFX11-NEXT: s_endpgm
584 ; GFX12-LABEL: udiv_i32:
586 ; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
587 ; GFX12-NEXT: s_wait_kmcnt 0x0
588 ; GFX12-NEXT: s_cvt_f32_u32 s4, s3
589 ; GFX12-NEXT: s_sub_co_i32 s5, 0, s3
590 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
591 ; GFX12-NEXT: v_rcp_iflag_f32_e32 v0, s4
592 ; GFX12-NEXT: v_readfirstlane_b32 s4, v0
593 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_2)
594 ; GFX12-NEXT: s_mul_f32 s4, s4, 0x4f7ffffe
595 ; GFX12-NEXT: s_wait_alu 0xfffe
596 ; GFX12-NEXT: s_cvt_u32_f32 s4, s4
597 ; GFX12-NEXT: s_wait_alu 0xfffe
598 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2)
599 ; GFX12-NEXT: s_mul_i32 s5, s5, s4
600 ; GFX12-NEXT: s_wait_alu 0xfffe
601 ; GFX12-NEXT: s_mul_hi_u32 s5, s4, s5
602 ; GFX12-NEXT: s_wait_alu 0xfffe
603 ; GFX12-NEXT: s_add_co_i32 s4, s4, s5
604 ; GFX12-NEXT: s_wait_alu 0xfffe
605 ; GFX12-NEXT: s_mul_hi_u32 s4, s2, s4
606 ; GFX12-NEXT: s_wait_alu 0xfffe
607 ; GFX12-NEXT: s_mul_i32 s5, s4, s3
608 ; GFX12-NEXT: s_wait_alu 0xfffe
609 ; GFX12-NEXT: s_sub_co_i32 s2, s2, s5
610 ; GFX12-NEXT: s_add_co_i32 s5, s4, 1
611 ; GFX12-NEXT: s_sub_co_i32 s6, s2, s3
612 ; GFX12-NEXT: s_cmp_ge_u32 s2, s3
613 ; GFX12-NEXT: s_wait_alu 0xfffe
614 ; GFX12-NEXT: s_cselect_b32 s4, s5, s4
615 ; GFX12-NEXT: s_cselect_b32 s2, s6, s2
616 ; GFX12-NEXT: s_wait_alu 0xfffe
617 ; GFX12-NEXT: s_add_co_i32 s5, s4, 1
618 ; GFX12-NEXT: s_cmp_ge_u32 s2, s3
619 ; GFX12-NEXT: s_wait_alu 0xfffe
620 ; GFX12-NEXT: s_cselect_b32 s2, s5, s4
621 ; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
622 ; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
623 ; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
624 ; GFX12-NEXT: s_wait_storecnt 0x0
625 ; GFX12-NEXT: s_endpgm
627 store i32 %r, ptr addrspace(1) %out
631 declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32)
634 ; covers s_buffer_load
636 define amdgpu_ps float @smrd_sgpr_offset(<4 x i32> inreg %desc, i32 inreg %offset) #0 {
637 ; GFX9-LABEL: smrd_sgpr_offset:
638 ; GFX9: ; %bb.0: ; %main_body
639 ; GFX9-NEXT: s_buffer_load_dword s0, s[0:3], s4 offset:0x0
640 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
641 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
642 ; GFX9-NEXT: ; return to shader part epilog
644 ; GFX90A-LABEL: smrd_sgpr_offset:
645 ; GFX90A: ; %bb.0: ; %main_body
646 ; GFX90A-NEXT: s_buffer_load_dword s0, s[0:3], s4 offset:0x0
647 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
648 ; GFX90A-NEXT: v_mov_b32_e32 v0, s0
649 ; GFX90A-NEXT: ; return to shader part epilog
651 ; GFX10-LABEL: smrd_sgpr_offset:
652 ; GFX10: ; %bb.0: ; %main_body
653 ; GFX10-NEXT: s_buffer_load_dword s0, s[0:3], s4 offset:0x0
654 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
655 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
656 ; GFX10-NEXT: ; return to shader part epilog
658 ; GFX9-FLATSCR-LABEL: smrd_sgpr_offset:
659 ; GFX9-FLATSCR: ; %bb.0: ; %main_body
660 ; GFX9-FLATSCR-NEXT: s_mov_b32 s11, s5
661 ; GFX9-FLATSCR-NEXT: s_mov_b32 s10, s4
662 ; GFX9-FLATSCR-NEXT: s_mov_b32 s9, s3
663 ; GFX9-FLATSCR-NEXT: s_mov_b32 s8, s2
664 ; GFX9-FLATSCR-NEXT: s_buffer_load_dword s0, s[8:11], s6 offset:0x0
665 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
666 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s0
667 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog
669 ; GFX11-LABEL: smrd_sgpr_offset:
670 ; GFX11: ; %bb.0: ; %main_body
671 ; GFX11-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
672 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
673 ; GFX11-NEXT: v_mov_b32_e32 v0, s0
674 ; GFX11-NEXT: ; return to shader part epilog
676 ; GFX12-LABEL: smrd_sgpr_offset:
677 ; GFX12: ; %bb.0: ; %main_body
678 ; GFX12-NEXT: s_buffer_load_b32 s0, s[0:3], s4 offset:0x0
679 ; GFX12-NEXT: s_wait_kmcnt 0x0
680 ; GFX12-NEXT: v_mov_b32_e32 v0, s0
681 ; GFX12-NEXT: ; return to shader part epilog
683 %r = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %desc, i32 %offset, i32 0)
687 ; from atomic_load_add.ll
688 ; covers s_load, ds_add (atomic without return)
690 define amdgpu_kernel void @atomic_add_local(ptr addrspace(3) %local) {
691 ; GFX9-LABEL: atomic_add_local:
693 ; GFX9-NEXT: s_mov_b64 s[0:1], exec
694 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
695 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
696 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
697 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
698 ; GFX9-NEXT: s_cbranch_execz .LBB5_2
699 ; GFX9-NEXT: ; %bb.1:
700 ; GFX9-NEXT: s_load_dword s2, s[4:5], 0x24
701 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
702 ; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
703 ; GFX9-NEXT: s_mul_i32 s0, s0, 5
704 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
705 ; GFX9-NEXT: v_mov_b32_e32 v0, s2
706 ; GFX9-NEXT: ds_add_u32 v0, v1
707 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
708 ; GFX9-NEXT: .LBB5_2:
709 ; GFX9-NEXT: s_endpgm
711 ; GFX90A-LABEL: atomic_add_local:
713 ; GFX90A-NEXT: s_mov_b64 s[0:1], exec
714 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
715 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
716 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
717 ; GFX90A-NEXT: s_and_saveexec_b64 s[2:3], vcc
718 ; GFX90A-NEXT: s_cbranch_execz .LBB5_2
719 ; GFX90A-NEXT: ; %bb.1:
720 ; GFX90A-NEXT: s_load_dword s2, s[4:5], 0x24
721 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
722 ; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
723 ; GFX90A-NEXT: s_mul_i32 s0, s0, 5
724 ; GFX90A-NEXT: v_mov_b32_e32 v1, s0
725 ; GFX90A-NEXT: v_mov_b32_e32 v0, s2
726 ; GFX90A-NEXT: ds_add_u32 v0, v1
727 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
728 ; GFX90A-NEXT: .LBB5_2:
729 ; GFX90A-NEXT: s_endpgm
731 ; GFX10-LABEL: atomic_add_local:
733 ; GFX10-NEXT: s_mov_b32 s0, exec_lo
734 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
735 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
736 ; GFX10-NEXT: s_and_saveexec_b32 s1, vcc_lo
737 ; GFX10-NEXT: s_cbranch_execz .LBB5_2
738 ; GFX10-NEXT: ; %bb.1:
739 ; GFX10-NEXT: s_load_dword s1, s[4:5], 0x24
740 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
741 ; GFX10-NEXT: s_bcnt1_i32_b32 s0, s0
742 ; GFX10-NEXT: s_mul_i32 s0, s0, 5
743 ; GFX10-NEXT: v_mov_b32_e32 v1, s0
744 ; GFX10-NEXT: v_mov_b32_e32 v0, s1
745 ; GFX10-NEXT: ds_add_u32 v0, v1
746 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
747 ; GFX10-NEXT: buffer_gl0_inv
748 ; GFX10-NEXT: .LBB5_2:
749 ; GFX10-NEXT: s_endpgm
751 ; GFX9-FLATSCR-LABEL: atomic_add_local:
752 ; GFX9-FLATSCR: ; %bb.0:
753 ; GFX9-FLATSCR-NEXT: s_mov_b64 s[0:1], exec
754 ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
755 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
756 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
757 ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[2:3], vcc
758 ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB5_2
759 ; GFX9-FLATSCR-NEXT: ; %bb.1:
760 ; GFX9-FLATSCR-NEXT: s_load_dword s2, s[4:5], 0x24
761 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
762 ; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
763 ; GFX9-FLATSCR-NEXT: s_mul_i32 s0, s0, 5
764 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s0
765 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, s2
766 ; GFX9-FLATSCR-NEXT: ds_add_u32 v0, v1
767 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
768 ; GFX9-FLATSCR-NEXT: .LBB5_2:
769 ; GFX9-FLATSCR-NEXT: s_endpgm
771 ; GFX11-LABEL: atomic_add_local:
773 ; GFX11-NEXT: s_mov_b32 s0, exec_lo
774 ; GFX11-NEXT: s_mov_b32 s1, exec_lo
775 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
776 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
777 ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
778 ; GFX11-NEXT: s_cbranch_execz .LBB5_2
779 ; GFX11-NEXT: ; %bb.1:
780 ; GFX11-NEXT: s_load_b32 s1, s[4:5], 0x24
781 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
782 ; GFX11-NEXT: s_bcnt1_i32_b32 s0, s0
783 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
784 ; GFX11-NEXT: s_mul_i32 s0, s0, 5
785 ; GFX11-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1
786 ; GFX11-NEXT: ds_add_u32 v0, v1
787 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
788 ; GFX11-NEXT: buffer_gl0_inv
789 ; GFX11-NEXT: .LBB5_2:
790 ; GFX11-NEXT: s_endpgm
792 ; GFX12-LABEL: atomic_add_local:
794 ; GFX12-NEXT: s_mov_b32 s0, exec_lo
795 ; GFX12-NEXT: s_mov_b32 s1, exec_lo
796 ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
797 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
798 ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0
799 ; GFX12-NEXT: s_cbranch_execz .LBB5_2
800 ; GFX12-NEXT: ; %bb.1:
801 ; GFX12-NEXT: s_load_b32 s1, s[4:5], 0x24
802 ; GFX12-NEXT: s_wait_kmcnt 0x0
803 ; GFX12-NEXT: s_bcnt1_i32_b32 s0, s0
804 ; GFX12-NEXT: s_wait_alu 0xfffe
805 ; GFX12-NEXT: s_mul_i32 s0, s0, 5
806 ; GFX12-NEXT: s_wait_alu 0xfffe
807 ; GFX12-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v0, s1
808 ; GFX12-NEXT: ds_add_u32 v0, v1
809 ; GFX12-NEXT: s_wait_dscnt 0x0
810 ; GFX12-NEXT: global_inv scope:SCOPE_SE
811 ; GFX12-NEXT: .LBB5_2:
812 ; GFX12-NEXT: s_endpgm
813 %unused = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst
817 ; from flat_atomics_i32_system.ll
818 ; covers flat_atomic_swap (atomic without return)
820 define void @flat_atomic_xchg_i32_noret(ptr %ptr, i32 %in) {
821 ; GFX9-LABEL: flat_atomic_xchg_i32_noret:
823 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
824 ; GFX9-NEXT: flat_atomic_swap v[0:1], v2
825 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
826 ; GFX9-NEXT: buffer_wbinvl1_vol
827 ; GFX9-NEXT: s_setpc_b64 s[30:31]
829 ; GFX90A-LABEL: flat_atomic_xchg_i32_noret:
831 ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
832 ; GFX90A-NEXT: buffer_wbl2
833 ; GFX90A-NEXT: flat_atomic_swap v[0:1], v2
834 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
835 ; GFX90A-NEXT: buffer_invl2
836 ; GFX90A-NEXT: buffer_wbinvl1_vol
837 ; GFX90A-NEXT: s_setpc_b64 s[30:31]
839 ; GFX10-LABEL: flat_atomic_xchg_i32_noret:
841 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
842 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
843 ; GFX10-NEXT: flat_atomic_swap v[0:1], v2
844 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
845 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
846 ; GFX10-NEXT: buffer_gl1_inv
847 ; GFX10-NEXT: buffer_gl0_inv
848 ; GFX10-NEXT: s_setpc_b64 s[30:31]
850 ; GFX9-FLATSCR-LABEL: flat_atomic_xchg_i32_noret:
851 ; GFX9-FLATSCR: ; %bb.0:
852 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
853 ; GFX9-FLATSCR-NEXT: flat_atomic_swap v[0:1], v2
854 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
855 ; GFX9-FLATSCR-NEXT: buffer_wbinvl1_vol
856 ; GFX9-FLATSCR-NEXT: s_setpc_b64 s[30:31]
858 ; GFX11-LABEL: flat_atomic_xchg_i32_noret:
860 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
861 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
862 ; GFX11-NEXT: flat_atomic_swap_b32 v[0:1], v2
863 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
864 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
865 ; GFX11-NEXT: buffer_gl1_inv
866 ; GFX11-NEXT: buffer_gl0_inv
867 ; GFX11-NEXT: s_setpc_b64 s[30:31]
869 ; GFX12-LABEL: flat_atomic_xchg_i32_noret:
871 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
872 ; GFX12-NEXT: s_wait_expcnt 0x0
873 ; GFX12-NEXT: s_wait_samplecnt 0x0
874 ; GFX12-NEXT: s_wait_bvhcnt 0x0
875 ; GFX12-NEXT: s_wait_kmcnt 0x0
876 ; GFX12-NEXT: global_wb scope:SCOPE_SYS
877 ; GFX12-NEXT: s_wait_storecnt 0x0
878 ; GFX12-NEXT: flat_atomic_swap_b32 v[0:1], v2 scope:SCOPE_SYS
879 ; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
880 ; GFX12-NEXT: global_inv scope:SCOPE_SYS
881 ; GFX12-NEXT: s_setpc_b64 s[30:31]
882 %tmp0 = atomicrmw xchg ptr %ptr, i32 %in seq_cst
886 ; from atomic_load_add.ll
887 ; covers s_load, ds_add_rtn (atomic with return)
889 define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrspace(3) %local) {
890 ; GFX9-LABEL: atomic_add_ret_local:
892 ; GFX9-NEXT: s_mov_b64 s[2:3], exec
893 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
894 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
895 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
896 ; GFX9-NEXT: ; implicit-def: $vgpr1
897 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
898 ; GFX9-NEXT: s_cbranch_execz .LBB7_2
899 ; GFX9-NEXT: ; %bb.1:
900 ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x2c
901 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
902 ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
903 ; GFX9-NEXT: s_mul_i32 s2, s2, 5
904 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
905 ; GFX9-NEXT: v_mov_b32_e32 v1, s6
906 ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2
907 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
908 ; GFX9-NEXT: .LBB7_2:
909 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
910 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
911 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
912 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1
913 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
914 ; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
915 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
916 ; GFX9-NEXT: s_waitcnt vmcnt(0)
917 ; GFX9-NEXT: s_endpgm
919 ; GFX90A-LABEL: atomic_add_ret_local:
921 ; GFX90A-NEXT: s_mov_b64 s[2:3], exec
922 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
923 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
924 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
925 ; GFX90A-NEXT: ; implicit-def: $vgpr1
926 ; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc
927 ; GFX90A-NEXT: s_cbranch_execz .LBB7_2
928 ; GFX90A-NEXT: ; %bb.1:
929 ; GFX90A-NEXT: s_load_dword s6, s[4:5], 0x2c
930 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
931 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
932 ; GFX90A-NEXT: s_mul_i32 s2, s2, 5
933 ; GFX90A-NEXT: v_mov_b32_e32 v2, s2
934 ; GFX90A-NEXT: v_mov_b32_e32 v1, s6
935 ; GFX90A-NEXT: ds_add_rtn_u32 v1, v1, v2
936 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
937 ; GFX90A-NEXT: .LBB7_2:
938 ; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1]
939 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
940 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
941 ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1
942 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
943 ; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s2
944 ; GFX90A-NEXT: global_store_dword v2, v0, s[0:1]
945 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
946 ; GFX90A-NEXT: s_endpgm
948 ; GFX10-LABEL: atomic_add_ret_local:
950 ; GFX10-NEXT: s_mov_b32 s1, exec_lo
951 ; GFX10-NEXT: ; implicit-def: $vgpr1
952 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
953 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
954 ; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo
955 ; GFX10-NEXT: s_cbranch_execz .LBB7_2
956 ; GFX10-NEXT: ; %bb.1:
957 ; GFX10-NEXT: s_load_dword s2, s[4:5], 0x2c
958 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
959 ; GFX10-NEXT: s_bcnt1_i32_b32 s1, s1
960 ; GFX10-NEXT: s_mul_i32 s1, s1, 5
961 ; GFX10-NEXT: v_mov_b32_e32 v2, s1
962 ; GFX10-NEXT: v_mov_b32_e32 v1, s2
963 ; GFX10-NEXT: ds_add_rtn_u32 v1, v1, v2
964 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
965 ; GFX10-NEXT: buffer_gl0_inv
966 ; GFX10-NEXT: .LBB7_2:
967 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
968 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
969 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
970 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
971 ; GFX10-NEXT: v_readfirstlane_b32 s2, v1
972 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
973 ; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s2
974 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
975 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
976 ; GFX10-NEXT: s_endpgm
978 ; GFX9-FLATSCR-LABEL: atomic_add_ret_local:
979 ; GFX9-FLATSCR: ; %bb.0:
980 ; GFX9-FLATSCR-NEXT: s_mov_b64 s[2:3], exec
981 ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
982 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
983 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
984 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1
985 ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
986 ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB7_2
987 ; GFX9-FLATSCR-NEXT: ; %bb.1:
988 ; GFX9-FLATSCR-NEXT: s_load_dword s6, s[4:5], 0x2c
989 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
990 ; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
991 ; GFX9-FLATSCR-NEXT: s_mul_i32 s2, s2, 5
992 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, s2
993 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s6
994 ; GFX9-FLATSCR-NEXT: ds_add_rtn_u32 v1, v1, v2
995 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
996 ; GFX9-FLATSCR-NEXT: .LBB7_2:
997 ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1]
998 ; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
999 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
1000 ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1
1001 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0
1002 ; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s2
1003 ; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[0:1]
1004 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
1005 ; GFX9-FLATSCR-NEXT: s_endpgm
1007 ; GFX11-LABEL: atomic_add_ret_local:
1009 ; GFX11-NEXT: s_mov_b32 s1, exec_lo
1010 ; GFX11-NEXT: s_mov_b32 s0, exec_lo
1011 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
1012 ; GFX11-NEXT: ; implicit-def: $vgpr1
1013 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1014 ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
1015 ; GFX11-NEXT: s_cbranch_execz .LBB7_2
1016 ; GFX11-NEXT: ; %bb.1:
1017 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
1018 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1019 ; GFX11-NEXT: s_bcnt1_i32_b32 s1, s1
1020 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1021 ; GFX11-NEXT: s_mul_i32 s1, s1, 5
1022 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s2
1023 ; GFX11-NEXT: ds_add_rtn_u32 v1, v1, v2
1024 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1025 ; GFX11-NEXT: buffer_gl0_inv
1026 ; GFX11-NEXT: .LBB7_2:
1027 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
1028 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
1029 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1030 ; GFX11-NEXT: v_readfirstlane_b32 s2, v1
1031 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1032 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1033 ; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s2
1034 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1035 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1036 ; GFX11-NEXT: s_endpgm
1038 ; GFX12-LABEL: atomic_add_ret_local:
1040 ; GFX12-NEXT: s_mov_b32 s1, exec_lo
1041 ; GFX12-NEXT: s_mov_b32 s0, exec_lo
1042 ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
1043 ; GFX12-NEXT: ; implicit-def: $vgpr1
1044 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1045 ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0
1046 ; GFX12-NEXT: s_cbranch_execz .LBB7_2
1047 ; GFX12-NEXT: ; %bb.1:
1048 ; GFX12-NEXT: s_load_b32 s2, s[4:5], 0x2c
1049 ; GFX12-NEXT: s_wait_kmcnt 0x0
1050 ; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1
1051 ; GFX12-NEXT: s_wait_alu 0xfffe
1052 ; GFX12-NEXT: s_mul_i32 s1, s1, 5
1053 ; GFX12-NEXT: s_wait_alu 0xfffe
1054 ; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s2
1055 ; GFX12-NEXT: ds_add_rtn_u32 v1, v1, v2
1056 ; GFX12-NEXT: s_wait_dscnt 0x0
1057 ; GFX12-NEXT: global_inv scope:SCOPE_SE
1058 ; GFX12-NEXT: .LBB7_2:
1059 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
1060 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
1061 ; GFX12-NEXT: s_wait_kmcnt 0x0
1062 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1
1063 ; GFX12-NEXT: v_mov_b32_e32 v1, 0
1064 ; GFX12-NEXT: s_wait_alu 0xf1ff
1065 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
1066 ; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2
1067 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
1068 ; GFX12-NEXT: s_wait_storecnt 0x0
1069 ; GFX12-NEXT: s_endpgm
1070 %val = atomicrmw volatile add ptr addrspace(3) %local, i32 5 seq_cst
1071 store i32 %val, ptr addrspace(1) %out
1075 declare i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32, ptr addrspace(8), i32, i32, i32 immarg)
1077 ; from atomic_optimizations_buffer.ll
1078 ; covers buffer_atomic (atomic with return)
1080 define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
1081 ; GFX9-LABEL: add_i32_constant:
1082 ; GFX9: ; %bb.0: ; %entry
1083 ; GFX9-NEXT: s_mov_b64 s[2:3], exec
1084 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
1085 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
1086 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1087 ; GFX9-NEXT: ; implicit-def: $vgpr1
1088 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
1089 ; GFX9-NEXT: s_cbranch_execz .LBB8_2
1090 ; GFX9-NEXT: ; %bb.1:
1091 ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
1092 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1093 ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
1094 ; GFX9-NEXT: s_mul_i32 s2, s2, 5
1095 ; GFX9-NEXT: v_mov_b32_e32 v1, s2
1096 ; GFX9-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
1097 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1098 ; GFX9-NEXT: .LBB8_2:
1099 ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
1100 ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1101 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1102 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1
1103 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1104 ; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
1105 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
1106 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1107 ; GFX9-NEXT: s_endpgm
1109 ; GFX90A-LABEL: add_i32_constant:
1110 ; GFX90A: ; %bb.0: ; %entry
1111 ; GFX90A-NEXT: s_mov_b64 s[2:3], exec
1112 ; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
1113 ; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
1114 ; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1115 ; GFX90A-NEXT: ; implicit-def: $vgpr1
1116 ; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc
1117 ; GFX90A-NEXT: s_cbranch_execz .LBB8_2
1118 ; GFX90A-NEXT: ; %bb.1:
1119 ; GFX90A-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
1120 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1121 ; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
1122 ; GFX90A-NEXT: s_mul_i32 s2, s2, 5
1123 ; GFX90A-NEXT: v_mov_b32_e32 v1, s2
1124 ; GFX90A-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
1125 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1126 ; GFX90A-NEXT: .LBB8_2:
1127 ; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1]
1128 ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1129 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1130 ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1
1131 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0
1132 ; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s2
1133 ; GFX90A-NEXT: global_store_dword v2, v0, s[0:1]
1134 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1135 ; GFX90A-NEXT: s_endpgm
1137 ; GFX10-LABEL: add_i32_constant:
1138 ; GFX10: ; %bb.0: ; %entry
1139 ; GFX10-NEXT: s_mov_b32 s1, exec_lo
1140 ; GFX10-NEXT: ; implicit-def: $vgpr1
1141 ; GFX10-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
1142 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
1143 ; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo
1144 ; GFX10-NEXT: s_cbranch_execz .LBB8_2
1145 ; GFX10-NEXT: ; %bb.1:
1146 ; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
1147 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1148 ; GFX10-NEXT: s_bcnt1_i32_b32 s1, s1
1149 ; GFX10-NEXT: s_mul_i32 s1, s1, 5
1150 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1151 ; GFX10-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
1152 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1153 ; GFX10-NEXT: .LBB8_2:
1154 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3
1155 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
1156 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1157 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1158 ; GFX10-NEXT: v_readfirstlane_b32 s2, v1
1159 ; GFX10-NEXT: v_mov_b32_e32 v1, 0
1160 ; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s2
1161 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
1162 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1163 ; GFX10-NEXT: s_endpgm
1165 ; GFX9-FLATSCR-LABEL: add_i32_constant:
1166 ; GFX9-FLATSCR: ; %bb.0: ; %entry
1167 ; GFX9-FLATSCR-NEXT: s_mov_b64 s[2:3], exec
1168 ; GFX9-FLATSCR-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
1169 ; GFX9-FLATSCR-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
1170 ; GFX9-FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
1171 ; GFX9-FLATSCR-NEXT: ; implicit-def: $vgpr1
1172 ; GFX9-FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc
1173 ; GFX9-FLATSCR-NEXT: s_cbranch_execz .LBB8_2
1174 ; GFX9-FLATSCR-NEXT: ; %bb.1:
1175 ; GFX9-FLATSCR-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x34
1176 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
1177 ; GFX9-FLATSCR-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
1178 ; GFX9-FLATSCR-NEXT: s_mul_i32 s2, s2, 5
1179 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, s2
1180 ; GFX9-FLATSCR-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc
1181 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
1182 ; GFX9-FLATSCR-NEXT: .LBB8_2:
1183 ; GFX9-FLATSCR-NEXT: s_or_b64 exec, exec, s[0:1]
1184 ; GFX9-FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
1185 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
1186 ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1
1187 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0
1188 ; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s2
1189 ; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[0:1]
1190 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
1191 ; GFX9-FLATSCR-NEXT: s_endpgm
1193 ; GFX11-LABEL: add_i32_constant:
1194 ; GFX11: ; %bb.0: ; %entry
1195 ; GFX11-NEXT: s_mov_b32 s1, exec_lo
1196 ; GFX11-NEXT: s_mov_b32 s0, exec_lo
1197 ; GFX11-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
1198 ; GFX11-NEXT: ; implicit-def: $vgpr1
1199 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1200 ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0
1201 ; GFX11-NEXT: s_cbranch_execz .LBB8_2
1202 ; GFX11-NEXT: ; %bb.1:
1203 ; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x34
1204 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1205 ; GFX11-NEXT: s_bcnt1_i32_b32 s1, s1
1206 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1207 ; GFX11-NEXT: s_mul_i32 s1, s1, 5
1208 ; GFX11-NEXT: v_mov_b32_e32 v1, s1
1209 ; GFX11-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], 0 glc
1210 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1211 ; GFX11-NEXT: .LBB8_2:
1212 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
1213 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
1214 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1215 ; GFX11-NEXT: v_readfirstlane_b32 s2, v1
1216 ; GFX11-NEXT: v_mov_b32_e32 v1, 0
1217 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1218 ; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s2
1219 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
1220 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1221 ; GFX11-NEXT: s_endpgm
1223 ; GFX12-LABEL: add_i32_constant:
1224 ; GFX12: ; %bb.0: ; %entry
1225 ; GFX12-NEXT: s_mov_b32 s1, exec_lo
1226 ; GFX12-NEXT: s_mov_b32 s0, exec_lo
1227 ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, s1, 0
1228 ; GFX12-NEXT: ; implicit-def: $vgpr1
1229 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1230 ; GFX12-NEXT: v_cmpx_eq_u32_e32 0, v0
1231 ; GFX12-NEXT: s_cbranch_execz .LBB8_2
1232 ; GFX12-NEXT: ; %bb.1:
1233 ; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x34
1234 ; GFX12-NEXT: s_wait_kmcnt 0x0
1235 ; GFX12-NEXT: s_bcnt1_i32_b32 s1, s1
1236 ; GFX12-NEXT: s_wait_alu 0xfffe
1237 ; GFX12-NEXT: s_mul_i32 s1, s1, 5
1238 ; GFX12-NEXT: s_wait_alu 0xfffe
1239 ; GFX12-NEXT: v_mov_b32_e32 v1, s1
1240 ; GFX12-NEXT: buffer_atomic_add_u32 v1, off, s[8:11], null th:TH_ATOMIC_RETURN
1241 ; GFX12-NEXT: s_wait_loadcnt 0x0
1242 ; GFX12-NEXT: .LBB8_2:
1243 ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
1244 ; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
1245 ; GFX12-NEXT: s_wait_kmcnt 0x0
1246 ; GFX12-NEXT: v_readfirstlane_b32 s2, v1
1247 ; GFX12-NEXT: v_mov_b32_e32 v1, 0
1248 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
1249 ; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2
1250 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
1251 ; GFX12-NEXT: s_wait_storecnt 0x0
1252 ; GFX12-NEXT: s_endpgm
1254 %old = call i32 @llvm.amdgcn.raw.ptr.buffer.atomic.add(i32 5, ptr addrspace(8) %inout, i32 0, i32 0, i32 0)
1255 store i32 %old, ptr addrspace(1) %out
1259 declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32, i16, <8 x i32>, i32, i32)
1261 ; from llvm.amdgcn.image.load.a16.ll
1264 define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
1265 ; GFX9-LABEL: load.f32.1d:
1266 ; GFX9: ; %bb.0: ; %main_body
1267 ; GFX9-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm a16
1268 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1269 ; GFX9-NEXT: ; return to shader part epilog
1271 ; GFX90A-LABEL: load.f32.1d:
1272 ; GFX90A: ; %bb.0: ; %main_body
1273 ; GFX90A-NEXT: image_load v0, v0, s[0:7] dmask:0x1 unorm a16
1274 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1275 ; GFX90A-NEXT: ; return to shader part epilog
1277 ; GFX10-LABEL: load.f32.1d:
1278 ; GFX10: ; %bb.0: ; %main_body
1279 ; GFX10-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16
1280 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1281 ; GFX10-NEXT: ; return to shader part epilog
1283 ; GFX9-FLATSCR-LABEL: load.f32.1d:
1284 ; GFX9-FLATSCR: ; %bb.0: ; %main_body
1285 ; GFX9-FLATSCR-NEXT: s_mov_b32 s11, s9
1286 ; GFX9-FLATSCR-NEXT: s_mov_b32 s10, s8
1287 ; GFX9-FLATSCR-NEXT: s_mov_b32 s9, s7
1288 ; GFX9-FLATSCR-NEXT: s_mov_b32 s8, s6
1289 ; GFX9-FLATSCR-NEXT: s_mov_b32 s7, s5
1290 ; GFX9-FLATSCR-NEXT: s_mov_b32 s6, s4
1291 ; GFX9-FLATSCR-NEXT: s_mov_b32 s5, s3
1292 ; GFX9-FLATSCR-NEXT: s_mov_b32 s4, s2
1293 ; GFX9-FLATSCR-NEXT: image_load v0, v0, s[4:11] dmask:0x1 unorm a16
1294 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
1295 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog
1297 ; GFX11-LABEL: load.f32.1d:
1298 ; GFX11: ; %bb.0: ; %main_body
1299 ; GFX11-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16
1300 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1301 ; GFX11-NEXT: ; return to shader part epilog
1303 ; GFX12-LABEL: load.f32.1d:
1304 ; GFX12: ; %bb.0: ; %main_body
1305 ; GFX12-NEXT: image_load v0, v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D a16
1306 ; GFX12-NEXT: s_wait_loadcnt 0x0
1307 ; GFX12-NEXT: ; return to shader part epilog
1309 %x = extractelement <2 x i16> %coords, i32 0
1310 %v = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
1314 declare void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float>, i32, i16, <8 x i32>, i32, i32)
1316 ; from llvm.amdgcn.image.store.a16.ll
1317 ; covers image_store
1319 define amdgpu_ps void @store_f32_1d(<8 x i32> inreg %rsrc, <2 x i16> %coords, <4 x float> %val) {
1320 ; GFX9-LABEL: store_f32_1d:
1321 ; GFX9: ; %bb.0: ; %main_body
1322 ; GFX9-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 unorm a16
1323 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1324 ; GFX9-NEXT: s_endpgm
1326 ; GFX90A-LABEL: store_f32_1d:
1327 ; GFX90A: ; %bb.0: ; %main_body
1328 ; GFX90A-NEXT: v_mov_b32_e32 v5, v4
1329 ; GFX90A-NEXT: v_mov_b32_e32 v4, v3
1330 ; GFX90A-NEXT: v_mov_b32_e32 v3, v2
1331 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1
1332 ; GFX90A-NEXT: image_store v[2:5], v0, s[0:7] dmask:0x1 unorm a16
1333 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1334 ; GFX90A-NEXT: s_endpgm
1336 ; GFX10-LABEL: store_f32_1d:
1337 ; GFX10: ; %bb.0: ; %main_body
1338 ; GFX10-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16
1339 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1340 ; GFX10-NEXT: s_endpgm
1342 ; GFX9-FLATSCR-LABEL: store_f32_1d:
1343 ; GFX9-FLATSCR: ; %bb.0: ; %main_body
1344 ; GFX9-FLATSCR-NEXT: s_mov_b32 s11, s9
1345 ; GFX9-FLATSCR-NEXT: s_mov_b32 s10, s8
1346 ; GFX9-FLATSCR-NEXT: s_mov_b32 s9, s7
1347 ; GFX9-FLATSCR-NEXT: s_mov_b32 s8, s6
1348 ; GFX9-FLATSCR-NEXT: s_mov_b32 s7, s5
1349 ; GFX9-FLATSCR-NEXT: s_mov_b32 s6, s4
1350 ; GFX9-FLATSCR-NEXT: s_mov_b32 s5, s3
1351 ; GFX9-FLATSCR-NEXT: s_mov_b32 s4, s2
1352 ; GFX9-FLATSCR-NEXT: image_store v[1:4], v0, s[4:11] dmask:0x1 unorm a16
1353 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
1354 ; GFX9-FLATSCR-NEXT: s_endpgm
1356 ; GFX11-LABEL: store_f32_1d:
1357 ; GFX11: ; %bb.0: ; %main_body
1358 ; GFX11-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm a16
1359 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1360 ; GFX11-NEXT: s_endpgm
1362 ; GFX12-LABEL: store_f32_1d:
1363 ; GFX12: ; %bb.0: ; %main_body
1364 ; GFX12-NEXT: image_store v[1:4], v0, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D a16
1365 ; GFX12-NEXT: s_wait_storecnt 0x0
1366 ; GFX12-NEXT: s_endpgm
1369 %x = extractelement <2 x i16> %coords, i32 0
1370 call void @llvm.amdgcn.image.store.1d.v4f32.i16(<4 x float> %val, i32 1, i16 %x, <8 x i32> %rsrc, i32 0, i32 0)
1374 declare i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32, i32, <8 x i32>, i32, i32)
1376 ; from llvm.amdgcn.image.atomic.dim.ll
1377 ; covers image_atomic (atomic with return)
1379 define amdgpu_ps float @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
1380 ; GFX9-LABEL: atomic_swap_1d:
1381 ; GFX9: ; %bb.0: ; %main_body
1382 ; GFX9-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 unorm glc
1383 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1384 ; GFX9-NEXT: ; return to shader part epilog
1386 ; GFX90A-LABEL: atomic_swap_1d:
1387 ; GFX90A: ; %bb.0: ; %main_body
1388 ; GFX90A-NEXT: v_mov_b32_e32 v2, v1
1389 ; GFX90A-NEXT: image_atomic_swap v0, v2, s[0:7] dmask:0x1 unorm glc
1390 ; GFX90A-NEXT: s_waitcnt vmcnt(0)
1391 ; GFX90A-NEXT: ; return to shader part epilog
1393 ; GFX10-LABEL: atomic_swap_1d:
1394 ; GFX10: ; %bb.0: ; %main_body
1395 ; GFX10-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
1396 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1397 ; GFX10-NEXT: ; return to shader part epilog
1399 ; GFX9-FLATSCR-LABEL: atomic_swap_1d:
1400 ; GFX9-FLATSCR: ; %bb.0: ; %main_body
1401 ; GFX9-FLATSCR-NEXT: s_mov_b32 s11, s9
1402 ; GFX9-FLATSCR-NEXT: s_mov_b32 s10, s8
1403 ; GFX9-FLATSCR-NEXT: s_mov_b32 s9, s7
1404 ; GFX9-FLATSCR-NEXT: s_mov_b32 s8, s6
1405 ; GFX9-FLATSCR-NEXT: s_mov_b32 s7, s5
1406 ; GFX9-FLATSCR-NEXT: s_mov_b32 s6, s4
1407 ; GFX9-FLATSCR-NEXT: s_mov_b32 s5, s3
1408 ; GFX9-FLATSCR-NEXT: s_mov_b32 s4, s2
1409 ; GFX9-FLATSCR-NEXT: image_atomic_swap v0, v1, s[4:11] dmask:0x1 unorm glc
1410 ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0)
1411 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog
1413 ; GFX11-LABEL: atomic_swap_1d:
1414 ; GFX11: ; %bb.0: ; %main_body
1415 ; GFX11-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm glc
1416 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1417 ; GFX11-NEXT: ; return to shader part epilog
1419 ; GFX12-LABEL: atomic_swap_1d:
1420 ; GFX12: ; %bb.0: ; %main_body
1421 ; GFX12-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
1422 ; GFX12-NEXT: s_wait_loadcnt 0x0
1423 ; GFX12-NEXT: ; return to shader part epilog
1425 %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
1426 %out = bitcast i32 %v to float
1430 ; from lds-bounds.ll
1431 ; covers ds_write_b64 (atomic without return)
1432 @compute_lds = external addrspace(3) global [512 x i32], align 16
1434 define amdgpu_cs void @store_aligned(ptr addrspace(3) %ptr) #0 {
1435 ; GFX9-LABEL: store_aligned:
1436 ; GFX9: ; %bb.0: ; %entry
1437 ; GFX9-NEXT: v_mov_b32_e32 v1, 42
1438 ; GFX9-NEXT: v_mov_b32_e32 v2, 43
1439 ; GFX9-NEXT: ds_write_b64 v0, v[1:2]
1440 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1441 ; GFX9-NEXT: s_endpgm
1443 ; GFX90A-LABEL: store_aligned:
1444 ; GFX90A: ; %bb.0: ; %entry
1445 ; GFX90A-NEXT: v_mov_b32_e32 v2, 42
1446 ; GFX90A-NEXT: v_mov_b32_e32 v3, 43
1447 ; GFX90A-NEXT: ds_write_b64 v0, v[2:3]
1448 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1449 ; GFX90A-NEXT: s_endpgm
1451 ; GFX10-LABEL: store_aligned:
1452 ; GFX10: ; %bb.0: ; %entry
1453 ; GFX10-NEXT: v_mov_b32_e32 v1, 42
1454 ; GFX10-NEXT: v_mov_b32_e32 v2, 43
1455 ; GFX10-NEXT: ds_write_b64 v0, v[1:2]
1456 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1457 ; GFX10-NEXT: s_endpgm
1459 ; GFX9-FLATSCR-LABEL: store_aligned:
1460 ; GFX9-FLATSCR: ; %bb.0: ; %entry
1461 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 42
1462 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 43
1463 ; GFX9-FLATSCR-NEXT: ds_write_b64 v0, v[1:2]
1464 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
1465 ; GFX9-FLATSCR-NEXT: s_endpgm
1467 ; GFX11-LABEL: store_aligned:
1468 ; GFX11: ; %bb.0: ; %entry
1469 ; GFX11-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v2, 43
1470 ; GFX11-NEXT: ds_store_b64 v0, v[1:2]
1471 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1472 ; GFX11-NEXT: s_endpgm
1474 ; GFX12-LABEL: store_aligned:
1475 ; GFX12: ; %bb.0: ; %entry
1476 ; GFX12-NEXT: v_dual_mov_b32 v1, 42 :: v_dual_mov_b32 v2, 43
1477 ; GFX12-NEXT: ds_store_b64 v0, v[1:2]
1478 ; GFX12-NEXT: s_wait_dscnt 0x0
1479 ; GFX12-NEXT: s_endpgm
1481 %ptr.gep.1 = getelementptr i32, ptr addrspace(3) %ptr, i32 1
1483 store i32 42, ptr addrspace(3) %ptr, align 8
1484 store i32 43, ptr addrspace(3) %ptr.gep.1
1489 ; from lds-bounds.ll
1490 ; covers ds_read_b64
1492 define amdgpu_cs <2 x float> @load_aligned(ptr addrspace(3) %ptr) #0 {
1493 ; GFX9-LABEL: load_aligned:
1494 ; GFX9: ; %bb.0: ; %entry
1495 ; GFX9-NEXT: ds_read_b64 v[0:1], v0
1496 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1497 ; GFX9-NEXT: ; return to shader part epilog
1499 ; GFX90A-LABEL: load_aligned:
1500 ; GFX90A: ; %bb.0: ; %entry
1501 ; GFX90A-NEXT: ds_read_b64 v[0:1], v0
1502 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1503 ; GFX90A-NEXT: ; return to shader part epilog
1505 ; GFX10-LABEL: load_aligned:
1506 ; GFX10: ; %bb.0: ; %entry
1507 ; GFX10-NEXT: ds_read_b64 v[0:1], v0
1508 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1509 ; GFX10-NEXT: ; return to shader part epilog
1511 ; GFX9-FLATSCR-LABEL: load_aligned:
1512 ; GFX9-FLATSCR: ; %bb.0: ; %entry
1513 ; GFX9-FLATSCR-NEXT: ds_read_b64 v[0:1], v0
1514 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
1515 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog
1517 ; GFX11-LABEL: load_aligned:
1518 ; GFX11: ; %bb.0: ; %entry
1519 ; GFX11-NEXT: ds_load_b64 v[0:1], v0
1520 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1521 ; GFX11-NEXT: ; return to shader part epilog
1523 ; GFX12-LABEL: load_aligned:
1524 ; GFX12: ; %bb.0: ; %entry
1525 ; GFX12-NEXT: ds_load_b64 v[0:1], v0
1526 ; GFX12-NEXT: s_wait_dscnt 0x0
1527 ; GFX12-NEXT: ; return to shader part epilog
1529 %ptr.gep.1 = getelementptr i32, ptr addrspace(3) %ptr, i32 1
1531 %v.0 = load i32, ptr addrspace(3) %ptr, align 8
1532 %v.1 = load i32, ptr addrspace(3) %ptr.gep.1
1534 %r.0 = insertelement <2 x i32> poison, i32 %v.0, i32 0
1535 %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
1536 %bc = bitcast <2 x i32> %r.1 to <2 x float>
1540 ; from lds-bounds.ll
1541 ; covers ds_write2_b32
1543 define amdgpu_cs void @store_global_const_idx() #0 {
1544 ; GFX9-LABEL: store_global_const_idx:
1545 ; GFX9: ; %bb.0: ; %entry
1546 ; GFX9-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo
1547 ; GFX9-NEXT: v_mov_b32_e32 v1, 42
1548 ; GFX9-NEXT: v_mov_b32_e32 v2, 43
1549 ; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:4
1550 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1551 ; GFX9-NEXT: s_endpgm
1553 ; GFX90A-LABEL: store_global_const_idx:
1554 ; GFX90A: ; %bb.0: ; %entry
1555 ; GFX90A-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo
1556 ; GFX90A-NEXT: v_mov_b32_e32 v1, 42
1557 ; GFX90A-NEXT: v_mov_b32_e32 v2, 43
1558 ; GFX90A-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:4
1559 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1560 ; GFX90A-NEXT: s_endpgm
1562 ; GFX10-LABEL: store_global_const_idx:
1563 ; GFX10: ; %bb.0: ; %entry
1564 ; GFX10-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo
1565 ; GFX10-NEXT: v_mov_b32_e32 v1, 42
1566 ; GFX10-NEXT: v_mov_b32_e32 v2, 43
1567 ; GFX10-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:4
1568 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1569 ; GFX10-NEXT: s_endpgm
1571 ; GFX9-FLATSCR-LABEL: store_global_const_idx:
1572 ; GFX9-FLATSCR: ; %bb.0: ; %entry
1573 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo
1574 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v1, 42
1575 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 43
1576 ; GFX9-FLATSCR-NEXT: ds_write2_b32 v0, v1, v2 offset0:3 offset1:4
1577 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
1578 ; GFX9-FLATSCR-NEXT: s_endpgm
1580 ; GFX11-LABEL: store_global_const_idx:
1581 ; GFX11: ; %bb.0: ; %entry
1582 ; GFX11-NEXT: v_dual_mov_b32 v0, compute_lds@abs32@lo :: v_dual_mov_b32 v1, 42
1583 ; GFX11-NEXT: v_mov_b32_e32 v2, 43
1584 ; GFX11-NEXT: ds_store_2addr_b32 v0, v1, v2 offset0:3 offset1:4
1585 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1586 ; GFX11-NEXT: s_endpgm
1588 ; GFX12-LABEL: store_global_const_idx:
1589 ; GFX12: ; %bb.0: ; %entry
1590 ; GFX12-NEXT: v_dual_mov_b32 v0, compute_lds@abs32@lo :: v_dual_mov_b32 v1, 42
1591 ; GFX12-NEXT: v_mov_b32_e32 v2, 43
1592 ; GFX12-NEXT: ds_store_2addr_b32 v0, v1, v2 offset0:3 offset1:4
1593 ; GFX12-NEXT: s_wait_dscnt 0x0
1594 ; GFX12-NEXT: s_endpgm
1596 %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 3
1597 %ptr.b = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 4
1599 store i32 42, ptr addrspace(3) %ptr.a
1600 store i32 43, ptr addrspace(3) %ptr.b
1604 ; from lds-bounds.ll
1605 ; covers ds_read2_b32
1607 define amdgpu_cs <2 x float> @load_global_const_idx() #0 {
1608 ; GFX9-LABEL: load_global_const_idx:
1609 ; GFX9: ; %bb.0: ; %entry
1610 ; GFX9-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo
1611 ; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset0:3 offset1:4
1612 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1613 ; GFX9-NEXT: ; return to shader part epilog
1615 ; GFX90A-LABEL: load_global_const_idx:
1616 ; GFX90A: ; %bb.0: ; %entry
1617 ; GFX90A-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo
1618 ; GFX90A-NEXT: ds_read2_b32 v[0:1], v0 offset0:3 offset1:4
1619 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
1620 ; GFX90A-NEXT: ; return to shader part epilog
1622 ; GFX10-LABEL: load_global_const_idx:
1623 ; GFX10: ; %bb.0: ; %entry
1624 ; GFX10-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo
1625 ; GFX10-NEXT: ds_read2_b32 v[0:1], v0 offset0:3 offset1:4
1626 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1627 ; GFX10-NEXT: ; return to shader part epilog
1629 ; GFX9-FLATSCR-LABEL: load_global_const_idx:
1630 ; GFX9-FLATSCR: ; %bb.0: ; %entry
1631 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo
1632 ; GFX9-FLATSCR-NEXT: ds_read2_b32 v[0:1], v0 offset0:3 offset1:4
1633 ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
1634 ; GFX9-FLATSCR-NEXT: ; return to shader part epilog
1636 ; GFX11-LABEL: load_global_const_idx:
1637 ; GFX11: ; %bb.0: ; %entry
1638 ; GFX11-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo
1639 ; GFX11-NEXT: ds_load_2addr_b32 v[0:1], v0 offset0:3 offset1:4
1640 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1641 ; GFX11-NEXT: ; return to shader part epilog
1643 ; GFX12-LABEL: load_global_const_idx:
1644 ; GFX12: ; %bb.0: ; %entry
1645 ; GFX12-NEXT: v_mov_b32_e32 v0, compute_lds@abs32@lo
1646 ; GFX12-NEXT: ds_load_2addr_b32 v[0:1], v0 offset0:3 offset1:4
1647 ; GFX12-NEXT: s_wait_dscnt 0x0
1648 ; GFX12-NEXT: ; return to shader part epilog
1650 %ptr.a = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 3
1651 %ptr.b = getelementptr [512 x i32], ptr addrspace(3) @compute_lds, i32 0, i32 4
1653 %v.0 = load i32, ptr addrspace(3) %ptr.a
1654 %v.1 = load i32, ptr addrspace(3) %ptr.b
1656 %r.0 = insertelement <2 x i32> poison, i32 %v.0, i32 0
1657 %r.1 = insertelement <2 x i32> %r.0, i32 %v.1, i32 1
1658 %bc = bitcast <2 x i32> %r.1 to <2 x float>