1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -march=amdgcn -mcpu=gfx900 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1030 -global-isel -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx940 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -global-isel -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11 %s
7 define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
8 ; GFX9-LABEL: store_load_sindex_kernel:
10 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
11 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5
12 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
13 ; GFX9-NEXT: v_mov_b32_e32 v0, 15
14 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
15 ; GFX9-NEXT: s_lshl_b32 s1, s0, 2
16 ; GFX9-NEXT: s_and_b32 s0, s0, 15
17 ; GFX9-NEXT: s_add_i32 s1, s1, 4
18 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2
19 ; GFX9-NEXT: scratch_store_dword off, v0, s1
20 ; GFX9-NEXT: s_waitcnt vmcnt(0)
21 ; GFX9-NEXT: s_add_i32 s0, s0, 4
22 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
23 ; GFX9-NEXT: s_waitcnt vmcnt(0)
26 ; GFX10-LABEL: store_load_sindex_kernel:
27 ; GFX10: ; %bb.0: ; %bb
28 ; GFX10-NEXT: s_add_u32 s2, s2, s5
29 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
30 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
31 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
32 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
33 ; GFX10-NEXT: v_mov_b32_e32 v0, 15
34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
35 ; GFX10-NEXT: s_and_b32 s1, s0, 15
36 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2
37 ; GFX10-NEXT: s_lshl_b32 s1, s1, 2
38 ; GFX10-NEXT: s_add_i32 s0, s0, 4
39 ; GFX10-NEXT: s_add_i32 s1, s1, 4
40 ; GFX10-NEXT: scratch_store_dword off, v0, s0
41 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
42 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
43 ; GFX10-NEXT: s_waitcnt vmcnt(0)
44 ; GFX10-NEXT: s_endpgm
46 ; GFX940-LABEL: store_load_sindex_kernel:
47 ; GFX940: ; %bb.0: ; %bb
48 ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
49 ; GFX940-NEXT: v_mov_b32_e32 v0, 15
50 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
51 ; GFX940-NEXT: s_lshl_b32 s1, s0, 2
52 ; GFX940-NEXT: s_and_b32 s0, s0, 15
53 ; GFX940-NEXT: s_add_i32 s1, s1, 4
54 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2
55 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
56 ; GFX940-NEXT: s_waitcnt vmcnt(0)
57 ; GFX940-NEXT: v_mov_b32_e32 v0, s0
58 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:4 sc0 sc1
59 ; GFX940-NEXT: s_waitcnt vmcnt(0)
60 ; GFX940-NEXT: s_endpgm
62 ; GFX11-LABEL: store_load_sindex_kernel:
63 ; GFX11: ; %bb.0: ; %bb
64 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24
65 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
66 ; GFX11-NEXT: s_and_b32 s1, s0, 15
67 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2
68 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2
69 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
70 ; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
71 ; GFX11-NEXT: s_add_i32 s0, s0, 4
72 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
73 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
74 ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:4 glc dlc
75 ; GFX11-NEXT: s_waitcnt vmcnt(0)
76 ; GFX11-NEXT: s_endpgm
78 %i = alloca [32 x float], align 4, addrspace(5)
79 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
80 store volatile i32 15, ptr addrspace(5) %i7, align 4
81 %i9 = and i32 %idx, 15
82 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
83 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
87 define amdgpu_kernel void @store_load_vindex_kernel() {
88 ; GFX9-LABEL: store_load_vindex_kernel:
89 ; GFX9: ; %bb.0: ; %bb
90 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
91 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
92 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
93 ; GFX9-NEXT: v_add_u32_e32 v1, 4, v1
94 ; GFX9-NEXT: v_mov_b32_e32 v3, 15
95 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
96 ; GFX9-NEXT: v_mov_b32_e32 v2, 4
97 ; GFX9-NEXT: scratch_store_dword v1, v3, off
98 ; GFX9-NEXT: s_waitcnt vmcnt(0)
99 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
100 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c
101 ; GFX9-NEXT: v_add3_u32 v0, v2, v0, v1
102 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc
103 ; GFX9-NEXT: s_waitcnt vmcnt(0)
104 ; GFX9-NEXT: s_endpgm
106 ; GFX10-LABEL: store_load_vindex_kernel:
107 ; GFX10: ; %bb.0: ; %bb
108 ; GFX10-NEXT: s_add_u32 s0, s0, s3
109 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
110 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
111 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
112 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0
113 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
114 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7c
115 ; GFX10-NEXT: v_mov_b32_e32 v3, 15
116 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
117 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0
118 ; GFX10-NEXT: v_add3_u32 v1, 4, v1, v2
119 ; GFX10-NEXT: scratch_store_dword v0, v3, off
120 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
121 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
122 ; GFX10-NEXT: s_waitcnt vmcnt(0)
123 ; GFX10-NEXT: s_endpgm
125 ; GFX940-LABEL: store_load_vindex_kernel:
126 ; GFX940: ; %bb.0: ; %bb
127 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0
128 ; GFX940-NEXT: v_mov_b32_e32 v3, 15
129 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0
130 ; GFX940-NEXT: v_mov_b32_e32 v2, 4
131 ; GFX940-NEXT: scratch_store_dword v1, v3, off offset:4 sc0 sc1
132 ; GFX940-NEXT: s_waitcnt vmcnt(0)
133 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
134 ; GFX940-NEXT: v_mov_b32_e32 v1, 0x7c
135 ; GFX940-NEXT: v_add3_u32 v0, v2, v0, v1
136 ; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1
137 ; GFX940-NEXT: s_waitcnt vmcnt(0)
138 ; GFX940-NEXT: s_endpgm
140 ; GFX11-LABEL: store_load_vindex_kernel:
141 ; GFX11: ; %bb.0: ; %bb
142 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0
143 ; GFX11-NEXT: v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0
144 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
145 ; GFX11-NEXT: v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1
146 ; GFX11-NEXT: scratch_store_b32 v0, v3, off offset:4 dlc
147 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
148 ; GFX11-NEXT: v_add3_u32 v1, 4, v1, v2
149 ; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc
150 ; GFX11-NEXT: s_waitcnt vmcnt(0)
151 ; GFX11-NEXT: s_endpgm
153 %i = alloca [32 x float], align 4, addrspace(5)
154 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
155 %i3 = zext i32 %i2 to i64
156 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2
157 store volatile i32 15, ptr addrspace(5) %i7, align 4
158 %i9 = sub nsw i32 31, %i2
159 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
160 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
164 define void @store_load_vindex_foo(i32 %idx) {
165 ; GFX9-LABEL: store_load_vindex_foo:
166 ; GFX9: ; %bb.0: ; %bb
167 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
168 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
169 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
170 ; GFX9-NEXT: v_add_u32_e32 v1, s32, v1
171 ; GFX9-NEXT: v_mov_b32_e32 v2, 15
172 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
173 ; GFX9-NEXT: scratch_store_dword v1, v2, off
174 ; GFX9-NEXT: s_waitcnt vmcnt(0)
175 ; GFX9-NEXT: v_add_u32_e32 v0, s32, v0
176 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc
177 ; GFX9-NEXT: s_waitcnt vmcnt(0)
178 ; GFX9-NEXT: s_setpc_b64 s[30:31]
180 ; GFX10-LABEL: store_load_vindex_foo:
181 ; GFX10: ; %bb.0: ; %bb
182 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
183 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
184 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
185 ; GFX10-NEXT: v_mov_b32_e32 v2, 15
186 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
187 ; GFX10-NEXT: v_add_nc_u32_e32 v0, s32, v0
188 ; GFX10-NEXT: v_add_nc_u32_e32 v1, s32, v1
189 ; GFX10-NEXT: scratch_store_dword v0, v2, off
190 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
191 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
192 ; GFX10-NEXT: s_waitcnt vmcnt(0)
193 ; GFX10-NEXT: s_setpc_b64 s[30:31]
195 ; GFX940-LABEL: store_load_vindex_foo:
196 ; GFX940: ; %bb.0: ; %bb
197 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
198 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0
199 ; GFX940-NEXT: v_add_u32_e32 v1, s32, v1
200 ; GFX940-NEXT: v_mov_b32_e32 v2, 15
201 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0
202 ; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1
203 ; GFX940-NEXT: s_waitcnt vmcnt(0)
204 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
205 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1
206 ; GFX940-NEXT: s_waitcnt vmcnt(0)
207 ; GFX940-NEXT: s_setpc_b64 s[30:31]
209 ; GFX11-LABEL: store_load_vindex_foo:
210 ; GFX11: ; %bb.0: ; %bb
211 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
212 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
213 ; GFX11-NEXT: v_and_b32_e32 v0, 15, v0
214 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
215 ; GFX11-NEXT: v_add_nc_u32_e32 v1, s32, v1
216 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
217 ; GFX11-NEXT: scratch_store_b32 v1, v2, off dlc
218 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
219 ; GFX11-NEXT: scratch_load_b32 v0, v0, s32 glc dlc
220 ; GFX11-NEXT: s_waitcnt vmcnt(0)
221 ; GFX11-NEXT: s_setpc_b64 s[30:31]
223 %i = alloca [32 x float], align 4, addrspace(5)
224 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
225 store volatile i32 15, ptr addrspace(5) %i7, align 4
226 %i9 = and i32 %idx, 15
227 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
228 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
232 define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
233 ; GFX9-LABEL: private_ptr_foo:
235 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
236 ; GFX9-NEXT: v_add_u32_e32 v0, 4, v0
237 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000
238 ; GFX9-NEXT: scratch_store_dword v0, v1, off
239 ; GFX9-NEXT: s_waitcnt vmcnt(0)
240 ; GFX9-NEXT: s_setpc_b64 s[30:31]
242 ; GFX10-LABEL: private_ptr_foo:
244 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
245 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0
246 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000
247 ; GFX10-NEXT: scratch_store_dword v0, v1, off
248 ; GFX10-NEXT: s_setpc_b64 s[30:31]
250 ; GFX940-LABEL: private_ptr_foo:
252 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
253 ; GFX940-NEXT: v_add_u32_e32 v0, 4, v0
254 ; GFX940-NEXT: v_mov_b32_e32 v1, 0x41200000
255 ; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
256 ; GFX940-NEXT: s_waitcnt vmcnt(0)
257 ; GFX940-NEXT: s_setpc_b64 s[30:31]
259 ; GFX11-LABEL: private_ptr_foo:
261 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
262 ; GFX11-NEXT: v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
263 ; GFX11-NEXT: scratch_store_b32 v0, v1, off
264 ; GFX11-NEXT: s_setpc_b64 s[30:31]
265 %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
266 store float 1.000000e+01, ptr addrspace(5) %gep, align 4
270 define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
271 ; GFX9-LABEL: store_load_sindex_small_offset_kernel:
272 ; GFX9: ; %bb.0: ; %bb
273 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
274 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5
275 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
276 ; GFX9-NEXT: s_mov_b32 s1, 0
277 ; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc
278 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
279 ; GFX9-NEXT: s_lshl_b32 s1, s0, 2
280 ; GFX9-NEXT: s_and_b32 s0, s0, 15
281 ; GFX9-NEXT: v_mov_b32_e32 v0, 15
282 ; GFX9-NEXT: s_addk_i32 s1, 0x104
283 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2
284 ; GFX9-NEXT: scratch_store_dword off, v0, s1
285 ; GFX9-NEXT: s_waitcnt vmcnt(0)
286 ; GFX9-NEXT: s_addk_i32 s0, 0x104
287 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
288 ; GFX9-NEXT: s_waitcnt vmcnt(0)
289 ; GFX9-NEXT: s_endpgm
291 ; GFX10-LABEL: store_load_sindex_small_offset_kernel:
292 ; GFX10: ; %bb.0: ; %bb
293 ; GFX10-NEXT: s_add_u32 s2, s2, s5
294 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
295 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
296 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
297 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
298 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
299 ; GFX10-NEXT: s_waitcnt vmcnt(0)
300 ; GFX10-NEXT: v_mov_b32_e32 v0, 15
301 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
302 ; GFX10-NEXT: s_and_b32 s1, s0, 15
303 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2
304 ; GFX10-NEXT: s_lshl_b32 s1, s1, 2
305 ; GFX10-NEXT: s_addk_i32 s0, 0x104
306 ; GFX10-NEXT: s_addk_i32 s1, 0x104
307 ; GFX10-NEXT: scratch_store_dword off, v0, s0
308 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
309 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
310 ; GFX10-NEXT: s_waitcnt vmcnt(0)
311 ; GFX10-NEXT: s_endpgm
313 ; GFX940-LABEL: store_load_sindex_small_offset_kernel:
314 ; GFX940: ; %bb.0: ; %bb
315 ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
316 ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1
317 ; GFX940-NEXT: s_waitcnt vmcnt(0)
318 ; GFX940-NEXT: v_mov_b32_e32 v0, 15
319 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
320 ; GFX940-NEXT: s_lshl_b32 s1, s0, 2
321 ; GFX940-NEXT: s_and_b32 s0, s0, 15
322 ; GFX940-NEXT: s_addk_i32 s1, 0x104
323 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2
324 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
325 ; GFX940-NEXT: s_waitcnt vmcnt(0)
326 ; GFX940-NEXT: v_mov_b32_e32 v0, s0
327 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:260 sc0 sc1
328 ; GFX940-NEXT: s_waitcnt vmcnt(0)
329 ; GFX940-NEXT: s_endpgm
331 ; GFX11-LABEL: store_load_sindex_small_offset_kernel:
332 ; GFX11: ; %bb.0: ; %bb
333 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24
334 ; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc
335 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
336 ; GFX11-NEXT: s_and_b32 s1, s0, 15
337 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2
338 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2
339 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
340 ; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
341 ; GFX11-NEXT: s_addk_i32 s0, 0x104
342 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
343 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
344 ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:260 glc dlc
345 ; GFX11-NEXT: s_waitcnt vmcnt(0)
346 ; GFX11-NEXT: s_endpgm
348 %padding = alloca [64 x i32], align 4, addrspace(5)
349 %i = alloca [32 x float], align 4, addrspace(5)
350 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
351 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
352 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
353 store volatile i32 15, ptr addrspace(5) %i7, align 4
354 %i9 = and i32 %idx, 15
355 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
356 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
360 define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
361 ; GFX9-LABEL: store_load_vindex_small_offset_kernel:
362 ; GFX9: ; %bb.0: ; %bb
363 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
364 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
365 ; GFX9-NEXT: s_mov_b32 s0, 0
366 ; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc
367 ; GFX9-NEXT: s_waitcnt vmcnt(0)
368 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
369 ; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v1
370 ; GFX9-NEXT: v_mov_b32_e32 v3, 15
371 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
372 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x104
373 ; GFX9-NEXT: scratch_store_dword v1, v3, off
374 ; GFX9-NEXT: s_waitcnt vmcnt(0)
375 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
376 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c
377 ; GFX9-NEXT: v_add3_u32 v0, v2, v0, v1
378 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc
379 ; GFX9-NEXT: s_waitcnt vmcnt(0)
380 ; GFX9-NEXT: s_endpgm
382 ; GFX10-LABEL: store_load_vindex_small_offset_kernel:
383 ; GFX10: ; %bb.0: ; %bb
384 ; GFX10-NEXT: s_add_u32 s0, s0, s3
385 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
386 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
387 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
388 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0
389 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
390 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7c
391 ; GFX10-NEXT: v_mov_b32_e32 v3, 15
392 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
393 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x104, v0
394 ; GFX10-NEXT: v_add3_u32 v1, 0x104, v1, v2
395 ; GFX10-NEXT: scratch_load_dword v2, off, off offset:4 glc dlc
396 ; GFX10-NEXT: s_waitcnt vmcnt(0)
397 ; GFX10-NEXT: scratch_store_dword v0, v3, off
398 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
399 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
400 ; GFX10-NEXT: s_waitcnt vmcnt(0)
401 ; GFX10-NEXT: s_endpgm
403 ; GFX940-LABEL: store_load_vindex_small_offset_kernel:
404 ; GFX940: ; %bb.0: ; %bb
405 ; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1
406 ; GFX940-NEXT: s_waitcnt vmcnt(0)
407 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0
408 ; GFX940-NEXT: v_mov_b32_e32 v3, 15
409 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0
410 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x104
411 ; GFX940-NEXT: scratch_store_dword v1, v3, off offset:260 sc0 sc1
412 ; GFX940-NEXT: s_waitcnt vmcnt(0)
413 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
414 ; GFX940-NEXT: v_mov_b32_e32 v1, 0x7c
415 ; GFX940-NEXT: v_add3_u32 v0, v2, v0, v1
416 ; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1
417 ; GFX940-NEXT: s_waitcnt vmcnt(0)
418 ; GFX940-NEXT: s_endpgm
420 ; GFX11-LABEL: store_load_vindex_small_offset_kernel:
421 ; GFX11: ; %bb.0: ; %bb
422 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0
423 ; GFX11-NEXT: v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0
424 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
425 ; GFX11-NEXT: v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1
426 ; GFX11-NEXT: v_add3_u32 v1, 0x104, v1, v2
427 ; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc
428 ; GFX11-NEXT: s_waitcnt vmcnt(0)
429 ; GFX11-NEXT: scratch_store_b32 v0, v3, off offset:260 dlc
430 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
431 ; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc
432 ; GFX11-NEXT: s_waitcnt vmcnt(0)
433 ; GFX11-NEXT: s_endpgm
435 %padding = alloca [64 x i32], align 4, addrspace(5)
436 %i = alloca [32 x float], align 4, addrspace(5)
437 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
438 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
439 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
440 %i3 = zext i32 %i2 to i64
441 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2
442 store volatile i32 15, ptr addrspace(5) %i7, align 4
443 %i9 = sub nsw i32 31, %i2
444 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
445 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
449 define void @store_load_vindex_small_offset_foo(i32 %idx) {
450 ; GFX9-LABEL: store_load_vindex_small_offset_foo:
451 ; GFX9: ; %bb.0: ; %bb
452 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
453 ; GFX9-NEXT: scratch_load_dword v1, off, s32 glc
454 ; GFX9-NEXT: s_waitcnt vmcnt(0)
455 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
456 ; GFX9-NEXT: s_add_i32 s0, s32, 0x100
457 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
458 ; GFX9-NEXT: v_add_u32_e32 v1, s0, v1
459 ; GFX9-NEXT: v_mov_b32_e32 v2, 15
460 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
461 ; GFX9-NEXT: s_add_i32 s0, s32, 0x100
462 ; GFX9-NEXT: scratch_store_dword v1, v2, off
463 ; GFX9-NEXT: s_waitcnt vmcnt(0)
464 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
465 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc
466 ; GFX9-NEXT: s_waitcnt vmcnt(0)
467 ; GFX9-NEXT: s_setpc_b64 s[30:31]
469 ; GFX10-LABEL: store_load_vindex_small_offset_foo:
470 ; GFX10: ; %bb.0: ; %bb
471 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
472 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
473 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
474 ; GFX10-NEXT: s_add_i32 s0, s32, 0x100
475 ; GFX10-NEXT: v_mov_b32_e32 v2, 15
476 ; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc
477 ; GFX10-NEXT: s_waitcnt vmcnt(0)
478 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
479 ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
480 ; GFX10-NEXT: s_add_i32 s0, s32, 0x100
481 ; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1
482 ; GFX10-NEXT: scratch_store_dword v0, v2, off
483 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
484 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
485 ; GFX10-NEXT: s_waitcnt vmcnt(0)
486 ; GFX10-NEXT: s_setpc_b64 s[30:31]
488 ; GFX940-LABEL: store_load_vindex_small_offset_foo:
489 ; GFX940: ; %bb.0: ; %bb
490 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
491 ; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1
492 ; GFX940-NEXT: s_waitcnt vmcnt(0)
493 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0
494 ; GFX940-NEXT: s_add_i32 s0, s32, 0x100
495 ; GFX940-NEXT: v_add_u32_e32 v1, s0, v1
496 ; GFX940-NEXT: v_mov_b32_e32 v2, 15
497 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0
498 ; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1
499 ; GFX940-NEXT: s_waitcnt vmcnt(0)
500 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
501 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
502 ; GFX940-NEXT: s_waitcnt vmcnt(0)
503 ; GFX940-NEXT: s_setpc_b64 s[30:31]
505 ; GFX11-LABEL: store_load_vindex_small_offset_foo:
506 ; GFX11: ; %bb.0: ; %bb
507 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
508 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
509 ; GFX11-NEXT: v_and_b32_e32 v0, 15, v0
510 ; GFX11-NEXT: s_add_i32 s0, s32, 0x100
511 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc
512 ; GFX11-NEXT: s_waitcnt vmcnt(0)
513 ; GFX11-NEXT: v_add_nc_u32_e32 v1, s0, v1
514 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
515 ; GFX11-NEXT: scratch_store_b32 v1, v2, off dlc
516 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
517 ; GFX11-NEXT: scratch_load_b32 v0, v0, s32 offset:256 glc dlc
518 ; GFX11-NEXT: s_waitcnt vmcnt(0)
519 ; GFX11-NEXT: s_setpc_b64 s[30:31]
521 %padding = alloca [64 x i32], align 4, addrspace(5)
522 %i = alloca [32 x float], align 4, addrspace(5)
523 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
524 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
525 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
526 store volatile i32 15, ptr addrspace(5) %i7, align 4
527 %i9 = and i32 %idx, 15
528 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
529 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
533 define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
534 ; GFX9-LABEL: store_load_sindex_large_offset_kernel:
535 ; GFX9: ; %bb.0: ; %bb
536 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
537 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5
538 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
539 ; GFX9-NEXT: s_mov_b32 s1, 0
540 ; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc
541 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
542 ; GFX9-NEXT: s_lshl_b32 s1, s0, 2
543 ; GFX9-NEXT: s_and_b32 s0, s0, 15
544 ; GFX9-NEXT: v_mov_b32_e32 v0, 15
545 ; GFX9-NEXT: s_addk_i32 s1, 0x4004
546 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2
547 ; GFX9-NEXT: scratch_store_dword off, v0, s1
548 ; GFX9-NEXT: s_waitcnt vmcnt(0)
549 ; GFX9-NEXT: s_addk_i32 s0, 0x4004
550 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
551 ; GFX9-NEXT: s_waitcnt vmcnt(0)
552 ; GFX9-NEXT: s_endpgm
554 ; GFX10-LABEL: store_load_sindex_large_offset_kernel:
555 ; GFX10: ; %bb.0: ; %bb
556 ; GFX10-NEXT: s_add_u32 s2, s2, s5
557 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
558 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
559 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
560 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
561 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
562 ; GFX10-NEXT: s_waitcnt vmcnt(0)
563 ; GFX10-NEXT: v_mov_b32_e32 v0, 15
564 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
565 ; GFX10-NEXT: s_and_b32 s1, s0, 15
566 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2
567 ; GFX10-NEXT: s_lshl_b32 s1, s1, 2
568 ; GFX10-NEXT: s_addk_i32 s0, 0x4004
569 ; GFX10-NEXT: s_addk_i32 s1, 0x4004
570 ; GFX10-NEXT: scratch_store_dword off, v0, s0
571 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
572 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
573 ; GFX10-NEXT: s_waitcnt vmcnt(0)
574 ; GFX10-NEXT: s_endpgm
576 ; GFX940-LABEL: store_load_sindex_large_offset_kernel:
577 ; GFX940: ; %bb.0: ; %bb
578 ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
579 ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1
580 ; GFX940-NEXT: s_waitcnt vmcnt(0)
581 ; GFX940-NEXT: v_mov_b32_e32 v0, 15
582 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
583 ; GFX940-NEXT: s_lshl_b32 s1, s0, 2
584 ; GFX940-NEXT: s_and_b32 s0, s0, 15
585 ; GFX940-NEXT: s_addk_i32 s1, 0x4004
586 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2
587 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
588 ; GFX940-NEXT: s_waitcnt vmcnt(0)
589 ; GFX940-NEXT: v_mov_b32_e32 v0, s0
590 ; GFX940-NEXT: s_movk_i32 s0, 0x4004
591 ; GFX940-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1
592 ; GFX940-NEXT: s_waitcnt vmcnt(0)
593 ; GFX940-NEXT: s_endpgm
595 ; GFX11-LABEL: store_load_sindex_large_offset_kernel:
596 ; GFX11: ; %bb.0: ; %bb
597 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24
598 ; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc
599 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
600 ; GFX11-NEXT: s_and_b32 s1, s0, 15
601 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2
602 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2
603 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
604 ; GFX11-NEXT: v_dual_mov_b32 v0, 15 :: v_dual_mov_b32 v1, s1
605 ; GFX11-NEXT: s_addk_i32 s0, 0x4004
606 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
607 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
608 ; GFX11-NEXT: s_movk_i32 s0, 0x4004
609 ; GFX11-NEXT: scratch_load_b32 v0, v1, s0 glc dlc
610 ; GFX11-NEXT: s_waitcnt vmcnt(0)
611 ; GFX11-NEXT: s_endpgm
613 %padding = alloca [4096 x i32], align 4, addrspace(5)
614 %i = alloca [32 x float], align 4, addrspace(5)
615 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
616 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
617 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
618 store volatile i32 15, ptr addrspace(5) %i7, align 4
619 %i9 = and i32 %idx, 15
620 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
621 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
625 define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
626 ; GFX9-LABEL: store_load_vindex_large_offset_kernel:
627 ; GFX9: ; %bb.0: ; %bb
628 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
629 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
630 ; GFX9-NEXT: s_mov_b32 s0, 0
631 ; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc
632 ; GFX9-NEXT: s_waitcnt vmcnt(0)
633 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
634 ; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v1
635 ; GFX9-NEXT: v_mov_b32_e32 v3, 15
636 ; GFX9-NEXT: v_sub_u32_e32 v0, 0, v0
637 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x4004
638 ; GFX9-NEXT: scratch_store_dword v1, v3, off
639 ; GFX9-NEXT: s_waitcnt vmcnt(0)
640 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
641 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x7c
642 ; GFX9-NEXT: v_add3_u32 v0, v2, v0, v1
643 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc
644 ; GFX9-NEXT: s_waitcnt vmcnt(0)
645 ; GFX9-NEXT: s_endpgm
647 ; GFX10-LABEL: store_load_vindex_large_offset_kernel:
648 ; GFX10: ; %bb.0: ; %bb
649 ; GFX10-NEXT: s_add_u32 s0, s0, s3
650 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
651 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
652 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
653 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0, v0
654 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
655 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x7c
656 ; GFX10-NEXT: v_mov_b32_e32 v3, 15
657 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
658 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0
659 ; GFX10-NEXT: v_add3_u32 v1, 0x4004, v1, v2
660 ; GFX10-NEXT: scratch_load_dword v2, off, off offset:4 glc dlc
661 ; GFX10-NEXT: s_waitcnt vmcnt(0)
662 ; GFX10-NEXT: scratch_store_dword v0, v3, off
663 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
664 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
665 ; GFX10-NEXT: s_waitcnt vmcnt(0)
666 ; GFX10-NEXT: s_endpgm
668 ; GFX940-LABEL: store_load_vindex_large_offset_kernel:
669 ; GFX940: ; %bb.0: ; %bb
670 ; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1
671 ; GFX940-NEXT: s_waitcnt vmcnt(0)
672 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0
673 ; GFX940-NEXT: v_mov_b32_e32 v3, 15
674 ; GFX940-NEXT: s_movk_i32 s0, 0x4004
675 ; GFX940-NEXT: v_sub_u32_e32 v0, 0, v0
676 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x4004
677 ; GFX940-NEXT: scratch_store_dword v1, v3, s0 sc0 sc1
678 ; GFX940-NEXT: s_waitcnt vmcnt(0)
679 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
680 ; GFX940-NEXT: v_mov_b32_e32 v1, 0x7c
681 ; GFX940-NEXT: v_add3_u32 v0, v2, v0, v1
682 ; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1
683 ; GFX940-NEXT: s_waitcnt vmcnt(0)
684 ; GFX940-NEXT: s_endpgm
686 ; GFX11-LABEL: store_load_vindex_large_offset_kernel:
687 ; GFX11: ; %bb.0: ; %bb
688 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0, v0
689 ; GFX11-NEXT: v_dual_mov_b32 v3, 15 :: v_dual_lshlrev_b32 v0, 2, v0
690 ; GFX11-NEXT: s_movk_i32 s0, 0x4004
691 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
692 ; GFX11-NEXT: v_dual_mov_b32 v2, 0x7c :: v_dual_lshlrev_b32 v1, 2, v1
693 ; GFX11-NEXT: v_add3_u32 v1, 0x4004, v1, v2
694 ; GFX11-NEXT: scratch_load_b32 v2, off, off offset:4 glc dlc
695 ; GFX11-NEXT: s_waitcnt vmcnt(0)
696 ; GFX11-NEXT: scratch_store_b32 v0, v3, s0 dlc
697 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
698 ; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc
699 ; GFX11-NEXT: s_waitcnt vmcnt(0)
700 ; GFX11-NEXT: s_endpgm
702 %padding = alloca [4096 x i32], align 4, addrspace(5)
703 %i = alloca [32 x float], align 4, addrspace(5)
704 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
705 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
706 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
707 %i3 = zext i32 %i2 to i64
708 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2
709 store volatile i32 15, ptr addrspace(5) %i7, align 4
710 %i9 = sub nsw i32 31, %i2
711 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
712 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
716 define void @store_load_vindex_large_offset_foo(i32 %idx) {
717 ; GFX9-LABEL: store_load_vindex_large_offset_foo:
718 ; GFX9: ; %bb.0: ; %bb
719 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
720 ; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc
721 ; GFX9-NEXT: s_waitcnt vmcnt(0)
722 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0
723 ; GFX9-NEXT: s_add_i32 s0, s32, 0x4004
724 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
725 ; GFX9-NEXT: v_add_u32_e32 v1, s0, v1
726 ; GFX9-NEXT: v_mov_b32_e32 v2, 15
727 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
728 ; GFX9-NEXT: s_add_i32 s0, s32, 0x4004
729 ; GFX9-NEXT: scratch_store_dword v1, v2, off
730 ; GFX9-NEXT: s_waitcnt vmcnt(0)
731 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
732 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc
733 ; GFX9-NEXT: s_waitcnt vmcnt(0)
734 ; GFX9-NEXT: s_setpc_b64 s[30:31]
736 ; GFX10-LABEL: store_load_vindex_large_offset_foo:
737 ; GFX10: ; %bb.0: ; %bb
738 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
739 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
740 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
741 ; GFX10-NEXT: s_add_i32 s0, s32, 0x4004
742 ; GFX10-NEXT: v_mov_b32_e32 v2, 15
743 ; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc
744 ; GFX10-NEXT: s_waitcnt vmcnt(0)
745 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v1
746 ; GFX10-NEXT: v_add_nc_u32_e32 v0, s0, v0
747 ; GFX10-NEXT: s_add_i32 s0, s32, 0x4004
748 ; GFX10-NEXT: v_add_nc_u32_e32 v1, s0, v1
749 ; GFX10-NEXT: scratch_store_dword v0, v2, off
750 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
751 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
752 ; GFX10-NEXT: s_waitcnt vmcnt(0)
753 ; GFX10-NEXT: s_setpc_b64 s[30:31]
755 ; GFX940-LABEL: store_load_vindex_large_offset_foo:
756 ; GFX940: ; %bb.0: ; %bb
757 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
758 ; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1
759 ; GFX940-NEXT: s_waitcnt vmcnt(0)
760 ; GFX940-NEXT: v_lshlrev_b32_e32 v1, 2, v0
761 ; GFX940-NEXT: s_add_i32 s0, s32, 0x4004
762 ; GFX940-NEXT: v_add_u32_e32 v1, s0, v1
763 ; GFX940-NEXT: v_mov_b32_e32 v2, 15
764 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0
765 ; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1
766 ; GFX940-NEXT: s_waitcnt vmcnt(0)
767 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
768 ; GFX940-NEXT: s_add_i32 s0, s32, 0x4004
769 ; GFX940-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1
770 ; GFX940-NEXT: s_waitcnt vmcnt(0)
771 ; GFX940-NEXT: s_setpc_b64 s[30:31]
773 ; GFX11-LABEL: store_load_vindex_large_offset_foo:
774 ; GFX11: ; %bb.0: ; %bb
775 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
776 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_lshlrev_b32 v1, 2, v0
777 ; GFX11-NEXT: v_and_b32_e32 v0, 15, v0
778 ; GFX11-NEXT: s_add_i32 s0, s32, 0x4004
779 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc
780 ; GFX11-NEXT: s_waitcnt vmcnt(0)
781 ; GFX11-NEXT: v_add_nc_u32_e32 v1, s0, v1
782 ; GFX11-NEXT: s_add_i32 s0, s32, 0x4004
783 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
784 ; GFX11-NEXT: scratch_store_b32 v1, v2, off dlc
785 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
786 ; GFX11-NEXT: scratch_load_b32 v0, v0, s0 glc dlc
787 ; GFX11-NEXT: s_waitcnt vmcnt(0)
788 ; GFX11-NEXT: s_setpc_b64 s[30:31]
790 %padding = alloca [4096 x i32], align 4, addrspace(5)
791 %i = alloca [32 x float], align 4, addrspace(5)
792 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
793 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
794 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
795 store volatile i32 15, ptr addrspace(5) %i7, align 4
796 %i9 = and i32 %idx, 15
797 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
798 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
802 define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
803 ; GFX9-LABEL: store_load_large_imm_offset_kernel:
804 ; GFX9: ; %bb.0: ; %bb
805 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
806 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
807 ; GFX9-NEXT: v_mov_b32_e32 v0, 13
808 ; GFX9-NEXT: s_mov_b32 s0, 0
809 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
810 ; GFX9-NEXT: s_waitcnt vmcnt(0)
811 ; GFX9-NEXT: s_movk_i32 s0, 0x3e80
812 ; GFX9-NEXT: v_mov_b32_e32 v0, 15
813 ; GFX9-NEXT: s_add_i32 s0, s0, 4
814 ; GFX9-NEXT: scratch_store_dword off, v0, s0
815 ; GFX9-NEXT: s_waitcnt vmcnt(0)
816 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
817 ; GFX9-NEXT: s_waitcnt vmcnt(0)
818 ; GFX9-NEXT: s_endpgm
820 ; GFX10-LABEL: store_load_large_imm_offset_kernel:
821 ; GFX10: ; %bb.0: ; %bb
822 ; GFX10-NEXT: s_add_u32 s0, s0, s3
823 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
824 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
825 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
826 ; GFX10-NEXT: v_mov_b32_e32 v0, 13
827 ; GFX10-NEXT: v_mov_b32_e32 v1, 15
828 ; GFX10-NEXT: s_movk_i32 s0, 0x3e80
829 ; GFX10-NEXT: s_add_i32 s0, s0, 4
830 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:4
831 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
832 ; GFX10-NEXT: scratch_store_dword off, v1, s0
833 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
834 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc
835 ; GFX10-NEXT: s_waitcnt vmcnt(0)
836 ; GFX10-NEXT: s_endpgm
838 ; GFX940-LABEL: store_load_large_imm_offset_kernel:
839 ; GFX940: ; %bb.0: ; %bb
840 ; GFX940-NEXT: v_mov_b32_e32 v0, 13
841 ; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1
842 ; GFX940-NEXT: s_waitcnt vmcnt(0)
843 ; GFX940-NEXT: v_mov_b32_e32 v0, 0x3e80
844 ; GFX940-NEXT: v_mov_b32_e32 v1, 15
845 ; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1
846 ; GFX940-NEXT: s_waitcnt vmcnt(0)
847 ; GFX940-NEXT: scratch_load_dword v0, v0, off offset:4 sc0 sc1
848 ; GFX940-NEXT: s_waitcnt vmcnt(0)
849 ; GFX940-NEXT: s_endpgm
851 ; GFX11-LABEL: store_load_large_imm_offset_kernel:
852 ; GFX11: ; %bb.0: ; %bb
853 ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
854 ; GFX11-NEXT: v_mov_b32_e32 v2, 15
855 ; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
856 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
857 ; GFX11-NEXT: scratch_store_b32 v1, v2, off offset:4 dlc
858 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
859 ; GFX11-NEXT: scratch_load_b32 v0, v1, off offset:4 glc dlc
860 ; GFX11-NEXT: s_waitcnt vmcnt(0)
861 ; GFX11-NEXT: s_endpgm
863 %i = alloca [4096 x i32], align 4, addrspace(5)
864 %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef
865 store volatile i32 13, ptr addrspace(5) %i1, align 4
866 %i7 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000
867 store volatile i32 15, ptr addrspace(5) %i7, align 4
868 %i10 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000
869 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
873 define void @store_load_large_imm_offset_foo() {
874 ; GFX9-LABEL: store_load_large_imm_offset_foo:
875 ; GFX9: ; %bb.0: ; %bb
876 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
877 ; GFX9-NEXT: v_mov_b32_e32 v0, 13
878 ; GFX9-NEXT: s_movk_i32 s0, 0x3e80
879 ; GFX9-NEXT: s_add_i32 s1, s32, 4
880 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
881 ; GFX9-NEXT: s_waitcnt vmcnt(0)
882 ; GFX9-NEXT: v_mov_b32_e32 v0, 15
883 ; GFX9-NEXT: s_add_i32 s0, s0, s1
884 ; GFX9-NEXT: scratch_store_dword off, v0, s0
885 ; GFX9-NEXT: s_waitcnt vmcnt(0)
886 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
887 ; GFX9-NEXT: s_waitcnt vmcnt(0)
888 ; GFX9-NEXT: s_setpc_b64 s[30:31]
890 ; GFX10-LABEL: store_load_large_imm_offset_foo:
891 ; GFX10: ; %bb.0: ; %bb
892 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
893 ; GFX10-NEXT: v_mov_b32_e32 v0, 13
894 ; GFX10-NEXT: v_mov_b32_e32 v1, 15
895 ; GFX10-NEXT: s_movk_i32 s0, 0x3e80
896 ; GFX10-NEXT: s_add_i32 s1, s32, 4
897 ; GFX10-NEXT: s_add_i32 s0, s0, s1
898 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
899 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
900 ; GFX10-NEXT: scratch_store_dword off, v1, s0
901 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
902 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc
903 ; GFX10-NEXT: s_waitcnt vmcnt(0)
904 ; GFX10-NEXT: s_setpc_b64 s[30:31]
906 ; GFX940-LABEL: store_load_large_imm_offset_foo:
907 ; GFX940: ; %bb.0: ; %bb
908 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
909 ; GFX940-NEXT: v_mov_b32_e32 v0, 13
910 ; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
911 ; GFX940-NEXT: s_waitcnt vmcnt(0)
912 ; GFX940-NEXT: v_mov_b32_e32 v0, 0x3e80
913 ; GFX940-NEXT: v_mov_b32_e32 v1, 15
914 ; GFX940-NEXT: scratch_store_dword v0, v1, s32 offset:4 sc0 sc1
915 ; GFX940-NEXT: s_waitcnt vmcnt(0)
916 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:4 sc0 sc1
917 ; GFX940-NEXT: s_waitcnt vmcnt(0)
918 ; GFX940-NEXT: s_setpc_b64 s[30:31]
920 ; GFX11-LABEL: store_load_large_imm_offset_foo:
921 ; GFX11: ; %bb.0: ; %bb
922 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
923 ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3e80
924 ; GFX11-NEXT: v_mov_b32_e32 v2, 15
925 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
926 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
927 ; GFX11-NEXT: scratch_store_b32 v1, v2, s32 offset:4 dlc
928 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
929 ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:4 glc dlc
930 ; GFX11-NEXT: s_waitcnt vmcnt(0)
931 ; GFX11-NEXT: s_setpc_b64 s[30:31]
933 %i = alloca [4096 x i32], align 4, addrspace(5)
934 %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef
935 store volatile i32 13, ptr addrspace(5) %i1, align 4
936 %i7 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000
937 store volatile i32 15, ptr addrspace(5) %i7, align 4
938 %i10 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000
939 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
943 define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
944 ; GFX9-LABEL: store_load_vidx_sidx_offset:
945 ; GFX9: ; %bb.0: ; %bb
946 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
947 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5
948 ; GFX9-NEXT: v_mov_b32_e32 v1, 4
949 ; GFX9-NEXT: v_mov_b32_e32 v2, 0x400
950 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
951 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
952 ; GFX9-NEXT: v_add_lshl_u32 v0, s0, v0, 2
953 ; GFX9-NEXT: v_add3_u32 v0, v1, v0, v2
954 ; GFX9-NEXT: v_mov_b32_e32 v1, 15
955 ; GFX9-NEXT: scratch_store_dword v0, v1, off
956 ; GFX9-NEXT: s_waitcnt vmcnt(0)
957 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc
958 ; GFX9-NEXT: s_waitcnt vmcnt(0)
959 ; GFX9-NEXT: s_endpgm
961 ; GFX10-LABEL: store_load_vidx_sidx_offset:
962 ; GFX10: ; %bb.0: ; %bb
963 ; GFX10-NEXT: s_add_u32 s2, s2, s5
964 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
965 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
966 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
967 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
968 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x400
969 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
970 ; GFX10-NEXT: v_add_lshl_u32 v0, s0, v0, 2
971 ; GFX10-NEXT: v_add3_u32 v0, 4, v0, v1
972 ; GFX10-NEXT: v_mov_b32_e32 v1, 15
973 ; GFX10-NEXT: scratch_store_dword v0, v1, off
974 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
975 ; GFX10-NEXT: scratch_load_dword v0, v0, off glc dlc
976 ; GFX10-NEXT: s_waitcnt vmcnt(0)
977 ; GFX10-NEXT: s_endpgm
979 ; GFX940-LABEL: store_load_vidx_sidx_offset:
980 ; GFX940: ; %bb.0: ; %bb
981 ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
982 ; GFX940-NEXT: v_mov_b32_e32 v1, 4
983 ; GFX940-NEXT: v_mov_b32_e32 v2, 0x400
984 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
985 ; GFX940-NEXT: v_add_lshl_u32 v0, s0, v0, 2
986 ; GFX940-NEXT: v_add3_u32 v0, v1, v0, v2
987 ; GFX940-NEXT: v_mov_b32_e32 v1, 15
988 ; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
989 ; GFX940-NEXT: s_waitcnt vmcnt(0)
990 ; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1
991 ; GFX940-NEXT: s_waitcnt vmcnt(0)
992 ; GFX940-NEXT: s_endpgm
994 ; GFX11-LABEL: store_load_vidx_sidx_offset:
995 ; GFX11: ; %bb.0: ; %bb
996 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24
997 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x400
998 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
999 ; GFX11-NEXT: v_add_lshl_u32 v0, s0, v0, 2
1000 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1001 ; GFX11-NEXT: v_add3_u32 v0, 4, v0, v1
1002 ; GFX11-NEXT: v_mov_b32_e32 v1, 15
1003 ; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc
1004 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1005 ; GFX11-NEXT: scratch_load_b32 v0, v0, off glc dlc
1006 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1007 ; GFX11-NEXT: s_endpgm
1009 %alloca = alloca [32 x i32], align 4, addrspace(5)
1010 %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
1011 %add1 = add nsw i32 %sidx, %vidx
1012 %add2 = add nsw i32 %add1, 256
1013 %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %add2
1014 store volatile i32 15, ptr addrspace(5) %gep, align 4
1015 %load = load volatile i32, ptr addrspace(5) %gep, align 4
1019 define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
1020 ; GFX9-LABEL: store_load_i64_aligned:
1021 ; GFX9: ; %bb.0: ; %bb
1022 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1023 ; GFX9-NEXT: v_mov_b32_e32 v1, 15
1024 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1025 ; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off
1026 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1027 ; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc
1028 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1029 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1031 ; GFX10-LABEL: store_load_i64_aligned:
1032 ; GFX10: ; %bb.0: ; %bb
1033 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1034 ; GFX10-NEXT: v_mov_b32_e32 v1, 15
1035 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1036 ; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off
1037 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1038 ; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc
1039 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1040 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1042 ; GFX940-LABEL: store_load_i64_aligned:
1043 ; GFX940: ; %bb.0: ; %bb
1044 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1045 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 15
1046 ; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
1047 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1048 ; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
1049 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1050 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1052 ; GFX11-LABEL: store_load_i64_aligned:
1053 ; GFX11: ; %bb.0: ; %bb
1054 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1055 ; GFX11-NEXT: v_mov_b32_e32 v1, 15
1056 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1057 ; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc
1058 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1059 ; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc
1060 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1061 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1063 store volatile i64 15, ptr addrspace(5) %arg, align 8
1064 %load = load volatile i64, ptr addrspace(5) %arg, align 8
1068 define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
1069 ; GFX9-LABEL: store_load_i64_unaligned:
1070 ; GFX9: ; %bb.0: ; %bb
1071 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1072 ; GFX9-NEXT: v_mov_b32_e32 v1, 15
1073 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
1074 ; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off
1075 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1076 ; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc
1077 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1078 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1080 ; GFX10-LABEL: store_load_i64_unaligned:
1081 ; GFX10: ; %bb.0: ; %bb
1082 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1083 ; GFX10-NEXT: v_mov_b32_e32 v1, 15
1084 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
1085 ; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off
1086 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1087 ; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc
1088 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1089 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1091 ; GFX940-LABEL: store_load_i64_unaligned:
1092 ; GFX940: ; %bb.0: ; %bb
1093 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1094 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], 15
1095 ; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
1096 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1097 ; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
1098 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1099 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1101 ; GFX11-LABEL: store_load_i64_unaligned:
1102 ; GFX11: ; %bb.0: ; %bb
1103 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1104 ; GFX11-NEXT: v_mov_b32_e32 v1, 15
1105 ; GFX11-NEXT: v_mov_b32_e32 v2, 0
1106 ; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc
1107 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1108 ; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc
1109 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1110 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1112 store volatile i64 15, ptr addrspace(5) %arg, align 1
1113 %load = load volatile i64, ptr addrspace(5) %arg, align 1
1117 define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
1118 ; GFX9-LABEL: store_load_v3i32_unaligned:
1119 ; GFX9: ; %bb.0: ; %bb
1120 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1121 ; GFX9-NEXT: s_mov_b32 s2, 3
1122 ; GFX9-NEXT: s_mov_b32 s1, 2
1123 ; GFX9-NEXT: s_mov_b32 s0, 1
1124 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
1125 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
1126 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
1127 ; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off
1128 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1129 ; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc
1130 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1131 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1133 ; GFX10-LABEL: store_load_v3i32_unaligned:
1134 ; GFX10: ; %bb.0: ; %bb
1135 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1136 ; GFX10-NEXT: s_mov_b32 s2, 3
1137 ; GFX10-NEXT: s_mov_b32 s1, 2
1138 ; GFX10-NEXT: s_mov_b32 s0, 1
1139 ; GFX10-NEXT: v_mov_b32_e32 v3, s2
1140 ; GFX10-NEXT: v_mov_b32_e32 v2, s1
1141 ; GFX10-NEXT: v_mov_b32_e32 v1, s0
1142 ; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off
1143 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1144 ; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc
1145 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1146 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1148 ; GFX940-LABEL: store_load_v3i32_unaligned:
1149 ; GFX940: ; %bb.0: ; %bb
1150 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1151 ; GFX940-NEXT: s_mov_b32 s2, 3
1152 ; GFX940-NEXT: s_mov_b32 s1, 2
1153 ; GFX940-NEXT: s_mov_b32 s0, 1
1154 ; GFX940-NEXT: v_mov_b32_e32 v4, s2
1155 ; GFX940-NEXT: v_mov_b32_e32 v3, s1
1156 ; GFX940-NEXT: v_mov_b32_e32 v2, s0
1157 ; GFX940-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
1158 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1159 ; GFX940-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
1160 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1161 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1163 ; GFX11-LABEL: store_load_v3i32_unaligned:
1164 ; GFX11: ; %bb.0: ; %bb
1165 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1166 ; GFX11-NEXT: s_mov_b32 s2, 3
1167 ; GFX11-NEXT: s_mov_b32 s1, 2
1168 ; GFX11-NEXT: s_mov_b32 s0, 1
1169 ; GFX11-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
1170 ; GFX11-NEXT: v_mov_b32_e32 v1, s0
1171 ; GFX11-NEXT: scratch_store_b96 v0, v[1:3], off dlc
1172 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1173 ; GFX11-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc
1174 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1175 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1177 store volatile <3 x i32> <i32 1, i32 2, i32 3>, ptr addrspace(5) %arg, align 1
1178 %load = load volatile <3 x i32>, ptr addrspace(5) %arg, align 1
1182 define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
1183 ; GFX9-LABEL: store_load_v4i32_unaligned:
1184 ; GFX9: ; %bb.0: ; %bb
1185 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1186 ; GFX9-NEXT: s_mov_b32 s3, 4
1187 ; GFX9-NEXT: s_mov_b32 s2, 3
1188 ; GFX9-NEXT: s_mov_b32 s1, 2
1189 ; GFX9-NEXT: s_mov_b32 s0, 1
1190 ; GFX9-NEXT: v_mov_b32_e32 v4, s3
1191 ; GFX9-NEXT: v_mov_b32_e32 v3, s2
1192 ; GFX9-NEXT: v_mov_b32_e32 v2, s1
1193 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
1194 ; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off
1195 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1196 ; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc
1197 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1198 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1200 ; GFX10-LABEL: store_load_v4i32_unaligned:
1201 ; GFX10: ; %bb.0: ; %bb
1202 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1203 ; GFX10-NEXT: s_mov_b32 s3, 4
1204 ; GFX10-NEXT: s_mov_b32 s2, 3
1205 ; GFX10-NEXT: s_mov_b32 s1, 2
1206 ; GFX10-NEXT: s_mov_b32 s0, 1
1207 ; GFX10-NEXT: v_mov_b32_e32 v4, s3
1208 ; GFX10-NEXT: v_mov_b32_e32 v3, s2
1209 ; GFX10-NEXT: v_mov_b32_e32 v2, s1
1210 ; GFX10-NEXT: v_mov_b32_e32 v1, s0
1211 ; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off
1212 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1213 ; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc
1214 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1215 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1217 ; GFX940-LABEL: store_load_v4i32_unaligned:
1218 ; GFX940: ; %bb.0: ; %bb
1219 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1220 ; GFX940-NEXT: s_mov_b32 s3, 4
1221 ; GFX940-NEXT: s_mov_b32 s2, 3
1222 ; GFX940-NEXT: s_mov_b32 s1, 2
1223 ; GFX940-NEXT: s_mov_b32 s0, 1
1224 ; GFX940-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
1225 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
1226 ; GFX940-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
1227 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1228 ; GFX940-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
1229 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1230 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1232 ; GFX11-LABEL: store_load_v4i32_unaligned:
1233 ; GFX11: ; %bb.0: ; %bb
1234 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1235 ; GFX11-NEXT: s_mov_b32 s3, 4
1236 ; GFX11-NEXT: s_mov_b32 s2, 3
1237 ; GFX11-NEXT: s_mov_b32 s1, 2
1238 ; GFX11-NEXT: s_mov_b32 s0, 1
1239 ; GFX11-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
1240 ; GFX11-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
1241 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off dlc
1242 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1243 ; GFX11-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc
1244 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1245 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1247 store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %arg, align 1
1248 %load = load volatile <4 x i32>, ptr addrspace(5) %arg, align 1
1252 declare i32 @llvm.amdgcn.workitem.id.x()