1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
5 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s
7 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s
8 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s
9 ; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-PAL %s
11 define amdgpu_kernel void @zero_init_kernel() {
12 ; GFX9-LABEL: zero_init_kernel:
14 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
15 ; GFX9-NEXT: s_mov_b32 s0, 0
16 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
17 ; GFX9-NEXT: s_mov_b32 s1, s0
18 ; GFX9-NEXT: s_mov_b32 s2, s0
19 ; GFX9-NEXT: s_mov_b32 s3, s0
20 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
21 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
22 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
23 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
24 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:52
25 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:36
26 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:20
27 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:4
30 ; GFX10-LABEL: zero_init_kernel:
32 ; GFX10-NEXT: s_add_u32 s0, s0, s3
33 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
34 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
35 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
36 ; GFX10-NEXT: s_mov_b32 s0, 0
37 ; GFX10-NEXT: s_mov_b32 s1, s0
38 ; GFX10-NEXT: s_mov_b32 s2, s0
39 ; GFX10-NEXT: s_mov_b32 s3, s0
40 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
41 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
42 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
43 ; GFX10-NEXT: v_mov_b32_e32 v3, s3
44 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:52
45 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36
46 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20
47 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4
48 ; GFX10-NEXT: s_endpgm
50 ; GFX11-LABEL: zero_init_kernel:
52 ; GFX11-NEXT: s_mov_b32 s0, 0
53 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
54 ; GFX11-NEXT: s_mov_b32 s1, s0
55 ; GFX11-NEXT: s_mov_b32 s2, s0
56 ; GFX11-NEXT: s_mov_b32 s3, s0
57 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
58 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
59 ; GFX11-NEXT: s_clause 0x3
60 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:52
61 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:36
62 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:20
63 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:4
64 ; GFX11-NEXT: s_endpgm
66 ; GFX9-PAL-LABEL: zero_init_kernel:
68 ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
69 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
70 ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
71 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0
72 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
73 ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
74 ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
75 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
76 ; GFX9-PAL-NEXT: s_mov_b32 s1, s0
77 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
78 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0
79 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0
80 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1
81 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2
82 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3
83 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:52
84 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:36
85 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:20
86 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:4
87 ; GFX9-PAL-NEXT: s_endpgm
89 ; GFX940-LABEL: zero_init_kernel:
91 ; GFX940-NEXT: s_mov_b32 s0, 0
92 ; GFX940-NEXT: s_mov_b32 s1, s0
93 ; GFX940-NEXT: s_mov_b32 s2, s0
94 ; GFX940-NEXT: s_mov_b32 s3, s0
95 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
96 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
97 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:52 sc0 sc1
98 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36 sc0 sc1
99 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20 sc0 sc1
100 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4 sc0 sc1
101 ; GFX940-NEXT: s_endpgm
103 ; GFX1010-PAL-LABEL: zero_init_kernel:
104 ; GFX1010-PAL: ; %bb.0:
105 ; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
106 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
107 ; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
108 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
109 ; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff
110 ; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1
111 ; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
112 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
113 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
114 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0
115 ; GFX1010-PAL-NEXT: s_mov_b32 s1, s0
116 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
117 ; GFX1010-PAL-NEXT: s_mov_b32 s3, s0
118 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0
119 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1
120 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2
121 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3
122 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:52
123 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:36
124 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:20
125 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:4
126 ; GFX1010-PAL-NEXT: s_endpgm
128 ; GFX1030-PAL-LABEL: zero_init_kernel:
129 ; GFX1030-PAL: ; %bb.0:
130 ; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3]
131 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
132 ; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
133 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
134 ; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff
135 ; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1
136 ; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
137 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
138 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
139 ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0
140 ; GFX1030-PAL-NEXT: s_mov_b32 s1, s0
141 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
142 ; GFX1030-PAL-NEXT: s_mov_b32 s3, s0
143 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0
144 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1
145 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2
146 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3
147 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:52
148 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:36
149 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:20
150 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:4
151 ; GFX1030-PAL-NEXT: s_endpgm
153 ; GFX11-PAL-LABEL: zero_init_kernel:
154 ; GFX11-PAL: ; %bb.0:
155 ; GFX11-PAL-NEXT: s_mov_b32 s0, 0
156 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
157 ; GFX11-PAL-NEXT: s_mov_b32 s1, s0
158 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0
159 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0
160 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
161 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
162 ; GFX11-PAL-NEXT: s_clause 0x3
163 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:52
164 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:36
165 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:20
166 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:4
167 ; GFX11-PAL-NEXT: s_endpgm
168 %alloca = alloca [32 x i16], align 2, addrspace(5)
169 call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
173 define void @zero_init_foo() {
174 ; GFX9-LABEL: zero_init_foo:
176 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
177 ; GFX9-NEXT: s_mov_b32 s0, 0
178 ; GFX9-NEXT: s_mov_b32 s1, s0
179 ; GFX9-NEXT: s_mov_b32 s2, s0
180 ; GFX9-NEXT: s_mov_b32 s3, s0
181 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
182 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
183 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
184 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
185 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48
186 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32
187 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16
188 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32
189 ; GFX9-NEXT: s_waitcnt vmcnt(0)
190 ; GFX9-NEXT: s_setpc_b64 s[30:31]
192 ; GFX10-LABEL: zero_init_foo:
194 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
195 ; GFX10-NEXT: s_mov_b32 s0, 0
196 ; GFX10-NEXT: s_mov_b32 s1, s0
197 ; GFX10-NEXT: s_mov_b32 s2, s0
198 ; GFX10-NEXT: s_mov_b32 s3, s0
199 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
200 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
201 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
202 ; GFX10-NEXT: v_mov_b32_e32 v3, s3
203 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48
204 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32
205 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16
206 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32
207 ; GFX10-NEXT: s_setpc_b64 s[30:31]
209 ; GFX11-LABEL: zero_init_foo:
211 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
212 ; GFX11-NEXT: s_mov_b32 s0, 0
213 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
214 ; GFX11-NEXT: s_mov_b32 s1, s0
215 ; GFX11-NEXT: s_mov_b32 s2, s0
216 ; GFX11-NEXT: s_mov_b32 s3, s0
217 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
218 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
219 ; GFX11-NEXT: s_clause 0x3
220 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48
221 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32
222 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16
223 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32
224 ; GFX11-NEXT: s_setpc_b64 s[30:31]
226 ; GFX9-PAL-LABEL: zero_init_foo:
228 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
229 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0
230 ; GFX9-PAL-NEXT: s_mov_b32 s1, s0
231 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
232 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0
233 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0
234 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1
235 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2
236 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3
237 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48
238 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32
239 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16
240 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32
241 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
242 ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
244 ; GFX940-LABEL: zero_init_foo:
246 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
247 ; GFX940-NEXT: s_mov_b32 s0, 0
248 ; GFX940-NEXT: s_mov_b32 s1, s0
249 ; GFX940-NEXT: s_mov_b32 s2, s0
250 ; GFX940-NEXT: s_mov_b32 s3, s0
251 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
252 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
253 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48 sc0 sc1
254 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32 sc0 sc1
255 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16 sc0 sc1
256 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 sc0 sc1
257 ; GFX940-NEXT: s_waitcnt vmcnt(0)
258 ; GFX940-NEXT: s_setpc_b64 s[30:31]
260 ; GFX10-PAL-LABEL: zero_init_foo:
261 ; GFX10-PAL: ; %bb.0:
262 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
263 ; GFX10-PAL-NEXT: s_mov_b32 s0, 0
264 ; GFX10-PAL-NEXT: s_mov_b32 s1, s0
265 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0
266 ; GFX10-PAL-NEXT: s_mov_b32 s3, s0
267 ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0
268 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1
269 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2
270 ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3
271 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:48
272 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:32
273 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:16
274 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32
275 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
277 ; GFX11-PAL-LABEL: zero_init_foo:
278 ; GFX11-PAL: ; %bb.0:
279 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
280 ; GFX11-PAL-NEXT: s_mov_b32 s0, 0
281 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
282 ; GFX11-PAL-NEXT: s_mov_b32 s1, s0
283 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0
284 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0
285 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
286 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
287 ; GFX11-PAL-NEXT: s_clause 0x3
288 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48
289 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32
290 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:16
291 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32
292 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
293 %alloca = alloca [32 x i16], align 2, addrspace(5)
294 call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
298 define amdgpu_kernel void @store_load_sindex_kernel(i32 %idx) {
299 ; GFX9-LABEL: store_load_sindex_kernel:
300 ; GFX9: ; %bb.0: ; %bb
301 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
302 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5
303 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
304 ; GFX9-NEXT: v_mov_b32_e32 v0, 15
305 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
306 ; GFX9-NEXT: s_lshl_b32 s1, s0, 2
307 ; GFX9-NEXT: s_and_b32 s0, s0, 15
308 ; GFX9-NEXT: s_add_i32 s1, s1, 4
309 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2
310 ; GFX9-NEXT: scratch_store_dword off, v0, s1
311 ; GFX9-NEXT: s_waitcnt vmcnt(0)
312 ; GFX9-NEXT: s_add_i32 s0, s0, 4
313 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
314 ; GFX9-NEXT: s_waitcnt vmcnt(0)
315 ; GFX9-NEXT: s_endpgm
317 ; GFX10-LABEL: store_load_sindex_kernel:
318 ; GFX10: ; %bb.0: ; %bb
319 ; GFX10-NEXT: s_add_u32 s2, s2, s5
320 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
321 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
322 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
323 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
324 ; GFX10-NEXT: v_mov_b32_e32 v0, 15
325 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
326 ; GFX10-NEXT: s_and_b32 s1, s0, 15
327 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2
328 ; GFX10-NEXT: s_lshl_b32 s1, s1, 2
329 ; GFX10-NEXT: s_add_i32 s0, s0, 4
330 ; GFX10-NEXT: s_add_i32 s1, s1, 4
331 ; GFX10-NEXT: scratch_store_dword off, v0, s0
332 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
333 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
334 ; GFX10-NEXT: s_waitcnt vmcnt(0)
335 ; GFX10-NEXT: s_endpgm
337 ; GFX11-LABEL: store_load_sindex_kernel:
338 ; GFX11: ; %bb.0: ; %bb
339 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24
340 ; GFX11-NEXT: v_mov_b32_e32 v0, 15
341 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
342 ; GFX11-NEXT: s_and_b32 s1, s0, 15
343 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2
344 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2
345 ; GFX11-NEXT: s_add_i32 s0, s0, 4
346 ; GFX11-NEXT: s_add_i32 s1, s1, 4
347 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
348 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
349 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
350 ; GFX11-NEXT: s_waitcnt vmcnt(0)
351 ; GFX11-NEXT: s_endpgm
353 ; GFX9-PAL-LABEL: store_load_sindex_kernel:
354 ; GFX9-PAL: ; %bb.0: ; %bb
355 ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5]
356 ; GFX9-PAL-NEXT: s_mov_b32 s4, s0
357 ; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
358 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
359 ; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0
360 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
361 ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff
362 ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3
363 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
364 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2
365 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15
366 ; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4
367 ; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2
368 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1
369 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
370 ; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4
371 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc
372 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
373 ; GFX9-PAL-NEXT: s_endpgm
375 ; GFX940-LABEL: store_load_sindex_kernel:
376 ; GFX940: ; %bb.0: ; %bb
377 ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
378 ; GFX940-NEXT: v_mov_b32_e32 v0, 15
379 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
380 ; GFX940-NEXT: s_lshl_b32 s1, s0, 2
381 ; GFX940-NEXT: s_and_b32 s0, s0, 15
382 ; GFX940-NEXT: s_add_i32 s1, s1, 4
383 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2
384 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
385 ; GFX940-NEXT: s_waitcnt vmcnt(0)
386 ; GFX940-NEXT: s_add_i32 s0, s0, 4
387 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
388 ; GFX940-NEXT: s_waitcnt vmcnt(0)
389 ; GFX940-NEXT: s_endpgm
391 ; GFX10-PAL-LABEL: store_load_sindex_kernel:
392 ; GFX10-PAL: ; %bb.0: ; %bb
393 ; GFX10-PAL-NEXT: s_getpc_b64 s[4:5]
394 ; GFX10-PAL-NEXT: s_mov_b32 s4, s0
395 ; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
396 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
397 ; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff
398 ; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3
399 ; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0
400 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
401 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
402 ; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0
403 ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15
404 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
405 ; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15
406 ; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2
407 ; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2
408 ; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4
409 ; GFX10-PAL-NEXT: s_add_i32 s1, s1, 4
410 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0
411 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
412 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
413 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
414 ; GFX10-PAL-NEXT: s_endpgm
416 ; GFX11-PAL-LABEL: store_load_sindex_kernel:
417 ; GFX11-PAL: ; %bb.0: ; %bb
418 ; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0
419 ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15
420 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0)
421 ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15
422 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2
423 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2
424 ; GFX11-PAL-NEXT: s_add_i32 s0, s0, 4
425 ; GFX11-PAL-NEXT: s_add_i32 s1, s1, 4
426 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc
427 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
428 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc
429 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
430 ; GFX11-PAL-NEXT: s_endpgm
432 %i = alloca [32 x float], align 4, addrspace(5)
433 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
434 store volatile i32 15, ptr addrspace(5) %i7, align 4
435 %i9 = and i32 %idx, 15
436 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
437 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
441 define amdgpu_ps void @store_load_sindex_foo(i32 inreg %idx) {
442 ; GFX9-LABEL: store_load_sindex_foo:
443 ; GFX9: ; %bb.0: ; %bb
444 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
445 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
446 ; GFX9-NEXT: s_lshl_b32 s0, s2, 2
447 ; GFX9-NEXT: s_add_i32 s0, s0, 4
448 ; GFX9-NEXT: v_mov_b32_e32 v0, 15
449 ; GFX9-NEXT: scratch_store_dword off, v0, s0
450 ; GFX9-NEXT: s_waitcnt vmcnt(0)
451 ; GFX9-NEXT: s_and_b32 s0, s2, 15
452 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2
453 ; GFX9-NEXT: s_add_i32 s0, s0, 4
454 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
455 ; GFX9-NEXT: s_waitcnt vmcnt(0)
456 ; GFX9-NEXT: s_endpgm
458 ; GFX10-LABEL: store_load_sindex_foo:
459 ; GFX10: ; %bb.0: ; %bb
460 ; GFX10-NEXT: s_add_u32 s0, s0, s3
461 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
462 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
463 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
464 ; GFX10-NEXT: v_mov_b32_e32 v0, 15
465 ; GFX10-NEXT: s_and_b32 s0, s2, 15
466 ; GFX10-NEXT: s_lshl_b32 s1, s2, 2
467 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2
468 ; GFX10-NEXT: s_add_i32 s1, s1, 4
469 ; GFX10-NEXT: s_add_i32 s0, s0, 4
470 ; GFX10-NEXT: scratch_store_dword off, v0, s1
471 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
472 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc
473 ; GFX10-NEXT: s_waitcnt vmcnt(0)
474 ; GFX10-NEXT: s_endpgm
476 ; GFX11-LABEL: store_load_sindex_foo:
477 ; GFX11: ; %bb.0: ; %bb
478 ; GFX11-NEXT: v_mov_b32_e32 v0, 15
479 ; GFX11-NEXT: s_and_b32 s1, s0, 15
480 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2
481 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2
482 ; GFX11-NEXT: s_add_i32 s0, s0, 4
483 ; GFX11-NEXT: s_add_i32 s1, s1, 4
484 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
485 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
486 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
487 ; GFX11-NEXT: s_waitcnt vmcnt(0)
488 ; GFX11-NEXT: s_endpgm
490 ; GFX9-PAL-LABEL: store_load_sindex_foo:
491 ; GFX9-PAL: ; %bb.0: ; %bb
492 ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
493 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
494 ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
495 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
496 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
497 ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
498 ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
499 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
500 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2
501 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15
502 ; GFX9-PAL-NEXT: s_add_i32 s1, s1, 4
503 ; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2
504 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1
505 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
506 ; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4
507 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc
508 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
509 ; GFX9-PAL-NEXT: s_endpgm
511 ; GFX940-LABEL: store_load_sindex_foo:
512 ; GFX940: ; %bb.0: ; %bb
513 ; GFX940-NEXT: s_lshl_b32 s1, s0, 2
514 ; GFX940-NEXT: s_and_b32 s0, s0, 15
515 ; GFX940-NEXT: s_add_i32 s1, s1, 4
516 ; GFX940-NEXT: v_mov_b32_e32 v0, 15
517 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2
518 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
519 ; GFX940-NEXT: s_waitcnt vmcnt(0)
520 ; GFX940-NEXT: s_add_i32 s0, s0, 4
521 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
522 ; GFX940-NEXT: s_waitcnt vmcnt(0)
523 ; GFX940-NEXT: s_endpgm
525 ; GFX10-PAL-LABEL: store_load_sindex_foo:
526 ; GFX10-PAL: ; %bb.0: ; %bb
527 ; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
528 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0
529 ; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
530 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
531 ; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
532 ; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
533 ; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
534 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
535 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
536 ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 15
537 ; GFX10-PAL-NEXT: s_and_b32 s1, s0, 15
538 ; GFX10-PAL-NEXT: s_lshl_b32 s0, s0, 2
539 ; GFX10-PAL-NEXT: s_lshl_b32 s1, s1, 2
540 ; GFX10-PAL-NEXT: s_add_i32 s0, s0, 4
541 ; GFX10-PAL-NEXT: s_add_i32 s1, s1, 4
542 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s0
543 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
544 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
545 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
546 ; GFX10-PAL-NEXT: s_endpgm
548 ; GFX11-PAL-LABEL: store_load_sindex_foo:
549 ; GFX11-PAL: ; %bb.0: ; %bb
550 ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15
551 ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15
552 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2
553 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2
554 ; GFX11-PAL-NEXT: s_add_i32 s0, s0, 4
555 ; GFX11-PAL-NEXT: s_add_i32 s1, s1, 4
556 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc
557 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
558 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc
559 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
560 ; GFX11-PAL-NEXT: s_endpgm
562 %i = alloca [32 x float], align 4, addrspace(5)
563 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
564 store volatile i32 15, ptr addrspace(5) %i7, align 4
565 %i9 = and i32 %idx, 15
566 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
567 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
571 define amdgpu_kernel void @store_load_vindex_kernel() {
572 ; GFX9-LABEL: store_load_vindex_kernel:
573 ; GFX9: ; %bb.0: ; %bb
574 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
575 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
576 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
577 ; GFX9-NEXT: v_add_u32_e32 v1, 4, v0
578 ; GFX9-NEXT: v_mov_b32_e32 v2, 15
579 ; GFX9-NEXT: v_sub_u32_e32 v0, 4, v0
580 ; GFX9-NEXT: scratch_store_dword v1, v2, off
581 ; GFX9-NEXT: s_waitcnt vmcnt(0)
582 ; GFX9-NEXT: v_add_u32_e32 v0, 0x7c, v0
583 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc
584 ; GFX9-NEXT: s_waitcnt vmcnt(0)
585 ; GFX9-NEXT: s_endpgm
587 ; GFX10-LABEL: store_load_vindex_kernel:
588 ; GFX10: ; %bb.0: ; %bb
589 ; GFX10-NEXT: s_add_u32 s0, s0, s3
590 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
591 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
592 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
593 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
594 ; GFX10-NEXT: v_mov_b32_e32 v2, 15
595 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 4, v0
596 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0
597 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1
598 ; GFX10-NEXT: scratch_store_dword v0, v2, off
599 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
600 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
601 ; GFX10-NEXT: s_waitcnt vmcnt(0)
602 ; GFX10-NEXT: s_endpgm
604 ; GFX11-LABEL: store_load_vindex_kernel:
605 ; GFX11: ; %bb.0: ; %bb
606 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
607 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
608 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 4, v0
609 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
610 ; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:4 dlc
611 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
612 ; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc
613 ; GFX11-NEXT: s_waitcnt vmcnt(0)
614 ; GFX11-NEXT: s_endpgm
616 ; GFX9-PAL-LABEL: store_load_vindex_kernel:
617 ; GFX9-PAL: ; %bb.0: ; %bb
618 ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
619 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
620 ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
621 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
622 ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 4, v0
623 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15
624 ; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 4, v0
625 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
626 ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
627 ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
628 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
629 ; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off
630 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
631 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0x7c, v0
632 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc
633 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
634 ; GFX9-PAL-NEXT: s_endpgm
636 ; GFX940-LABEL: store_load_vindex_kernel:
637 ; GFX940: ; %bb.0: ; %bb
638 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
639 ; GFX940-NEXT: v_mov_b32_e32 v1, 15
640 ; GFX940-NEXT: scratch_store_dword v0, v1, off offset:4 sc0 sc1
641 ; GFX940-NEXT: s_waitcnt vmcnt(0)
642 ; GFX940-NEXT: v_sub_u32_e32 v0, 4, v0
643 ; GFX940-NEXT: v_add_u32_e32 v0, 0x7c, v0
644 ; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1
645 ; GFX940-NEXT: s_waitcnt vmcnt(0)
646 ; GFX940-NEXT: s_endpgm
648 ; GFX10-PAL-LABEL: store_load_vindex_kernel:
649 ; GFX10-PAL: ; %bb.0: ; %bb
650 ; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
651 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0
652 ; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
653 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
654 ; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
655 ; GFX10-PAL-NEXT: s_add_u32 s2, s2, s1
656 ; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
657 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
658 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
659 ; GFX10-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
660 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15
661 ; GFX10-PAL-NEXT: v_sub_nc_u32_e32 v1, 4, v0
662 ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, 4, v0
663 ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1
664 ; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off
665 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
666 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
667 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
668 ; GFX10-PAL-NEXT: s_endpgm
670 ; GFX11-PAL-LABEL: store_load_vindex_kernel:
671 ; GFX11-PAL: ; %bb.0: ; %bb
672 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
673 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
674 ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v1, 4, v0
675 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
676 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, off offset:4 dlc
677 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
678 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off glc dlc
679 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
680 ; GFX11-PAL-NEXT: s_endpgm
682 %i = alloca [32 x float], align 4, addrspace(5)
683 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
684 %i3 = zext i32 %i2 to i64
685 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2
686 store volatile i32 15, ptr addrspace(5) %i7, align 4
687 %i9 = sub nsw i32 31, %i2
688 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
689 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
693 define void @store_load_vindex_foo(i32 %idx) {
694 ; GFX9-LABEL: store_load_vindex_foo:
695 ; GFX9: ; %bb.0: ; %bb
696 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
697 ; GFX9-NEXT: v_mov_b32_e32 v1, s32
698 ; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1
699 ; GFX9-NEXT: v_mov_b32_e32 v3, 15
700 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
701 ; GFX9-NEXT: scratch_store_dword v2, v3, off
702 ; GFX9-NEXT: s_waitcnt vmcnt(0)
703 ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
704 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc
705 ; GFX9-NEXT: s_waitcnt vmcnt(0)
706 ; GFX9-NEXT: s_setpc_b64 s[30:31]
708 ; GFX10-LABEL: store_load_vindex_foo:
709 ; GFX10: ; %bb.0: ; %bb
710 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
711 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
712 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s32
713 ; GFX10-NEXT: v_mov_b32_e32 v2, 15
714 ; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s32
715 ; GFX10-NEXT: scratch_store_dword v0, v2, off
716 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
717 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
718 ; GFX10-NEXT: s_waitcnt vmcnt(0)
719 ; GFX10-NEXT: s_setpc_b64 s[30:31]
721 ; GFX11-LABEL: store_load_vindex_foo:
722 ; GFX11: ; %bb.0: ; %bb
723 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
724 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
725 ; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s32
726 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
727 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
728 ; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc
729 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
730 ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 glc dlc
731 ; GFX11-NEXT: s_waitcnt vmcnt(0)
732 ; GFX11-NEXT: s_setpc_b64 s[30:31]
734 ; GFX9-PAL-LABEL: store_load_vindex_foo:
735 ; GFX9-PAL: ; %bb.0: ; %bb
736 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
737 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s32
738 ; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1
739 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
740 ; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0
741 ; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
742 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
743 ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
744 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc
745 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
746 ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
748 ; GFX940-LABEL: store_load_vindex_foo:
749 ; GFX940: ; %bb.0: ; %bb
750 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
751 ; GFX940-NEXT: v_mov_b32_e32 v1, s32
752 ; GFX940-NEXT: v_lshl_add_u32 v1, v0, 2, v1
753 ; GFX940-NEXT: v_mov_b32_e32 v2, 15
754 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0
755 ; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1
756 ; GFX940-NEXT: s_waitcnt vmcnt(0)
757 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
758 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 sc0 sc1
759 ; GFX940-NEXT: s_waitcnt vmcnt(0)
760 ; GFX940-NEXT: s_setpc_b64 s[30:31]
762 ; GFX10-PAL-LABEL: store_load_vindex_foo:
763 ; GFX10-PAL: ; %bb.0: ; %bb
764 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
765 ; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0
766 ; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32
767 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15
768 ; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s32
769 ; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off
770 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
771 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
772 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
773 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
775 ; GFX11-PAL-LABEL: store_load_vindex_foo:
776 ; GFX11-PAL: ; %bb.0: ; %bb
777 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
778 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
779 ; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s32
780 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2)
781 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
782 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, off dlc
783 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
784 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 glc dlc
785 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
786 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
788 %i = alloca [32 x float], align 4, addrspace(5)
789 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
790 store volatile i32 15, ptr addrspace(5) %i7, align 4
791 %i9 = and i32 %idx, 15
792 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
793 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
797 define void @private_ptr_foo(ptr addrspace(5) nocapture %arg) {
798 ; GFX9-LABEL: private_ptr_foo:
800 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
801 ; GFX9-NEXT: v_add_u32_e32 v0, 4, v0
802 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x41200000
803 ; GFX9-NEXT: scratch_store_dword v0, v1, off
804 ; GFX9-NEXT: s_waitcnt vmcnt(0)
805 ; GFX9-NEXT: s_setpc_b64 s[30:31]
807 ; GFX10-LABEL: private_ptr_foo:
809 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
810 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 4, v0
811 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41200000
812 ; GFX10-NEXT: scratch_store_dword v0, v1, off
813 ; GFX10-NEXT: s_setpc_b64 s[30:31]
815 ; GFX11-LABEL: private_ptr_foo:
817 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
818 ; GFX11-NEXT: v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
819 ; GFX11-NEXT: scratch_store_b32 v0, v1, off
820 ; GFX11-NEXT: s_setpc_b64 s[30:31]
822 ; GFX9-PAL-LABEL: private_ptr_foo:
824 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
825 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, 4, v0
826 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000
827 ; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off
828 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
829 ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
831 ; GFX940-LABEL: private_ptr_foo:
833 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
834 ; GFX940-NEXT: v_add_u32_e32 v0, 4, v0
835 ; GFX940-NEXT: v_mov_b32_e32 v1, 0x41200000
836 ; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
837 ; GFX940-NEXT: s_waitcnt vmcnt(0)
838 ; GFX940-NEXT: s_setpc_b64 s[30:31]
840 ; GFX10-PAL-LABEL: private_ptr_foo:
841 ; GFX10-PAL: ; %bb.0:
842 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
843 ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, 4, v0
844 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x41200000
845 ; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off
846 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
848 ; GFX11-PAL-LABEL: private_ptr_foo:
849 ; GFX11-PAL: ; %bb.0:
850 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
851 ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 0x41200000 :: v_dual_add_nc_u32 v0, 4, v0
852 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off
853 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
854 %gep = getelementptr inbounds float, ptr addrspace(5) %arg, i32 1
855 store float 1.000000e+01, ptr addrspace(5) %gep, align 4
859 define amdgpu_kernel void @zero_init_small_offset_kernel() {
860 ; GFX9-LABEL: zero_init_small_offset_kernel:
862 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
863 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
864 ; GFX9-NEXT: s_mov_b32 s0, 0
865 ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:4 glc
866 ; GFX9-NEXT: s_waitcnt vmcnt(0)
867 ; GFX9-NEXT: s_mov_b32 s1, s0
868 ; GFX9-NEXT: s_mov_b32 s2, s0
869 ; GFX9-NEXT: s_mov_b32 s3, s0
870 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
871 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
872 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
873 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
874 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:260
875 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:276
876 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:292
877 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:308
878 ; GFX9-NEXT: s_endpgm
880 ; GFX10-LABEL: zero_init_small_offset_kernel:
882 ; GFX10-NEXT: s_add_u32 s0, s0, s3
883 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
884 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
885 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
886 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
887 ; GFX10-NEXT: s_waitcnt vmcnt(0)
888 ; GFX10-NEXT: s_mov_b32 s0, 0
889 ; GFX10-NEXT: s_mov_b32 s1, s0
890 ; GFX10-NEXT: s_mov_b32 s2, s0
891 ; GFX10-NEXT: s_mov_b32 s3, s0
892 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
893 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
894 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
895 ; GFX10-NEXT: v_mov_b32_e32 v3, s3
896 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:260
897 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276
898 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292
899 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308
900 ; GFX10-NEXT: s_endpgm
902 ; GFX11-LABEL: zero_init_small_offset_kernel:
904 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
905 ; GFX11-NEXT: s_waitcnt vmcnt(0)
906 ; GFX11-NEXT: s_mov_b32 s0, 0
907 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
908 ; GFX11-NEXT: s_mov_b32 s1, s0
909 ; GFX11-NEXT: s_mov_b32 s2, s0
910 ; GFX11-NEXT: s_mov_b32 s3, s0
911 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
912 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
913 ; GFX11-NEXT: s_clause 0x3
914 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:260
915 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:276
916 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:292
917 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:308
918 ; GFX11-NEXT: s_endpgm
920 ; GFX9-PAL-LABEL: zero_init_small_offset_kernel:
922 ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
923 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
924 ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
925 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0
926 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
927 ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
928 ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
929 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
930 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc
931 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
932 ; GFX9-PAL-NEXT: s_mov_b32 s1, s0
933 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
934 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0
935 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0
936 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1
937 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2
938 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3
939 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:260
940 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:276
941 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:292
942 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:308
943 ; GFX9-PAL-NEXT: s_endpgm
945 ; GFX940-LABEL: zero_init_small_offset_kernel:
947 ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1
948 ; GFX940-NEXT: s_waitcnt vmcnt(0)
949 ; GFX940-NEXT: s_mov_b32 s0, 0
950 ; GFX940-NEXT: s_mov_b32 s1, s0
951 ; GFX940-NEXT: s_mov_b32 s2, s0
952 ; GFX940-NEXT: s_mov_b32 s3, s0
953 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
954 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
955 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:260 sc0 sc1
956 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276 sc0 sc1
957 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292 sc0 sc1
958 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308 sc0 sc1
959 ; GFX940-NEXT: s_endpgm
961 ; GFX1010-PAL-LABEL: zero_init_small_offset_kernel:
962 ; GFX1010-PAL: ; %bb.0:
963 ; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
964 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
965 ; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
966 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
967 ; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff
968 ; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1
969 ; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
970 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
971 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
972 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0
973 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc dlc
974 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
975 ; GFX1010-PAL-NEXT: s_mov_b32 s1, s0
976 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
977 ; GFX1010-PAL-NEXT: s_mov_b32 s3, s0
978 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0
979 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1
980 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2
981 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3
982 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:260
983 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:276
984 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:292
985 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:308
986 ; GFX1010-PAL-NEXT: s_endpgm
988 ; GFX1030-PAL-LABEL: zero_init_small_offset_kernel:
989 ; GFX1030-PAL: ; %bb.0:
990 ; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3]
991 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
992 ; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
993 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
994 ; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff
995 ; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1
996 ; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
997 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
998 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
999 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
1000 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
1001 ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0
1002 ; GFX1030-PAL-NEXT: s_mov_b32 s1, s0
1003 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
1004 ; GFX1030-PAL-NEXT: s_mov_b32 s3, s0
1005 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0
1006 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1
1007 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2
1008 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3
1009 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:260
1010 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:276
1011 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:292
1012 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:308
1013 ; GFX1030-PAL-NEXT: s_endpgm
1015 ; GFX11-PAL-LABEL: zero_init_small_offset_kernel:
1016 ; GFX11-PAL: ; %bb.0:
1017 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
1018 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
1019 ; GFX11-PAL-NEXT: s_mov_b32 s0, 0
1020 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1021 ; GFX11-PAL-NEXT: s_mov_b32 s1, s0
1022 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0
1023 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0
1024 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1025 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
1026 ; GFX11-PAL-NEXT: s_clause 0x3
1027 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:260
1028 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:276
1029 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:292
1030 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:308
1031 ; GFX11-PAL-NEXT: s_endpgm
1032 %padding = alloca [64 x i32], align 4, addrspace(5)
1033 %alloca = alloca [32 x i16], align 2, addrspace(5)
1034 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
1035 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
1036 call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
1040 define void @zero_init_small_offset_foo() {
1041 ; GFX9-LABEL: zero_init_small_offset_foo:
1043 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1044 ; GFX9-NEXT: scratch_load_dword v0, off, s32 glc
1045 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1046 ; GFX9-NEXT: s_mov_b32 s0, 0
1047 ; GFX9-NEXT: s_mov_b32 s1, s0
1048 ; GFX9-NEXT: s_mov_b32 s2, s0
1049 ; GFX9-NEXT: s_mov_b32 s3, s0
1050 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1051 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1052 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1053 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1054 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256
1055 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272
1056 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288
1057 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304
1058 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1059 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1061 ; GFX10-LABEL: zero_init_small_offset_foo:
1063 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1064 ; GFX10-NEXT: scratch_load_dword v0, off, s32 glc dlc
1065 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1066 ; GFX10-NEXT: s_mov_b32 s0, 0
1067 ; GFX10-NEXT: s_mov_b32 s1, s0
1068 ; GFX10-NEXT: s_mov_b32 s2, s0
1069 ; GFX10-NEXT: s_mov_b32 s3, s0
1070 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1071 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1072 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
1073 ; GFX10-NEXT: v_mov_b32_e32 v3, s3
1074 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256
1075 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272
1076 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288
1077 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304
1078 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1080 ; GFX11-LABEL: zero_init_small_offset_foo:
1082 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1083 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 glc dlc
1084 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1085 ; GFX11-NEXT: s_mov_b32 s0, 0
1086 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1087 ; GFX11-NEXT: s_mov_b32 s1, s0
1088 ; GFX11-NEXT: s_mov_b32 s2, s0
1089 ; GFX11-NEXT: s_mov_b32 s3, s0
1090 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1091 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
1092 ; GFX11-NEXT: s_clause 0x3
1093 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256
1094 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272
1095 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288
1096 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304
1097 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1099 ; GFX9-PAL-LABEL: zero_init_small_offset_foo:
1100 ; GFX9-PAL: ; %bb.0:
1101 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1102 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 glc
1103 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
1104 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0
1105 ; GFX9-PAL-NEXT: s_mov_b32 s1, s0
1106 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
1107 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0
1108 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0
1109 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1
1110 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2
1111 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3
1112 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256
1113 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272
1114 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288
1115 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304
1116 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
1117 ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
1119 ; GFX940-LABEL: zero_init_small_offset_foo:
1121 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1122 ; GFX940-NEXT: scratch_load_dword v0, off, s32 sc0 sc1
1123 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1124 ; GFX940-NEXT: s_mov_b32 s0, 0
1125 ; GFX940-NEXT: s_mov_b32 s1, s0
1126 ; GFX940-NEXT: s_mov_b32 s2, s0
1127 ; GFX940-NEXT: s_mov_b32 s3, s0
1128 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
1129 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
1130 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256 sc0 sc1
1131 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272 sc0 sc1
1132 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288 sc0 sc1
1133 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304 sc0 sc1
1134 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1135 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1137 ; GFX10-PAL-LABEL: zero_init_small_offset_foo:
1138 ; GFX10-PAL: ; %bb.0:
1139 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1140 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s32 glc dlc
1141 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
1142 ; GFX10-PAL-NEXT: s_mov_b32 s0, 0
1143 ; GFX10-PAL-NEXT: s_mov_b32 s1, s0
1144 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0
1145 ; GFX10-PAL-NEXT: s_mov_b32 s3, s0
1146 ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, s0
1147 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, s1
1148 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, s2
1149 ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, s3
1150 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:256
1151 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:272
1152 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:288
1153 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s32 offset:304
1154 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
1156 ; GFX11-PAL-LABEL: zero_init_small_offset_foo:
1157 ; GFX11-PAL: ; %bb.0:
1158 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1159 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 glc dlc
1160 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
1161 ; GFX11-PAL-NEXT: s_mov_b32 s0, 0
1162 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1163 ; GFX11-PAL-NEXT: s_mov_b32 s1, s0
1164 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0
1165 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0
1166 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1167 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
1168 ; GFX11-PAL-NEXT: s_clause 0x3
1169 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256
1170 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272
1171 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:288
1172 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:304
1173 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
1174 %padding = alloca [64 x i32], align 4, addrspace(5)
1175 %alloca = alloca [32 x i16], align 2, addrspace(5)
1176 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
1177 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
1178 call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
1182 define amdgpu_kernel void @store_load_sindex_small_offset_kernel(i32 %idx) {
1183 ; GFX9-LABEL: store_load_sindex_small_offset_kernel:
1184 ; GFX9: ; %bb.0: ; %bb
1185 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
1186 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5
1187 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
1188 ; GFX9-NEXT: s_mov_b32 s1, 0
1189 ; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc
1190 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1191 ; GFX9-NEXT: s_lshl_b32 s1, s0, 2
1192 ; GFX9-NEXT: s_and_b32 s0, s0, 15
1193 ; GFX9-NEXT: v_mov_b32_e32 v0, 15
1194 ; GFX9-NEXT: s_addk_i32 s1, 0x104
1195 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2
1196 ; GFX9-NEXT: scratch_store_dword off, v0, s1
1197 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1198 ; GFX9-NEXT: s_addk_i32 s0, 0x104
1199 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
1200 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1201 ; GFX9-NEXT: s_endpgm
1203 ; GFX10-LABEL: store_load_sindex_small_offset_kernel:
1204 ; GFX10: ; %bb.0: ; %bb
1205 ; GFX10-NEXT: s_add_u32 s2, s2, s5
1206 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
1207 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1208 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1209 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
1210 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
1211 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1212 ; GFX10-NEXT: v_mov_b32_e32 v0, 15
1213 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
1214 ; GFX10-NEXT: s_and_b32 s1, s0, 15
1215 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2
1216 ; GFX10-NEXT: s_lshl_b32 s1, s1, 2
1217 ; GFX10-NEXT: s_addk_i32 s0, 0x104
1218 ; GFX10-NEXT: s_addk_i32 s1, 0x104
1219 ; GFX10-NEXT: scratch_store_dword off, v0, s0
1220 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1221 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
1222 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1223 ; GFX10-NEXT: s_endpgm
1225 ; GFX11-LABEL: store_load_sindex_small_offset_kernel:
1226 ; GFX11: ; %bb.0: ; %bb
1227 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24
1228 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
1229 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1230 ; GFX11-NEXT: v_mov_b32_e32 v0, 15
1231 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1232 ; GFX11-NEXT: s_and_b32 s1, s0, 15
1233 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2
1234 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2
1235 ; GFX11-NEXT: s_addk_i32 s0, 0x104
1236 ; GFX11-NEXT: s_addk_i32 s1, 0x104
1237 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
1238 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1239 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
1240 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1241 ; GFX11-NEXT: s_endpgm
1243 ; GFX9-PAL-LABEL: store_load_sindex_small_offset_kernel:
1244 ; GFX9-PAL: ; %bb.0: ; %bb
1245 ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5]
1246 ; GFX9-PAL-NEXT: s_mov_b32 s4, s0
1247 ; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
1248 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
1249 ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff
1250 ; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0
1251 ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3
1252 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
1253 ; GFX9-PAL-NEXT: s_mov_b32 s1, 0
1254 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc
1255 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1256 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2
1257 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15
1258 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
1259 ; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104
1260 ; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2
1261 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1
1262 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
1263 ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104
1264 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc
1265 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
1266 ; GFX9-PAL-NEXT: s_endpgm
1268 ; GFX940-LABEL: store_load_sindex_small_offset_kernel:
1269 ; GFX940: ; %bb.0: ; %bb
1270 ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
1271 ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1
1272 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1273 ; GFX940-NEXT: v_mov_b32_e32 v0, 15
1274 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
1275 ; GFX940-NEXT: s_lshl_b32 s1, s0, 2
1276 ; GFX940-NEXT: s_and_b32 s0, s0, 15
1277 ; GFX940-NEXT: s_addk_i32 s1, 0x104
1278 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2
1279 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
1280 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1281 ; GFX940-NEXT: s_addk_i32 s0, 0x104
1282 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
1283 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1284 ; GFX940-NEXT: s_endpgm
1286 ; GFX1010-PAL-LABEL: store_load_sindex_small_offset_kernel:
1287 ; GFX1010-PAL: ; %bb.0: ; %bb
1288 ; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5]
1289 ; GFX1010-PAL-NEXT: s_mov_b32 s4, s0
1290 ; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
1291 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
1292 ; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff
1293 ; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3
1294 ; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0
1295 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1296 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1297 ; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0
1298 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0
1299 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc dlc
1300 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
1301 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15
1302 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
1303 ; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15
1304 ; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2
1305 ; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2
1306 ; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x104
1307 ; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x104
1308 ; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0
1309 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
1310 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
1311 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
1312 ; GFX1010-PAL-NEXT: s_endpgm
1314 ; GFX1030-PAL-LABEL: store_load_sindex_small_offset_kernel:
1315 ; GFX1030-PAL: ; %bb.0: ; %bb
1316 ; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5]
1317 ; GFX1030-PAL-NEXT: s_mov_b32 s4, s0
1318 ; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
1319 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
1320 ; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff
1321 ; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3
1322 ; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0
1323 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
1324 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
1325 ; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0
1326 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
1327 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
1328 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15
1329 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
1330 ; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15
1331 ; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2
1332 ; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2
1333 ; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104
1334 ; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x104
1335 ; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0
1336 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
1337 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
1338 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
1339 ; GFX1030-PAL-NEXT: s_endpgm
1341 ; GFX11-PAL-LABEL: store_load_sindex_small_offset_kernel:
1342 ; GFX11-PAL: ; %bb.0: ; %bb
1343 ; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0
1344 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
1345 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
1346 ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15
1347 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0)
1348 ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15
1349 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2
1350 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2
1351 ; GFX11-PAL-NEXT: s_addk_i32 s0, 0x104
1352 ; GFX11-PAL-NEXT: s_addk_i32 s1, 0x104
1353 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc
1354 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
1355 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc
1356 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
1357 ; GFX11-PAL-NEXT: s_endpgm
1359 %padding = alloca [64 x i32], align 4, addrspace(5)
1360 %i = alloca [32 x float], align 4, addrspace(5)
1361 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
1362 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
1363 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
1364 store volatile i32 15, ptr addrspace(5) %i7, align 4
1365 %i9 = and i32 %idx, 15
1366 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
1367 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
1371 define amdgpu_ps void @store_load_sindex_small_offset_foo(i32 inreg %idx) {
1372 ; GFX9-LABEL: store_load_sindex_small_offset_foo:
1373 ; GFX9: ; %bb.0: ; %bb
1374 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
1375 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1376 ; GFX9-NEXT: s_mov_b32 s0, 0
1377 ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:4 glc
1378 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1379 ; GFX9-NEXT: s_lshl_b32 s0, s2, 2
1380 ; GFX9-NEXT: s_addk_i32 s0, 0x104
1381 ; GFX9-NEXT: v_mov_b32_e32 v0, 15
1382 ; GFX9-NEXT: scratch_store_dword off, v0, s0
1383 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1384 ; GFX9-NEXT: s_and_b32 s0, s2, 15
1385 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2
1386 ; GFX9-NEXT: s_addk_i32 s0, 0x104
1387 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
1388 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1389 ; GFX9-NEXT: s_endpgm
1391 ; GFX10-LABEL: store_load_sindex_small_offset_foo:
1392 ; GFX10: ; %bb.0: ; %bb
1393 ; GFX10-NEXT: s_add_u32 s0, s0, s3
1394 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1395 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1396 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1397 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
1398 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1399 ; GFX10-NEXT: v_mov_b32_e32 v0, 15
1400 ; GFX10-NEXT: s_and_b32 s0, s2, 15
1401 ; GFX10-NEXT: s_lshl_b32 s1, s2, 2
1402 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2
1403 ; GFX10-NEXT: s_addk_i32 s1, 0x104
1404 ; GFX10-NEXT: s_addk_i32 s0, 0x104
1405 ; GFX10-NEXT: scratch_store_dword off, v0, s1
1406 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1407 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc
1408 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1409 ; GFX10-NEXT: s_endpgm
1411 ; GFX11-LABEL: store_load_sindex_small_offset_foo:
1412 ; GFX11: ; %bb.0: ; %bb
1413 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
1414 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1415 ; GFX11-NEXT: v_mov_b32_e32 v0, 15
1416 ; GFX11-NEXT: s_and_b32 s1, s0, 15
1417 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2
1418 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2
1419 ; GFX11-NEXT: s_addk_i32 s0, 0x104
1420 ; GFX11-NEXT: s_addk_i32 s1, 0x104
1421 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
1422 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1423 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
1424 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1425 ; GFX11-NEXT: s_endpgm
1427 ; GFX9-PAL-LABEL: store_load_sindex_small_offset_foo:
1428 ; GFX9-PAL: ; %bb.0: ; %bb
1429 ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
1430 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
1431 ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
1432 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
1433 ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
1434 ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
1435 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
1436 ; GFX9-PAL-NEXT: s_mov_b32 s1, 0
1437 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc
1438 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
1439 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2
1440 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15
1441 ; GFX9-PAL-NEXT: s_addk_i32 s1, 0x104
1442 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
1443 ; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2
1444 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1
1445 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
1446 ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x104
1447 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc
1448 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
1449 ; GFX9-PAL-NEXT: s_endpgm
1451 ; GFX940-LABEL: store_load_sindex_small_offset_foo:
1452 ; GFX940: ; %bb.0: ; %bb
1453 ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1
1454 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1455 ; GFX940-NEXT: s_lshl_b32 s1, s0, 2
1456 ; GFX940-NEXT: s_and_b32 s0, s0, 15
1457 ; GFX940-NEXT: s_addk_i32 s1, 0x104
1458 ; GFX940-NEXT: v_mov_b32_e32 v0, 15
1459 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2
1460 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
1461 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1462 ; GFX940-NEXT: s_addk_i32 s0, 0x104
1463 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
1464 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1465 ; GFX940-NEXT: s_endpgm
1467 ; GFX1010-PAL-LABEL: store_load_sindex_small_offset_foo:
1468 ; GFX1010-PAL: ; %bb.0: ; %bb
1469 ; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
1470 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
1471 ; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
1472 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
1473 ; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff
1474 ; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1
1475 ; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
1476 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1477 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1478 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0
1479 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc dlc
1480 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
1481 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15
1482 ; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15
1483 ; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2
1484 ; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2
1485 ; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x104
1486 ; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x104
1487 ; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0
1488 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
1489 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
1490 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
1491 ; GFX1010-PAL-NEXT: s_endpgm
1493 ; GFX1030-PAL-LABEL: store_load_sindex_small_offset_foo:
1494 ; GFX1030-PAL: ; %bb.0: ; %bb
1495 ; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3]
1496 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
1497 ; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
1498 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
1499 ; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff
1500 ; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1
1501 ; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
1502 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1503 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1504 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
1505 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
1506 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15
1507 ; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15
1508 ; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2
1509 ; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2
1510 ; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x104
1511 ; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x104
1512 ; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0
1513 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
1514 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
1515 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
1516 ; GFX1030-PAL-NEXT: s_endpgm
1518 ; GFX11-PAL-LABEL: store_load_sindex_small_offset_foo:
1519 ; GFX11-PAL: ; %bb.0: ; %bb
1520 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
1521 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
1522 ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15
1523 ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15
1524 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2
1525 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2
1526 ; GFX11-PAL-NEXT: s_addk_i32 s0, 0x104
1527 ; GFX11-PAL-NEXT: s_addk_i32 s1, 0x104
1528 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc
1529 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
1530 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc
1531 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
1532 ; GFX11-PAL-NEXT: s_endpgm
1534 %padding = alloca [64 x i32], align 4, addrspace(5)
1535 %i = alloca [32 x float], align 4, addrspace(5)
1536 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
1537 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
1538 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
1539 store volatile i32 15, ptr addrspace(5) %i7, align 4
1540 %i9 = and i32 %idx, 15
1541 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
1542 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
1546 define amdgpu_kernel void @store_load_vindex_small_offset_kernel() {
1547 ; GFX9-LABEL: store_load_vindex_small_offset_kernel:
1548 ; GFX9: ; %bb.0: ; %bb
1549 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
1550 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1551 ; GFX9-NEXT: s_mov_b32 s0, 0
1552 ; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc
1553 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1554 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1555 ; GFX9-NEXT: v_add_u32_e32 v1, 0x104, v0
1556 ; GFX9-NEXT: v_mov_b32_e32 v2, 15
1557 ; GFX9-NEXT: v_sub_u32_e32 v0, 0x104, v0
1558 ; GFX9-NEXT: scratch_store_dword v1, v2, off
1559 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1560 ; GFX9-NEXT: v_add_u32_e32 v0, 0x7c, v0
1561 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc
1562 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1563 ; GFX9-NEXT: s_endpgm
1565 ; GFX10-LABEL: store_load_vindex_small_offset_kernel:
1566 ; GFX10: ; %bb.0: ; %bb
1567 ; GFX10-NEXT: s_add_u32 s0, s0, s3
1568 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1569 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1570 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1571 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1572 ; GFX10-NEXT: v_mov_b32_e32 v2, 15
1573 ; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc
1574 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1575 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x104, v0
1576 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x104, v0
1577 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1
1578 ; GFX10-NEXT: scratch_store_dword v0, v2, off
1579 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1580 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
1581 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1582 ; GFX10-NEXT: s_endpgm
1584 ; GFX11-LABEL: store_load_vindex_small_offset_kernel:
1585 ; GFX11: ; %bb.0: ; %bb
1586 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1587 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc
1588 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1589 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x104, v0
1590 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1591 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
1592 ; GFX11-NEXT: scratch_store_b32 v0, v2, off offset:260 dlc
1593 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1594 ; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc
1595 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1596 ; GFX11-NEXT: s_endpgm
1598 ; GFX9-PAL-LABEL: store_load_vindex_small_offset_kernel:
1599 ; GFX9-PAL: ; %bb.0: ; %bb
1600 ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
1601 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
1602 ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
1603 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0
1604 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1605 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15
1606 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
1607 ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
1608 ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
1609 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
1610 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 offset:4 glc
1611 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
1612 ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x104, v0
1613 ; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x104, v0
1614 ; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off
1615 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
1616 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0x7c, v0
1617 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc
1618 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
1619 ; GFX9-PAL-NEXT: s_endpgm
1621 ; GFX940-LABEL: store_load_vindex_small_offset_kernel:
1622 ; GFX940: ; %bb.0: ; %bb
1623 ; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1
1624 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1625 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1626 ; GFX940-NEXT: v_mov_b32_e32 v1, 15
1627 ; GFX940-NEXT: scratch_store_dword v0, v1, off offset:260 sc0 sc1
1628 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1629 ; GFX940-NEXT: v_sub_u32_e32 v0, 0x104, v0
1630 ; GFX940-NEXT: v_add_u32_e32 v0, 0x7c, v0
1631 ; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1
1632 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1633 ; GFX940-NEXT: s_endpgm
1635 ; GFX1010-PAL-LABEL: store_load_vindex_small_offset_kernel:
1636 ; GFX1010-PAL: ; %bb.0: ; %bb
1637 ; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
1638 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
1639 ; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
1640 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
1641 ; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff
1642 ; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1
1643 ; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
1644 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1645 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1646 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1647 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15
1648 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0
1649 ; GFX1010-PAL-NEXT: scratch_load_dword v3, off, s0 offset:4 glc dlc
1650 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
1651 ; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v1, 0x104, v0
1652 ; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, 0x104, v0
1653 ; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1
1654 ; GFX1010-PAL-NEXT: scratch_store_dword v0, v2, off
1655 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
1656 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
1657 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
1658 ; GFX1010-PAL-NEXT: s_endpgm
1660 ; GFX1030-PAL-LABEL: store_load_vindex_small_offset_kernel:
1661 ; GFX1030-PAL: ; %bb.0: ; %bb
1662 ; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3]
1663 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
1664 ; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
1665 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
1666 ; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff
1667 ; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1
1668 ; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
1669 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1670 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1671 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1672 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15
1673 ; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc
1674 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
1675 ; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v1, 0x104, v0
1676 ; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v0, 0x104, v0
1677 ; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1
1678 ; GFX1030-PAL-NEXT: scratch_store_dword v0, v2, off
1679 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
1680 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
1681 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
1682 ; GFX1030-PAL-NEXT: s_endpgm
1684 ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel:
1685 ; GFX11-PAL: ; %bb.0: ; %bb
1686 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1687 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc
1688 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
1689 ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v1, 0x104, v0
1690 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1)
1691 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
1692 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, off offset:260 dlc
1693 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
1694 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off glc dlc
1695 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
1696 ; GFX11-PAL-NEXT: s_endpgm
1698 %padding = alloca [64 x i32], align 4, addrspace(5)
1699 %i = alloca [32 x float], align 4, addrspace(5)
1700 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
1701 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
1702 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
1703 %i3 = zext i32 %i2 to i64
1704 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2
1705 store volatile i32 15, ptr addrspace(5) %i7, align 4
1706 %i9 = sub nsw i32 31, %i2
1707 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
1708 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
1712 define void @store_load_vindex_small_offset_foo(i32 %idx) {
1713 ; GFX9-LABEL: store_load_vindex_small_offset_foo:
1714 ; GFX9: ; %bb.0: ; %bb
1715 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1716 ; GFX9-NEXT: scratch_load_dword v1, off, s32 glc
1717 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1718 ; GFX9-NEXT: s_add_i32 s0, s32, 0x100
1719 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
1720 ; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1
1721 ; GFX9-NEXT: v_mov_b32_e32 v3, 15
1722 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
1723 ; GFX9-NEXT: scratch_store_dword v2, v3, off
1724 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1725 ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
1726 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc
1727 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1728 ; GFX9-NEXT: s_setpc_b64 s[30:31]
1730 ; GFX10-LABEL: store_load_vindex_small_offset_foo:
1731 ; GFX10: ; %bb.0: ; %bb
1732 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1733 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
1734 ; GFX10-NEXT: s_add_i32 s0, s32, 0x100
1735 ; GFX10-NEXT: v_mov_b32_e32 v2, 15
1736 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s0
1737 ; GFX10-NEXT: s_add_i32 s0, s32, 0x100
1738 ; GFX10-NEXT: scratch_load_dword v3, off, s32 glc dlc
1739 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1740 ; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s0
1741 ; GFX10-NEXT: scratch_store_dword v0, v2, off
1742 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
1743 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
1744 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1745 ; GFX10-NEXT: s_setpc_b64 s[30:31]
1747 ; GFX11-LABEL: store_load_vindex_small_offset_foo:
1748 ; GFX11: ; %bb.0: ; %bb
1749 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1750 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
1751 ; GFX11-NEXT: s_add_i32 s0, s32, 0x100
1752 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc
1753 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1754 ; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
1755 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
1756 ; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc
1757 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
1758 ; GFX11-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc
1759 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1760 ; GFX11-NEXT: s_setpc_b64 s[30:31]
1762 ; GFX9-PAL-LABEL: store_load_vindex_small_offset_foo:
1763 ; GFX9-PAL: ; %bb.0: ; %bb
1764 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1765 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 glc
1766 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
1767 ; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x100
1768 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s0
1769 ; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1
1770 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
1771 ; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0
1772 ; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
1773 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
1774 ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
1775 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc
1776 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
1777 ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
1779 ; GFX940-LABEL: store_load_vindex_small_offset_foo:
1780 ; GFX940: ; %bb.0: ; %bb
1781 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1782 ; GFX940-NEXT: scratch_load_dword v1, off, s32 sc0 sc1
1783 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1784 ; GFX940-NEXT: s_add_i32 s0, s32, 0x100
1785 ; GFX940-NEXT: v_mov_b32_e32 v1, s0
1786 ; GFX940-NEXT: v_lshl_add_u32 v1, v0, 2, v1
1787 ; GFX940-NEXT: v_mov_b32_e32 v2, 15
1788 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0
1789 ; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1
1790 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1791 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1792 ; GFX940-NEXT: scratch_load_dword v0, v0, s32 offset:256 sc0 sc1
1793 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1794 ; GFX940-NEXT: s_setpc_b64 s[30:31]
1796 ; GFX10-PAL-LABEL: store_load_vindex_small_offset_foo:
1797 ; GFX10-PAL: ; %bb.0: ; %bb
1798 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1799 ; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0
1800 ; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x100
1801 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15
1802 ; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0
1803 ; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x100
1804 ; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 glc dlc
1805 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
1806 ; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s0
1807 ; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off
1808 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
1809 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
1810 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
1811 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
1813 ; GFX11-PAL-LABEL: store_load_vindex_small_offset_foo:
1814 ; GFX11-PAL: ; %bb.0: ; %bb
1815 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1816 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
1817 ; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x100
1818 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 glc dlc
1819 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
1820 ; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0
1821 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
1822 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, off dlc
1823 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
1824 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 glc dlc
1825 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
1826 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
1828 %padding = alloca [64 x i32], align 4, addrspace(5)
1829 %i = alloca [32 x float], align 4, addrspace(5)
1830 %pad_gep = getelementptr inbounds [64 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
1831 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
1832 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
1833 store volatile i32 15, ptr addrspace(5) %i7, align 4
1834 %i9 = and i32 %idx, 15
1835 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
1836 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
1840 define amdgpu_kernel void @zero_init_large_offset_kernel() {
1841 ; GFX9-LABEL: zero_init_large_offset_kernel:
1843 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
1844 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
1845 ; GFX9-NEXT: s_mov_b32 s0, 0
1846 ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:4 glc
1847 ; GFX9-NEXT: s_waitcnt vmcnt(0)
1848 ; GFX9-NEXT: s_mov_b32 s1, s0
1849 ; GFX9-NEXT: s_mov_b32 s2, s0
1850 ; GFX9-NEXT: s_mov_b32 s3, s0
1851 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
1852 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
1853 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
1854 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
1855 ; GFX9-NEXT: s_movk_i32 s0, 0x4004
1856 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0
1857 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16
1858 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32
1859 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48
1860 ; GFX9-NEXT: s_endpgm
1862 ; GFX10-LABEL: zero_init_large_offset_kernel:
1864 ; GFX10-NEXT: s_add_u32 s0, s0, s3
1865 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1866 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1867 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1868 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
1869 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1870 ; GFX10-NEXT: s_mov_b32 s0, 0
1871 ; GFX10-NEXT: s_mov_b32 s1, s0
1872 ; GFX10-NEXT: s_mov_b32 s2, s0
1873 ; GFX10-NEXT: s_mov_b32 s3, s0
1874 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
1875 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
1876 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
1877 ; GFX10-NEXT: v_mov_b32_e32 v3, s3
1878 ; GFX10-NEXT: s_movk_i32 s0, 0x4004
1879 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0
1880 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16
1881 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32
1882 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48
1883 ; GFX10-NEXT: s_endpgm
1885 ; GFX11-LABEL: zero_init_large_offset_kernel:
1887 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
1888 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1889 ; GFX11-NEXT: s_mov_b32 s0, 0
1890 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1891 ; GFX11-NEXT: s_mov_b32 s1, s0
1892 ; GFX11-NEXT: s_mov_b32 s2, s0
1893 ; GFX11-NEXT: s_mov_b32 s3, s0
1894 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
1895 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
1896 ; GFX11-NEXT: s_movk_i32 s0, 0x4004
1897 ; GFX11-NEXT: s_clause 0x3
1898 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0
1899 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:16
1900 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32
1901 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:48
1902 ; GFX11-NEXT: s_endpgm
1904 ; GFX9-PAL-LABEL: zero_init_large_offset_kernel:
1905 ; GFX9-PAL: ; %bb.0:
1906 ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
1907 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
1908 ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
1909 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0
1910 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
1911 ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
1912 ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
1913 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
1914 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc
1915 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
1916 ; GFX9-PAL-NEXT: s_mov_b32 s1, s0
1917 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
1918 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0
1919 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0
1920 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1
1921 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2
1922 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3
1923 ; GFX9-PAL-NEXT: s_movk_i32 s0, 0x4004
1924 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0
1925 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16
1926 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32
1927 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48
1928 ; GFX9-PAL-NEXT: s_endpgm
1930 ; GFX940-LABEL: zero_init_large_offset_kernel:
1932 ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1
1933 ; GFX940-NEXT: s_waitcnt vmcnt(0)
1934 ; GFX940-NEXT: s_mov_b32 s0, 0
1935 ; GFX940-NEXT: s_mov_b32 s1, s0
1936 ; GFX940-NEXT: s_mov_b32 s2, s0
1937 ; GFX940-NEXT: s_mov_b32 s3, s0
1938 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
1939 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
1940 ; GFX940-NEXT: s_movk_i32 s0, 0x4004
1941 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 sc0 sc1
1942 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 sc0 sc1
1943 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 sc0 sc1
1944 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 sc0 sc1
1945 ; GFX940-NEXT: s_endpgm
1947 ; GFX1010-PAL-LABEL: zero_init_large_offset_kernel:
1948 ; GFX1010-PAL: ; %bb.0:
1949 ; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
1950 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
1951 ; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
1952 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
1953 ; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff
1954 ; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1
1955 ; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
1956 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1957 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1958 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0
1959 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:4 glc dlc
1960 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
1961 ; GFX1010-PAL-NEXT: s_mov_b32 s1, s0
1962 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
1963 ; GFX1010-PAL-NEXT: s_mov_b32 s3, s0
1964 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0
1965 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1
1966 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2
1967 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3
1968 ; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x4004
1969 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0
1970 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16
1971 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32
1972 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48
1973 ; GFX1010-PAL-NEXT: s_endpgm
1975 ; GFX1030-PAL-LABEL: zero_init_large_offset_kernel:
1976 ; GFX1030-PAL: ; %bb.0:
1977 ; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3]
1978 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
1979 ; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
1980 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
1981 ; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff
1982 ; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1
1983 ; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
1984 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
1985 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
1986 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
1987 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
1988 ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0
1989 ; GFX1030-PAL-NEXT: s_mov_b32 s1, s0
1990 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
1991 ; GFX1030-PAL-NEXT: s_mov_b32 s3, s0
1992 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0
1993 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1
1994 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2
1995 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3
1996 ; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x4004
1997 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0
1998 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16
1999 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32
2000 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48
2001 ; GFX1030-PAL-NEXT: s_endpgm
2003 ; GFX11-PAL-LABEL: zero_init_large_offset_kernel:
2004 ; GFX11-PAL: ; %bb.0:
2005 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
2006 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
2007 ; GFX11-PAL-NEXT: s_mov_b32 s0, 0
2008 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2009 ; GFX11-PAL-NEXT: s_mov_b32 s1, s0
2010 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0
2011 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0
2012 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2013 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2014 ; GFX11-PAL-NEXT: s_movk_i32 s0, 0x4004
2015 ; GFX11-PAL-NEXT: s_clause 0x3
2016 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0
2017 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:16
2018 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32
2019 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:48
2020 ; GFX11-PAL-NEXT: s_endpgm
2021 %padding = alloca [4096 x i32], align 4, addrspace(5)
2022 %alloca = alloca [32 x i16], align 2, addrspace(5)
2023 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
2024 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
2025 call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
2029 define void @zero_init_large_offset_foo() {
2030 ; GFX9-LABEL: zero_init_large_offset_foo:
2032 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2033 ; GFX9-NEXT: scratch_load_dword v0, off, s32 offset:4 glc
2034 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2035 ; GFX9-NEXT: s_mov_b32 s0, 0
2036 ; GFX9-NEXT: s_mov_b32 s1, s0
2037 ; GFX9-NEXT: s_mov_b32 s2, s0
2038 ; GFX9-NEXT: s_mov_b32 s3, s0
2039 ; GFX9-NEXT: v_mov_b32_e32 v0, s0
2040 ; GFX9-NEXT: v_mov_b32_e32 v1, s1
2041 ; GFX9-NEXT: v_mov_b32_e32 v2, s2
2042 ; GFX9-NEXT: v_mov_b32_e32 v3, s3
2043 ; GFX9-NEXT: s_add_i32 s0, s32, 0x4004
2044 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0
2045 ; GFX9-NEXT: s_add_i32 s0, s32, 0x4004
2046 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16
2047 ; GFX9-NEXT: s_add_i32 s0, s32, 0x4004
2048 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32
2049 ; GFX9-NEXT: s_add_i32 s0, s32, 0x4004
2050 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48
2051 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2052 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2054 ; GFX10-LABEL: zero_init_large_offset_foo:
2056 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2057 ; GFX10-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc
2058 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2059 ; GFX10-NEXT: s_mov_b32 s0, 0
2060 ; GFX10-NEXT: s_mov_b32 s1, s0
2061 ; GFX10-NEXT: s_mov_b32 s2, s0
2062 ; GFX10-NEXT: s_mov_b32 s3, s0
2063 ; GFX10-NEXT: v_mov_b32_e32 v0, s0
2064 ; GFX10-NEXT: v_mov_b32_e32 v1, s1
2065 ; GFX10-NEXT: v_mov_b32_e32 v2, s2
2066 ; GFX10-NEXT: v_mov_b32_e32 v3, s3
2067 ; GFX10-NEXT: s_add_i32 s0, s32, 0x4004
2068 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0
2069 ; GFX10-NEXT: s_add_i32 s0, s32, 0x4004
2070 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16
2071 ; GFX10-NEXT: s_add_i32 s0, s32, 0x4004
2072 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32
2073 ; GFX10-NEXT: s_add_i32 s0, s32, 0x4004
2074 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48
2075 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2077 ; GFX11-LABEL: zero_init_large_offset_foo:
2079 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2080 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc
2081 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2082 ; GFX11-NEXT: s_mov_b32 s0, 0
2083 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2084 ; GFX11-NEXT: s_mov_b32 s1, s0
2085 ; GFX11-NEXT: s_mov_b32 s2, s0
2086 ; GFX11-NEXT: s_mov_b32 s3, s0
2087 ; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2088 ; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2089 ; GFX11-NEXT: s_add_i32 s0, s32, 0x4004
2090 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0
2091 ; GFX11-NEXT: s_add_i32 s0, s32, 0x4004
2092 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:16
2093 ; GFX11-NEXT: s_add_i32 s0, s32, 0x4004
2094 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32
2095 ; GFX11-NEXT: s_add_i32 s0, s32, 0x4004
2096 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s0 offset:48
2097 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2099 ; GFX9-PAL-LABEL: zero_init_large_offset_foo:
2100 ; GFX9-PAL: ; %bb.0:
2101 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2102 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s32 offset:4 glc
2103 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
2104 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0
2105 ; GFX9-PAL-NEXT: s_mov_b32 s1, s0
2106 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
2107 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0
2108 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0
2109 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1
2110 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2
2111 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3
2112 ; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2113 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0
2114 ; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2115 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16
2116 ; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2117 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32
2118 ; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2119 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48
2120 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
2121 ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
2123 ; GFX940-LABEL: zero_init_large_offset_foo:
2125 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2126 ; GFX940-NEXT: scratch_load_dword v0, off, s32 offset:4 sc0 sc1
2127 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2128 ; GFX940-NEXT: s_mov_b32 s0, 0
2129 ; GFX940-NEXT: s_mov_b32 s1, s0
2130 ; GFX940-NEXT: s_mov_b32 s2, s0
2131 ; GFX940-NEXT: s_mov_b32 s3, s0
2132 ; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
2133 ; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
2134 ; GFX940-NEXT: s_add_i32 s0, s32, 0x4004
2135 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 sc0 sc1
2136 ; GFX940-NEXT: s_add_i32 s0, s32, 0x4004
2137 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16 sc0 sc1
2138 ; GFX940-NEXT: s_add_i32 s0, s32, 0x4004
2139 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32 sc0 sc1
2140 ; GFX940-NEXT: s_add_i32 s0, s32, 0x4004
2141 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48 sc0 sc1
2142 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2143 ; GFX940-NEXT: s_setpc_b64 s[30:31]
2145 ; GFX1010-PAL-LABEL: zero_init_large_offset_foo:
2146 ; GFX1010-PAL: ; %bb.0:
2147 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2148 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc
2149 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
2150 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0
2151 ; GFX1010-PAL-NEXT: s_mov_b32 s1, s0
2152 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
2153 ; GFX1010-PAL-NEXT: s_mov_b32 s3, s0
2154 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, s0
2155 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, s1
2156 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, s2
2157 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v3, s3
2158 ; GFX1010-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2159 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0
2160 ; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3
2161 ; GFX1010-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2162 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16
2163 ; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3
2164 ; GFX1010-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2165 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32
2166 ; GFX1010-PAL-NEXT: s_waitcnt_depctr 0xffe3
2167 ; GFX1010-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2168 ; GFX1010-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48
2169 ; GFX1010-PAL-NEXT: s_setpc_b64 s[30:31]
2171 ; GFX1030-PAL-LABEL: zero_init_large_offset_foo:
2172 ; GFX1030-PAL: ; %bb.0:
2173 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2174 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s32 offset:4 glc dlc
2175 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
2176 ; GFX1030-PAL-NEXT: s_mov_b32 s0, 0
2177 ; GFX1030-PAL-NEXT: s_mov_b32 s1, s0
2178 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
2179 ; GFX1030-PAL-NEXT: s_mov_b32 s3, s0
2180 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, s0
2181 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, s1
2182 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, s2
2183 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v3, s3
2184 ; GFX1030-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2185 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0
2186 ; GFX1030-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2187 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:16
2188 ; GFX1030-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2189 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:32
2190 ; GFX1030-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2191 ; GFX1030-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:48
2192 ; GFX1030-PAL-NEXT: s_setpc_b64 s[30:31]
2194 ; GFX11-PAL-LABEL: zero_init_large_offset_foo:
2195 ; GFX11-PAL: ; %bb.0:
2196 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2197 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s32 offset:4 glc dlc
2198 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
2199 ; GFX11-PAL-NEXT: s_mov_b32 s0, 0
2200 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2201 ; GFX11-PAL-NEXT: s_mov_b32 s1, s0
2202 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0
2203 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0
2204 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
2205 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
2206 ; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2207 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0
2208 ; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2209 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:16
2210 ; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2211 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:32
2212 ; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2213 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s0 offset:48
2214 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
2215 %padding = alloca [4096 x i32], align 4, addrspace(5)
2216 %alloca = alloca [32 x i16], align 2, addrspace(5)
2217 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
2218 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
2219 call void @llvm.memset.p5.i64(ptr addrspace(5) align 2 dereferenceable(64) %alloca, i8 0, i64 64, i1 false)
2223 define amdgpu_kernel void @store_load_sindex_large_offset_kernel(i32 %idx) {
2224 ; GFX9-LABEL: store_load_sindex_large_offset_kernel:
2225 ; GFX9: ; %bb.0: ; %bb
2226 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
2227 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5
2228 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
2229 ; GFX9-NEXT: s_mov_b32 s1, 0
2230 ; GFX9-NEXT: scratch_load_dword v0, off, s1 offset:4 glc
2231 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2232 ; GFX9-NEXT: s_lshl_b32 s1, s0, 2
2233 ; GFX9-NEXT: s_and_b32 s0, s0, 15
2234 ; GFX9-NEXT: v_mov_b32_e32 v0, 15
2235 ; GFX9-NEXT: s_addk_i32 s1, 0x4004
2236 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2
2237 ; GFX9-NEXT: scratch_store_dword off, v0, s1
2238 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2239 ; GFX9-NEXT: s_addk_i32 s0, 0x4004
2240 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
2241 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2242 ; GFX9-NEXT: s_endpgm
2244 ; GFX10-LABEL: store_load_sindex_large_offset_kernel:
2245 ; GFX10: ; %bb.0: ; %bb
2246 ; GFX10-NEXT: s_add_u32 s2, s2, s5
2247 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
2248 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2249 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2250 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
2251 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
2252 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2253 ; GFX10-NEXT: v_mov_b32_e32 v0, 15
2254 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
2255 ; GFX10-NEXT: s_and_b32 s1, s0, 15
2256 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2
2257 ; GFX10-NEXT: s_lshl_b32 s1, s1, 2
2258 ; GFX10-NEXT: s_addk_i32 s0, 0x4004
2259 ; GFX10-NEXT: s_addk_i32 s1, 0x4004
2260 ; GFX10-NEXT: scratch_store_dword off, v0, s0
2261 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2262 ; GFX10-NEXT: scratch_load_dword v0, off, s1 glc dlc
2263 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2264 ; GFX10-NEXT: s_endpgm
2266 ; GFX11-LABEL: store_load_sindex_large_offset_kernel:
2267 ; GFX11: ; %bb.0: ; %bb
2268 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24
2269 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
2270 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2271 ; GFX11-NEXT: v_mov_b32_e32 v0, 15
2272 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
2273 ; GFX11-NEXT: s_and_b32 s1, s0, 15
2274 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2
2275 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2
2276 ; GFX11-NEXT: s_addk_i32 s0, 0x4004
2277 ; GFX11-NEXT: s_addk_i32 s1, 0x4004
2278 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
2279 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2280 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
2281 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2282 ; GFX11-NEXT: s_endpgm
2284 ; GFX9-PAL-LABEL: store_load_sindex_large_offset_kernel:
2285 ; GFX9-PAL: ; %bb.0: ; %bb
2286 ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5]
2287 ; GFX9-PAL-NEXT: s_mov_b32 s4, s0
2288 ; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
2289 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
2290 ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff
2291 ; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0
2292 ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3
2293 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
2294 ; GFX9-PAL-NEXT: s_mov_b32 s1, 0
2295 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc
2296 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
2297 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2
2298 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15
2299 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
2300 ; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004
2301 ; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2
2302 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1
2303 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
2304 ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004
2305 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc
2306 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
2307 ; GFX9-PAL-NEXT: s_endpgm
2309 ; GFX940-LABEL: store_load_sindex_large_offset_kernel:
2310 ; GFX940: ; %bb.0: ; %bb
2311 ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
2312 ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1
2313 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2314 ; GFX940-NEXT: v_mov_b32_e32 v0, 15
2315 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
2316 ; GFX940-NEXT: s_lshl_b32 s1, s0, 2
2317 ; GFX940-NEXT: s_and_b32 s0, s0, 15
2318 ; GFX940-NEXT: s_addk_i32 s1, 0x4004
2319 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2
2320 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
2321 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2322 ; GFX940-NEXT: s_addk_i32 s0, 0x4004
2323 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
2324 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2325 ; GFX940-NEXT: s_endpgm
2327 ; GFX1010-PAL-LABEL: store_load_sindex_large_offset_kernel:
2328 ; GFX1010-PAL: ; %bb.0: ; %bb
2329 ; GFX1010-PAL-NEXT: s_getpc_b64 s[4:5]
2330 ; GFX1010-PAL-NEXT: s_mov_b32 s4, s0
2331 ; GFX1010-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
2332 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
2333 ; GFX1010-PAL-NEXT: s_and_b32 s5, s5, 0xffff
2334 ; GFX1010-PAL-NEXT: s_add_u32 s4, s4, s3
2335 ; GFX1010-PAL-NEXT: s_addc_u32 s5, s5, 0
2336 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
2337 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
2338 ; GFX1010-PAL-NEXT: s_load_dword s0, s[0:1], 0x0
2339 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0
2340 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc dlc
2341 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
2342 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15
2343 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
2344 ; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15
2345 ; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2
2346 ; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2
2347 ; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004
2348 ; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x4004
2349 ; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0
2350 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
2351 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
2352 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
2353 ; GFX1010-PAL-NEXT: s_endpgm
2355 ; GFX1030-PAL-LABEL: store_load_sindex_large_offset_kernel:
2356 ; GFX1030-PAL: ; %bb.0: ; %bb
2357 ; GFX1030-PAL-NEXT: s_getpc_b64 s[4:5]
2358 ; GFX1030-PAL-NEXT: s_mov_b32 s4, s0
2359 ; GFX1030-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
2360 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
2361 ; GFX1030-PAL-NEXT: s_and_b32 s5, s5, 0xffff
2362 ; GFX1030-PAL-NEXT: s_add_u32 s4, s4, s3
2363 ; GFX1030-PAL-NEXT: s_addc_u32 s5, s5, 0
2364 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
2365 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
2366 ; GFX1030-PAL-NEXT: s_load_dword s0, s[0:1], 0x0
2367 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
2368 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
2369 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15
2370 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
2371 ; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15
2372 ; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2
2373 ; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2
2374 ; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004
2375 ; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x4004
2376 ; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0
2377 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
2378 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
2379 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
2380 ; GFX1030-PAL-NEXT: s_endpgm
2382 ; GFX11-PAL-LABEL: store_load_sindex_large_offset_kernel:
2383 ; GFX11-PAL: ; %bb.0: ; %bb
2384 ; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0
2385 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
2386 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
2387 ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15
2388 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0)
2389 ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15
2390 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2
2391 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2
2392 ; GFX11-PAL-NEXT: s_addk_i32 s0, 0x4004
2393 ; GFX11-PAL-NEXT: s_addk_i32 s1, 0x4004
2394 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc
2395 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
2396 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc
2397 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
2398 ; GFX11-PAL-NEXT: s_endpgm
2400 %padding = alloca [4096 x i32], align 4, addrspace(5)
2401 %i = alloca [32 x float], align 4, addrspace(5)
2402 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
2403 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
2404 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
2405 store volatile i32 15, ptr addrspace(5) %i7, align 4
2406 %i9 = and i32 %idx, 15
2407 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
2408 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
2412 define amdgpu_ps void @store_load_sindex_large_offset_foo(i32 inreg %idx) {
2413 ; GFX9-LABEL: store_load_sindex_large_offset_foo:
2414 ; GFX9: ; %bb.0: ; %bb
2415 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
2416 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
2417 ; GFX9-NEXT: s_mov_b32 s0, 0
2418 ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:4 glc
2419 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2420 ; GFX9-NEXT: s_lshl_b32 s0, s2, 2
2421 ; GFX9-NEXT: s_addk_i32 s0, 0x4004
2422 ; GFX9-NEXT: v_mov_b32_e32 v0, 15
2423 ; GFX9-NEXT: scratch_store_dword off, v0, s0
2424 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2425 ; GFX9-NEXT: s_and_b32 s0, s2, 15
2426 ; GFX9-NEXT: s_lshl_b32 s0, s0, 2
2427 ; GFX9-NEXT: s_addk_i32 s0, 0x4004
2428 ; GFX9-NEXT: scratch_load_dword v0, off, s0 glc
2429 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2430 ; GFX9-NEXT: s_endpgm
2432 ; GFX10-LABEL: store_load_sindex_large_offset_foo:
2433 ; GFX10: ; %bb.0: ; %bb
2434 ; GFX10-NEXT: s_add_u32 s0, s0, s3
2435 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
2436 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2437 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2438 ; GFX10-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
2439 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2440 ; GFX10-NEXT: v_mov_b32_e32 v0, 15
2441 ; GFX10-NEXT: s_and_b32 s0, s2, 15
2442 ; GFX10-NEXT: s_lshl_b32 s1, s2, 2
2443 ; GFX10-NEXT: s_lshl_b32 s0, s0, 2
2444 ; GFX10-NEXT: s_addk_i32 s1, 0x4004
2445 ; GFX10-NEXT: s_addk_i32 s0, 0x4004
2446 ; GFX10-NEXT: scratch_store_dword off, v0, s1
2447 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2448 ; GFX10-NEXT: scratch_load_dword v0, off, s0 glc dlc
2449 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2450 ; GFX10-NEXT: s_endpgm
2452 ; GFX11-LABEL: store_load_sindex_large_offset_foo:
2453 ; GFX11: ; %bb.0: ; %bb
2454 ; GFX11-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
2455 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2456 ; GFX11-NEXT: v_mov_b32_e32 v0, 15
2457 ; GFX11-NEXT: s_and_b32 s1, s0, 15
2458 ; GFX11-NEXT: s_lshl_b32 s0, s0, 2
2459 ; GFX11-NEXT: s_lshl_b32 s1, s1, 2
2460 ; GFX11-NEXT: s_addk_i32 s0, 0x4004
2461 ; GFX11-NEXT: s_addk_i32 s1, 0x4004
2462 ; GFX11-NEXT: scratch_store_b32 off, v0, s0 dlc
2463 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2464 ; GFX11-NEXT: scratch_load_b32 v0, off, s1 glc dlc
2465 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2466 ; GFX11-NEXT: s_endpgm
2468 ; GFX9-PAL-LABEL: store_load_sindex_large_offset_foo:
2469 ; GFX9-PAL: ; %bb.0: ; %bb
2470 ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
2471 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
2472 ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
2473 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
2474 ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
2475 ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
2476 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
2477 ; GFX9-PAL-NEXT: s_mov_b32 s1, 0
2478 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc
2479 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
2480 ; GFX9-PAL-NEXT: s_lshl_b32 s1, s0, 2
2481 ; GFX9-PAL-NEXT: s_and_b32 s0, s0, 15
2482 ; GFX9-PAL-NEXT: s_addk_i32 s1, 0x4004
2483 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
2484 ; GFX9-PAL-NEXT: s_lshl_b32 s0, s0, 2
2485 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s1
2486 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
2487 ; GFX9-PAL-NEXT: s_addk_i32 s0, 0x4004
2488 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 glc
2489 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
2490 ; GFX9-PAL-NEXT: s_endpgm
2492 ; GFX940-LABEL: store_load_sindex_large_offset_foo:
2493 ; GFX940: ; %bb.0: ; %bb
2494 ; GFX940-NEXT: scratch_load_dword v0, off, off offset:4 sc0 sc1
2495 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2496 ; GFX940-NEXT: s_lshl_b32 s1, s0, 2
2497 ; GFX940-NEXT: s_and_b32 s0, s0, 15
2498 ; GFX940-NEXT: s_addk_i32 s1, 0x4004
2499 ; GFX940-NEXT: v_mov_b32_e32 v0, 15
2500 ; GFX940-NEXT: s_lshl_b32 s0, s0, 2
2501 ; GFX940-NEXT: scratch_store_dword off, v0, s1 sc0 sc1
2502 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2503 ; GFX940-NEXT: s_addk_i32 s0, 0x4004
2504 ; GFX940-NEXT: scratch_load_dword v0, off, s0 sc0 sc1
2505 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2506 ; GFX940-NEXT: s_endpgm
2508 ; GFX1010-PAL-LABEL: store_load_sindex_large_offset_foo:
2509 ; GFX1010-PAL: ; %bb.0: ; %bb
2510 ; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
2511 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
2512 ; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
2513 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
2514 ; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff
2515 ; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1
2516 ; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
2517 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2518 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2519 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0
2520 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 offset:4 glc dlc
2521 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
2522 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 15
2523 ; GFX1010-PAL-NEXT: s_and_b32 s1, s0, 15
2524 ; GFX1010-PAL-NEXT: s_lshl_b32 s0, s0, 2
2525 ; GFX1010-PAL-NEXT: s_lshl_b32 s1, s1, 2
2526 ; GFX1010-PAL-NEXT: s_addk_i32 s0, 0x4004
2527 ; GFX1010-PAL-NEXT: s_addk_i32 s1, 0x4004
2528 ; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s0
2529 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
2530 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
2531 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
2532 ; GFX1010-PAL-NEXT: s_endpgm
2534 ; GFX1030-PAL-LABEL: store_load_sindex_large_offset_foo:
2535 ; GFX1030-PAL: ; %bb.0: ; %bb
2536 ; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3]
2537 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
2538 ; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
2539 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
2540 ; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff
2541 ; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1
2542 ; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
2543 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2544 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2545 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, off offset:4 glc dlc
2546 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
2547 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 15
2548 ; GFX1030-PAL-NEXT: s_and_b32 s1, s0, 15
2549 ; GFX1030-PAL-NEXT: s_lshl_b32 s0, s0, 2
2550 ; GFX1030-PAL-NEXT: s_lshl_b32 s1, s1, 2
2551 ; GFX1030-PAL-NEXT: s_addk_i32 s0, 0x4004
2552 ; GFX1030-PAL-NEXT: s_addk_i32 s1, 0x4004
2553 ; GFX1030-PAL-NEXT: scratch_store_dword off, v0, s0
2554 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
2555 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s1 glc dlc
2556 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
2557 ; GFX1030-PAL-NEXT: s_endpgm
2559 ; GFX11-PAL-LABEL: store_load_sindex_large_offset_foo:
2560 ; GFX11-PAL: ; %bb.0: ; %bb
2561 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, off offset:4 glc dlc
2562 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
2563 ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 15
2564 ; GFX11-PAL-NEXT: s_and_b32 s1, s0, 15
2565 ; GFX11-PAL-NEXT: s_lshl_b32 s0, s0, 2
2566 ; GFX11-PAL-NEXT: s_lshl_b32 s1, s1, 2
2567 ; GFX11-PAL-NEXT: s_addk_i32 s0, 0x4004
2568 ; GFX11-PAL-NEXT: s_addk_i32 s1, 0x4004
2569 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s0 dlc
2570 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
2571 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s1 glc dlc
2572 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
2573 ; GFX11-PAL-NEXT: s_endpgm
2575 %padding = alloca [4096 x i32], align 4, addrspace(5)
2576 %i = alloca [32 x float], align 4, addrspace(5)
2577 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
2578 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
2579 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
2580 store volatile i32 15, ptr addrspace(5) %i7, align 4
2581 %i9 = and i32 %idx, 15
2582 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
2583 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
2587 define amdgpu_kernel void @store_load_vindex_large_offset_kernel() {
2588 ; GFX9-LABEL: store_load_vindex_large_offset_kernel:
2589 ; GFX9: ; %bb.0: ; %bb
2590 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
2591 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
2592 ; GFX9-NEXT: s_mov_b32 s0, 0
2593 ; GFX9-NEXT: scratch_load_dword v1, off, s0 offset:4 glc
2594 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2595 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2596 ; GFX9-NEXT: v_add_u32_e32 v1, 0x4004, v0
2597 ; GFX9-NEXT: v_mov_b32_e32 v2, 15
2598 ; GFX9-NEXT: v_sub_u32_e32 v0, 0x4004, v0
2599 ; GFX9-NEXT: scratch_store_dword v1, v2, off
2600 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2601 ; GFX9-NEXT: v_add_u32_e32 v0, 0x7c, v0
2602 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc
2603 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2604 ; GFX9-NEXT: s_endpgm
2606 ; GFX10-LABEL: store_load_vindex_large_offset_kernel:
2607 ; GFX10: ; %bb.0: ; %bb
2608 ; GFX10-NEXT: s_add_u32 s0, s0, s3
2609 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
2610 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2611 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2612 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2613 ; GFX10-NEXT: v_mov_b32_e32 v2, 15
2614 ; GFX10-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc
2615 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2616 ; GFX10-NEXT: v_sub_nc_u32_e32 v1, 0x4004, v0
2617 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0
2618 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1
2619 ; GFX10-NEXT: scratch_store_dword v0, v2, off
2620 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2621 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
2622 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2623 ; GFX10-NEXT: s_endpgm
2625 ; GFX11-LABEL: store_load_vindex_large_offset_kernel:
2626 ; GFX11: ; %bb.0: ; %bb
2627 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2628 ; GFX11-NEXT: s_movk_i32 s0, 0x4004
2629 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc
2630 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2631 ; GFX11-NEXT: v_sub_nc_u32_e32 v1, 0x4004, v0
2632 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
2633 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
2634 ; GFX11-NEXT: scratch_store_b32 v0, v2, s0 dlc
2635 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2636 ; GFX11-NEXT: scratch_load_b32 v0, v1, off glc dlc
2637 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2638 ; GFX11-NEXT: s_endpgm
2640 ; GFX9-PAL-LABEL: store_load_vindex_large_offset_kernel:
2641 ; GFX9-PAL: ; %bb.0: ; %bb
2642 ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
2643 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
2644 ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
2645 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0
2646 ; GFX9-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2647 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 15
2648 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
2649 ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
2650 ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
2651 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
2652 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s0 offset:4 glc
2653 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
2654 ; GFX9-PAL-NEXT: v_add_u32_e32 v1, 0x4004, v0
2655 ; GFX9-PAL-NEXT: v_sub_u32_e32 v0, 0x4004, v0
2656 ; GFX9-PAL-NEXT: scratch_store_dword v1, v2, off
2657 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
2658 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0x7c, v0
2659 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc
2660 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
2661 ; GFX9-PAL-NEXT: s_endpgm
2663 ; GFX940-LABEL: store_load_vindex_large_offset_kernel:
2664 ; GFX940: ; %bb.0: ; %bb
2665 ; GFX940-NEXT: scratch_load_dword v1, off, off offset:4 sc0 sc1
2666 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2667 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2668 ; GFX940-NEXT: v_mov_b32_e32 v1, 15
2669 ; GFX940-NEXT: s_movk_i32 s0, 0x4004
2670 ; GFX940-NEXT: scratch_store_dword v0, v1, s0 sc0 sc1
2671 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2672 ; GFX940-NEXT: v_sub_u32_e32 v0, 0x4004, v0
2673 ; GFX940-NEXT: v_add_u32_e32 v0, 0x7c, v0
2674 ; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1
2675 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2676 ; GFX940-NEXT: s_endpgm
2678 ; GFX1010-PAL-LABEL: store_load_vindex_large_offset_kernel:
2679 ; GFX1010-PAL: ; %bb.0: ; %bb
2680 ; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
2681 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
2682 ; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
2683 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
2684 ; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff
2685 ; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1
2686 ; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
2687 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2688 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2689 ; GFX1010-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2690 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v2, 15
2691 ; GFX1010-PAL-NEXT: s_mov_b32 s0, 0
2692 ; GFX1010-PAL-NEXT: scratch_load_dword v3, off, s0 offset:4 glc dlc
2693 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
2694 ; GFX1010-PAL-NEXT: v_sub_nc_u32_e32 v1, 0x4004, v0
2695 ; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0
2696 ; GFX1010-PAL-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1
2697 ; GFX1010-PAL-NEXT: scratch_store_dword v0, v2, off
2698 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
2699 ; GFX1010-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
2700 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
2701 ; GFX1010-PAL-NEXT: s_endpgm
2703 ; GFX1030-PAL-LABEL: store_load_vindex_large_offset_kernel:
2704 ; GFX1030-PAL: ; %bb.0: ; %bb
2705 ; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3]
2706 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
2707 ; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
2708 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
2709 ; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff
2710 ; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1
2711 ; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
2712 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2713 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2714 ; GFX1030-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2715 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v2, 15
2716 ; GFX1030-PAL-NEXT: scratch_load_dword v3, off, off offset:4 glc dlc
2717 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
2718 ; GFX1030-PAL-NEXT: v_sub_nc_u32_e32 v1, 0x4004, v0
2719 ; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v0, 0x4004, v0
2720 ; GFX1030-PAL-NEXT: v_add_nc_u32_e32 v1, 0x7c, v1
2721 ; GFX1030-PAL-NEXT: scratch_store_dword v0, v2, off
2722 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
2723 ; GFX1030-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
2724 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
2725 ; GFX1030-PAL-NEXT: s_endpgm
2727 ; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel:
2728 ; GFX11-PAL: ; %bb.0: ; %bb
2729 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2730 ; GFX11-PAL-NEXT: s_movk_i32 s0, 0x4004
2731 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc
2732 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
2733 ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v1, 0x4004, v0
2734 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1)
2735 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_add_nc_u32 v1, 0x7c, v1
2736 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s0 dlc
2737 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
2738 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, off glc dlc
2739 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
2740 ; GFX11-PAL-NEXT: s_endpgm
2742 %padding = alloca [4096 x i32], align 4, addrspace(5)
2743 %i = alloca [32 x float], align 4, addrspace(5)
2744 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
2745 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
2746 %i2 = tail call i32 @llvm.amdgcn.workitem.id.x()
2747 %i3 = zext i32 %i2 to i64
2748 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i2
2749 store volatile i32 15, ptr addrspace(5) %i7, align 4
2750 %i9 = sub nsw i32 31, %i2
2751 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
2752 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
2756 define void @store_load_vindex_large_offset_foo(i32 %idx) {
2757 ; GFX9-LABEL: store_load_vindex_large_offset_foo:
2758 ; GFX9: ; %bb.0: ; %bb
2759 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2760 ; GFX9-NEXT: scratch_load_dword v1, off, s32 offset:4 glc
2761 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2762 ; GFX9-NEXT: s_add_i32 s0, s32, 0x4004
2763 ; GFX9-NEXT: v_mov_b32_e32 v1, s0
2764 ; GFX9-NEXT: v_lshl_add_u32 v2, v0, 2, v1
2765 ; GFX9-NEXT: v_mov_b32_e32 v3, 15
2766 ; GFX9-NEXT: v_and_b32_e32 v0, 15, v0
2767 ; GFX9-NEXT: scratch_store_dword v2, v3, off
2768 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2769 ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
2770 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc
2771 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2772 ; GFX9-NEXT: s_setpc_b64 s[30:31]
2774 ; GFX10-LABEL: store_load_vindex_large_offset_foo:
2775 ; GFX10: ; %bb.0: ; %bb
2776 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2777 ; GFX10-NEXT: v_and_b32_e32 v1, 15, v0
2778 ; GFX10-NEXT: s_add_i32 s0, s32, 0x4004
2779 ; GFX10-NEXT: v_mov_b32_e32 v2, 15
2780 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s0
2781 ; GFX10-NEXT: s_add_i32 s0, s32, 0x4004
2782 ; GFX10-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc
2783 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2784 ; GFX10-NEXT: v_lshl_add_u32 v1, v1, 2, s0
2785 ; GFX10-NEXT: scratch_store_dword v0, v2, off
2786 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2787 ; GFX10-NEXT: scratch_load_dword v0, v1, off glc dlc
2788 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2789 ; GFX10-NEXT: s_setpc_b64 s[30:31]
2791 ; GFX11-LABEL: store_load_vindex_large_offset_foo:
2792 ; GFX11: ; %bb.0: ; %bb
2793 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2794 ; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
2795 ; GFX11-NEXT: s_add_i32 s0, s32, 0x4004
2796 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc
2797 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2798 ; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
2799 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1
2800 ; GFX11-NEXT: s_add_i32 s0, s32, 0x4004
2801 ; GFX11-NEXT: scratch_store_b32 v0, v2, off dlc
2802 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2803 ; GFX11-NEXT: scratch_load_b32 v0, v1, s0 glc dlc
2804 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2805 ; GFX11-NEXT: s_setpc_b64 s[30:31]
2807 ; GFX9-PAL-LABEL: store_load_vindex_large_offset_foo:
2808 ; GFX9-PAL: ; %bb.0: ; %bb
2809 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2810 ; GFX9-PAL-NEXT: scratch_load_dword v1, off, s32 offset:4 glc
2811 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
2812 ; GFX9-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2813 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s0
2814 ; GFX9-PAL-NEXT: v_lshl_add_u32 v2, v0, 2, v1
2815 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 15
2816 ; GFX9-PAL-NEXT: v_and_b32_e32 v0, 15, v0
2817 ; GFX9-PAL-NEXT: scratch_store_dword v2, v3, off
2818 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
2819 ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
2820 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc
2821 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
2822 ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
2824 ; GFX940-LABEL: store_load_vindex_large_offset_foo:
2825 ; GFX940: ; %bb.0: ; %bb
2826 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2827 ; GFX940-NEXT: scratch_load_dword v1, off, s32 offset:4 sc0 sc1
2828 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2829 ; GFX940-NEXT: s_add_i32 s0, s32, 0x4004
2830 ; GFX940-NEXT: v_mov_b32_e32 v1, s0
2831 ; GFX940-NEXT: v_lshl_add_u32 v1, v0, 2, v1
2832 ; GFX940-NEXT: v_mov_b32_e32 v2, 15
2833 ; GFX940-NEXT: v_and_b32_e32 v0, 15, v0
2834 ; GFX940-NEXT: scratch_store_dword v1, v2, off sc0 sc1
2835 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2836 ; GFX940-NEXT: v_lshlrev_b32_e32 v0, 2, v0
2837 ; GFX940-NEXT: s_add_i32 s0, s32, 0x4004
2838 ; GFX940-NEXT: scratch_load_dword v0, v0, s0 sc0 sc1
2839 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2840 ; GFX940-NEXT: s_setpc_b64 s[30:31]
2842 ; GFX10-PAL-LABEL: store_load_vindex_large_offset_foo:
2843 ; GFX10-PAL: ; %bb.0: ; %bb
2844 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2845 ; GFX10-PAL-NEXT: v_and_b32_e32 v1, 15, v0
2846 ; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2847 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 15
2848 ; GFX10-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0
2849 ; GFX10-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2850 ; GFX10-PAL-NEXT: scratch_load_dword v3, off, s32 offset:4 glc dlc
2851 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
2852 ; GFX10-PAL-NEXT: v_lshl_add_u32 v1, v1, 2, s0
2853 ; GFX10-PAL-NEXT: scratch_store_dword v0, v2, off
2854 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
2855 ; GFX10-PAL-NEXT: scratch_load_dword v0, v1, off glc dlc
2856 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
2857 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
2859 ; GFX11-PAL-LABEL: store_load_vindex_large_offset_foo:
2860 ; GFX11-PAL: ; %bb.0: ; %bb
2861 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2862 ; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0
2863 ; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2864 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc
2865 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
2866 ; GFX11-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, s0
2867 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
2868 ; GFX11-PAL-NEXT: s_add_i32 s0, s32, 0x4004
2869 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, off dlc
2870 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
2871 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v1, s0 glc dlc
2872 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
2873 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
2875 %padding = alloca [4096 x i32], align 4, addrspace(5)
2876 %i = alloca [32 x float], align 4, addrspace(5)
2877 %pad_gep = getelementptr inbounds [4096 x i32], ptr addrspace(5) %padding, i32 0, i32 undef
2878 %pad_load = load volatile i32, ptr addrspace(5) %pad_gep, align 4
2879 %i7 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %idx
2880 store volatile i32 15, ptr addrspace(5) %i7, align 4
2881 %i9 = and i32 %idx, 15
2882 %i10 = getelementptr inbounds [32 x float], ptr addrspace(5) %i, i32 0, i32 %i9
2883 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
2887 define amdgpu_kernel void @store_load_large_imm_offset_kernel() {
2888 ; GFX9-LABEL: store_load_large_imm_offset_kernel:
2889 ; GFX9: ; %bb.0: ; %bb
2890 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s3
2891 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
2892 ; GFX9-NEXT: v_mov_b32_e32 v0, 13
2893 ; GFX9-NEXT: s_mov_b32 s0, 0
2894 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:4
2895 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2896 ; GFX9-NEXT: s_movk_i32 s0, 0x3000
2897 ; GFX9-NEXT: s_add_i32 s0, s0, 4
2898 ; GFX9-NEXT: v_mov_b32_e32 v0, 15
2899 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712
2900 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2901 ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc
2902 ; GFX9-NEXT: s_waitcnt vmcnt(0)
2903 ; GFX9-NEXT: s_endpgm
2905 ; GFX10-LABEL: store_load_large_imm_offset_kernel:
2906 ; GFX10: ; %bb.0: ; %bb
2907 ; GFX10-NEXT: s_add_u32 s0, s0, s3
2908 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
2909 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
2910 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
2911 ; GFX10-NEXT: v_mov_b32_e32 v0, 13
2912 ; GFX10-NEXT: v_mov_b32_e32 v1, 15
2913 ; GFX10-NEXT: s_movk_i32 s0, 0x3800
2914 ; GFX10-NEXT: s_add_i32 s0, s0, 4
2915 ; GFX10-NEXT: scratch_store_dword off, v0, off offset:4
2916 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2917 ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
2918 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
2919 ; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc
2920 ; GFX10-NEXT: s_waitcnt vmcnt(0)
2921 ; GFX10-NEXT: s_endpgm
2923 ; GFX11-LABEL: store_load_large_imm_offset_kernel:
2924 ; GFX11: ; %bb.0: ; %bb
2925 ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
2926 ; GFX11-NEXT: s_movk_i32 s0, 0x3000
2927 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
2928 ; GFX11-NEXT: s_add_i32 s0, s0, 4
2929 ; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
2930 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2931 ; GFX11-NEXT: scratch_store_b32 off, v1, s0 offset:3712 dlc
2932 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
2933 ; GFX11-NEXT: scratch_load_b32 v0, off, s0 offset:3712 glc dlc
2934 ; GFX11-NEXT: s_waitcnt vmcnt(0)
2935 ; GFX11-NEXT: s_endpgm
2937 ; GFX9-PAL-LABEL: store_load_large_imm_offset_kernel:
2938 ; GFX9-PAL: ; %bb.0: ; %bb
2939 ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
2940 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
2941 ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
2942 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13
2943 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0
2944 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
2945 ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
2946 ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s1
2947 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
2948 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:4
2949 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
2950 ; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000
2951 ; GFX9-PAL-NEXT: s_add_i32 s0, s0, 4
2952 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
2953 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712
2954 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
2955 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc
2956 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
2957 ; GFX9-PAL-NEXT: s_endpgm
2959 ; GFX940-LABEL: store_load_large_imm_offset_kernel:
2960 ; GFX940: ; %bb.0: ; %bb
2961 ; GFX940-NEXT: v_mov_b32_e32 v0, 13
2962 ; GFX940-NEXT: s_movk_i32 s0, 0x3000
2963 ; GFX940-NEXT: scratch_store_dword off, v0, off offset:4 sc0 sc1
2964 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2965 ; GFX940-NEXT: s_add_i32 s0, s0, 4
2966 ; GFX940-NEXT: v_mov_b32_e32 v0, 15
2967 ; GFX940-NEXT: scratch_store_dword off, v0, s0 offset:3712 sc0 sc1
2968 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2969 ; GFX940-NEXT: scratch_load_dword v0, off, s0 offset:3712 sc0 sc1
2970 ; GFX940-NEXT: s_waitcnt vmcnt(0)
2971 ; GFX940-NEXT: s_endpgm
2973 ; GFX1010-PAL-LABEL: store_load_large_imm_offset_kernel:
2974 ; GFX1010-PAL: ; %bb.0: ; %bb
2975 ; GFX1010-PAL-NEXT: s_getpc_b64 s[2:3]
2976 ; GFX1010-PAL-NEXT: s_mov_b32 s2, s0
2977 ; GFX1010-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
2978 ; GFX1010-PAL-NEXT: s_waitcnt lgkmcnt(0)
2979 ; GFX1010-PAL-NEXT: s_and_b32 s3, s3, 0xffff
2980 ; GFX1010-PAL-NEXT: s_add_u32 s2, s2, s1
2981 ; GFX1010-PAL-NEXT: s_addc_u32 s3, s3, 0
2982 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
2983 ; GFX1010-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
2984 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v0, 13
2985 ; GFX1010-PAL-NEXT: v_mov_b32_e32 v1, 15
2986 ; GFX1010-PAL-NEXT: s_movk_i32 s0, 0x3800
2987 ; GFX1010-PAL-NEXT: s_mov_b32 s1, 0
2988 ; GFX1010-PAL-NEXT: s_add_i32 s0, s0, 4
2989 ; GFX1010-PAL-NEXT: scratch_store_dword off, v0, s1 offset:4
2990 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
2991 ; GFX1010-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
2992 ; GFX1010-PAL-NEXT: s_waitcnt_vscnt null, 0x0
2993 ; GFX1010-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc
2994 ; GFX1010-PAL-NEXT: s_waitcnt vmcnt(0)
2995 ; GFX1010-PAL-NEXT: s_endpgm
2997 ; GFX1030-PAL-LABEL: store_load_large_imm_offset_kernel:
2998 ; GFX1030-PAL: ; %bb.0: ; %bb
2999 ; GFX1030-PAL-NEXT: s_getpc_b64 s[2:3]
3000 ; GFX1030-PAL-NEXT: s_mov_b32 s2, s0
3001 ; GFX1030-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
3002 ; GFX1030-PAL-NEXT: s_waitcnt lgkmcnt(0)
3003 ; GFX1030-PAL-NEXT: s_and_b32 s3, s3, 0xffff
3004 ; GFX1030-PAL-NEXT: s_add_u32 s2, s2, s1
3005 ; GFX1030-PAL-NEXT: s_addc_u32 s3, s3, 0
3006 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
3007 ; GFX1030-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
3008 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v0, 13
3009 ; GFX1030-PAL-NEXT: v_mov_b32_e32 v1, 15
3010 ; GFX1030-PAL-NEXT: s_movk_i32 s0, 0x3800
3011 ; GFX1030-PAL-NEXT: s_add_i32 s0, s0, 4
3012 ; GFX1030-PAL-NEXT: scratch_store_dword off, v0, off offset:4
3013 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3014 ; GFX1030-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
3015 ; GFX1030-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3016 ; GFX1030-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc
3017 ; GFX1030-PAL-NEXT: s_waitcnt vmcnt(0)
3018 ; GFX1030-PAL-NEXT: s_endpgm
3020 ; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel:
3021 ; GFX11-PAL: ; %bb.0: ; %bb
3022 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
3023 ; GFX11-PAL-NEXT: s_movk_i32 s0, 0x3000
3024 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3025 ; GFX11-PAL-NEXT: s_add_i32 s0, s0, 4
3026 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, off offset:4 dlc
3027 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3028 ; GFX11-PAL-NEXT: scratch_store_b32 off, v1, s0 offset:3712 dlc
3029 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3030 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s0 offset:3712 glc dlc
3031 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
3032 ; GFX11-PAL-NEXT: s_endpgm
3034 %i = alloca [4096 x i32], align 4, addrspace(5)
3035 %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef
3036 store volatile i32 13, ptr addrspace(5) %i1, align 4
3037 %i7 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000
3038 store volatile i32 15, ptr addrspace(5) %i7, align 4
3039 %i10 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000
3040 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
3044 define void @store_load_large_imm_offset_foo() {
3045 ; GFX9-LABEL: store_load_large_imm_offset_foo:
3046 ; GFX9: ; %bb.0: ; %bb
3047 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3048 ; GFX9-NEXT: v_mov_b32_e32 v0, 13
3049 ; GFX9-NEXT: s_movk_i32 s0, 0x3000
3050 ; GFX9-NEXT: s_add_i32 s1, s32, 4
3051 ; GFX9-NEXT: scratch_store_dword off, v0, s32 offset:4
3052 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3053 ; GFX9-NEXT: s_add_i32 s0, s0, s1
3054 ; GFX9-NEXT: v_mov_b32_e32 v0, 15
3055 ; GFX9-NEXT: scratch_store_dword off, v0, s0 offset:3712
3056 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3057 ; GFX9-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc
3058 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3059 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3061 ; GFX10-LABEL: store_load_large_imm_offset_foo:
3062 ; GFX10: ; %bb.0: ; %bb
3063 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3064 ; GFX10-NEXT: v_mov_b32_e32 v0, 13
3065 ; GFX10-NEXT: v_mov_b32_e32 v1, 15
3066 ; GFX10-NEXT: s_movk_i32 s0, 0x3800
3067 ; GFX10-NEXT: s_add_i32 s1, s32, 4
3068 ; GFX10-NEXT: s_add_i32 s0, s0, s1
3069 ; GFX10-NEXT: scratch_store_dword off, v0, s32 offset:4
3070 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3071 ; GFX10-NEXT: scratch_store_dword off, v1, s0 offset:1664
3072 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3073 ; GFX10-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc
3074 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3075 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3077 ; GFX11-LABEL: store_load_large_imm_offset_foo:
3078 ; GFX11: ; %bb.0: ; %bb
3079 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3080 ; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
3081 ; GFX11-NEXT: s_movk_i32 s0, 0x3000
3082 ; GFX11-NEXT: s_add_i32 s1, s32, 4
3083 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3084 ; GFX11-NEXT: s_add_i32 s0, s0, s1
3085 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
3086 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3087 ; GFX11-NEXT: scratch_store_b32 off, v1, s0 offset:3712 dlc
3088 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3089 ; GFX11-NEXT: scratch_load_b32 v0, off, s0 offset:3712 glc dlc
3090 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3091 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3093 ; GFX9-PAL-LABEL: store_load_large_imm_offset_foo:
3094 ; GFX9-PAL: ; %bb.0: ; %bb
3095 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3096 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 13
3097 ; GFX9-PAL-NEXT: s_movk_i32 s0, 0x3000
3098 ; GFX9-PAL-NEXT: s_add_i32 s1, s32, 4
3099 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4
3100 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3101 ; GFX9-PAL-NEXT: s_add_i32 s0, s0, s1
3102 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 15
3103 ; GFX9-PAL-NEXT: scratch_store_dword off, v0, s0 offset:3712
3104 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3105 ; GFX9-PAL-NEXT: scratch_load_dword v0, off, s0 offset:3712 glc
3106 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3107 ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
3109 ; GFX940-LABEL: store_load_large_imm_offset_foo:
3110 ; GFX940: ; %bb.0: ; %bb
3111 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3112 ; GFX940-NEXT: v_mov_b32_e32 v0, 13
3113 ; GFX940-NEXT: s_movk_i32 s0, 0x3000
3114 ; GFX940-NEXT: s_add_i32 s1, s32, 4
3115 ; GFX940-NEXT: scratch_store_dword off, v0, s32 offset:4 sc0 sc1
3116 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3117 ; GFX940-NEXT: s_add_i32 s0, s0, s1
3118 ; GFX940-NEXT: v_mov_b32_e32 v0, 15
3119 ; GFX940-NEXT: scratch_store_dword off, v0, s0 offset:3712 sc0 sc1
3120 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3121 ; GFX940-NEXT: scratch_load_dword v0, off, s0 offset:3712 sc0 sc1
3122 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3123 ; GFX940-NEXT: s_setpc_b64 s[30:31]
3125 ; GFX10-PAL-LABEL: store_load_large_imm_offset_foo:
3126 ; GFX10-PAL: ; %bb.0: ; %bb
3127 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3128 ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 13
3129 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
3130 ; GFX10-PAL-NEXT: s_movk_i32 s0, 0x3800
3131 ; GFX10-PAL-NEXT: s_add_i32 s1, s32, 4
3132 ; GFX10-PAL-NEXT: s_add_i32 s0, s0, s1
3133 ; GFX10-PAL-NEXT: scratch_store_dword off, v0, s32 offset:4
3134 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3135 ; GFX10-PAL-NEXT: scratch_store_dword off, v1, s0 offset:1664
3136 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3137 ; GFX10-PAL-NEXT: scratch_load_dword v0, off, s0 offset:1664 glc dlc
3138 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
3139 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
3141 ; GFX11-PAL-LABEL: store_load_large_imm_offset_foo:
3142 ; GFX11-PAL: ; %bb.0: ; %bb
3143 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3144 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
3145 ; GFX11-PAL-NEXT: s_movk_i32 s0, 0x3000
3146 ; GFX11-PAL-NEXT: s_add_i32 s1, s32, 4
3147 ; GFX11-PAL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3148 ; GFX11-PAL-NEXT: s_add_i32 s0, s0, s1
3149 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc
3150 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3151 ; GFX11-PAL-NEXT: scratch_store_b32 off, v1, s0 offset:3712 dlc
3152 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3153 ; GFX11-PAL-NEXT: scratch_load_b32 v0, off, s0 offset:3712 glc dlc
3154 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
3155 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
3157 %i = alloca [4096 x i32], align 4, addrspace(5)
3158 %i1 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 undef
3159 store volatile i32 13, ptr addrspace(5) %i1, align 4
3160 %i7 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000
3161 store volatile i32 15, ptr addrspace(5) %i7, align 4
3162 %i10 = getelementptr inbounds [4096 x i32], ptr addrspace(5) %i, i32 0, i32 4000
3163 %i12 = load volatile i32, ptr addrspace(5) %i10, align 4
3167 define amdgpu_kernel void @store_load_vidx_sidx_offset(i32 %sidx) {
3168 ; GFX9-LABEL: store_load_vidx_sidx_offset:
3169 ; GFX9: ; %bb.0: ; %bb
3170 ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x24
3171 ; GFX9-NEXT: v_mov_b32_e32 v1, 4
3172 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s2, s5
3173 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
3174 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
3175 ; GFX9-NEXT: v_add_u32_e32 v0, s0, v0
3176 ; GFX9-NEXT: v_lshl_add_u32 v0, v0, 2, v1
3177 ; GFX9-NEXT: v_add_u32_e32 v0, 0x400, v0
3178 ; GFX9-NEXT: v_mov_b32_e32 v1, 15
3179 ; GFX9-NEXT: scratch_store_dword v0, v1, off
3180 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3181 ; GFX9-NEXT: scratch_load_dword v0, v0, off glc
3182 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3183 ; GFX9-NEXT: s_endpgm
3185 ; GFX10-LABEL: store_load_vidx_sidx_offset:
3186 ; GFX10: ; %bb.0: ; %bb
3187 ; GFX10-NEXT: s_add_u32 s2, s2, s5
3188 ; GFX10-NEXT: s_addc_u32 s3, s3, 0
3189 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
3190 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
3191 ; GFX10-NEXT: s_load_dword s0, s[0:1], 0x24
3192 ; GFX10-NEXT: v_mov_b32_e32 v1, 4
3193 ; GFX10-NEXT: s_waitcnt lgkmcnt(0)
3194 ; GFX10-NEXT: v_add_lshl_u32 v0, s0, v0, 2
3195 ; GFX10-NEXT: v_add3_u32 v0, v1, v0, 0x400
3196 ; GFX10-NEXT: v_mov_b32_e32 v1, 15
3197 ; GFX10-NEXT: scratch_store_dword v0, v1, off
3198 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3199 ; GFX10-NEXT: scratch_load_dword v0, v0, off glc dlc
3200 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3201 ; GFX10-NEXT: s_endpgm
3203 ; GFX11-LABEL: store_load_vidx_sidx_offset:
3204 ; GFX11: ; %bb.0: ; %bb
3205 ; GFX11-NEXT: s_load_b32 s0, s[0:1], 0x24
3206 ; GFX11-NEXT: v_mov_b32_e32 v1, 4
3207 ; GFX11-NEXT: s_waitcnt lgkmcnt(0)
3208 ; GFX11-NEXT: v_add_lshl_u32 v0, s0, v0, 2
3209 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3210 ; GFX11-NEXT: v_add3_u32 v0, v1, v0, 0x400
3211 ; GFX11-NEXT: v_mov_b32_e32 v1, 15
3212 ; GFX11-NEXT: scratch_store_b32 v0, v1, off dlc
3213 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3214 ; GFX11-NEXT: scratch_load_b32 v0, v0, off glc dlc
3215 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3216 ; GFX11-NEXT: s_endpgm
3218 ; GFX9-PAL-LABEL: store_load_vidx_sidx_offset:
3219 ; GFX9-PAL: ; %bb.0: ; %bb
3220 ; GFX9-PAL-NEXT: s_getpc_b64 s[4:5]
3221 ; GFX9-PAL-NEXT: s_mov_b32 s4, s0
3222 ; GFX9-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
3223 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 4
3224 ; GFX9-PAL-NEXT: s_load_dword s0, s[0:1], 0x0
3225 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
3226 ; GFX9-PAL-NEXT: s_and_b32 s5, s5, 0xffff
3227 ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s4, s3
3228 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, s0, v0
3229 ; GFX9-PAL-NEXT: v_lshl_add_u32 v0, v0, 2, v1
3230 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s5, 0
3231 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0x400, v0
3232 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15
3233 ; GFX9-PAL-NEXT: scratch_store_dword v0, v1, off
3234 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3235 ; GFX9-PAL-NEXT: scratch_load_dword v0, v0, off glc
3236 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3237 ; GFX9-PAL-NEXT: s_endpgm
3239 ; GFX940-LABEL: store_load_vidx_sidx_offset:
3240 ; GFX940: ; %bb.0: ; %bb
3241 ; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
3242 ; GFX940-NEXT: v_mov_b32_e32 v1, 4
3243 ; GFX940-NEXT: s_waitcnt lgkmcnt(0)
3244 ; GFX940-NEXT: v_add_u32_e32 v0, s0, v0
3245 ; GFX940-NEXT: v_lshl_add_u32 v0, v0, 2, v1
3246 ; GFX940-NEXT: v_add_u32_e32 v0, 0x400, v0
3247 ; GFX940-NEXT: v_mov_b32_e32 v1, 15
3248 ; GFX940-NEXT: scratch_store_dword v0, v1, off sc0 sc1
3249 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3250 ; GFX940-NEXT: scratch_load_dword v0, v0, off sc0 sc1
3251 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3252 ; GFX940-NEXT: s_endpgm
3254 ; GFX10-PAL-LABEL: store_load_vidx_sidx_offset:
3255 ; GFX10-PAL: ; %bb.0: ; %bb
3256 ; GFX10-PAL-NEXT: s_getpc_b64 s[4:5]
3257 ; GFX10-PAL-NEXT: s_mov_b32 s4, s0
3258 ; GFX10-PAL-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
3259 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
3260 ; GFX10-PAL-NEXT: s_and_b32 s5, s5, 0xffff
3261 ; GFX10-PAL-NEXT: s_add_u32 s4, s4, s3
3262 ; GFX10-PAL-NEXT: s_addc_u32 s5, s5, 0
3263 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s4
3264 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s5
3265 ; GFX10-PAL-NEXT: s_load_dword s0, s[0:1], 0x0
3266 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 4
3267 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
3268 ; GFX10-PAL-NEXT: v_add_lshl_u32 v0, s0, v0, 2
3269 ; GFX10-PAL-NEXT: v_add3_u32 v0, v1, v0, 0x400
3270 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
3271 ; GFX10-PAL-NEXT: scratch_store_dword v0, v1, off
3272 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3273 ; GFX10-PAL-NEXT: scratch_load_dword v0, v0, off glc dlc
3274 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
3275 ; GFX10-PAL-NEXT: s_endpgm
3277 ; GFX11-PAL-LABEL: store_load_vidx_sidx_offset:
3278 ; GFX11-PAL: ; %bb.0: ; %bb
3279 ; GFX11-PAL-NEXT: s_load_b32 s0, s[0:1], 0x0
3280 ; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 4
3281 ; GFX11-PAL-NEXT: s_waitcnt lgkmcnt(0)
3282 ; GFX11-PAL-NEXT: v_add_lshl_u32 v0, s0, v0, 2
3283 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1)
3284 ; GFX11-PAL-NEXT: v_add3_u32 v0, v1, v0, 0x400
3285 ; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15
3286 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off dlc
3287 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3288 ; GFX11-PAL-NEXT: scratch_load_b32 v0, v0, off glc dlc
3289 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
3290 ; GFX11-PAL-NEXT: s_endpgm
3292 %alloca = alloca [32 x i32], align 4, addrspace(5)
3293 %vidx = tail call i32 @llvm.amdgcn.workitem.id.x()
3294 %add1 = add nsw i32 %sidx, %vidx
3295 %add2 = add nsw i32 %add1, 256
3296 %gep = getelementptr inbounds [32 x i32], ptr addrspace(5) %alloca, i32 0, i32 %add2
3297 store volatile i32 15, ptr addrspace(5) %gep, align 4
3298 %load = load volatile i32, ptr addrspace(5) %gep, align 4
3302 define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
3303 ; GFX9-LABEL: store_load_i64_aligned:
3304 ; GFX9: ; %bb.0: ; %bb
3305 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3306 ; GFX9-NEXT: v_mov_b32_e32 v1, 15
3307 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3308 ; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off
3309 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3310 ; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc
3311 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3312 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3314 ; GFX10-LABEL: store_load_i64_aligned:
3315 ; GFX10: ; %bb.0: ; %bb
3316 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3317 ; GFX10-NEXT: v_mov_b32_e32 v1, 15
3318 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
3319 ; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off
3320 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3321 ; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc
3322 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3323 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3325 ; GFX11-LABEL: store_load_i64_aligned:
3326 ; GFX11: ; %bb.0: ; %bb
3327 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3328 ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
3329 ; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc
3330 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3331 ; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc
3332 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3333 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3335 ; GFX9-PAL-LABEL: store_load_i64_aligned:
3336 ; GFX9-PAL: ; %bb.0: ; %bb
3337 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3338 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15
3339 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0
3340 ; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off
3341 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3342 ; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc
3343 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3344 ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
3346 ; GFX940-LABEL: store_load_i64_aligned:
3347 ; GFX940: ; %bb.0: ; %bb
3348 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3349 ; GFX940-NEXT: v_mov_b32_e32 v2, 15
3350 ; GFX940-NEXT: v_mov_b32_e32 v3, 0
3351 ; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
3352 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3353 ; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
3354 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3355 ; GFX940-NEXT: s_setpc_b64 s[30:31]
3357 ; GFX10-PAL-LABEL: store_load_i64_aligned:
3358 ; GFX10-PAL: ; %bb.0: ; %bb
3359 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3360 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
3361 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0
3362 ; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off
3363 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3364 ; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc
3365 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
3366 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
3368 ; GFX11-PAL-LABEL: store_load_i64_aligned:
3369 ; GFX11-PAL: ; %bb.0: ; %bb
3370 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3371 ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
3372 ; GFX11-PAL-NEXT: scratch_store_b64 v0, v[1:2], off dlc
3373 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3374 ; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc
3375 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
3376 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
3378 store volatile i64 15, ptr addrspace(5) %arg, align 8
3379 %load = load volatile i64, ptr addrspace(5) %arg, align 8
3383 define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
3384 ; GFX9-LABEL: store_load_i64_unaligned:
3385 ; GFX9: ; %bb.0: ; %bb
3386 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3387 ; GFX9-NEXT: v_mov_b32_e32 v1, 15
3388 ; GFX9-NEXT: v_mov_b32_e32 v2, 0
3389 ; GFX9-NEXT: scratch_store_dwordx2 v0, v[1:2], off
3390 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3391 ; GFX9-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc
3392 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3393 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3395 ; GFX10-LABEL: store_load_i64_unaligned:
3396 ; GFX10: ; %bb.0: ; %bb
3397 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3398 ; GFX10-NEXT: v_mov_b32_e32 v1, 15
3399 ; GFX10-NEXT: v_mov_b32_e32 v2, 0
3400 ; GFX10-NEXT: scratch_store_dwordx2 v0, v[1:2], off
3401 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3402 ; GFX10-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc
3403 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3404 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3406 ; GFX11-LABEL: store_load_i64_unaligned:
3407 ; GFX11: ; %bb.0: ; %bb
3408 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3409 ; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
3410 ; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc
3411 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3412 ; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc
3413 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3414 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3416 ; GFX9-PAL-LABEL: store_load_i64_unaligned:
3417 ; GFX9-PAL: ; %bb.0: ; %bb
3418 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3419 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 15
3420 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 0
3421 ; GFX9-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off
3422 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3423 ; GFX9-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc
3424 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3425 ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
3427 ; GFX940-LABEL: store_load_i64_unaligned:
3428 ; GFX940: ; %bb.0: ; %bb
3429 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3430 ; GFX940-NEXT: v_mov_b32_e32 v2, 15
3431 ; GFX940-NEXT: v_mov_b32_e32 v3, 0
3432 ; GFX940-NEXT: scratch_store_dwordx2 v0, v[2:3], off sc0 sc1
3433 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3434 ; GFX940-NEXT: scratch_load_dwordx2 v[0:1], v0, off sc0 sc1
3435 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3436 ; GFX940-NEXT: s_setpc_b64 s[30:31]
3438 ; GFX10-PAL-LABEL: store_load_i64_unaligned:
3439 ; GFX10-PAL: ; %bb.0: ; %bb
3440 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3441 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 15
3442 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 0
3443 ; GFX10-PAL-NEXT: scratch_store_dwordx2 v0, v[1:2], off
3444 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3445 ; GFX10-PAL-NEXT: scratch_load_dwordx2 v[0:1], v0, off glc dlc
3446 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
3447 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
3449 ; GFX11-PAL-LABEL: store_load_i64_unaligned:
3450 ; GFX11-PAL: ; %bb.0: ; %bb
3451 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3452 ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
3453 ; GFX11-PAL-NEXT: scratch_store_b64 v0, v[1:2], off dlc
3454 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3455 ; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc
3456 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
3457 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
3459 store volatile i64 15, ptr addrspace(5) %arg, align 1
3460 %load = load volatile i64, ptr addrspace(5) %arg, align 1
3464 define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
3465 ; GFX9-LABEL: store_load_v3i32_unaligned:
3466 ; GFX9: ; %bb.0: ; %bb
3467 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3468 ; GFX9-NEXT: v_mov_b32_e32 v1, 1
3469 ; GFX9-NEXT: v_mov_b32_e32 v2, 2
3470 ; GFX9-NEXT: v_mov_b32_e32 v3, 3
3471 ; GFX9-NEXT: scratch_store_dwordx3 v0, v[1:3], off
3472 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3473 ; GFX9-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc
3474 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3475 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3477 ; GFX10-LABEL: store_load_v3i32_unaligned:
3478 ; GFX10: ; %bb.0: ; %bb
3479 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3480 ; GFX10-NEXT: v_mov_b32_e32 v1, 1
3481 ; GFX10-NEXT: v_mov_b32_e32 v2, 2
3482 ; GFX10-NEXT: v_mov_b32_e32 v3, 3
3483 ; GFX10-NEXT: scratch_store_dwordx3 v0, v[1:3], off
3484 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3485 ; GFX10-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc
3486 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3487 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3489 ; GFX11-LABEL: store_load_v3i32_unaligned:
3490 ; GFX11: ; %bb.0: ; %bb
3491 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3492 ; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
3493 ; GFX11-NEXT: v_mov_b32_e32 v3, 3
3494 ; GFX11-NEXT: scratch_store_b96 v0, v[1:3], off dlc
3495 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3496 ; GFX11-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc
3497 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3498 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3500 ; GFX9-PAL-LABEL: store_load_v3i32_unaligned:
3501 ; GFX9-PAL: ; %bb.0: ; %bb
3502 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3503 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1
3504 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2
3505 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3
3506 ; GFX9-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off
3507 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3508 ; GFX9-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc
3509 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3510 ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
3512 ; GFX940-LABEL: store_load_v3i32_unaligned:
3513 ; GFX940: ; %bb.0: ; %bb
3514 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3515 ; GFX940-NEXT: v_mov_b32_e32 v2, 1
3516 ; GFX940-NEXT: v_mov_b32_e32 v3, 2
3517 ; GFX940-NEXT: v_mov_b32_e32 v4, 3
3518 ; GFX940-NEXT: scratch_store_dwordx3 v0, v[2:4], off sc0 sc1
3519 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3520 ; GFX940-NEXT: scratch_load_dwordx3 v[0:2], v0, off sc0 sc1
3521 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3522 ; GFX940-NEXT: s_setpc_b64 s[30:31]
3524 ; GFX10-PAL-LABEL: store_load_v3i32_unaligned:
3525 ; GFX10-PAL: ; %bb.0: ; %bb
3526 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3527 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1
3528 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2
3529 ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3
3530 ; GFX10-PAL-NEXT: scratch_store_dwordx3 v0, v[1:3], off
3531 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3532 ; GFX10-PAL-NEXT: scratch_load_dwordx3 v[0:2], v0, off glc dlc
3533 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
3534 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
3536 ; GFX11-PAL-LABEL: store_load_v3i32_unaligned:
3537 ; GFX11-PAL: ; %bb.0: ; %bb
3538 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3539 ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
3540 ; GFX11-PAL-NEXT: v_mov_b32_e32 v3, 3
3541 ; GFX11-PAL-NEXT: scratch_store_b96 v0, v[1:3], off dlc
3542 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3543 ; GFX11-PAL-NEXT: scratch_load_b96 v[0:2], v0, off glc dlc
3544 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
3545 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
3547 store volatile <3 x i32> <i32 1, i32 2, i32 3>, ptr addrspace(5) %arg, align 1
3548 %load = load volatile <3 x i32>, ptr addrspace(5) %arg, align 1
3552 define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
3553 ; GFX9-LABEL: store_load_v4i32_unaligned:
3554 ; GFX9: ; %bb.0: ; %bb
3555 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3556 ; GFX9-NEXT: v_mov_b32_e32 v1, 1
3557 ; GFX9-NEXT: v_mov_b32_e32 v2, 2
3558 ; GFX9-NEXT: v_mov_b32_e32 v3, 3
3559 ; GFX9-NEXT: v_mov_b32_e32 v4, 4
3560 ; GFX9-NEXT: scratch_store_dwordx4 v0, v[1:4], off
3561 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3562 ; GFX9-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc
3563 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3564 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3566 ; GFX10-LABEL: store_load_v4i32_unaligned:
3567 ; GFX10: ; %bb.0: ; %bb
3568 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3569 ; GFX10-NEXT: v_mov_b32_e32 v1, 1
3570 ; GFX10-NEXT: v_mov_b32_e32 v2, 2
3571 ; GFX10-NEXT: v_mov_b32_e32 v3, 3
3572 ; GFX10-NEXT: v_mov_b32_e32 v4, 4
3573 ; GFX10-NEXT: scratch_store_dwordx4 v0, v[1:4], off
3574 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3575 ; GFX10-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc
3576 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3577 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3579 ; GFX11-LABEL: store_load_v4i32_unaligned:
3580 ; GFX11: ; %bb.0: ; %bb
3581 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3582 ; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
3583 ; GFX11-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
3584 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off dlc
3585 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3586 ; GFX11-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc
3587 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3588 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3590 ; GFX9-PAL-LABEL: store_load_v4i32_unaligned:
3591 ; GFX9-PAL: ; %bb.0: ; %bb
3592 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3593 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1
3594 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, 2
3595 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, 3
3596 ; GFX9-PAL-NEXT: v_mov_b32_e32 v4, 4
3597 ; GFX9-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off
3598 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3599 ; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc
3600 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3601 ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
3603 ; GFX940-LABEL: store_load_v4i32_unaligned:
3604 ; GFX940: ; %bb.0: ; %bb
3605 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3606 ; GFX940-NEXT: v_mov_b32_e32 v2, 1
3607 ; GFX940-NEXT: v_mov_b32_e32 v3, 2
3608 ; GFX940-NEXT: v_mov_b32_e32 v4, 3
3609 ; GFX940-NEXT: v_mov_b32_e32 v5, 4
3610 ; GFX940-NEXT: scratch_store_dwordx4 v0, v[2:5], off sc0 sc1
3611 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3612 ; GFX940-NEXT: scratch_load_dwordx4 v[0:3], v0, off sc0 sc1
3613 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3614 ; GFX940-NEXT: s_setpc_b64 s[30:31]
3616 ; GFX10-PAL-LABEL: store_load_v4i32_unaligned:
3617 ; GFX10-PAL: ; %bb.0: ; %bb
3618 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3619 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1
3620 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, 2
3621 ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, 3
3622 ; GFX10-PAL-NEXT: v_mov_b32_e32 v4, 4
3623 ; GFX10-PAL-NEXT: scratch_store_dwordx4 v0, v[1:4], off
3624 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3625 ; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], v0, off glc dlc
3626 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
3627 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
3629 ; GFX11-PAL-LABEL: store_load_v4i32_unaligned:
3630 ; GFX11-PAL: ; %bb.0: ; %bb
3631 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3632 ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
3633 ; GFX11-PAL-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
3634 ; GFX11-PAL-NEXT: scratch_store_b128 v0, v[1:4], off dlc
3635 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3636 ; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc
3637 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
3638 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
3640 store volatile <4 x i32> <i32 1, i32 2, i32 3, i32 4>, ptr addrspace(5) %arg, align 1
3641 %load = load volatile <4 x i32>, ptr addrspace(5) %arg, align 1
3645 define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg) {
3646 ; GFX9-LABEL: store_load_i32_negative_unaligned:
3647 ; GFX9: ; %bb.0: ; %bb
3648 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3649 ; GFX9-NEXT: v_add_u32_e32 v0, -1, v0
3650 ; GFX9-NEXT: v_mov_b32_e32 v1, 1
3651 ; GFX9-NEXT: scratch_store_byte v0, v1, off
3652 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3653 ; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc
3654 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3655 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3657 ; GFX10-LABEL: store_load_i32_negative_unaligned:
3658 ; GFX10: ; %bb.0: ; %bb
3659 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3660 ; GFX10-NEXT: v_add_nc_u32_e32 v0, -1, v0
3661 ; GFX10-NEXT: v_mov_b32_e32 v1, 1
3662 ; GFX10-NEXT: scratch_store_byte v0, v1, off
3663 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3664 ; GFX10-NEXT: scratch_load_ubyte v0, v0, off glc dlc
3665 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3666 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3668 ; GFX11-LABEL: store_load_i32_negative_unaligned:
3669 ; GFX11: ; %bb.0: ; %bb
3670 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3671 ; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, -1, v0
3672 ; GFX11-NEXT: scratch_store_b8 v0, v1, off dlc
3673 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3674 ; GFX11-NEXT: scratch_load_u8 v0, v0, off glc dlc
3675 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3676 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3678 ; GFX9-PAL-LABEL: store_load_i32_negative_unaligned:
3679 ; GFX9-PAL: ; %bb.0: ; %bb
3680 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3681 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, -1, v0
3682 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1
3683 ; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off
3684 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3685 ; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc
3686 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3687 ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
3689 ; GFX940-LABEL: store_load_i32_negative_unaligned:
3690 ; GFX940: ; %bb.0: ; %bb
3691 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3692 ; GFX940-NEXT: v_add_u32_e32 v0, -1, v0
3693 ; GFX940-NEXT: v_mov_b32_e32 v1, 1
3694 ; GFX940-NEXT: scratch_store_byte v0, v1, off sc0 sc1
3695 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3696 ; GFX940-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1
3697 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3698 ; GFX940-NEXT: s_setpc_b64 s[30:31]
3700 ; GFX10-PAL-LABEL: store_load_i32_negative_unaligned:
3701 ; GFX10-PAL: ; %bb.0: ; %bb
3702 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3703 ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, -1, v0
3704 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1
3705 ; GFX10-PAL-NEXT: scratch_store_byte v0, v1, off
3706 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3707 ; GFX10-PAL-NEXT: scratch_load_ubyte v0, v0, off glc dlc
3708 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
3709 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
3711 ; GFX11-PAL-LABEL: store_load_i32_negative_unaligned:
3712 ; GFX11-PAL: ; %bb.0: ; %bb
3713 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3714 ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, -1, v0
3715 ; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off dlc
3716 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3717 ; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off glc dlc
3718 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
3719 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
3721 %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -1
3722 store volatile i8 1, ptr addrspace(5) %ptr, align 1
3723 %load = load volatile i8, ptr addrspace(5) %ptr, align 1
3727 define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture %arg) {
3728 ; GFX9-LABEL: store_load_i32_large_negative_unaligned:
3729 ; GFX9: ; %bb.0: ; %bb
3730 ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3731 ; GFX9-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0
3732 ; GFX9-NEXT: v_mov_b32_e32 v1, 1
3733 ; GFX9-NEXT: scratch_store_byte v0, v1, off
3734 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3735 ; GFX9-NEXT: scratch_load_ubyte v0, v0, off glc
3736 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3737 ; GFX9-NEXT: s_setpc_b64 s[30:31]
3739 ; GFX10-LABEL: store_load_i32_large_negative_unaligned:
3740 ; GFX10: ; %bb.0: ; %bb
3741 ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3742 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0xffffef7f, v0
3743 ; GFX10-NEXT: v_mov_b32_e32 v1, 1
3744 ; GFX10-NEXT: scratch_store_byte v0, v1, off
3745 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3746 ; GFX10-NEXT: scratch_load_ubyte v0, v0, off glc dlc
3747 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3748 ; GFX10-NEXT: s_setpc_b64 s[30:31]
3750 ; GFX11-LABEL: store_load_i32_large_negative_unaligned:
3751 ; GFX11: ; %bb.0: ; %bb
3752 ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3753 ; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xffffef7f, v0
3754 ; GFX11-NEXT: scratch_store_b8 v0, v1, off dlc
3755 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3756 ; GFX11-NEXT: scratch_load_u8 v0, v0, off glc dlc
3757 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3758 ; GFX11-NEXT: s_setpc_b64 s[30:31]
3760 ; GFX9-PAL-LABEL: store_load_i32_large_negative_unaligned:
3761 ; GFX9-PAL: ; %bb.0: ; %bb
3762 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3763 ; GFX9-PAL-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0
3764 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, 1
3765 ; GFX9-PAL-NEXT: scratch_store_byte v0, v1, off
3766 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3767 ; GFX9-PAL-NEXT: scratch_load_ubyte v0, v0, off glc
3768 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3769 ; GFX9-PAL-NEXT: s_setpc_b64 s[30:31]
3771 ; GFX940-LABEL: store_load_i32_large_negative_unaligned:
3772 ; GFX940: ; %bb.0: ; %bb
3773 ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3774 ; GFX940-NEXT: v_add_u32_e32 v0, 0xffffef7f, v0
3775 ; GFX940-NEXT: v_mov_b32_e32 v1, 1
3776 ; GFX940-NEXT: scratch_store_byte v0, v1, off sc0 sc1
3777 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3778 ; GFX940-NEXT: scratch_load_ubyte v0, v0, off sc0 sc1
3779 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3780 ; GFX940-NEXT: s_setpc_b64 s[30:31]
3782 ; GFX10-PAL-LABEL: store_load_i32_large_negative_unaligned:
3783 ; GFX10-PAL: ; %bb.0: ; %bb
3784 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3785 ; GFX10-PAL-NEXT: v_add_nc_u32_e32 v0, 0xffffef7f, v0
3786 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 1
3787 ; GFX10-PAL-NEXT: scratch_store_byte v0, v1, off
3788 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3789 ; GFX10-PAL-NEXT: scratch_load_ubyte v0, v0, off glc dlc
3790 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
3791 ; GFX10-PAL-NEXT: s_setpc_b64 s[30:31]
3793 ; GFX11-PAL-LABEL: store_load_i32_large_negative_unaligned:
3794 ; GFX11-PAL: ; %bb.0: ; %bb
3795 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3796 ; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xffffef7f, v0
3797 ; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off dlc
3798 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3799 ; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off glc dlc
3800 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
3801 ; GFX11-PAL-NEXT: s_setpc_b64 s[30:31]
3803 %ptr = getelementptr inbounds i8, ptr addrspace(5) %arg, i32 -4225
3804 store volatile i8 1, ptr addrspace(5) %ptr, align 1
3805 %load = load volatile i8, ptr addrspace(5) %ptr, align 1
3809 define amdgpu_ps void @large_offset() {
3810 ; GFX9-LABEL: large_offset:
3811 ; GFX9: ; %bb.0: ; %bb
3812 ; GFX9-NEXT: s_add_u32 flat_scratch_lo, s0, s2
3813 ; GFX9-NEXT: v_mov_b32_e32 v0, 0
3814 ; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s1, 0
3815 ; GFX9-NEXT: v_mov_b32_e32 v1, v0
3816 ; GFX9-NEXT: v_mov_b32_e32 v2, v0
3817 ; GFX9-NEXT: v_mov_b32_e32 v3, v0
3818 ; GFX9-NEXT: s_mov_b32 s0, 0
3819 ; GFX9-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:3024
3820 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3821 ; GFX9-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:3024 glc
3822 ; GFX9-NEXT: s_waitcnt vmcnt(0)
3823 ; GFX9-NEXT: v_mov_b32_e32 v0, 16
3824 ; GFX9-NEXT: ;;#ASMSTART
3825 ; GFX9-NEXT: ; use v0
3826 ; GFX9-NEXT: ;;#ASMEND
3827 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x810
3828 ; GFX9-NEXT: ;;#ASMSTART
3829 ; GFX9-NEXT: ; use v0
3830 ; GFX9-NEXT: ;;#ASMEND
3831 ; GFX9-NEXT: s_endpgm
3833 ; GFX10-LABEL: large_offset:
3834 ; GFX10: ; %bb.0: ; %bb
3835 ; GFX10-NEXT: s_add_u32 s0, s0, s2
3836 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
3837 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
3838 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
3839 ; GFX10-NEXT: v_mov_b32_e32 v0, 0
3840 ; GFX10-NEXT: s_movk_i32 s0, 0x810
3841 ; GFX10-NEXT: s_addk_i32 s0, 0x3c0
3842 ; GFX10-NEXT: v_mov_b32_e32 v1, v0
3843 ; GFX10-NEXT: v_mov_b32_e32 v2, v0
3844 ; GFX10-NEXT: v_mov_b32_e32 v3, v0
3845 ; GFX10-NEXT: scratch_store_dwordx4 off, v[0:3], s0
3846 ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
3847 ; GFX10-NEXT: scratch_load_dwordx4 v[0:3], off, s0 glc dlc
3848 ; GFX10-NEXT: s_waitcnt vmcnt(0)
3849 ; GFX10-NEXT: v_mov_b32_e32 v0, 16
3850 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x810
3851 ; GFX10-NEXT: ;;#ASMSTART
3852 ; GFX10-NEXT: ; use v0
3853 ; GFX10-NEXT: ;;#ASMEND
3854 ; GFX10-NEXT: ;;#ASMSTART
3855 ; GFX10-NEXT: ; use v1
3856 ; GFX10-NEXT: ;;#ASMEND
3857 ; GFX10-NEXT: s_endpgm
3859 ; GFX11-LABEL: large_offset:
3860 ; GFX11: ; %bb.0: ; %bb
3861 ; GFX11-NEXT: v_mov_b32_e32 v0, 0
3862 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
3863 ; GFX11-NEXT: v_mov_b32_e32 v1, v0
3864 ; GFX11-NEXT: v_mov_b32_e32 v2, v0
3865 ; GFX11-NEXT: v_mov_b32_e32 v3, v0
3866 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 dlc
3867 ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
3868 ; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc
3869 ; GFX11-NEXT: s_waitcnt vmcnt(0)
3870 ; GFX11-NEXT: v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810
3871 ; GFX11-NEXT: ;;#ASMSTART
3872 ; GFX11-NEXT: ; use v0
3873 ; GFX11-NEXT: ;;#ASMEND
3874 ; GFX11-NEXT: ;;#ASMSTART
3875 ; GFX11-NEXT: ; use v1
3876 ; GFX11-NEXT: ;;#ASMEND
3877 ; GFX11-NEXT: s_endpgm
3879 ; GFX9-PAL-LABEL: large_offset:
3880 ; GFX9-PAL: ; %bb.0: ; %bb
3881 ; GFX9-PAL-NEXT: s_getpc_b64 s[2:3]
3882 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0
3883 ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
3884 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0
3885 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, v0
3886 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, v0
3887 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, v0
3888 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0)
3889 ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff
3890 ; GFX9-PAL-NEXT: s_add_u32 flat_scratch_lo, s2, s0
3891 ; GFX9-PAL-NEXT: s_addc_u32 flat_scratch_hi, s3, 0
3892 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0
3893 ; GFX9-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0 offset:3024
3894 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3895 ; GFX9-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s0 offset:3024 glc
3896 ; GFX9-PAL-NEXT: s_waitcnt vmcnt(0)
3897 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 16
3898 ; GFX9-PAL-NEXT: ;;#ASMSTART
3899 ; GFX9-PAL-NEXT: ; use v0
3900 ; GFX9-PAL-NEXT: ;;#ASMEND
3901 ; GFX9-PAL-NEXT: v_mov_b32_e32 v0, 0x810
3902 ; GFX9-PAL-NEXT: ;;#ASMSTART
3903 ; GFX9-PAL-NEXT: ; use v0
3904 ; GFX9-PAL-NEXT: ;;#ASMEND
3905 ; GFX9-PAL-NEXT: s_endpgm
3907 ; GFX940-LABEL: large_offset:
3908 ; GFX940: ; %bb.0: ; %bb
3909 ; GFX940-NEXT: v_mov_b32_e32 v0, 0
3910 ; GFX940-NEXT: v_mov_b32_e32 v1, v0
3911 ; GFX940-NEXT: v_mov_b32_e32 v2, v0
3912 ; GFX940-NEXT: v_mov_b32_e32 v3, v0
3913 ; GFX940-NEXT: scratch_store_dwordx4 off, v[0:3], off offset:3024 sc0 sc1
3914 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3915 ; GFX940-NEXT: scratch_load_dwordx4 v[0:3], off, off offset:3024 sc0 sc1
3916 ; GFX940-NEXT: s_waitcnt vmcnt(0)
3917 ; GFX940-NEXT: v_mov_b32_e32 v0, 16
3918 ; GFX940-NEXT: ;;#ASMSTART
3919 ; GFX940-NEXT: ; use v0
3920 ; GFX940-NEXT: ;;#ASMEND
3921 ; GFX940-NEXT: v_mov_b32_e32 v0, 0x810
3922 ; GFX940-NEXT: ;;#ASMSTART
3923 ; GFX940-NEXT: ; use v0
3924 ; GFX940-NEXT: ;;#ASMEND
3925 ; GFX940-NEXT: s_endpgm
3927 ; GFX10-PAL-LABEL: large_offset:
3928 ; GFX10-PAL: ; %bb.0: ; %bb
3929 ; GFX10-PAL-NEXT: s_getpc_b64 s[2:3]
3930 ; GFX10-PAL-NEXT: s_mov_b32 s2, s0
3931 ; GFX10-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
3932 ; GFX10-PAL-NEXT: s_waitcnt lgkmcnt(0)
3933 ; GFX10-PAL-NEXT: s_and_b32 s3, s3, 0xffff
3934 ; GFX10-PAL-NEXT: s_add_u32 s2, s2, s0
3935 ; GFX10-PAL-NEXT: s_addc_u32 s3, s3, 0
3936 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
3937 ; GFX10-PAL-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
3938 ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 0
3939 ; GFX10-PAL-NEXT: s_movk_i32 s0, 0x810
3940 ; GFX10-PAL-NEXT: s_addk_i32 s0, 0x3c0
3941 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, v0
3942 ; GFX10-PAL-NEXT: v_mov_b32_e32 v2, v0
3943 ; GFX10-PAL-NEXT: v_mov_b32_e32 v3, v0
3944 ; GFX10-PAL-NEXT: scratch_store_dwordx4 off, v[0:3], s0
3945 ; GFX10-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3946 ; GFX10-PAL-NEXT: scratch_load_dwordx4 v[0:3], off, s0 glc dlc
3947 ; GFX10-PAL-NEXT: s_waitcnt vmcnt(0)
3948 ; GFX10-PAL-NEXT: v_mov_b32_e32 v0, 16
3949 ; GFX10-PAL-NEXT: v_mov_b32_e32 v1, 0x810
3950 ; GFX10-PAL-NEXT: ;;#ASMSTART
3951 ; GFX10-PAL-NEXT: ; use v0
3952 ; GFX10-PAL-NEXT: ;;#ASMEND
3953 ; GFX10-PAL-NEXT: ;;#ASMSTART
3954 ; GFX10-PAL-NEXT: ; use v1
3955 ; GFX10-PAL-NEXT: ;;#ASMEND
3956 ; GFX10-PAL-NEXT: s_endpgm
3958 ; GFX11-PAL-LABEL: large_offset:
3959 ; GFX11-PAL: ; %bb.0: ; %bb
3960 ; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 0
3961 ; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1)
3962 ; GFX11-PAL-NEXT: v_mov_b32_e32 v1, v0
3963 ; GFX11-PAL-NEXT: v_mov_b32_e32 v2, v0
3964 ; GFX11-PAL-NEXT: v_mov_b32_e32 v3, v0
3965 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:3024 dlc
3966 ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0
3967 ; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc
3968 ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0)
3969 ; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810
3970 ; GFX11-PAL-NEXT: ;;#ASMSTART
3971 ; GFX11-PAL-NEXT: ; use v0
3972 ; GFX11-PAL-NEXT: ;;#ASMEND
3973 ; GFX11-PAL-NEXT: ;;#ASMSTART
3974 ; GFX11-PAL-NEXT: ; use v1
3975 ; GFX11-PAL-NEXT: ;;#ASMEND
3976 ; GFX11-PAL-NEXT: s_endpgm
3978 %alloca = alloca [128 x <4 x i32>], align 16, addrspace(5)
3979 %alloca2 = alloca [128 x <4 x i32>], align 16, addrspace(5)
3980 %gep = getelementptr inbounds [128 x <4 x i32>], ptr addrspace(5) %alloca2, i32 0, i32 60
3981 store volatile <4 x i32> zeroinitializer, ptr addrspace(5) %gep, align 16
3982 %load = load volatile <4 x i32>, ptr addrspace(5) %gep, align 16
3983 call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %alloca) #0
3984 call void asm sideeffect "; use $0", "s"(ptr addrspace(5) %alloca2) #0
3988 declare void @llvm.memset.p5.i64(ptr addrspace(5) nocapture writeonly, i8, i64, i1 immarg)
3989 declare i32 @llvm.amdgcn.workitem.id.x()