1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
7 define amdgpu_ps void @test_scratch_load_i8_zext_v(ptr addrspace(5) %in, ptr %out) {
8 ; GFX10-LABEL: test_scratch_load_i8_zext_v:
10 ; GFX10-NEXT: s_add_u32 s0, s0, s2
11 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
12 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
13 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
14 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
15 ; GFX10-NEXT: scratch_load_ubyte v0, v0, off
16 ; GFX10-NEXT: s_waitcnt vmcnt(0)
17 ; GFX10-NEXT: flat_store_dword v[1:2], v0
18 ; GFX10-NEXT: s_endpgm
20 ; GFX11-LABEL: test_scratch_load_i8_zext_v:
22 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0
23 ; GFX11-NEXT: scratch_load_u8 v0, v0, off
24 ; GFX11-NEXT: s_waitcnt vmcnt(0)
25 ; GFX11-NEXT: flat_store_b32 v[1:2], v0
26 ; GFX11-NEXT: s_endpgm
27 %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
28 %load = load i8, ptr addrspace(5) %gep, align 4
29 %ext = zext i8 %load to i32
30 store i32 %ext, ptr %out, align 4
34 define amdgpu_ps void @test_scratch_load_i8_sext_v(ptr addrspace(5) %in, ptr %out) {
35 ; GFX10-LABEL: test_scratch_load_i8_sext_v:
37 ; GFX10-NEXT: s_add_u32 s0, s0, s2
38 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
39 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
40 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
41 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
42 ; GFX10-NEXT: scratch_load_sbyte v0, v0, off
43 ; GFX10-NEXT: s_waitcnt vmcnt(0)
44 ; GFX10-NEXT: flat_store_dword v[1:2], v0
45 ; GFX10-NEXT: s_endpgm
47 ; GFX11-LABEL: test_scratch_load_i8_sext_v:
49 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 1, v0
50 ; GFX11-NEXT: scratch_load_i8 v0, v0, off
51 ; GFX11-NEXT: s_waitcnt vmcnt(0)
52 ; GFX11-NEXT: flat_store_b32 v[1:2], v0
53 ; GFX11-NEXT: s_endpgm
54 %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
55 %load = load i8, ptr addrspace(5) %gep, align 4
56 %ext = sext i8 %load to i32
57 store i32 %ext, ptr %out, align 4
61 define amdgpu_ps void @test_scratch_load_i16_zext_v(ptr addrspace(5) %in, ptr %out) {
62 ; GFX10-LABEL: test_scratch_load_i16_zext_v:
64 ; GFX10-NEXT: s_add_u32 s0, s0, s2
65 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
66 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
67 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
68 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
69 ; GFX10-NEXT: scratch_load_ushort v0, v0, off
70 ; GFX10-NEXT: s_waitcnt vmcnt(0)
71 ; GFX10-NEXT: flat_store_dword v[1:2], v0
72 ; GFX10-NEXT: s_endpgm
74 ; GFX11-LABEL: test_scratch_load_i16_zext_v:
76 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0
77 ; GFX11-NEXT: scratch_load_u16 v0, v0, off
78 ; GFX11-NEXT: s_waitcnt vmcnt(0)
79 ; GFX11-NEXT: flat_store_b32 v[1:2], v0
80 ; GFX11-NEXT: s_endpgm
81 %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
82 %load = load i16, ptr addrspace(5) %gep, align 4
83 %ext = zext i16 %load to i32
84 store i32 %ext, ptr %out, align 4
88 define amdgpu_ps void @test_scratch_load_i16_sext_v(ptr addrspace(5) %in, ptr %out) {
89 ; GFX10-LABEL: test_scratch_load_i16_sext_v:
91 ; GFX10-NEXT: s_add_u32 s0, s0, s2
92 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
93 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
94 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
95 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
96 ; GFX10-NEXT: scratch_load_sshort v0, v0, off
97 ; GFX10-NEXT: s_waitcnt vmcnt(0)
98 ; GFX10-NEXT: flat_store_dword v[1:2], v0
99 ; GFX10-NEXT: s_endpgm
101 ; GFX11-LABEL: test_scratch_load_i16_sext_v:
103 ; GFX11-NEXT: v_add_nc_u32_e32 v0, 2, v0
104 ; GFX11-NEXT: scratch_load_i16 v0, v0, off
105 ; GFX11-NEXT: s_waitcnt vmcnt(0)
106 ; GFX11-NEXT: flat_store_b32 v[1:2], v0
107 ; GFX11-NEXT: s_endpgm
108 %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
109 %load = load i16, ptr addrspace(5) %gep, align 4
110 %ext = sext i16 %load to i32
111 store i32 %ext, ptr %out, align 4
115 define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_v(ptr addrspace(5) %in, ptr %out) {
116 ; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_lo_v:
117 ; GFX10: ; %bb.0: ; %bb
118 ; GFX10-NEXT: s_add_u32 s0, s0, s2
119 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
120 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
121 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
122 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
123 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
124 ; GFX10-NEXT: scratch_load_ubyte_d16 v3, v0, off
125 ; GFX10-NEXT: s_waitcnt vmcnt(0)
126 ; GFX10-NEXT: flat_store_dword v[1:2], v3
127 ; GFX10-NEXT: s_endpgm
129 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_v:
130 ; GFX11: ; %bb.0: ; %bb
131 ; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 1, v0
132 ; GFX11-NEXT: scratch_load_d16_u8 v3, v0, off
133 ; GFX11-NEXT: s_waitcnt vmcnt(0)
134 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
135 ; GFX11-NEXT: s_endpgm
137 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
138 %load_lo = load i8, ptr addrspace(5) %gep
139 %ext = zext i8 %load_lo to i16
140 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
141 store <2 x i16> %result, ptr %out, align 4
145 define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_v(ptr addrspace(5) %in, ptr %out) {
146 ; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_lo_v:
147 ; GFX10: ; %bb.0: ; %bb
148 ; GFX10-NEXT: s_add_u32 s0, s0, s2
149 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
150 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
151 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
152 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
153 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
154 ; GFX10-NEXT: scratch_load_sbyte_d16 v3, v0, off
155 ; GFX10-NEXT: s_waitcnt vmcnt(0)
156 ; GFX10-NEXT: flat_store_dword v[1:2], v3
157 ; GFX10-NEXT: s_endpgm
159 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_v:
160 ; GFX11: ; %bb.0: ; %bb
161 ; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 1, v0
162 ; GFX11-NEXT: scratch_load_d16_i8 v3, v0, off
163 ; GFX11-NEXT: s_waitcnt vmcnt(0)
164 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
165 ; GFX11-NEXT: s_endpgm
167 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
168 %load_lo = load i8, ptr addrspace(5) %gep
169 %ext = sext i8 %load_lo to i16
170 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
171 store <2 x i16> %result, ptr %out, align 4
175 define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_v(ptr addrspace(5) %in, ptr %out) {
176 ; GFX10-LABEL: test_scratch_load_i16_to_d16_lo_v:
177 ; GFX10: ; %bb.0: ; %bb
178 ; GFX10-NEXT: s_add_u32 s0, s0, s2
179 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
180 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
181 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
182 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
183 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
184 ; GFX10-NEXT: scratch_load_short_d16 v3, v0, off
185 ; GFX10-NEXT: s_waitcnt vmcnt(0)
186 ; GFX10-NEXT: flat_store_dword v[1:2], v3
187 ; GFX10-NEXT: s_endpgm
189 ; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_v:
190 ; GFX11: ; %bb.0: ; %bb
191 ; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 2, v0
192 ; GFX11-NEXT: scratch_load_d16_b16 v3, v0, off
193 ; GFX11-NEXT: s_waitcnt vmcnt(0)
194 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
195 ; GFX11-NEXT: s_endpgm
197 %gep = getelementptr i16, ptr addrspace(5) %in, i64 1
198 %load_lo = load i16, ptr addrspace(5) %gep
199 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 0
200 store <2 x i16> %result, ptr %out, align 4
205 define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_v(ptr addrspace(5) %in, ptr %out) {
206 ; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_hi_v:
207 ; GFX10: ; %bb.0: ; %bb
208 ; GFX10-NEXT: s_add_u32 s0, s0, s2
209 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
210 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
211 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
212 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
213 ; GFX10-NEXT: v_mov_b32_e32 v3, -1
214 ; GFX10-NEXT: scratch_load_ubyte_d16_hi v3, v0, off
215 ; GFX10-NEXT: s_waitcnt vmcnt(0)
216 ; GFX10-NEXT: flat_store_dword v[1:2], v3
217 ; GFX10-NEXT: s_endpgm
219 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_v:
220 ; GFX11: ; %bb.0: ; %bb
221 ; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0
222 ; GFX11-NEXT: scratch_load_d16_hi_u8 v3, v0, off
223 ; GFX11-NEXT: s_waitcnt vmcnt(0)
224 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
225 ; GFX11-NEXT: s_endpgm
227 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
228 %load_lo = load i8, ptr addrspace(5) %gep
229 %ext = zext i8 %load_lo to i16
230 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
231 store <2 x i16> %result, ptr %out, align 4
235 define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_v(ptr addrspace(5) %in, ptr %out) {
236 ; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_hi_v:
237 ; GFX10: ; %bb.0: ; %bb
238 ; GFX10-NEXT: s_add_u32 s0, s0, s2
239 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
240 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
241 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
242 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
243 ; GFX10-NEXT: v_mov_b32_e32 v3, -1
244 ; GFX10-NEXT: scratch_load_sbyte_d16_hi v3, v0, off
245 ; GFX10-NEXT: s_waitcnt vmcnt(0)
246 ; GFX10-NEXT: flat_store_dword v[1:2], v3
247 ; GFX10-NEXT: s_endpgm
249 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_v:
250 ; GFX11: ; %bb.0: ; %bb
251 ; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0
252 ; GFX11-NEXT: scratch_load_d16_hi_i8 v3, v0, off
253 ; GFX11-NEXT: s_waitcnt vmcnt(0)
254 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
255 ; GFX11-NEXT: s_endpgm
257 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
258 %load_lo = load i8, ptr addrspace(5) %gep
259 %ext = sext i8 %load_lo to i16
260 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
261 store <2 x i16> %result, ptr %out, align 4
265 define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_v(ptr addrspace(5) %in, ptr %out) {
266 ; GFX10-LABEL: test_scratch_load_i16_to_d16_hi_v:
267 ; GFX10: ; %bb.0: ; %bb
268 ; GFX10-NEXT: s_add_u32 s0, s0, s2
269 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
270 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
271 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
272 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
273 ; GFX10-NEXT: v_mov_b32_e32 v3, -1
274 ; GFX10-NEXT: scratch_load_short_d16_hi v3, v0, off
275 ; GFX10-NEXT: s_waitcnt vmcnt(0)
276 ; GFX10-NEXT: flat_store_dword v[1:2], v3
277 ; GFX10-NEXT: s_endpgm
279 ; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_v:
280 ; GFX11: ; %bb.0: ; %bb
281 ; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 2, v0
282 ; GFX11-NEXT: scratch_load_d16_hi_b16 v3, v0, off
283 ; GFX11-NEXT: s_waitcnt vmcnt(0)
284 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
285 ; GFX11-NEXT: s_endpgm
287 %gep = getelementptr i16, ptr addrspace(5) %in, i64 1
288 %load_lo = load i16, ptr addrspace(5) %gep
289 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 1
290 store <2 x i16> %result, ptr %out, align 4
294 define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_v(ptr %in, ptr addrspace(5) %out) {
295 ; GFX10-LABEL: test_scratch_store_b8_from_d16_hi_v:
296 ; GFX10: ; %bb.0: ; %bb
297 ; GFX10-NEXT: s_add_u32 s0, s0, s2
298 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
299 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
300 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
301 ; GFX10-NEXT: flat_load_dword v0, v[0:1]
302 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v2
303 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
304 ; GFX10-NEXT: scratch_store_byte_d16_hi v1, v0, off
305 ; GFX10-NEXT: s_endpgm
307 ; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_v:
308 ; GFX11: ; %bb.0: ; %bb
309 ; GFX11-NEXT: flat_load_b32 v0, v[0:1]
310 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 4, v2
311 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
312 ; GFX11-NEXT: scratch_store_d16_hi_b8 v1, v0, off
313 ; GFX11-NEXT: s_endpgm
315 %load = load <4 x i8>, ptr %in
316 %element = extractelement <4 x i8> %load, i32 2
317 %gep = getelementptr <4 x i8>, ptr addrspace(5) %out, i64 1
318 store i8 %element, ptr addrspace(5) %gep, align 4
322 define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_v(ptr %in, ptr addrspace(5) %out) {
323 ; GFX10-LABEL: test_scratch_store_b16_from_d16_hi_v:
324 ; GFX10: ; %bb.0: ; %bb
325 ; GFX10-NEXT: s_add_u32 s0, s0, s2
326 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
327 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
328 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
329 ; GFX10-NEXT: flat_load_dword v0, v[0:1]
330 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 2, v2
331 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
332 ; GFX10-NEXT: scratch_store_short_d16_hi v1, v0, off
333 ; GFX10-NEXT: s_endpgm
335 ; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_v:
336 ; GFX11: ; %bb.0: ; %bb
337 ; GFX11-NEXT: flat_load_b32 v0, v[0:1]
338 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v2
339 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
340 ; GFX11-NEXT: scratch_store_d16_hi_b16 v1, v0, off
341 ; GFX11-NEXT: s_endpgm
343 %load = load <2 x i16>, ptr %in
344 %element = extractelement <2 x i16> %load, i32 1
345 %gep = getelementptr <2 x i8>, ptr addrspace(5) %out, i64 1
346 store i16 %element, ptr addrspace(5) %gep, align 4
355 define amdgpu_ps void @test_scratch_load_i8_zext_s(ptr addrspace(5) inreg %in, ptr %out) {
356 ; GFX10-LABEL: test_scratch_load_i8_zext_s:
358 ; GFX10-NEXT: s_add_u32 s0, s0, s3
359 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
360 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
361 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
362 ; GFX10-NEXT: s_add_i32 s2, s2, 1
363 ; GFX10-NEXT: scratch_load_ubyte v2, off, s2
364 ; GFX10-NEXT: s_waitcnt vmcnt(0)
365 ; GFX10-NEXT: flat_store_dword v[0:1], v2
366 ; GFX10-NEXT: s_endpgm
368 ; GFX11-LABEL: test_scratch_load_i8_zext_s:
370 ; GFX11-NEXT: s_add_i32 s0, s0, 1
371 ; GFX11-NEXT: scratch_load_u8 v2, off, s0
372 ; GFX11-NEXT: s_waitcnt vmcnt(0)
373 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
374 ; GFX11-NEXT: s_endpgm
375 %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
376 %load = load i8, ptr addrspace(5) %gep, align 4
377 %ext = zext i8 %load to i32
378 store i32 %ext, ptr %out, align 4
382 define amdgpu_ps void @test_scratch_load_i8_sext_s(ptr addrspace(5) inreg %in, ptr %out) {
383 ; GFX10-LABEL: test_scratch_load_i8_sext_s:
385 ; GFX10-NEXT: s_add_u32 s0, s0, s3
386 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
387 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
388 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
389 ; GFX10-NEXT: s_add_i32 s2, s2, 1
390 ; GFX10-NEXT: scratch_load_sbyte v2, off, s2
391 ; GFX10-NEXT: s_waitcnt vmcnt(0)
392 ; GFX10-NEXT: flat_store_dword v[0:1], v2
393 ; GFX10-NEXT: s_endpgm
395 ; GFX11-LABEL: test_scratch_load_i8_sext_s:
397 ; GFX11-NEXT: s_add_i32 s0, s0, 1
398 ; GFX11-NEXT: scratch_load_i8 v2, off, s0
399 ; GFX11-NEXT: s_waitcnt vmcnt(0)
400 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
401 ; GFX11-NEXT: s_endpgm
402 %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
403 %load = load i8, ptr addrspace(5) %gep, align 4
404 %ext = sext i8 %load to i32
405 store i32 %ext, ptr %out, align 4
409 define amdgpu_ps void @test_scratch_load_i16_zext_s(ptr addrspace(5) inreg %in, ptr %out) {
410 ; GFX10-LABEL: test_scratch_load_i16_zext_s:
412 ; GFX10-NEXT: s_add_u32 s0, s0, s3
413 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
414 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
415 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
416 ; GFX10-NEXT: s_add_i32 s2, s2, 2
417 ; GFX10-NEXT: scratch_load_ushort v2, off, s2
418 ; GFX10-NEXT: s_waitcnt vmcnt(0)
419 ; GFX10-NEXT: flat_store_dword v[0:1], v2
420 ; GFX10-NEXT: s_endpgm
422 ; GFX11-LABEL: test_scratch_load_i16_zext_s:
424 ; GFX11-NEXT: s_add_i32 s0, s0, 2
425 ; GFX11-NEXT: scratch_load_u16 v2, off, s0
426 ; GFX11-NEXT: s_waitcnt vmcnt(0)
427 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
428 ; GFX11-NEXT: s_endpgm
429 %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
430 %load = load i16, ptr addrspace(5) %gep, align 4
431 %ext = zext i16 %load to i32
432 store i32 %ext, ptr %out, align 4
436 define amdgpu_ps void @test_scratch_load_i16_sext_s(ptr addrspace(5) inreg %in, ptr %out) {
437 ; GFX10-LABEL: test_scratch_load_i16_sext_s:
439 ; GFX10-NEXT: s_add_u32 s0, s0, s3
440 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
441 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
442 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
443 ; GFX10-NEXT: s_add_i32 s2, s2, 2
444 ; GFX10-NEXT: scratch_load_sshort v2, off, s2
445 ; GFX10-NEXT: s_waitcnt vmcnt(0)
446 ; GFX10-NEXT: flat_store_dword v[0:1], v2
447 ; GFX10-NEXT: s_endpgm
449 ; GFX11-LABEL: test_scratch_load_i16_sext_s:
451 ; GFX11-NEXT: s_add_i32 s0, s0, 2
452 ; GFX11-NEXT: scratch_load_i16 v2, off, s0
453 ; GFX11-NEXT: s_waitcnt vmcnt(0)
454 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
455 ; GFX11-NEXT: s_endpgm
456 %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
457 %load = load i16, ptr addrspace(5) %gep, align 4
458 %ext = sext i16 %load to i32
459 store i32 %ext, ptr %out, align 4
463 define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_s(ptr addrspace(5) inreg %in, ptr %out) {
464 ; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_lo_s:
465 ; GFX10: ; %bb.0: ; %bb
466 ; GFX10-NEXT: s_add_u32 s0, s0, s3
467 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
468 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
469 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
470 ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff0000
471 ; GFX10-NEXT: s_add_i32 s2, s2, 1
472 ; GFX10-NEXT: scratch_load_ubyte_d16 v2, off, s2
473 ; GFX10-NEXT: s_waitcnt vmcnt(0)
474 ; GFX10-NEXT: flat_store_dword v[0:1], v2
475 ; GFX10-NEXT: s_endpgm
477 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_s:
478 ; GFX11: ; %bb.0: ; %bb
479 ; GFX11-NEXT: v_mov_b32_e32 v2, 0xffff0000
480 ; GFX11-NEXT: s_add_i32 s0, s0, 1
481 ; GFX11-NEXT: scratch_load_d16_u8 v2, off, s0
482 ; GFX11-NEXT: s_waitcnt vmcnt(0)
483 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
484 ; GFX11-NEXT: s_endpgm
486 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
487 %load_lo = load i8, ptr addrspace(5) %gep
488 %ext = zext i8 %load_lo to i16
489 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
490 store <2 x i16> %result, ptr %out, align 4
494 define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_s(ptr addrspace(5) inreg %in, ptr %out) {
495 ; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_lo_s:
496 ; GFX10: ; %bb.0: ; %bb
497 ; GFX10-NEXT: s_add_u32 s0, s0, s3
498 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
499 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
500 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
501 ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff0000
502 ; GFX10-NEXT: s_add_i32 s2, s2, 1
503 ; GFX10-NEXT: scratch_load_sbyte_d16 v2, off, s2
504 ; GFX10-NEXT: s_waitcnt vmcnt(0)
505 ; GFX10-NEXT: flat_store_dword v[0:1], v2
506 ; GFX10-NEXT: s_endpgm
508 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_s:
509 ; GFX11: ; %bb.0: ; %bb
510 ; GFX11-NEXT: v_mov_b32_e32 v2, 0xffff0000
511 ; GFX11-NEXT: s_add_i32 s0, s0, 1
512 ; GFX11-NEXT: scratch_load_d16_i8 v2, off, s0
513 ; GFX11-NEXT: s_waitcnt vmcnt(0)
514 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
515 ; GFX11-NEXT: s_endpgm
517 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
518 %load_lo = load i8, ptr addrspace(5) %gep
519 %ext = sext i8 %load_lo to i16
520 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
521 store <2 x i16> %result, ptr %out, align 4
525 define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_s(ptr addrspace(5) inreg %in, ptr %out) {
526 ; GFX10-LABEL: test_scratch_load_i16_to_d16_lo_s:
527 ; GFX10: ; %bb.0: ; %bb
528 ; GFX10-NEXT: s_add_u32 s0, s0, s3
529 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
530 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
531 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
532 ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff0000
533 ; GFX10-NEXT: s_add_i32 s2, s2, 2
534 ; GFX10-NEXT: scratch_load_short_d16 v2, off, s2
535 ; GFX10-NEXT: s_waitcnt vmcnt(0)
536 ; GFX10-NEXT: flat_store_dword v[0:1], v2
537 ; GFX10-NEXT: s_endpgm
539 ; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_s:
540 ; GFX11: ; %bb.0: ; %bb
541 ; GFX11-NEXT: v_mov_b32_e32 v2, 0xffff0000
542 ; GFX11-NEXT: s_add_i32 s0, s0, 2
543 ; GFX11-NEXT: scratch_load_d16_b16 v2, off, s0
544 ; GFX11-NEXT: s_waitcnt vmcnt(0)
545 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
546 ; GFX11-NEXT: s_endpgm
548 %gep = getelementptr i16, ptr addrspace(5) %in, i64 1
549 %load_lo = load i16, ptr addrspace(5) %gep
550 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 0
551 store <2 x i16> %result, ptr %out, align 4
556 define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_s(ptr addrspace(5) inreg %in, ptr %out) {
557 ; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_hi_s:
558 ; GFX10: ; %bb.0: ; %bb
559 ; GFX10-NEXT: s_add_u32 s0, s0, s3
560 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
561 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
562 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
563 ; GFX10-NEXT: v_mov_b32_e32 v2, -1
564 ; GFX10-NEXT: s_add_i32 s2, s2, 1
565 ; GFX10-NEXT: scratch_load_ubyte_d16_hi v2, off, s2
566 ; GFX10-NEXT: s_waitcnt vmcnt(0)
567 ; GFX10-NEXT: flat_store_dword v[0:1], v2
568 ; GFX10-NEXT: s_endpgm
570 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_s:
571 ; GFX11: ; %bb.0: ; %bb
572 ; GFX11-NEXT: v_mov_b32_e32 v2, -1
573 ; GFX11-NEXT: s_add_i32 s0, s0, 1
574 ; GFX11-NEXT: scratch_load_d16_hi_u8 v2, off, s0
575 ; GFX11-NEXT: s_waitcnt vmcnt(0)
576 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
577 ; GFX11-NEXT: s_endpgm
579 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
580 %load_lo = load i8, ptr addrspace(5) %gep
581 %ext = zext i8 %load_lo to i16
582 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
583 store <2 x i16> %result, ptr %out, align 4
587 define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_s(ptr addrspace(5) inreg %in, ptr %out) {
588 ; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_hi_s:
589 ; GFX10: ; %bb.0: ; %bb
590 ; GFX10-NEXT: s_add_u32 s0, s0, s3
591 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
592 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
593 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
594 ; GFX10-NEXT: v_mov_b32_e32 v2, -1
595 ; GFX10-NEXT: s_add_i32 s2, s2, 1
596 ; GFX10-NEXT: scratch_load_sbyte_d16_hi v2, off, s2
597 ; GFX10-NEXT: s_waitcnt vmcnt(0)
598 ; GFX10-NEXT: flat_store_dword v[0:1], v2
599 ; GFX10-NEXT: s_endpgm
601 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_s:
602 ; GFX11: ; %bb.0: ; %bb
603 ; GFX11-NEXT: v_mov_b32_e32 v2, -1
604 ; GFX11-NEXT: s_add_i32 s0, s0, 1
605 ; GFX11-NEXT: scratch_load_d16_hi_i8 v2, off, s0
606 ; GFX11-NEXT: s_waitcnt vmcnt(0)
607 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
608 ; GFX11-NEXT: s_endpgm
610 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
611 %load_lo = load i8, ptr addrspace(5) %gep
612 %ext = sext i8 %load_lo to i16
613 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
614 store <2 x i16> %result, ptr %out, align 4
618 define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_s(ptr addrspace(5) inreg %in, ptr %out) {
619 ; GFX10-LABEL: test_scratch_load_i16_to_d16_hi_s:
620 ; GFX10: ; %bb.0: ; %bb
621 ; GFX10-NEXT: s_add_u32 s0, s0, s3
622 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
623 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
624 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
625 ; GFX10-NEXT: v_mov_b32_e32 v2, -1
626 ; GFX10-NEXT: s_add_i32 s2, s2, 2
627 ; GFX10-NEXT: scratch_load_short_d16_hi v2, off, s2
628 ; GFX10-NEXT: s_waitcnt vmcnt(0)
629 ; GFX10-NEXT: flat_store_dword v[0:1], v2
630 ; GFX10-NEXT: s_endpgm
632 ; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_s:
633 ; GFX11: ; %bb.0: ; %bb
634 ; GFX11-NEXT: v_mov_b32_e32 v2, -1
635 ; GFX11-NEXT: s_add_i32 s0, s0, 2
636 ; GFX11-NEXT: scratch_load_d16_hi_b16 v2, off, s0
637 ; GFX11-NEXT: s_waitcnt vmcnt(0)
638 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
639 ; GFX11-NEXT: s_endpgm
641 %gep = getelementptr i16, ptr addrspace(5) %in, i64 1
642 %load_lo = load i16, ptr addrspace(5) %gep
643 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 1
644 store <2 x i16> %result, ptr %out, align 4
648 define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_s(ptr %in, ptr addrspace(5) inreg %out) {
649 ; GFX10-LABEL: test_scratch_store_b8_from_d16_hi_s:
650 ; GFX10: ; %bb.0: ; %bb
651 ; GFX10-NEXT: s_add_u32 s0, s0, s3
652 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
653 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
654 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
655 ; GFX10-NEXT: flat_load_dword v0, v[0:1]
656 ; GFX10-NEXT: s_add_i32 s2, s2, 4
657 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
658 ; GFX10-NEXT: scratch_store_byte_d16_hi off, v0, s2
659 ; GFX10-NEXT: s_endpgm
661 ; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_s:
662 ; GFX11: ; %bb.0: ; %bb
663 ; GFX11-NEXT: flat_load_b32 v0, v[0:1]
664 ; GFX11-NEXT: s_add_i32 s0, s0, 4
665 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
666 ; GFX11-NEXT: scratch_store_d16_hi_b8 off, v0, s0
667 ; GFX11-NEXT: s_endpgm
669 %load = load <4 x i8>, ptr %in
670 %element = extractelement <4 x i8> %load, i32 2
671 %gep = getelementptr <4 x i8>, ptr addrspace(5) %out, i64 1
672 store i8 %element, ptr addrspace(5) %gep, align 4
676 define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_s(ptr %in, ptr addrspace(5) inreg %out) {
677 ; GFX10-LABEL: test_scratch_store_b16_from_d16_hi_s:
678 ; GFX10: ; %bb.0: ; %bb
679 ; GFX10-NEXT: s_add_u32 s0, s0, s3
680 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
681 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
682 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
683 ; GFX10-NEXT: flat_load_dword v0, v[0:1]
684 ; GFX10-NEXT: s_add_i32 s2, s2, 2
685 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
686 ; GFX10-NEXT: scratch_store_short_d16_hi off, v0, s2
687 ; GFX10-NEXT: s_endpgm
689 ; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_s:
690 ; GFX11: ; %bb.0: ; %bb
691 ; GFX11-NEXT: flat_load_b32 v0, v[0:1]
692 ; GFX11-NEXT: s_add_i32 s0, s0, 2
693 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
694 ; GFX11-NEXT: scratch_store_d16_hi_b16 off, v0, s0
695 ; GFX11-NEXT: s_endpgm
697 %load = load <2 x i16>, ptr %in
698 %element = extractelement <2 x i16> %load, i32 1
699 %gep = getelementptr <2 x i8>, ptr addrspace(5) %out, i64 1
700 store i16 %element, ptr addrspace(5) %gep, align 4
709 define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
710 ; GFX10-LABEL: test_scratch_load_i8_zext_svs:
712 ; GFX10-NEXT: s_add_u32 s0, s0, s3
713 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
714 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
715 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
716 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
717 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
718 ; GFX10-NEXT: scratch_load_ubyte v0, v0, off
719 ; GFX10-NEXT: s_waitcnt vmcnt(0)
720 ; GFX10-NEXT: flat_store_dword v[1:2], v0
721 ; GFX10-NEXT: s_endpgm
723 ; GFX11-LABEL: test_scratch_load_i8_zext_svs:
725 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
726 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
727 ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
728 ; GFX11-NEXT: scratch_load_u8 v0, v0, off
729 ; GFX11-NEXT: s_waitcnt vmcnt(0)
730 ; GFX11-NEXT: flat_store_b32 v[1:2], v0
731 ; GFX11-NEXT: s_endpgm
732 %voffset4 = mul i32 %voffset, 4
733 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
734 %gep = getelementptr inbounds i8, ptr addrspace(5) %gep0, i32 1
735 %load = load i8, ptr addrspace(5) %gep, align 4
736 %ext = zext i8 %load to i32
737 store i32 %ext, ptr %out, align 4
741 define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
742 ; GFX10-LABEL: test_scratch_load_i8_sext_svs:
744 ; GFX10-NEXT: s_add_u32 s0, s0, s3
745 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
746 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
747 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
748 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
749 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
750 ; GFX10-NEXT: scratch_load_sbyte v0, v0, off
751 ; GFX10-NEXT: s_waitcnt vmcnt(0)
752 ; GFX10-NEXT: flat_store_dword v[1:2], v0
753 ; GFX10-NEXT: s_endpgm
755 ; GFX11-LABEL: test_scratch_load_i8_sext_svs:
757 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
758 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
759 ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
760 ; GFX11-NEXT: scratch_load_i8 v0, v0, off
761 ; GFX11-NEXT: s_waitcnt vmcnt(0)
762 ; GFX11-NEXT: flat_store_b32 v[1:2], v0
763 ; GFX11-NEXT: s_endpgm
764 %voffset4 = mul i32 %voffset, 4
765 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
766 %gep = getelementptr inbounds i8, ptr addrspace(5) %gep0, i32 1
767 %load = load i8, ptr addrspace(5) %gep, align 4
768 %ext = sext i8 %load to i32
769 store i32 %ext, ptr %out, align 4
773 define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
774 ; GFX10-LABEL: test_scratch_load_i16_zext_svs:
776 ; GFX10-NEXT: s_add_u32 s0, s0, s3
777 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
778 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
779 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
780 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
781 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 2
782 ; GFX10-NEXT: scratch_load_ushort v0, v0, off
783 ; GFX10-NEXT: s_waitcnt vmcnt(0)
784 ; GFX10-NEXT: flat_store_dword v[1:2], v0
785 ; GFX10-NEXT: s_endpgm
787 ; GFX11-LABEL: test_scratch_load_i16_zext_svs:
789 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
790 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
791 ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2
792 ; GFX11-NEXT: scratch_load_u16 v0, v0, off
793 ; GFX11-NEXT: s_waitcnt vmcnt(0)
794 ; GFX11-NEXT: flat_store_b32 v[1:2], v0
795 ; GFX11-NEXT: s_endpgm
796 %voffset4 = mul i32 %voffset, 4
797 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
798 %gep = getelementptr inbounds i16, ptr addrspace(5) %gep0, i32 1
799 %load = load i16, ptr addrspace(5) %gep, align 4
800 %ext = zext i16 %load to i32
801 store i32 %ext, ptr %out, align 4
805 define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
806 ; GFX10-LABEL: test_scratch_load_i16_sext_svs:
808 ; GFX10-NEXT: s_add_u32 s0, s0, s3
809 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
810 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
811 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
812 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
813 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 2
814 ; GFX10-NEXT: scratch_load_sshort v0, v0, off
815 ; GFX10-NEXT: s_waitcnt vmcnt(0)
816 ; GFX10-NEXT: flat_store_dword v[1:2], v0
817 ; GFX10-NEXT: s_endpgm
819 ; GFX11-LABEL: test_scratch_load_i16_sext_svs:
821 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0
822 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
823 ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2
824 ; GFX11-NEXT: scratch_load_i16 v0, v0, off
825 ; GFX11-NEXT: s_waitcnt vmcnt(0)
826 ; GFX11-NEXT: flat_store_b32 v[1:2], v0
827 ; GFX11-NEXT: s_endpgm
828 %voffset4 = mul i32 %voffset, 4
829 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
830 %gep = getelementptr inbounds i16, ptr addrspace(5) %gep0, i32 1
831 %load = load i16, ptr addrspace(5) %gep, align 4
832 %ext = sext i16 %load to i32
833 store i32 %ext, ptr %out, align 4
837 define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
838 ; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs:
839 ; GFX10: ; %bb.0: ; %bb
840 ; GFX10-NEXT: s_add_u32 s0, s0, s3
841 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
842 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
843 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
844 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
845 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
846 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
847 ; GFX10-NEXT: scratch_load_ubyte_d16 v3, v0, off
848 ; GFX10-NEXT: s_waitcnt vmcnt(0)
849 ; GFX10-NEXT: flat_store_dword v[1:2], v3
850 ; GFX10-NEXT: s_endpgm
852 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs:
853 ; GFX11: ; %bb.0: ; %bb
854 ; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
855 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
856 ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
857 ; GFX11-NEXT: scratch_load_d16_u8 v3, v0, off
858 ; GFX11-NEXT: s_waitcnt vmcnt(0)
859 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
860 ; GFX11-NEXT: s_endpgm
862 %voffset4 = mul i32 %voffset, 4
863 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
864 %gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1
865 %load_lo = load i8, ptr addrspace(5) %gep
866 %ext = zext i8 %load_lo to i16
867 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
868 store <2 x i16> %result, ptr %out, align 4
872 define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
873 ; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs:
874 ; GFX10: ; %bb.0: ; %bb
875 ; GFX10-NEXT: s_add_u32 s0, s0, s3
876 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
877 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
878 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
879 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
880 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
881 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
882 ; GFX10-NEXT: scratch_load_sbyte_d16 v3, v0, off
883 ; GFX10-NEXT: s_waitcnt vmcnt(0)
884 ; GFX10-NEXT: flat_store_dword v[1:2], v3
885 ; GFX10-NEXT: s_endpgm
887 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs:
888 ; GFX11: ; %bb.0: ; %bb
889 ; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
890 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
891 ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
892 ; GFX11-NEXT: scratch_load_d16_i8 v3, v0, off
893 ; GFX11-NEXT: s_waitcnt vmcnt(0)
894 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
895 ; GFX11-NEXT: s_endpgm
897 %voffset4 = mul i32 %voffset, 4
898 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
899 %gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1
900 %load_lo = load i8, ptr addrspace(5) %gep
901 %ext = sext i8 %load_lo to i16
902 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
903 store <2 x i16> %result, ptr %out, align 4
907 define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
908 ; GFX10-LABEL: test_scratch_load_i16_to_d16_lo_svs:
909 ; GFX10: ; %bb.0: ; %bb
910 ; GFX10-NEXT: s_add_u32 s0, s0, s3
911 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
912 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
913 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
914 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
915 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
916 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 2
917 ; GFX10-NEXT: scratch_load_short_d16 v3, v0, off
918 ; GFX10-NEXT: s_waitcnt vmcnt(0)
919 ; GFX10-NEXT: flat_store_dword v[1:2], v3
920 ; GFX10-NEXT: s_endpgm
922 ; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_svs:
923 ; GFX11: ; %bb.0: ; %bb
924 ; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
925 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
926 ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2
927 ; GFX11-NEXT: scratch_load_d16_b16 v3, v0, off
928 ; GFX11-NEXT: s_waitcnt vmcnt(0)
929 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
930 ; GFX11-NEXT: s_endpgm
932 %voffset4 = mul i32 %voffset, 4
933 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
934 %gep = getelementptr i16, ptr addrspace(5) %gep0, i64 1
935 %load_lo = load i16, ptr addrspace(5) %gep
936 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 0
937 store <2 x i16> %result, ptr %out, align 4
942 define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
943 ; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs:
944 ; GFX10: ; %bb.0: ; %bb
945 ; GFX10-NEXT: s_add_u32 s0, s0, s3
946 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
947 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
948 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
949 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
950 ; GFX10-NEXT: v_mov_b32_e32 v3, -1
951 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
952 ; GFX10-NEXT: scratch_load_ubyte_d16_hi v3, v0, off
953 ; GFX10-NEXT: s_waitcnt vmcnt(0)
954 ; GFX10-NEXT: flat_store_dword v[1:2], v3
955 ; GFX10-NEXT: s_endpgm
957 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs:
958 ; GFX11: ; %bb.0: ; %bb
959 ; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
960 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
961 ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
962 ; GFX11-NEXT: scratch_load_d16_hi_u8 v3, v0, off
963 ; GFX11-NEXT: s_waitcnt vmcnt(0)
964 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
965 ; GFX11-NEXT: s_endpgm
967 %voffset4 = mul i32 %voffset, 4
968 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
969 %gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1
970 %load_lo = load i8, ptr addrspace(5) %gep
971 %ext = zext i8 %load_lo to i16
972 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
973 store <2 x i16> %result, ptr %out, align 4
977 define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
978 ; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs:
979 ; GFX10: ; %bb.0: ; %bb
980 ; GFX10-NEXT: s_add_u32 s0, s0, s3
981 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
982 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
983 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
984 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
985 ; GFX10-NEXT: v_mov_b32_e32 v3, -1
986 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
987 ; GFX10-NEXT: scratch_load_sbyte_d16_hi v3, v0, off
988 ; GFX10-NEXT: s_waitcnt vmcnt(0)
989 ; GFX10-NEXT: flat_store_dword v[1:2], v3
990 ; GFX10-NEXT: s_endpgm
992 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs:
993 ; GFX11: ; %bb.0: ; %bb
994 ; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
995 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
996 ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
997 ; GFX11-NEXT: scratch_load_d16_hi_i8 v3, v0, off
998 ; GFX11-NEXT: s_waitcnt vmcnt(0)
999 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
1000 ; GFX11-NEXT: s_endpgm
1002 %voffset4 = mul i32 %voffset, 4
1003 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
1004 %gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1
1005 %load_lo = load i8, ptr addrspace(5) %gep
1006 %ext = sext i8 %load_lo to i16
1007 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
1008 store <2 x i16> %result, ptr %out, align 4
1012 define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
1013 ; GFX10-LABEL: test_scratch_load_i16_to_d16_hi_svs:
1014 ; GFX10: ; %bb.0: ; %bb
1015 ; GFX10-NEXT: s_add_u32 s0, s0, s3
1016 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1017 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1018 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1019 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1020 ; GFX10-NEXT: v_mov_b32_e32 v3, -1
1021 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 2
1022 ; GFX10-NEXT: scratch_load_short_d16_hi v3, v0, off
1023 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1024 ; GFX10-NEXT: flat_store_dword v[1:2], v3
1025 ; GFX10-NEXT: s_endpgm
1027 ; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_svs:
1028 ; GFX11: ; %bb.0: ; %bb
1029 ; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
1030 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1031 ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2
1032 ; GFX11-NEXT: scratch_load_d16_hi_b16 v3, v0, off
1033 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1034 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
1035 ; GFX11-NEXT: s_endpgm
1037 %voffset4 = mul i32 %voffset, 4
1038 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
1039 %gep = getelementptr i16, ptr addrspace(5) %gep0, i64 1
1040 %load_lo = load i16, ptr addrspace(5) %gep
1041 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 1
1042 store <2 x i16> %result, ptr %out, align 4
1046 define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_svs(ptr %in, ptr addrspace(5) inreg %out, i32 %voffset) {
1047 ; GFX10-LABEL: test_scratch_store_b8_from_d16_hi_svs:
1048 ; GFX10: ; %bb.0: ; %bb
1049 ; GFX10-NEXT: s_add_u32 s0, s0, s3
1050 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1051 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1052 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1053 ; GFX10-NEXT: flat_load_dword v0, v[0:1]
1054 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v2
1055 ; GFX10-NEXT: v_add3_u32 v1, s2, v1, 4
1056 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1057 ; GFX10-NEXT: scratch_store_byte_d16_hi v1, v0, off
1058 ; GFX10-NEXT: s_endpgm
1060 ; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_svs:
1061 ; GFX11: ; %bb.0: ; %bb
1062 ; GFX11-NEXT: flat_load_b32 v0, v[0:1]
1063 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v2
1064 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1065 ; GFX11-NEXT: v_add3_u32 v1, s0, v1, 4
1066 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1067 ; GFX11-NEXT: scratch_store_d16_hi_b8 v1, v0, off
1068 ; GFX11-NEXT: s_endpgm
1070 %load = load <4 x i8>, ptr %in
1071 %element = extractelement <4 x i8> %load, i32 2
1072 %voffset4 = mul i32 %voffset, 4
1073 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %out, i32 %voffset4
1074 %gep = getelementptr <4 x i8>, ptr addrspace(5) %gep0, i64 1
1075 store i8 %element, ptr addrspace(5) %gep, align 4
1079 define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_svs(ptr %in, ptr addrspace(5) inreg %out, i32 %voffset) {
1080 ; GFX10-LABEL: test_scratch_store_b16_from_d16_hi_svs:
1081 ; GFX10: ; %bb.0: ; %bb
1082 ; GFX10-NEXT: s_add_u32 s0, s0, s3
1083 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1084 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1085 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1086 ; GFX10-NEXT: flat_load_dword v0, v[0:1]
1087 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v2
1088 ; GFX10-NEXT: v_add3_u32 v1, s2, v1, 2
1089 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1090 ; GFX10-NEXT: scratch_store_short_d16_hi v1, v0, off
1091 ; GFX10-NEXT: s_endpgm
1093 ; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_svs:
1094 ; GFX11: ; %bb.0: ; %bb
1095 ; GFX11-NEXT: flat_load_b32 v0, v[0:1]
1096 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v2
1097 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1098 ; GFX11-NEXT: v_add3_u32 v1, s0, v1, 2
1099 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1100 ; GFX11-NEXT: scratch_store_d16_hi_b16 v1, v0, off
1101 ; GFX11-NEXT: s_endpgm
1103 %load = load <2 x i16>, ptr %in
1104 %element = extractelement <2 x i16> %load, i32 1
1105 %voffset4 = mul i32 %voffset, 4
1106 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %out, i32 %voffset4
1107 %gep = getelementptr <2 x i8>, ptr addrspace(5) %gep0, i64 1
1108 store i16 %element, ptr addrspace(5) %gep, align 4