1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10 %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s
8 define amdgpu_ps void @test_scratch_load_i8_zext_v(ptr addrspace(5) %in, ptr %out) {
9 ; GFX10-LABEL: test_scratch_load_i8_zext_v:
11 ; GFX10-NEXT: s_add_u32 s0, s0, s2
12 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
13 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
14 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
15 ; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1
16 ; GFX10-NEXT: s_waitcnt vmcnt(0)
17 ; GFX10-NEXT: flat_store_dword v[1:2], v0
18 ; GFX10-NEXT: s_endpgm
20 ; GFX11-LABEL: test_scratch_load_i8_zext_v:
22 ; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:1
23 ; GFX11-NEXT: s_waitcnt vmcnt(0)
24 ; GFX11-NEXT: flat_store_b32 v[1:2], v0
25 ; GFX11-NEXT: s_endpgm
27 ; GFX12-LABEL: test_scratch_load_i8_zext_v:
29 ; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:1
30 ; GFX12-NEXT: s_wait_loadcnt 0x0
31 ; GFX12-NEXT: flat_store_b32 v[1:2], v0
32 ; GFX12-NEXT: s_endpgm
33 %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
34 %load = load i8, ptr addrspace(5) %gep, align 4
35 %ext = zext i8 %load to i32
36 store i32 %ext, ptr %out, align 4
40 define amdgpu_ps void @test_scratch_load_i8_sext_v(ptr addrspace(5) %in, ptr %out) {
41 ; GFX10-LABEL: test_scratch_load_i8_sext_v:
43 ; GFX10-NEXT: s_add_u32 s0, s0, s2
44 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
45 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
46 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
47 ; GFX10-NEXT: scratch_load_sbyte v0, v0, off offset:1
48 ; GFX10-NEXT: s_waitcnt vmcnt(0)
49 ; GFX10-NEXT: flat_store_dword v[1:2], v0
50 ; GFX10-NEXT: s_endpgm
52 ; GFX11-LABEL: test_scratch_load_i8_sext_v:
54 ; GFX11-NEXT: scratch_load_i8 v0, v0, off offset:1
55 ; GFX11-NEXT: s_waitcnt vmcnt(0)
56 ; GFX11-NEXT: flat_store_b32 v[1:2], v0
57 ; GFX11-NEXT: s_endpgm
59 ; GFX12-LABEL: test_scratch_load_i8_sext_v:
61 ; GFX12-NEXT: scratch_load_i8 v0, v0, off offset:1
62 ; GFX12-NEXT: s_wait_loadcnt 0x0
63 ; GFX12-NEXT: flat_store_b32 v[1:2], v0
64 ; GFX12-NEXT: s_endpgm
65 %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
66 %load = load i8, ptr addrspace(5) %gep, align 4
67 %ext = sext i8 %load to i32
68 store i32 %ext, ptr %out, align 4
72 define amdgpu_ps void @test_scratch_load_i16_zext_v(ptr addrspace(5) %in, ptr %out) {
73 ; GFX10-LABEL: test_scratch_load_i16_zext_v:
75 ; GFX10-NEXT: s_add_u32 s0, s0, s2
76 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
77 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
78 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
79 ; GFX10-NEXT: scratch_load_ushort v0, v0, off offset:2
80 ; GFX10-NEXT: s_waitcnt vmcnt(0)
81 ; GFX10-NEXT: flat_store_dword v[1:2], v0
82 ; GFX10-NEXT: s_endpgm
84 ; GFX11-LABEL: test_scratch_load_i16_zext_v:
86 ; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2
87 ; GFX11-NEXT: s_waitcnt vmcnt(0)
88 ; GFX11-NEXT: flat_store_b32 v[1:2], v0
89 ; GFX11-NEXT: s_endpgm
91 ; GFX12-LABEL: test_scratch_load_i16_zext_v:
93 ; GFX12-NEXT: scratch_load_u16 v0, v0, off offset:2
94 ; GFX12-NEXT: s_wait_loadcnt 0x0
95 ; GFX12-NEXT: flat_store_b32 v[1:2], v0
96 ; GFX12-NEXT: s_endpgm
97 %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
98 %load = load i16, ptr addrspace(5) %gep, align 4
99 %ext = zext i16 %load to i32
100 store i32 %ext, ptr %out, align 4
104 define amdgpu_ps void @test_scratch_load_i16_sext_v(ptr addrspace(5) %in, ptr %out) {
105 ; GFX10-LABEL: test_scratch_load_i16_sext_v:
107 ; GFX10-NEXT: s_add_u32 s0, s0, s2
108 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
109 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
110 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
111 ; GFX10-NEXT: scratch_load_sshort v0, v0, off offset:2
112 ; GFX10-NEXT: s_waitcnt vmcnt(0)
113 ; GFX10-NEXT: flat_store_dword v[1:2], v0
114 ; GFX10-NEXT: s_endpgm
116 ; GFX11-LABEL: test_scratch_load_i16_sext_v:
118 ; GFX11-NEXT: scratch_load_i16 v0, v0, off offset:2
119 ; GFX11-NEXT: s_waitcnt vmcnt(0)
120 ; GFX11-NEXT: flat_store_b32 v[1:2], v0
121 ; GFX11-NEXT: s_endpgm
123 ; GFX12-LABEL: test_scratch_load_i16_sext_v:
125 ; GFX12-NEXT: scratch_load_i16 v0, v0, off offset:2
126 ; GFX12-NEXT: s_wait_loadcnt 0x0
127 ; GFX12-NEXT: flat_store_b32 v[1:2], v0
128 ; GFX12-NEXT: s_endpgm
129 %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
130 %load = load i16, ptr addrspace(5) %gep, align 4
131 %ext = sext i16 %load to i32
132 store i32 %ext, ptr %out, align 4
136 define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_v(ptr addrspace(5) %in, ptr %out) {
137 ; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_lo_v:
138 ; GFX10: ; %bb.0: ; %bb
139 ; GFX10-NEXT: s_add_u32 s0, s0, s2
140 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
141 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
142 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
143 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
144 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
145 ; GFX10-NEXT: scratch_load_ubyte_d16 v3, v0, off
146 ; GFX10-NEXT: s_waitcnt vmcnt(0)
147 ; GFX10-NEXT: flat_store_dword v[1:2], v3
148 ; GFX10-NEXT: s_endpgm
150 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_v:
151 ; GFX11: ; %bb.0: ; %bb
152 ; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 1, v0
153 ; GFX11-NEXT: scratch_load_d16_u8 v3, v0, off
154 ; GFX11-NEXT: s_waitcnt vmcnt(0)
155 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
156 ; GFX11-NEXT: s_endpgm
158 ; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_v:
159 ; GFX12: ; %bb.0: ; %bb
160 ; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000
161 ; GFX12-NEXT: scratch_load_d16_u8 v3, v0, off offset:1
162 ; GFX12-NEXT: s_wait_loadcnt 0x0
163 ; GFX12-NEXT: flat_store_b32 v[1:2], v3
164 ; GFX12-NEXT: s_endpgm
166 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
167 %load_lo = load i8, ptr addrspace(5) %gep
168 %ext = zext i8 %load_lo to i16
169 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
170 store <2 x i16> %result, ptr %out, align 4
174 define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_v(ptr addrspace(5) %in, ptr %out) {
175 ; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_lo_v:
176 ; GFX10: ; %bb.0: ; %bb
177 ; GFX10-NEXT: s_add_u32 s0, s0, s2
178 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
179 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
180 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
181 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
182 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
183 ; GFX10-NEXT: scratch_load_sbyte_d16 v3, v0, off
184 ; GFX10-NEXT: s_waitcnt vmcnt(0)
185 ; GFX10-NEXT: flat_store_dword v[1:2], v3
186 ; GFX10-NEXT: s_endpgm
188 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_v:
189 ; GFX11: ; %bb.0: ; %bb
190 ; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 1, v0
191 ; GFX11-NEXT: scratch_load_d16_i8 v3, v0, off
192 ; GFX11-NEXT: s_waitcnt vmcnt(0)
193 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
194 ; GFX11-NEXT: s_endpgm
196 ; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_v:
197 ; GFX12: ; %bb.0: ; %bb
198 ; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000
199 ; GFX12-NEXT: scratch_load_d16_i8 v3, v0, off offset:1
200 ; GFX12-NEXT: s_wait_loadcnt 0x0
201 ; GFX12-NEXT: flat_store_b32 v[1:2], v3
202 ; GFX12-NEXT: s_endpgm
204 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
205 %load_lo = load i8, ptr addrspace(5) %gep
206 %ext = sext i8 %load_lo to i16
207 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
208 store <2 x i16> %result, ptr %out, align 4
212 define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_v(ptr addrspace(5) %in, ptr %out) {
213 ; GFX10-LABEL: test_scratch_load_i16_to_d16_lo_v:
214 ; GFX10: ; %bb.0: ; %bb
215 ; GFX10-NEXT: s_add_u32 s0, s0, s2
216 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
217 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
218 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
219 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
220 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
221 ; GFX10-NEXT: scratch_load_short_d16 v3, v0, off
222 ; GFX10-NEXT: s_waitcnt vmcnt(0)
223 ; GFX10-NEXT: flat_store_dword v[1:2], v3
224 ; GFX10-NEXT: s_endpgm
226 ; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_v:
227 ; GFX11: ; %bb.0: ; %bb
228 ; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_add_nc_u32 v0, 2, v0
229 ; GFX11-NEXT: scratch_load_d16_b16 v3, v0, off
230 ; GFX11-NEXT: s_waitcnt vmcnt(0)
231 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
232 ; GFX11-NEXT: s_endpgm
234 ; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_v:
235 ; GFX12: ; %bb.0: ; %bb
236 ; GFX12-NEXT: v_mov_b32_e32 v3, 0xffff0000
237 ; GFX12-NEXT: scratch_load_d16_b16 v3, v0, off offset:2
238 ; GFX12-NEXT: s_wait_loadcnt 0x0
239 ; GFX12-NEXT: flat_store_b32 v[1:2], v3
240 ; GFX12-NEXT: s_endpgm
242 %gep = getelementptr i16, ptr addrspace(5) %in, i64 1
243 %load_lo = load i16, ptr addrspace(5) %gep
244 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 0
245 store <2 x i16> %result, ptr %out, align 4
250 define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_v(ptr addrspace(5) %in, ptr %out) {
251 ; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_hi_v:
252 ; GFX10: ; %bb.0: ; %bb
253 ; GFX10-NEXT: s_add_u32 s0, s0, s2
254 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
255 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
256 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
257 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
258 ; GFX10-NEXT: v_mov_b32_e32 v3, -1
259 ; GFX10-NEXT: scratch_load_ubyte_d16_hi v3, v0, off
260 ; GFX10-NEXT: s_waitcnt vmcnt(0)
261 ; GFX10-NEXT: flat_store_dword v[1:2], v3
262 ; GFX10-NEXT: s_endpgm
264 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_v:
265 ; GFX11: ; %bb.0: ; %bb
266 ; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0
267 ; GFX11-NEXT: scratch_load_d16_hi_u8 v3, v0, off
268 ; GFX11-NEXT: s_waitcnt vmcnt(0)
269 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
270 ; GFX11-NEXT: s_endpgm
272 ; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_v:
273 ; GFX12: ; %bb.0: ; %bb
274 ; GFX12-NEXT: v_mov_b32_e32 v3, -1
275 ; GFX12-NEXT: scratch_load_d16_hi_u8 v3, v0, off offset:1
276 ; GFX12-NEXT: s_wait_loadcnt 0x0
277 ; GFX12-NEXT: flat_store_b32 v[1:2], v3
278 ; GFX12-NEXT: s_endpgm
280 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
281 %load_lo = load i8, ptr addrspace(5) %gep
282 %ext = zext i8 %load_lo to i16
283 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
284 store <2 x i16> %result, ptr %out, align 4
288 define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_v(ptr addrspace(5) %in, ptr %out) {
289 ; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_hi_v:
290 ; GFX10: ; %bb.0: ; %bb
291 ; GFX10-NEXT: s_add_u32 s0, s0, s2
292 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
293 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
294 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
295 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 1, v0
296 ; GFX10-NEXT: v_mov_b32_e32 v3, -1
297 ; GFX10-NEXT: scratch_load_sbyte_d16_hi v3, v0, off
298 ; GFX10-NEXT: s_waitcnt vmcnt(0)
299 ; GFX10-NEXT: flat_store_dword v[1:2], v3
300 ; GFX10-NEXT: s_endpgm
302 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_v:
303 ; GFX11: ; %bb.0: ; %bb
304 ; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 1, v0
305 ; GFX11-NEXT: scratch_load_d16_hi_i8 v3, v0, off
306 ; GFX11-NEXT: s_waitcnt vmcnt(0)
307 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
308 ; GFX11-NEXT: s_endpgm
310 ; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_v:
311 ; GFX12: ; %bb.0: ; %bb
312 ; GFX12-NEXT: v_mov_b32_e32 v3, -1
313 ; GFX12-NEXT: scratch_load_d16_hi_i8 v3, v0, off offset:1
314 ; GFX12-NEXT: s_wait_loadcnt 0x0
315 ; GFX12-NEXT: flat_store_b32 v[1:2], v3
316 ; GFX12-NEXT: s_endpgm
318 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
319 %load_lo = load i8, ptr addrspace(5) %gep
320 %ext = sext i8 %load_lo to i16
321 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
322 store <2 x i16> %result, ptr %out, align 4
326 define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_v(ptr addrspace(5) %in, ptr %out) {
327 ; GFX10-LABEL: test_scratch_load_i16_to_d16_hi_v:
328 ; GFX10: ; %bb.0: ; %bb
329 ; GFX10-NEXT: s_add_u32 s0, s0, s2
330 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
331 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
332 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
333 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 2, v0
334 ; GFX10-NEXT: v_mov_b32_e32 v3, -1
335 ; GFX10-NEXT: scratch_load_short_d16_hi v3, v0, off
336 ; GFX10-NEXT: s_waitcnt vmcnt(0)
337 ; GFX10-NEXT: flat_store_dword v[1:2], v3
338 ; GFX10-NEXT: s_endpgm
340 ; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_v:
341 ; GFX11: ; %bb.0: ; %bb
342 ; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_add_nc_u32 v0, 2, v0
343 ; GFX11-NEXT: scratch_load_d16_hi_b16 v3, v0, off
344 ; GFX11-NEXT: s_waitcnt vmcnt(0)
345 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
346 ; GFX11-NEXT: s_endpgm
348 ; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_v:
349 ; GFX12: ; %bb.0: ; %bb
350 ; GFX12-NEXT: v_mov_b32_e32 v3, -1
351 ; GFX12-NEXT: scratch_load_d16_hi_b16 v3, v0, off offset:2
352 ; GFX12-NEXT: s_wait_loadcnt 0x0
353 ; GFX12-NEXT: flat_store_b32 v[1:2], v3
354 ; GFX12-NEXT: s_endpgm
356 %gep = getelementptr i16, ptr addrspace(5) %in, i64 1
357 %load_lo = load i16, ptr addrspace(5) %gep
358 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 1
359 store <2 x i16> %result, ptr %out, align 4
363 define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_v(ptr %in, ptr addrspace(5) %out) {
364 ; GFX10-LABEL: test_scratch_store_b8_from_d16_hi_v:
365 ; GFX10: ; %bb.0: ; %bb
366 ; GFX10-NEXT: s_add_u32 s0, s0, s2
367 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
368 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
369 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
370 ; GFX10-NEXT: flat_load_dword v0, v[0:1]
371 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 4, v2
372 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
373 ; GFX10-NEXT: scratch_store_byte_d16_hi v1, v0, off
374 ; GFX10-NEXT: s_endpgm
376 ; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_v:
377 ; GFX11: ; %bb.0: ; %bb
378 ; GFX11-NEXT: flat_load_b32 v0, v[0:1]
379 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 4, v2
380 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
381 ; GFX11-NEXT: scratch_store_d16_hi_b8 v1, v0, off
382 ; GFX11-NEXT: s_endpgm
384 ; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_v:
385 ; GFX12: ; %bb.0: ; %bb
386 ; GFX12-NEXT: flat_load_b32 v0, v[0:1]
387 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
388 ; GFX12-NEXT: scratch_store_d16_hi_b8 v2, v0, off offset:4
389 ; GFX12-NEXT: s_endpgm
391 %load = load <4 x i8>, ptr %in
392 %element = extractelement <4 x i8> %load, i32 2
393 %gep = getelementptr <4 x i8>, ptr addrspace(5) %out, i64 1
394 store i8 %element, ptr addrspace(5) %gep, align 4
398 define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_v(ptr %in, ptr addrspace(5) %out) {
399 ; GFX10-LABEL: test_scratch_store_b16_from_d16_hi_v:
400 ; GFX10: ; %bb.0: ; %bb
401 ; GFX10-NEXT: s_add_u32 s0, s0, s2
402 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
403 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
404 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
405 ; GFX10-NEXT: flat_load_dword v0, v[0:1]
406 ; GFX10-NEXT: v_add_nc_u32_e32 v1, 2, v2
407 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
408 ; GFX10-NEXT: scratch_store_short_d16_hi v1, v0, off
409 ; GFX10-NEXT: s_endpgm
411 ; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_v:
412 ; GFX11: ; %bb.0: ; %bb
413 ; GFX11-NEXT: flat_load_b32 v0, v[0:1]
414 ; GFX11-NEXT: v_add_nc_u32_e32 v1, 2, v2
415 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
416 ; GFX11-NEXT: scratch_store_d16_hi_b16 v1, v0, off
417 ; GFX11-NEXT: s_endpgm
419 ; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_v:
420 ; GFX12: ; %bb.0: ; %bb
421 ; GFX12-NEXT: flat_load_b32 v0, v[0:1]
422 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
423 ; GFX12-NEXT: scratch_store_d16_hi_b16 v2, v0, off offset:2
424 ; GFX12-NEXT: s_endpgm
426 %load = load <2 x i16>, ptr %in
427 %element = extractelement <2 x i16> %load, i32 1
428 %gep = getelementptr <2 x i8>, ptr addrspace(5) %out, i64 1
429 store i16 %element, ptr addrspace(5) %gep, align 4
438 define amdgpu_ps void @test_scratch_load_i8_zext_s(ptr addrspace(5) inreg %in, ptr %out) {
439 ; GFX10-LABEL: test_scratch_load_i8_zext_s:
441 ; GFX10-NEXT: s_add_u32 s0, s0, s3
442 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
443 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
444 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
445 ; GFX10-NEXT: scratch_load_ubyte v2, off, s2 offset:1
446 ; GFX10-NEXT: s_waitcnt vmcnt(0)
447 ; GFX10-NEXT: flat_store_dword v[0:1], v2
448 ; GFX10-NEXT: s_endpgm
450 ; GFX11-LABEL: test_scratch_load_i8_zext_s:
452 ; GFX11-NEXT: scratch_load_u8 v2, off, s0 offset:1
453 ; GFX11-NEXT: s_waitcnt vmcnt(0)
454 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
455 ; GFX11-NEXT: s_endpgm
457 ; GFX12-LABEL: test_scratch_load_i8_zext_s:
459 ; GFX12-NEXT: scratch_load_u8 v2, off, s0 offset:1
460 ; GFX12-NEXT: s_wait_loadcnt 0x0
461 ; GFX12-NEXT: flat_store_b32 v[0:1], v2
462 ; GFX12-NEXT: s_endpgm
463 %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
464 %load = load i8, ptr addrspace(5) %gep, align 4
465 %ext = zext i8 %load to i32
466 store i32 %ext, ptr %out, align 4
470 define amdgpu_ps void @test_scratch_load_i8_sext_s(ptr addrspace(5) inreg %in, ptr %out) {
471 ; GFX10-LABEL: test_scratch_load_i8_sext_s:
473 ; GFX10-NEXT: s_add_u32 s0, s0, s3
474 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
475 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
476 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
477 ; GFX10-NEXT: scratch_load_sbyte v2, off, s2 offset:1
478 ; GFX10-NEXT: s_waitcnt vmcnt(0)
479 ; GFX10-NEXT: flat_store_dword v[0:1], v2
480 ; GFX10-NEXT: s_endpgm
482 ; GFX11-LABEL: test_scratch_load_i8_sext_s:
484 ; GFX11-NEXT: scratch_load_i8 v2, off, s0 offset:1
485 ; GFX11-NEXT: s_waitcnt vmcnt(0)
486 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
487 ; GFX11-NEXT: s_endpgm
489 ; GFX12-LABEL: test_scratch_load_i8_sext_s:
491 ; GFX12-NEXT: scratch_load_i8 v2, off, s0 offset:1
492 ; GFX12-NEXT: s_wait_loadcnt 0x0
493 ; GFX12-NEXT: flat_store_b32 v[0:1], v2
494 ; GFX12-NEXT: s_endpgm
495 %gep = getelementptr inbounds i8, ptr addrspace(5) %in, i32 1
496 %load = load i8, ptr addrspace(5) %gep, align 4
497 %ext = sext i8 %load to i32
498 store i32 %ext, ptr %out, align 4
502 define amdgpu_ps void @test_scratch_load_i16_zext_s(ptr addrspace(5) inreg %in, ptr %out) {
503 ; GFX10-LABEL: test_scratch_load_i16_zext_s:
505 ; GFX10-NEXT: s_add_u32 s0, s0, s3
506 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
507 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
508 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
509 ; GFX10-NEXT: scratch_load_ushort v2, off, s2 offset:2
510 ; GFX10-NEXT: s_waitcnt vmcnt(0)
511 ; GFX10-NEXT: flat_store_dword v[0:1], v2
512 ; GFX10-NEXT: s_endpgm
514 ; GFX11-LABEL: test_scratch_load_i16_zext_s:
516 ; GFX11-NEXT: scratch_load_u16 v2, off, s0 offset:2
517 ; GFX11-NEXT: s_waitcnt vmcnt(0)
518 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
519 ; GFX11-NEXT: s_endpgm
521 ; GFX12-LABEL: test_scratch_load_i16_zext_s:
523 ; GFX12-NEXT: scratch_load_u16 v2, off, s0 offset:2
524 ; GFX12-NEXT: s_wait_loadcnt 0x0
525 ; GFX12-NEXT: flat_store_b32 v[0:1], v2
526 ; GFX12-NEXT: s_endpgm
527 %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
528 %load = load i16, ptr addrspace(5) %gep, align 4
529 %ext = zext i16 %load to i32
530 store i32 %ext, ptr %out, align 4
534 define amdgpu_ps void @test_scratch_load_i16_sext_s(ptr addrspace(5) inreg %in, ptr %out) {
535 ; GFX10-LABEL: test_scratch_load_i16_sext_s:
537 ; GFX10-NEXT: s_add_u32 s0, s0, s3
538 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
539 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
540 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
541 ; GFX10-NEXT: scratch_load_sshort v2, off, s2 offset:2
542 ; GFX10-NEXT: s_waitcnt vmcnt(0)
543 ; GFX10-NEXT: flat_store_dword v[0:1], v2
544 ; GFX10-NEXT: s_endpgm
546 ; GFX11-LABEL: test_scratch_load_i16_sext_s:
548 ; GFX11-NEXT: scratch_load_i16 v2, off, s0 offset:2
549 ; GFX11-NEXT: s_waitcnt vmcnt(0)
550 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
551 ; GFX11-NEXT: s_endpgm
553 ; GFX12-LABEL: test_scratch_load_i16_sext_s:
555 ; GFX12-NEXT: scratch_load_i16 v2, off, s0 offset:2
556 ; GFX12-NEXT: s_wait_loadcnt 0x0
557 ; GFX12-NEXT: flat_store_b32 v[0:1], v2
558 ; GFX12-NEXT: s_endpgm
559 %gep = getelementptr inbounds i16, ptr addrspace(5) %in, i32 1
560 %load = load i16, ptr addrspace(5) %gep, align 4
561 %ext = sext i16 %load to i32
562 store i32 %ext, ptr %out, align 4
566 define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_s(ptr addrspace(5) inreg %in, ptr %out) {
567 ; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_lo_s:
568 ; GFX10: ; %bb.0: ; %bb
569 ; GFX10-NEXT: s_add_u32 s0, s0, s3
570 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
571 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
572 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
573 ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff0000
574 ; GFX10-NEXT: s_add_i32 s2, s2, 1
575 ; GFX10-NEXT: scratch_load_ubyte_d16 v2, off, s2
576 ; GFX10-NEXT: s_waitcnt vmcnt(0)
577 ; GFX10-NEXT: flat_store_dword v[0:1], v2
578 ; GFX10-NEXT: s_endpgm
580 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_s:
581 ; GFX11: ; %bb.0: ; %bb
582 ; GFX11-NEXT: v_mov_b32_e32 v2, 0xffff0000
583 ; GFX11-NEXT: s_add_i32 s0, s0, 1
584 ; GFX11-NEXT: scratch_load_d16_u8 v2, off, s0
585 ; GFX11-NEXT: s_waitcnt vmcnt(0)
586 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
587 ; GFX11-NEXT: s_endpgm
589 ; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_s:
590 ; GFX12: ; %bb.0: ; %bb
591 ; GFX12-NEXT: v_mov_b32_e32 v2, 0xffff0000
592 ; GFX12-NEXT: scratch_load_d16_u8 v2, off, s0 offset:1
593 ; GFX12-NEXT: s_wait_loadcnt 0x0
594 ; GFX12-NEXT: flat_store_b32 v[0:1], v2
595 ; GFX12-NEXT: s_endpgm
597 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
598 %load_lo = load i8, ptr addrspace(5) %gep
599 %ext = zext i8 %load_lo to i16
600 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
601 store <2 x i16> %result, ptr %out, align 4
605 define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_s(ptr addrspace(5) inreg %in, ptr %out) {
606 ; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_lo_s:
607 ; GFX10: ; %bb.0: ; %bb
608 ; GFX10-NEXT: s_add_u32 s0, s0, s3
609 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
610 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
611 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
612 ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff0000
613 ; GFX10-NEXT: s_add_i32 s2, s2, 1
614 ; GFX10-NEXT: scratch_load_sbyte_d16 v2, off, s2
615 ; GFX10-NEXT: s_waitcnt vmcnt(0)
616 ; GFX10-NEXT: flat_store_dword v[0:1], v2
617 ; GFX10-NEXT: s_endpgm
619 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_s:
620 ; GFX11: ; %bb.0: ; %bb
621 ; GFX11-NEXT: v_mov_b32_e32 v2, 0xffff0000
622 ; GFX11-NEXT: s_add_i32 s0, s0, 1
623 ; GFX11-NEXT: scratch_load_d16_i8 v2, off, s0
624 ; GFX11-NEXT: s_waitcnt vmcnt(0)
625 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
626 ; GFX11-NEXT: s_endpgm
628 ; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_s:
629 ; GFX12: ; %bb.0: ; %bb
630 ; GFX12-NEXT: v_mov_b32_e32 v2, 0xffff0000
631 ; GFX12-NEXT: scratch_load_d16_i8 v2, off, s0 offset:1
632 ; GFX12-NEXT: s_wait_loadcnt 0x0
633 ; GFX12-NEXT: flat_store_b32 v[0:1], v2
634 ; GFX12-NEXT: s_endpgm
636 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
637 %load_lo = load i8, ptr addrspace(5) %gep
638 %ext = sext i8 %load_lo to i16
639 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
640 store <2 x i16> %result, ptr %out, align 4
644 define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_s(ptr addrspace(5) inreg %in, ptr %out) {
645 ; GFX10-LABEL: test_scratch_load_i16_to_d16_lo_s:
646 ; GFX10: ; %bb.0: ; %bb
647 ; GFX10-NEXT: s_add_u32 s0, s0, s3
648 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
649 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
650 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
651 ; GFX10-NEXT: v_mov_b32_e32 v2, 0xffff0000
652 ; GFX10-NEXT: s_add_i32 s2, s2, 2
653 ; GFX10-NEXT: scratch_load_short_d16 v2, off, s2
654 ; GFX10-NEXT: s_waitcnt vmcnt(0)
655 ; GFX10-NEXT: flat_store_dword v[0:1], v2
656 ; GFX10-NEXT: s_endpgm
658 ; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_s:
659 ; GFX11: ; %bb.0: ; %bb
660 ; GFX11-NEXT: v_mov_b32_e32 v2, 0xffff0000
661 ; GFX11-NEXT: s_add_i32 s0, s0, 2
662 ; GFX11-NEXT: scratch_load_d16_b16 v2, off, s0
663 ; GFX11-NEXT: s_waitcnt vmcnt(0)
664 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
665 ; GFX11-NEXT: s_endpgm
667 ; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_s:
668 ; GFX12: ; %bb.0: ; %bb
669 ; GFX12-NEXT: v_mov_b32_e32 v2, 0xffff0000
670 ; GFX12-NEXT: scratch_load_d16_b16 v2, off, s0 offset:2
671 ; GFX12-NEXT: s_wait_loadcnt 0x0
672 ; GFX12-NEXT: flat_store_b32 v[0:1], v2
673 ; GFX12-NEXT: s_endpgm
675 %gep = getelementptr i16, ptr addrspace(5) %in, i64 1
676 %load_lo = load i16, ptr addrspace(5) %gep
677 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 0
678 store <2 x i16> %result, ptr %out, align 4
683 define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_s(ptr addrspace(5) inreg %in, ptr %out) {
684 ; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_hi_s:
685 ; GFX10: ; %bb.0: ; %bb
686 ; GFX10-NEXT: s_add_u32 s0, s0, s3
687 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
688 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
689 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
690 ; GFX10-NEXT: v_mov_b32_e32 v2, -1
691 ; GFX10-NEXT: s_add_i32 s2, s2, 1
692 ; GFX10-NEXT: scratch_load_ubyte_d16_hi v2, off, s2
693 ; GFX10-NEXT: s_waitcnt vmcnt(0)
694 ; GFX10-NEXT: flat_store_dword v[0:1], v2
695 ; GFX10-NEXT: s_endpgm
697 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_s:
698 ; GFX11: ; %bb.0: ; %bb
699 ; GFX11-NEXT: v_mov_b32_e32 v2, -1
700 ; GFX11-NEXT: s_add_i32 s0, s0, 1
701 ; GFX11-NEXT: scratch_load_d16_hi_u8 v2, off, s0
702 ; GFX11-NEXT: s_waitcnt vmcnt(0)
703 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
704 ; GFX11-NEXT: s_endpgm
706 ; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_s:
707 ; GFX12: ; %bb.0: ; %bb
708 ; GFX12-NEXT: v_mov_b32_e32 v2, -1
709 ; GFX12-NEXT: scratch_load_d16_hi_u8 v2, off, s0 offset:1
710 ; GFX12-NEXT: s_wait_loadcnt 0x0
711 ; GFX12-NEXT: flat_store_b32 v[0:1], v2
712 ; GFX12-NEXT: s_endpgm
714 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
715 %load_lo = load i8, ptr addrspace(5) %gep
716 %ext = zext i8 %load_lo to i16
717 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
718 store <2 x i16> %result, ptr %out, align 4
722 define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_s(ptr addrspace(5) inreg %in, ptr %out) {
723 ; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_hi_s:
724 ; GFX10: ; %bb.0: ; %bb
725 ; GFX10-NEXT: s_add_u32 s0, s0, s3
726 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
727 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
728 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
729 ; GFX10-NEXT: v_mov_b32_e32 v2, -1
730 ; GFX10-NEXT: s_add_i32 s2, s2, 1
731 ; GFX10-NEXT: scratch_load_sbyte_d16_hi v2, off, s2
732 ; GFX10-NEXT: s_waitcnt vmcnt(0)
733 ; GFX10-NEXT: flat_store_dword v[0:1], v2
734 ; GFX10-NEXT: s_endpgm
736 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_s:
737 ; GFX11: ; %bb.0: ; %bb
738 ; GFX11-NEXT: v_mov_b32_e32 v2, -1
739 ; GFX11-NEXT: s_add_i32 s0, s0, 1
740 ; GFX11-NEXT: scratch_load_d16_hi_i8 v2, off, s0
741 ; GFX11-NEXT: s_waitcnt vmcnt(0)
742 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
743 ; GFX11-NEXT: s_endpgm
745 ; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_s:
746 ; GFX12: ; %bb.0: ; %bb
747 ; GFX12-NEXT: v_mov_b32_e32 v2, -1
748 ; GFX12-NEXT: scratch_load_d16_hi_i8 v2, off, s0 offset:1
749 ; GFX12-NEXT: s_wait_loadcnt 0x0
750 ; GFX12-NEXT: flat_store_b32 v[0:1], v2
751 ; GFX12-NEXT: s_endpgm
753 %gep = getelementptr i8, ptr addrspace(5) %in, i64 1
754 %load_lo = load i8, ptr addrspace(5) %gep
755 %ext = sext i8 %load_lo to i16
756 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
757 store <2 x i16> %result, ptr %out, align 4
761 define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_s(ptr addrspace(5) inreg %in, ptr %out) {
762 ; GFX10-LABEL: test_scratch_load_i16_to_d16_hi_s:
763 ; GFX10: ; %bb.0: ; %bb
764 ; GFX10-NEXT: s_add_u32 s0, s0, s3
765 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
766 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
767 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
768 ; GFX10-NEXT: v_mov_b32_e32 v2, -1
769 ; GFX10-NEXT: s_add_i32 s2, s2, 2
770 ; GFX10-NEXT: scratch_load_short_d16_hi v2, off, s2
771 ; GFX10-NEXT: s_waitcnt vmcnt(0)
772 ; GFX10-NEXT: flat_store_dword v[0:1], v2
773 ; GFX10-NEXT: s_endpgm
775 ; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_s:
776 ; GFX11: ; %bb.0: ; %bb
777 ; GFX11-NEXT: v_mov_b32_e32 v2, -1
778 ; GFX11-NEXT: s_add_i32 s0, s0, 2
779 ; GFX11-NEXT: scratch_load_d16_hi_b16 v2, off, s0
780 ; GFX11-NEXT: s_waitcnt vmcnt(0)
781 ; GFX11-NEXT: flat_store_b32 v[0:1], v2
782 ; GFX11-NEXT: s_endpgm
784 ; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_s:
785 ; GFX12: ; %bb.0: ; %bb
786 ; GFX12-NEXT: v_mov_b32_e32 v2, -1
787 ; GFX12-NEXT: scratch_load_d16_hi_b16 v2, off, s0 offset:2
788 ; GFX12-NEXT: s_wait_loadcnt 0x0
789 ; GFX12-NEXT: flat_store_b32 v[0:1], v2
790 ; GFX12-NEXT: s_endpgm
792 %gep = getelementptr i16, ptr addrspace(5) %in, i64 1
793 %load_lo = load i16, ptr addrspace(5) %gep
794 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 1
795 store <2 x i16> %result, ptr %out, align 4
799 define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_s(ptr %in, ptr addrspace(5) inreg %out) {
800 ; GFX10-LABEL: test_scratch_store_b8_from_d16_hi_s:
801 ; GFX10: ; %bb.0: ; %bb
802 ; GFX10-NEXT: s_add_u32 s0, s0, s3
803 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
804 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
805 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
806 ; GFX10-NEXT: flat_load_dword v0, v[0:1]
807 ; GFX10-NEXT: s_add_i32 s2, s2, 4
808 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
809 ; GFX10-NEXT: scratch_store_byte_d16_hi off, v0, s2
810 ; GFX10-NEXT: s_endpgm
812 ; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_s:
813 ; GFX11: ; %bb.0: ; %bb
814 ; GFX11-NEXT: flat_load_b32 v0, v[0:1]
815 ; GFX11-NEXT: s_add_i32 s0, s0, 4
816 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
817 ; GFX11-NEXT: scratch_store_d16_hi_b8 off, v0, s0
818 ; GFX11-NEXT: s_endpgm
820 ; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_s:
821 ; GFX12: ; %bb.0: ; %bb
822 ; GFX12-NEXT: flat_load_b32 v0, v[0:1]
823 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
824 ; GFX12-NEXT: scratch_store_d16_hi_b8 off, v0, s0 offset:4
825 ; GFX12-NEXT: s_endpgm
827 %load = load <4 x i8>, ptr %in
828 %element = extractelement <4 x i8> %load, i32 2
829 %gep = getelementptr <4 x i8>, ptr addrspace(5) %out, i64 1
830 store i8 %element, ptr addrspace(5) %gep, align 4
834 define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_s(ptr %in, ptr addrspace(5) inreg %out) {
835 ; GFX10-LABEL: test_scratch_store_b16_from_d16_hi_s:
836 ; GFX10: ; %bb.0: ; %bb
837 ; GFX10-NEXT: s_add_u32 s0, s0, s3
838 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
839 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
840 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
841 ; GFX10-NEXT: flat_load_dword v0, v[0:1]
842 ; GFX10-NEXT: s_add_i32 s2, s2, 2
843 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
844 ; GFX10-NEXT: scratch_store_short_d16_hi off, v0, s2
845 ; GFX10-NEXT: s_endpgm
847 ; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_s:
848 ; GFX11: ; %bb.0: ; %bb
849 ; GFX11-NEXT: flat_load_b32 v0, v[0:1]
850 ; GFX11-NEXT: s_add_i32 s0, s0, 2
851 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
852 ; GFX11-NEXT: scratch_store_d16_hi_b16 off, v0, s0
853 ; GFX11-NEXT: s_endpgm
855 ; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_s:
856 ; GFX12: ; %bb.0: ; %bb
857 ; GFX12-NEXT: flat_load_b32 v0, v[0:1]
858 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
859 ; GFX12-NEXT: scratch_store_d16_hi_b16 off, v0, s0 offset:2
860 ; GFX12-NEXT: s_endpgm
862 %load = load <2 x i16>, ptr %in
863 %element = extractelement <2 x i16> %load, i32 1
864 %gep = getelementptr <2 x i8>, ptr addrspace(5) %out, i64 1
865 store i16 %element, ptr addrspace(5) %gep, align 4
874 define amdgpu_ps void @test_scratch_load_i8_zext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
875 ; GFX10-LABEL: test_scratch_load_i8_zext_svs:
877 ; GFX10-NEXT: s_add_u32 s0, s0, s3
878 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
879 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
880 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
881 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
882 ; GFX10-NEXT: scratch_load_ubyte v0, v0, off offset:1
883 ; GFX10-NEXT: s_waitcnt vmcnt(0)
884 ; GFX10-NEXT: flat_store_dword v[1:2], v0
885 ; GFX10-NEXT: s_endpgm
887 ; GFX11-LABEL: test_scratch_load_i8_zext_svs:
889 ; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
890 ; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:1
891 ; GFX11-NEXT: s_waitcnt vmcnt(0)
892 ; GFX11-NEXT: flat_store_b32 v[1:2], v0
893 ; GFX11-NEXT: s_endpgm
895 ; GFX12-LABEL: test_scratch_load_i8_zext_svs:
897 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
898 ; GFX12-NEXT: scratch_load_u8 v0, v0, s0 offset:1
899 ; GFX12-NEXT: s_wait_loadcnt 0x0
900 ; GFX12-NEXT: flat_store_b32 v[1:2], v0
901 ; GFX12-NEXT: s_endpgm
902 %voffset4 = mul i32 %voffset, 4
903 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
904 %gep = getelementptr inbounds i8, ptr addrspace(5) %gep0, i32 1
905 %load = load i8, ptr addrspace(5) %gep, align 4
906 %ext = zext i8 %load to i32
907 store i32 %ext, ptr %out, align 4
911 define amdgpu_ps void @test_scratch_load_i8_sext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
912 ; GFX10-LABEL: test_scratch_load_i8_sext_svs:
914 ; GFX10-NEXT: s_add_u32 s0, s0, s3
915 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
916 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
917 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
918 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
919 ; GFX10-NEXT: scratch_load_sbyte v0, v0, off offset:1
920 ; GFX10-NEXT: s_waitcnt vmcnt(0)
921 ; GFX10-NEXT: flat_store_dword v[1:2], v0
922 ; GFX10-NEXT: s_endpgm
924 ; GFX11-LABEL: test_scratch_load_i8_sext_svs:
926 ; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
927 ; GFX11-NEXT: scratch_load_i8 v0, v0, off offset:1
928 ; GFX11-NEXT: s_waitcnt vmcnt(0)
929 ; GFX11-NEXT: flat_store_b32 v[1:2], v0
930 ; GFX11-NEXT: s_endpgm
932 ; GFX12-LABEL: test_scratch_load_i8_sext_svs:
934 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
935 ; GFX12-NEXT: scratch_load_i8 v0, v0, s0 offset:1
936 ; GFX12-NEXT: s_wait_loadcnt 0x0
937 ; GFX12-NEXT: flat_store_b32 v[1:2], v0
938 ; GFX12-NEXT: s_endpgm
939 %voffset4 = mul i32 %voffset, 4
940 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
941 %gep = getelementptr inbounds i8, ptr addrspace(5) %gep0, i32 1
942 %load = load i8, ptr addrspace(5) %gep, align 4
943 %ext = sext i8 %load to i32
944 store i32 %ext, ptr %out, align 4
948 define amdgpu_ps void @test_scratch_load_i16_zext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
949 ; GFX10-LABEL: test_scratch_load_i16_zext_svs:
951 ; GFX10-NEXT: s_add_u32 s0, s0, s3
952 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
953 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
954 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
955 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
956 ; GFX10-NEXT: scratch_load_ushort v0, v0, off offset:2
957 ; GFX10-NEXT: s_waitcnt vmcnt(0)
958 ; GFX10-NEXT: flat_store_dword v[1:2], v0
959 ; GFX10-NEXT: s_endpgm
961 ; GFX11-LABEL: test_scratch_load_i16_zext_svs:
963 ; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
964 ; GFX11-NEXT: scratch_load_u16 v0, v0, off offset:2
965 ; GFX11-NEXT: s_waitcnt vmcnt(0)
966 ; GFX11-NEXT: flat_store_b32 v[1:2], v0
967 ; GFX11-NEXT: s_endpgm
969 ; GFX12-LABEL: test_scratch_load_i16_zext_svs:
971 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
972 ; GFX12-NEXT: scratch_load_u16 v0, v0, s0 offset:2
973 ; GFX12-NEXT: s_wait_loadcnt 0x0
974 ; GFX12-NEXT: flat_store_b32 v[1:2], v0
975 ; GFX12-NEXT: s_endpgm
976 %voffset4 = mul i32 %voffset, 4
977 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
978 %gep = getelementptr inbounds i16, ptr addrspace(5) %gep0, i32 1
979 %load = load i16, ptr addrspace(5) %gep, align 4
980 %ext = zext i16 %load to i32
981 store i32 %ext, ptr %out, align 4
985 define amdgpu_ps void @test_scratch_load_i16_sext_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
986 ; GFX10-LABEL: test_scratch_load_i16_sext_svs:
988 ; GFX10-NEXT: s_add_u32 s0, s0, s3
989 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
990 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
991 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
992 ; GFX10-NEXT: v_lshl_add_u32 v0, v0, 2, s2
993 ; GFX10-NEXT: scratch_load_sshort v0, v0, off offset:2
994 ; GFX10-NEXT: s_waitcnt vmcnt(0)
995 ; GFX10-NEXT: flat_store_dword v[1:2], v0
996 ; GFX10-NEXT: s_endpgm
998 ; GFX11-LABEL: test_scratch_load_i16_sext_svs:
1000 ; GFX11-NEXT: v_lshl_add_u32 v0, v0, 2, s0
1001 ; GFX11-NEXT: scratch_load_i16 v0, v0, off offset:2
1002 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1003 ; GFX11-NEXT: flat_store_b32 v[1:2], v0
1004 ; GFX11-NEXT: s_endpgm
1006 ; GFX12-LABEL: test_scratch_load_i16_sext_svs:
1008 ; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1009 ; GFX12-NEXT: scratch_load_i16 v0, v0, s0 offset:2
1010 ; GFX12-NEXT: s_wait_loadcnt 0x0
1011 ; GFX12-NEXT: flat_store_b32 v[1:2], v0
1012 ; GFX12-NEXT: s_endpgm
1013 %voffset4 = mul i32 %voffset, 4
1014 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
1015 %gep = getelementptr inbounds i16, ptr addrspace(5) %gep0, i32 1
1016 %load = load i16, ptr addrspace(5) %gep, align 4
1017 %ext = sext i16 %load to i32
1018 store i32 %ext, ptr %out, align 4
1022 define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_lo_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
1023 ; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs:
1024 ; GFX10: ; %bb.0: ; %bb
1025 ; GFX10-NEXT: s_add_u32 s0, s0, s3
1026 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1027 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1028 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1029 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1030 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
1031 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
1032 ; GFX10-NEXT: scratch_load_ubyte_d16 v3, v0, off
1033 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1034 ; GFX10-NEXT: flat_store_dword v[1:2], v3
1035 ; GFX10-NEXT: s_endpgm
1037 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs:
1038 ; GFX11: ; %bb.0: ; %bb
1039 ; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
1040 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1041 ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
1042 ; GFX11-NEXT: scratch_load_d16_u8 v3, v0, off
1043 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1044 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
1045 ; GFX11-NEXT: s_endpgm
1047 ; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_lo_svs:
1048 ; GFX12: ; %bb.0: ; %bb
1049 ; GFX12-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
1050 ; GFX12-NEXT: scratch_load_d16_u8 v3, v0, s0 offset:1
1051 ; GFX12-NEXT: s_wait_loadcnt 0x0
1052 ; GFX12-NEXT: flat_store_b32 v[1:2], v3
1053 ; GFX12-NEXT: s_endpgm
1055 %voffset4 = mul i32 %voffset, 4
1056 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
1057 %gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1
1058 %load_lo = load i8, ptr addrspace(5) %gep
1059 %ext = zext i8 %load_lo to i16
1060 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
1061 store <2 x i16> %result, ptr %out, align 4
1065 define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_lo_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
1066 ; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs:
1067 ; GFX10: ; %bb.0: ; %bb
1068 ; GFX10-NEXT: s_add_u32 s0, s0, s3
1069 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1070 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1071 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1072 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1073 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
1074 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
1075 ; GFX10-NEXT: scratch_load_sbyte_d16 v3, v0, off
1076 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1077 ; GFX10-NEXT: flat_store_dword v[1:2], v3
1078 ; GFX10-NEXT: s_endpgm
1080 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs:
1081 ; GFX11: ; %bb.0: ; %bb
1082 ; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
1083 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1084 ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
1085 ; GFX11-NEXT: scratch_load_d16_i8 v3, v0, off
1086 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1087 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
1088 ; GFX11-NEXT: s_endpgm
1090 ; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_lo_svs:
1091 ; GFX12: ; %bb.0: ; %bb
1092 ; GFX12-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
1093 ; GFX12-NEXT: scratch_load_d16_i8 v3, v0, s0 offset:1
1094 ; GFX12-NEXT: s_wait_loadcnt 0x0
1095 ; GFX12-NEXT: flat_store_b32 v[1:2], v3
1096 ; GFX12-NEXT: s_endpgm
1098 %voffset4 = mul i32 %voffset, 4
1099 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
1100 %gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1
1101 %load_lo = load i8, ptr addrspace(5) %gep
1102 %ext = sext i8 %load_lo to i16
1103 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 0
1104 store <2 x i16> %result, ptr %out, align 4
1108 define amdgpu_ps void @test_scratch_load_i16_to_d16_lo_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
1109 ; GFX10-LABEL: test_scratch_load_i16_to_d16_lo_svs:
1110 ; GFX10: ; %bb.0: ; %bb
1111 ; GFX10-NEXT: s_add_u32 s0, s0, s3
1112 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1113 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1114 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1115 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1116 ; GFX10-NEXT: v_mov_b32_e32 v3, 0xffff0000
1117 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 2
1118 ; GFX10-NEXT: scratch_load_short_d16 v3, v0, off
1119 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1120 ; GFX10-NEXT: flat_store_dword v[1:2], v3
1121 ; GFX10-NEXT: s_endpgm
1123 ; GFX11-LABEL: test_scratch_load_i16_to_d16_lo_svs:
1124 ; GFX11: ; %bb.0: ; %bb
1125 ; GFX11-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
1126 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1127 ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2
1128 ; GFX11-NEXT: scratch_load_d16_b16 v3, v0, off
1129 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1130 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
1131 ; GFX11-NEXT: s_endpgm
1133 ; GFX12-LABEL: test_scratch_load_i16_to_d16_lo_svs:
1134 ; GFX12: ; %bb.0: ; %bb
1135 ; GFX12-NEXT: v_dual_mov_b32 v3, 0xffff0000 :: v_dual_lshlrev_b32 v0, 2, v0
1136 ; GFX12-NEXT: scratch_load_d16_b16 v3, v0, s0 offset:2
1137 ; GFX12-NEXT: s_wait_loadcnt 0x0
1138 ; GFX12-NEXT: flat_store_b32 v[1:2], v3
1139 ; GFX12-NEXT: s_endpgm
1141 %voffset4 = mul i32 %voffset, 4
1142 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
1143 %gep = getelementptr i16, ptr addrspace(5) %gep0, i64 1
1144 %load_lo = load i16, ptr addrspace(5) %gep
1145 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 0
1146 store <2 x i16> %result, ptr %out, align 4
1151 define amdgpu_ps void @test_scratch_load_i8_zext_to_d16_hi_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
1152 ; GFX10-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs:
1153 ; GFX10: ; %bb.0: ; %bb
1154 ; GFX10-NEXT: s_add_u32 s0, s0, s3
1155 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1156 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1157 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1158 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1159 ; GFX10-NEXT: v_mov_b32_e32 v3, -1
1160 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
1161 ; GFX10-NEXT: scratch_load_ubyte_d16_hi v3, v0, off
1162 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1163 ; GFX10-NEXT: flat_store_dword v[1:2], v3
1164 ; GFX10-NEXT: s_endpgm
1166 ; GFX11-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs:
1167 ; GFX11: ; %bb.0: ; %bb
1168 ; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
1169 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1170 ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
1171 ; GFX11-NEXT: scratch_load_d16_hi_u8 v3, v0, off
1172 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1173 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
1174 ; GFX11-NEXT: s_endpgm
1176 ; GFX12-LABEL: test_scratch_load_i8_zext_to_d16_hi_svs:
1177 ; GFX12: ; %bb.0: ; %bb
1178 ; GFX12-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
1179 ; GFX12-NEXT: scratch_load_d16_hi_u8 v3, v0, s0 offset:1
1180 ; GFX12-NEXT: s_wait_loadcnt 0x0
1181 ; GFX12-NEXT: flat_store_b32 v[1:2], v3
1182 ; GFX12-NEXT: s_endpgm
1184 %voffset4 = mul i32 %voffset, 4
1185 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
1186 %gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1
1187 %load_lo = load i8, ptr addrspace(5) %gep
1188 %ext = zext i8 %load_lo to i16
1189 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
1190 store <2 x i16> %result, ptr %out, align 4
1194 define amdgpu_ps void @test_scratch_load_i8_sext_to_d16_hi_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
1195 ; GFX10-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs:
1196 ; GFX10: ; %bb.0: ; %bb
1197 ; GFX10-NEXT: s_add_u32 s0, s0, s3
1198 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1199 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1200 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1201 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1202 ; GFX10-NEXT: v_mov_b32_e32 v3, -1
1203 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 1
1204 ; GFX10-NEXT: scratch_load_sbyte_d16_hi v3, v0, off
1205 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1206 ; GFX10-NEXT: flat_store_dword v[1:2], v3
1207 ; GFX10-NEXT: s_endpgm
1209 ; GFX11-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs:
1210 ; GFX11: ; %bb.0: ; %bb
1211 ; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
1212 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1213 ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 1
1214 ; GFX11-NEXT: scratch_load_d16_hi_i8 v3, v0, off
1215 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1216 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
1217 ; GFX11-NEXT: s_endpgm
1219 ; GFX12-LABEL: test_scratch_load_i8_sext_to_d16_hi_svs:
1220 ; GFX12: ; %bb.0: ; %bb
1221 ; GFX12-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
1222 ; GFX12-NEXT: scratch_load_d16_hi_i8 v3, v0, s0 offset:1
1223 ; GFX12-NEXT: s_wait_loadcnt 0x0
1224 ; GFX12-NEXT: flat_store_b32 v[1:2], v3
1225 ; GFX12-NEXT: s_endpgm
1227 %voffset4 = mul i32 %voffset, 4
1228 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
1229 %gep = getelementptr i8, ptr addrspace(5) %gep0, i64 1
1230 %load_lo = load i8, ptr addrspace(5) %gep
1231 %ext = sext i8 %load_lo to i16
1232 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %ext, i32 1
1233 store <2 x i16> %result, ptr %out, align 4
1237 define amdgpu_ps void @test_scratch_load_i16_to_d16_hi_svs(ptr addrspace(5) inreg %in, i32 %voffset, ptr %out) {
1238 ; GFX10-LABEL: test_scratch_load_i16_to_d16_hi_svs:
1239 ; GFX10: ; %bb.0: ; %bb
1240 ; GFX10-NEXT: s_add_u32 s0, s0, s3
1241 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1242 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1243 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1244 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0
1245 ; GFX10-NEXT: v_mov_b32_e32 v3, -1
1246 ; GFX10-NEXT: v_add3_u32 v0, s2, v0, 2
1247 ; GFX10-NEXT: scratch_load_short_d16_hi v3, v0, off
1248 ; GFX10-NEXT: s_waitcnt vmcnt(0)
1249 ; GFX10-NEXT: flat_store_dword v[1:2], v3
1250 ; GFX10-NEXT: s_endpgm
1252 ; GFX11-LABEL: test_scratch_load_i16_to_d16_hi_svs:
1253 ; GFX11: ; %bb.0: ; %bb
1254 ; GFX11-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
1255 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1256 ; GFX11-NEXT: v_add3_u32 v0, s0, v0, 2
1257 ; GFX11-NEXT: scratch_load_d16_hi_b16 v3, v0, off
1258 ; GFX11-NEXT: s_waitcnt vmcnt(0)
1259 ; GFX11-NEXT: flat_store_b32 v[1:2], v3
1260 ; GFX11-NEXT: s_endpgm
1262 ; GFX12-LABEL: test_scratch_load_i16_to_d16_hi_svs:
1263 ; GFX12: ; %bb.0: ; %bb
1264 ; GFX12-NEXT: v_dual_mov_b32 v3, -1 :: v_dual_lshlrev_b32 v0, 2, v0
1265 ; GFX12-NEXT: scratch_load_d16_hi_b16 v3, v0, s0 offset:2
1266 ; GFX12-NEXT: s_wait_loadcnt 0x0
1267 ; GFX12-NEXT: flat_store_b32 v[1:2], v3
1268 ; GFX12-NEXT: s_endpgm
1270 %voffset4 = mul i32 %voffset, 4
1271 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %in, i32 %voffset4
1272 %gep = getelementptr i16, ptr addrspace(5) %gep0, i64 1
1273 %load_lo = load i16, ptr addrspace(5) %gep
1274 %result = insertelement <2 x i16> <i16 -1, i16 -1>, i16 %load_lo, i32 1
1275 store <2 x i16> %result, ptr %out, align 4
1279 define amdgpu_ps void @test_scratch_store_b8_from_d16_hi_svs(ptr %in, ptr addrspace(5) inreg %out, i32 %voffset) {
1280 ; GFX10-LABEL: test_scratch_store_b8_from_d16_hi_svs:
1281 ; GFX10: ; %bb.0: ; %bb
1282 ; GFX10-NEXT: s_add_u32 s0, s0, s3
1283 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1284 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1285 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1286 ; GFX10-NEXT: flat_load_dword v0, v[0:1]
1287 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v2
1288 ; GFX10-NEXT: v_add3_u32 v1, s2, v1, 4
1289 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1290 ; GFX10-NEXT: scratch_store_byte_d16_hi v1, v0, off
1291 ; GFX10-NEXT: s_endpgm
1293 ; GFX11-LABEL: test_scratch_store_b8_from_d16_hi_svs:
1294 ; GFX11: ; %bb.0: ; %bb
1295 ; GFX11-NEXT: flat_load_b32 v0, v[0:1]
1296 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v2
1297 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1298 ; GFX11-NEXT: v_add3_u32 v1, s0, v1, 4
1299 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1300 ; GFX11-NEXT: scratch_store_d16_hi_b8 v1, v0, off
1301 ; GFX11-NEXT: s_endpgm
1303 ; GFX12-LABEL: test_scratch_store_b8_from_d16_hi_svs:
1304 ; GFX12: ; %bb.0: ; %bb
1305 ; GFX12-NEXT: flat_load_b32 v0, v[0:1]
1306 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v2
1307 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1308 ; GFX12-NEXT: scratch_store_d16_hi_b8 v1, v0, s0 offset:4
1309 ; GFX12-NEXT: s_endpgm
1311 %load = load <4 x i8>, ptr %in
1312 %element = extractelement <4 x i8> %load, i32 2
1313 %voffset4 = mul i32 %voffset, 4
1314 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %out, i32 %voffset4
1315 %gep = getelementptr <4 x i8>, ptr addrspace(5) %gep0, i64 1
1316 store i8 %element, ptr addrspace(5) %gep, align 4
1320 define amdgpu_ps void @test_scratch_store_b16_from_d16_hi_svs(ptr %in, ptr addrspace(5) inreg %out, i32 %voffset) {
1321 ; GFX10-LABEL: test_scratch_store_b16_from_d16_hi_svs:
1322 ; GFX10: ; %bb.0: ; %bb
1323 ; GFX10-NEXT: s_add_u32 s0, s0, s3
1324 ; GFX10-NEXT: s_addc_u32 s1, s1, 0
1325 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s0
1326 ; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s1
1327 ; GFX10-NEXT: flat_load_dword v0, v[0:1]
1328 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v2
1329 ; GFX10-NEXT: v_add3_u32 v1, s2, v1, 2
1330 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1331 ; GFX10-NEXT: scratch_store_short_d16_hi v1, v0, off
1332 ; GFX10-NEXT: s_endpgm
1334 ; GFX11-LABEL: test_scratch_store_b16_from_d16_hi_svs:
1335 ; GFX11: ; %bb.0: ; %bb
1336 ; GFX11-NEXT: flat_load_b32 v0, v[0:1]
1337 ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v2
1338 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
1339 ; GFX11-NEXT: v_add3_u32 v1, s0, v1, 2
1340 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1341 ; GFX11-NEXT: scratch_store_d16_hi_b16 v1, v0, off
1342 ; GFX11-NEXT: s_endpgm
1344 ; GFX12-LABEL: test_scratch_store_b16_from_d16_hi_svs:
1345 ; GFX12: ; %bb.0: ; %bb
1346 ; GFX12-NEXT: flat_load_b32 v0, v[0:1]
1347 ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v2
1348 ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1349 ; GFX12-NEXT: scratch_store_d16_hi_b16 v1, v0, s0 offset:2
1350 ; GFX12-NEXT: s_endpgm
1352 %load = load <2 x i16>, ptr %in
1353 %element = extractelement <2 x i16> %load, i32 1
1354 %voffset4 = mul i32 %voffset, 4
1355 %gep0 = getelementptr inbounds i8, ptr addrspace(5) %out, i32 %voffset4
1356 %gep = getelementptr <2 x i8>, ptr addrspace(5) %gep0, i64 1
1357 store i16 %element, ptr addrspace(5) %gep, align 4