1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca,+sram-ecc -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s
3 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s
5 ; GCN-LABEL: {{^}}load_local_lo_v2i16_undeflo:
7 ; GFX900-NEXT: ds_read_u16_d16 v0, v0
8 ; GFX900-NEXT: s_waitcnt
9 ; GFX900-NEXT: s_setpc_b64
11 ; NO-D16-HI: ds_read_u16
12 define <2 x i16> @load_local_lo_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
14 %load = load i16, i16 addrspace(3)* %in
15 %build = insertelement <2 x i16> undef, i16 %load, i32 0
19 ; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo:
21 ; GCN: ds_read_u16 v0, v0
22 ; GFX9: v_and_b32_e32 v0, 0xffff, v0
23 ; GFX9: v_lshl_or_b32 v0, v1, 16, v0
25 define <2 x i16> @load_local_lo_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
27 %load = load i16, i16 addrspace(3)* %in
28 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
29 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
33 ; Show that we get reasonable regalloc without physreg constraints.
34 ; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg:
36 ; GCN: ds_read_u16 v0, v0
38 ; GFX9: v_and_b32_e32 v0, 0xffff, v0
39 ; GFX9: v_lshl_or_b32 v0, v1, 16, v0
40 ; GFX9: global_store_dword v
41 define void @load_local_lo_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
43 %load = load i16, i16 addrspace(3)* %in
44 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
45 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
46 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
50 ; GCN-LABEL: {{^}}load_local_lo_v2i16_zerolo:
52 ; GFX900-NEXT: v_mov_b32_e32 v1, 0
53 ; GFX900-NEXT: ds_read_u16_d16 v1, v0
54 ; GFX900-NEXT: s_waitcnt
55 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
56 ; GFX900-NEXT: s_setpc_b64
58 ; NO-D16-HI: ds_read_u16 v
59 define <2 x i16> @load_local_lo_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
61 %load = load i16, i16 addrspace(3)* %in
62 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 0
66 ; GCN-LABEL: {{^}}load_local_lo_v2f16_fpimm:
68 ; GFX900-NEXT: v_mov_b32_e32 v1, 2.0
69 ; GFX900-NEXT: ds_read_u16_d16 v1, v0
70 ; GFX900-NEXT: s_waitcnt
71 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
72 ; GFX900-NEXT: s_setpc_b64
74 ; NO-D16-HI: ds_read_u16 v
75 define <2 x half> @load_local_lo_v2f16_fpimm(half addrspace(3)* %in) #0 {
77 %load = load half, half addrspace(3)* %in
78 %build = insertelement <2 x half> <half 0.0, half 2.0>, half %load, i32 0
82 ; GCN-LABEL: {{^}}load_local_lo_v2f16_reghi_vreg:
84 ; GFX900-NEXT: ds_read_u16_d16 v1, v0
85 ; GFX900-NEXT: s_waitcnt
86 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
87 ; GFX900-NEXT: s_waitcnt
88 ; GFX900-NEXT: s_setpc_b64
90 ; NO-D16-HI: ds_read_u16 v
91 define void @load_local_lo_v2f16_reghi_vreg(half addrspace(3)* %in, i32 %reg) #0 {
93 %reg.bc = bitcast i32 %reg to <2 x half>
94 %load = load half, half addrspace(3)* %in
95 %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
96 store <2 x half> %build1, <2 x half> addrspace(1)* undef
99 ; GCN-LABEL: {{^}}load_local_lo_v2f16_reglo_vreg:
101 ; GFX900: ds_read_u16 v
102 ; GFX900: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
103 ; GFX900: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}}
104 ; GFX900: global_store_dword
106 ; NO-D16-HI: ds_read_u16 v
107 define void @load_local_lo_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
109 %load = load half, half addrspace(3)* %in
110 %build0 = insertelement <2 x half> undef, half %reg, i32 1
111 %build1 = insertelement <2 x half> %build0, half %load, i32 0
112 store <2 x half> %build1, <2 x half> addrspace(1)* undef
116 ; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_zexti8:
118 ; GFX900-NEXT: ds_read_u8_d16 v1, v0
119 ; GFX900-NEXT: s_waitcnt
120 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
121 ; GFX900-NEXT: s_waitcnt
122 ; GFX900-NEXT: s_setpc_b64
124 ; NO-D16-HI: ds_read_u8 v
125 define void @load_local_lo_v2i16_reghi_vreg_zexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
127 %reg.bc = bitcast i32 %reg to <2 x i16>
128 %load = load i8, i8 addrspace(3)* %in
129 %ext = zext i8 %load to i16
130 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
131 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
135 ; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg_zexti8:
137 ; GFX900: ds_read_u8 v
138 ; GFX900: global_store_dword
139 ; GFX900-NEXT: s_waitcnt
140 ; GFX900-NEXT: s_setpc_b64
142 ; NO-D16-HI: ds_read_u8 v
143 define void @load_local_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
145 %load = load i8, i8 addrspace(3)* %in
146 %ext = zext i8 %load to i16
147 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
148 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
149 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
153 ; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_sexti8:
155 ; GFX900-NEXT: ds_read_i8_d16 v1, v0
156 ; GFX900-NEXT: s_waitcnt
157 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
158 ; GFX900-NEXT: s_waitcnt
159 ; GFX900-NEXT: s_setpc_b64
161 ; NO-D16-HI: ds_read_i8 v
162 define void @load_local_lo_v2i16_reghi_vreg_sexti8(i8 addrspace(3)* %in, i32 %reg) #0 {
164 %reg.bc = bitcast i32 %reg to <2 x i16>
165 %load = load i8, i8 addrspace(3)* %in
166 %ext = sext i8 %load to i16
167 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
168 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
172 ; GCN-LABEL: {{^}}load_local_lo_v2i16_reglo_vreg_sexti8:
174 ; GFX900: ds_read_i8 v
175 ; GFX900: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
176 ; GFX900: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}}
178 ; NO-D16-HI: ds_read_i8 v
179 define void @load_local_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
181 %load = load i8, i8 addrspace(3)* %in
182 %ext = sext i8 %load to i16
183 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
184 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 0
185 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
189 ; GCN-LABEL: {{^}}load_local_lo_v2f16_reglo_vreg_zexti8:
191 ; GFX900: ds_read_u8 v
192 ; GFX900: global_store_dword
193 ; GFX900-NEXT: s_waitcnt
194 ; GFX900-NEXT: s_setpc_b64
196 ; NO-D16-HI: ds_read_u8 v
197 define void @load_local_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 {
199 %load = load i8, i8 addrspace(3)* %in
200 %ext = zext i8 %load to i16
201 %bitcast = bitcast i16 %ext to half
202 %build0 = insertelement <2 x half> undef, half %reg, i32 1
203 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 0
204 store <2 x half> %build1, <2 x half> addrspace(1)* undef
208 ; GCN-LABEL: {{^}}load_local_lo_v2f16_reglo_vreg_sexti8:
210 ; GFX900: ds_read_i8 v
211 ; GFX900: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
212 ; GFX900: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16, v{{[0-9]+}}
214 ; NO-D16-HI: ds_read_i8 v
215 define void @load_local_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 {
217 %load = load i8, i8 addrspace(3)* %in
218 %ext = sext i8 %load to i16
219 %bitcast = bitcast i16 %ext to half
220 %build0 = insertelement <2 x half> undef, half %reg, i32 1
221 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 0
222 store <2 x half> %build1, <2 x half> addrspace(1)* undef
226 ; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_multi_use_lo:
227 ; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
228 ; GFX900: ds_read_u16 v0, v0
229 ; GFX900: v_mov_b32_e32 v3, 0
230 ; GFX900: v_mov_b32_e32 v2, 0xffff
231 ; GFX900: s_waitcnt lgkmcnt(0)
232 ; GFX900: ds_write_b16 v3, v0
233 ; GFX900: v_bfi_b32 v0, v2, v0, v1
234 ; GFX900: global_store_dword v[0:1], v0, off
235 ; GFX900: s_waitcnt vmcnt(0) lgkmcnt(0)
236 ; GFX900: s_setpc_b64 s[30:31]
238 ; NO-D16-HI: ds_read_u16 v
239 define void @load_local_lo_v2i16_reghi_vreg_multi_use_lo(i16 addrspace(3)* %in, <2 x i16> %reg) #0 {
241 %load = load i16, i16 addrspace(3)* %in
242 %elt1 = extractelement <2 x i16> %reg, i32 1
243 store i16 %load, i16 addrspace(3)* null
244 %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
245 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
249 ; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_multi_use_hi:
250 ; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
251 ; GFX900: v_lshrrev_b32_e32 v2, 16, v1
252 ; GFX900: ds_read_u16_d16 v1, v0
253 ; GFX900: v_mov_b32_e32 v0, 0
254 ; GFX900: ds_write_b16 v0, v2
255 ; GFX900: s_waitcnt lgkmcnt(1)
256 ; GFX900: global_store_dword v[0:1], v1, off
257 ; GFX900: s_waitcnt vmcnt(0) lgkmcnt(0)
258 ; GFX900: s_setpc_b64 s[30:31]
260 ; NO-D16-HI: ds_read_u16 v
261 define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in, <2 x i16> %reg) #0 {
263 %load = load i16, i16 addrspace(3)* %in
264 %elt1 = extractelement <2 x i16> %reg, i32 1
265 store i16 %elt1, i16 addrspace(3)* null
266 %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
267 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
271 ; GCN-LABEL: {{^}}load_local_lo_v2i16_reghi_vreg_multi_use_lohi:
272 ; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
273 ; GFX900: ds_read_u16 v0, v0
274 ; GFX900: v_lshrrev_b32_e32 v[[A_F16:[0-9]+]], 16, v1
275 ; GFX900: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0xffff
276 ; GFX900: s_waitcnt lgkmcnt(0)
277 ; GFX900: ds_write_b16 v2, v0
278 ; GFX900: ds_write_b16 v3, v[[A_F16]]
279 ; GFX900: v_bfi_b32 v0, v[[A_F32]], v0, v1
280 ; GFX900: global_store_dword v[0:1], v0, off
281 ; GFX900: s_waitcnt vmcnt(0) lgkmcnt(0)
282 ; GFX900: s_setpc_b64 s[30:31]
284 ; NO-D16-HI: ds_read_u16 v
285 define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noalias %in, <2 x i16> %reg, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 {
287 %load = load i16, i16 addrspace(3)* %in
288 %elt1 = extractelement <2 x i16> %reg, i32 1
289 store i16 %load, i16 addrspace(3)* %out0
290 store i16 %elt1, i16 addrspace(3)* %out1
291 %build1 = insertelement <2 x i16> %reg, i16 %load, i32 0
292 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
296 ; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg:
298 ; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
299 ; GFX900-NEXT: s_waitcnt
300 ; GFX900-NEXT: global_store_dword
301 ; GFX900-NEXT: s_waitcnt
302 ; GFX900-NEXT: s_setpc_b64
304 ; GFX906: global_load_ushort v0, v[0:1], off offset:-4094
306 define void @load_global_lo_v2i16_reglo_vreg(i16 addrspace(1)* %in, i32 %reg) #0 {
308 %reg.bc = bitcast i32 %reg to <2 x i16>
309 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
310 %load = load i16, i16 addrspace(1)* %gep
311 %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
312 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
316 ; GCN-LABEL: {{^}}load_global_lo_v2f16_reglo_vreg:
318 ; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
319 ; GFX900-NEXT: s_waitcnt
320 ; GFX900-NEXT: global_store_dword
321 ; GFX900-NEXT: s_waitcnt
322 ; GFX900-NEXT: s_setpc_b64
324 ; GFX906: global_load_ushort v0, v[0:1], off offset:-4094
325 ; GFX906: v_lshrrev_b32
326 ; GFX906: v_and_b32_e32
327 ; GFX906: v_lshl_or_b32
329 ; GFX803: flat_load_ushort
330 define void @load_global_lo_v2f16_reglo_vreg(half addrspace(1)* %in, i32 %reg) #0 {
332 %reg.bc = bitcast i32 %reg to <2 x half>
333 %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
334 %load = load half, half addrspace(1)* %gep
335 %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
336 store <2 x half> %build1, <2 x half> addrspace(1)* undef
340 ; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg_zexti8:
342 ; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095
343 ; GFX900-NEXT: s_waitcnt
344 ; GFX900-NEXT: global_store_dword
345 ; GFX900-NEXT: s_waitcnt
346 ; GFX900-NEXT: s_setpc_b64
348 ; GFX906: global_load_ubyte v0, v[0:1], off offset:-4095
351 ; GFX803: flat_load_ubyte
352 define void @load_global_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
354 %reg.bc = bitcast i32 %reg to <2 x i16>
355 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
356 %load = load i8, i8 addrspace(1)* %gep
357 %ext = zext i8 %load to i16
358 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
359 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
363 ; GCN-LABEL: {{^}}load_global_lo_v2i16_reglo_vreg_sexti8:
365 ; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095
366 ; GFX900-NEXT: s_waitcnt
367 ; GFX900-NEXT: global_store_dword
368 ; GFX900-NEXT: s_waitcnt
369 ; GFX900-NEXT: s_setpc_b64
371 ; GFX906: global_load_sbyte v0, v[0:1], off offset:-4095
374 ; GFX803: flat_load_sbyte
375 define void @load_global_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
377 %reg.bc = bitcast i32 %reg to <2 x i16>
378 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
379 %load = load i8, i8 addrspace(1)* %gep
380 %ext = sext i8 %load to i16
381 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
382 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
386 ; GCN-LABEL: {{^}}load_global_lo_v2f16_reglo_vreg_zexti8:
388 ; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095
389 ; GFX900-NEXT: s_waitcnt
390 ; GFX900-NEXT: global_store_dword
391 ; GFX900-NEXT: s_waitcnt
392 ; GFX900-NEXT: s_setpc_b64
394 ; GFX906: global_load_ubyte v0, v[0:1], off offset:-4095
395 ; GFX906: v_and_b32_e32
396 ; GFX906: v_lshl_or_b32
398 ; GFX803: flat_load_ubyte
399 define void @load_global_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
401 %reg.bc = bitcast i32 %reg to <2 x half>
402 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
403 %load = load i8, i8 addrspace(1)* %gep
404 %ext = zext i8 %load to i16
405 %bitcast = bitcast i16 %ext to half
406 %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
407 store <2 x half> %build1, <2 x half> addrspace(1)* undef
411 ; GCN-LABEL: {{^}}load_global_lo_v2f16_reglo_vreg_sexti8:
413 ; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095
414 ; GFX900-NEXT: s_waitcnt
415 ; GFX900-NEXT: global_store_dword
416 ; GFX900-NEXT: s_waitcnt
417 ; GFX900-NEXT: s_setpc_b64
419 ; GFX906: global_load_sbyte v0, v[0:1], off offset:-4095
420 ; GFX906: v_lshrrev_b32
422 ; GFX906: v_lshl_or_b32
424 ; GFX803: flat_load_sbyte
425 define void @load_global_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i32 %reg) #0 {
427 %reg.bc = bitcast i32 %reg to <2 x half>
428 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
429 %load = load i8, i8 addrspace(1)* %gep
430 %ext = sext i8 %load to i16
431 %bitcast = bitcast i16 %ext to half
432 %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
433 store <2 x half> %build1, <2 x half> addrspace(1)* undef
437 ; GCN-LABEL: {{^}}load_flat_lo_v2i16_reghi_vreg:
439 ; GFX900-NEXT: flat_load_short_d16 v2, v[0:1]
440 ; GFX900-NEXT: s_waitcnt
441 ; GFX900-NEXT: global_store_dword v[0:1], v2
442 ; GFX900-NEXT: s_waitcnt
443 ; GFX900-NEXT: s_setpc_b64
445 ; GFX803: flat_load_ushort v{{[0-9]+}}
446 ; GFX803: v_or_b32_e32
448 ; GFX906: flat_load_ushort [[LOAD:v[0-9]+]]
449 ; GFX906: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
450 ; GFX906: v_bfi_b32 v{{[0-9]+}}, [[MASK]], [[LOAD]], v2
451 ; GFX906: global_store_dword
452 define void @load_flat_lo_v2i16_reghi_vreg(i16* %in, i32 %reg) #0 {
454 %reg.bc = bitcast i32 %reg to <2 x i16>
455 %load = load i16, i16* %in
456 %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
457 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
461 ; GCN-LABEL: {{^}}load_flat_lo_v2f16_reghi_vreg:
463 ; GFX900-NEXT: flat_load_short_d16 v2, v[0:1]
464 ; GFX900-NEXT: s_waitcnt
465 ; GFX900-NEXT: global_store_dword v[0:1], v2
466 ; GFX900-NEXT: s_waitcnt
467 ; GFX900-NEXT: s_setpc_b64
469 ; GFX803: flat_load_ushort v{{[0-9]+}}
470 ; GFX803: v_or_b32_e32
472 ; FIXME: and should be removable
473 ; GFX906: flat_load_ushort [[LOAD:v[0-9]+]]
474 ; GFX906: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, v2
475 ; GFX906: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]]
476 ; GFX906: v_lshl_or_b32 [[LSHL_OR:v[0-9]+]], [[SHR]], 16, [[AND]]
477 ; GFX906: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[LSHL_OR]]
478 define void @load_flat_lo_v2f16_reghi_vreg(half* %in, i32 %reg) #0 {
480 %reg.bc = bitcast i32 %reg to <2 x half>
481 %load = load half, half* %in
482 %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
483 store <2 x half> %build1, <2 x half> addrspace(1)* undef
487 ; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg_zexti8:
489 ; GFX900-NEXT: flat_load_ubyte_d16 v2, v[0:1]
490 ; GFX900-NEXT: s_waitcnt
491 ; GFX900-NEXT: global_store_dword v[0:1], v2
492 ; GFX900-NEXT: s_waitcnt
493 ; GFX900-NEXT: s_setpc_b64
495 ; GFX803: flat_load_ubyte [[LO:v[0-9]+]]
496 ; GFX803: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, v2
497 ; GFX803: s_mov_b32 [[MASK:s[0-9]+]], 0x5040c00
498 ; GFX803: v_perm_b32 [[RES:v[0-9]+]], [[HI]], [[LO]], [[MASK]]
499 ; GFX803: flat_store_dword v[0:1], [[RES]]
501 ; GFX906: flat_load_ubyte
503 define void @load_flat_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
505 %reg.bc = bitcast i32 %reg to <2 x i16>
506 %load = load i8, i8* %in
507 %ext = zext i8 %load to i16
508 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
509 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
513 ; GCN-LABEL: {{^}}load_flat_lo_v2i16_reglo_vreg_sexti8:
515 ; GFX900-NEXT: flat_load_sbyte_d16 v2, v[0:1]
516 ; GFX900-NEXT: s_waitcnt
517 ; GFX900-NEXT: global_store_dword v[0:1], v2
518 ; GFX900-NEXT: s_waitcnt
519 ; GFX900-NEXT: s_setpc_b64
521 ; GFX803: flat_load_sbyte v{{[0-9]+}}
522 ; GFX803: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
524 ; GFX906: flat_load_sbyte
526 define void @load_flat_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
528 %reg.bc = bitcast i32 %reg to <2 x i16>
529 %load = load i8, i8* %in
530 %ext = sext i8 %load to i16
531 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
532 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
536 ; GCN-LABEL: {{^}}load_flat_lo_v2f16_reglo_vreg_zexti8:
538 ; GFX900-NEXT: flat_load_ubyte_d16 v2, v[0:1]
539 ; GFX900-NEXT: s_waitcnt
540 ; GFX900-NEXT: global_store_dword v[0:1], v2
541 ; GFX900-NEXT: s_waitcnt
542 ; GFX900-NEXT: s_setpc_b64
544 ; GFX803: flat_load_ubyte [[LO:v[0-9]+]]
545 ; GFX803: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, v2
546 ; GFX803: s_mov_b32 [[MASK:s[0-9]+]], 0x5040c00
547 ; GFX803: v_perm_b32 [[RES:v[0-9]+]], [[HI]], [[LO]], [[MASK]]
548 ; GFX803: flat_store_dword v[0:1], [[RES]]
550 ; GFX906: flat_load_ubyte
551 ; GFX906: v_lshrrev_b32
553 ; GFX906: v_lshl_or_b32
554 define void @load_flat_lo_v2f16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
556 %reg.bc = bitcast i32 %reg to <2 x half>
557 %load = load i8, i8* %in
558 %ext = zext i8 %load to i16
559 %bitcast = bitcast i16 %ext to half
560 %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
561 store <2 x half> %build1, <2 x half> addrspace(1)* undef
565 ; GCN-LABEL: {{^}}load_flat_lo_v2f16_reglo_vreg_sexti8:
567 ; GFX900-NEXT: flat_load_sbyte_d16 v2, v[0:1]
568 ; GFX900-NEXT: s_waitcnt
569 ; GFX900-NEXT: global_store_dword v[0:1], v2
570 ; GFX900-NEXT: s_waitcnt
571 ; GFX900-NEXT: s_setpc_b64
573 ; GFX803: flat_load_sbyte v{{[0-9]+}}
574 ; GFX803: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
576 ; GFX906: flat_load_sbyte
577 ; GFX906: v_lshrrev_b32
579 ; GFX906: v_lshl_or_b32
580 define void @load_flat_lo_v2f16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
582 %reg.bc = bitcast i32 %reg to <2 x half>
583 %load = load i8, i8* %in
584 %ext = sext i8 %load to i16
585 %bitcast = bitcast i16 %ext to half
586 %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
587 store <2 x half> %build1, <2 x half> addrspace(1)* undef
591 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg:
593 ; GFX900: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094{{$}}
594 ; GFX900-NEXT: s_waitcnt
595 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
596 ; GFX900-NEXT: s_waitcnt
597 ; GFX900-NEXT: s_setpc_b64
599 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
600 define void @load_private_lo_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i32 %reg) #0 {
602 %reg.bc = bitcast i32 %reg to <2 x i16>
603 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
604 %load = load i16, i16 addrspace(5)* %gep
605 %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
606 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
610 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg:
612 ; GFX900: buffer_load_ushort v1, off, s[0:3], s32 offset:4094{{$}}
613 ; GFX900-NEXT: s_waitcnt
615 ; GFX900: v_lshl_or_b32
617 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
618 ; GFX900-NEXT: s_waitcnt
619 ; GFX900-NEXT: s_setpc_b64
621 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
622 define void @load_private_lo_v2i16_reghi_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 {
624 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
625 %load = load i16, i16 addrspace(5)* %gep
626 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
627 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
628 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
632 ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg:
634 ; GFX900: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094{{$}}
635 ; GFX900-NEXT: s_waitcnt
636 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
637 ; GFX900-NEXT: s_waitcnt
638 ; GFX900-NEXT: s_setpc_b64
640 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
641 define void @load_private_lo_v2f16_reglo_vreg(half addrspace(5)* byval %in, i32 %reg) #0 {
643 %reg.bc = bitcast i32 %reg to <2 x half>
644 %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
645 %load = load half, half addrspace(5)* %gep
646 %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
647 store <2 x half> %build1, <2 x half> addrspace(1)* undef
651 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff:
653 ; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094{{$}}
654 ; GFX900-NEXT: s_waitcnt
655 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
656 ; GFX900-NEXT: s_waitcnt
657 ; GFX900-NEXT: s_setpc_b64
659 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}}
660 define void @load_private_lo_v2i16_reglo_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 {
662 %reg.bc = bitcast i32 %reg to <2 x i16>
663 %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
664 %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
665 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
669 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg_nooff:
671 ; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094{{$}}
672 ; GFX900-NEXT: s_waitcnt
673 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
674 ; GFX900-NEXT: s_waitcnt
675 ; GFX900-NEXT: s_setpc_b64
677 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}}
678 define void @load_private_lo_v2i16_reghi_vreg_nooff(i16 addrspace(5)* %in, i32 %reg) #0 {
680 %reg.bc = bitcast i32 %reg to <2 x i16>
681 %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
682 %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
683 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
687 ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff:
689 ; GFX900-NEXT: buffer_load_short_d16 v1, off, s[0:3], s33 offset:4094{{$}}
690 ; GFX900-NEXT: s_waitcnt
691 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
692 ; GFX900-NEXT: s_waitcnt
693 ; GFX900-NEXT: s_setpc_b64
695 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}}
696 define void @load_private_lo_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, i32 %reg) #0 {
698 %reg.bc = bitcast i32 %reg to <2 x half>
699 %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
700 %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
701 store <2 x half> %build1, <2 x half> addrspace(1)* undef
705 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8:
707 ; GFX900: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095{{$}}
708 ; GFX900-NEXT: s_waitcnt
709 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
710 ; GFX900-NEXT: s_waitcnt
711 ; GFX900-NEXT: s_setpc_b64
713 ; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
714 define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 {
716 %reg.bc = bitcast i32 %reg to <2 x i16>
717 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
718 %load = load i8, i8 addrspace(5)* %gep
719 %ext = zext i8 %load to i16
720 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
721 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
725 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8:
727 ; GFX900: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095{{$}}
728 ; GFX900-NEXT: s_waitcnt
729 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
730 ; GFX900-NEXT: s_waitcnt
731 ; GFX900-NEXT: s_setpc_b64
733 ; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
734 define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i32 %reg) #0 {
736 %reg.bc = bitcast i32 %reg to <2 x i16>
737 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
738 %load = load i8, i8 addrspace(5)* %gep
739 %ext = sext i8 %load to i16
740 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
741 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
745 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_zexti8:
747 ; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094{{$}}
748 ; GFX900-NEXT: s_waitcnt
749 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
750 ; GFX900-NEXT: s_waitcnt
751 ; GFX900-NEXT: s_setpc_b64
753 ; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}}
754 define void @load_private_lo_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
756 %reg.bc = bitcast i32 %reg to <2 x i16>
757 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
758 %ext = zext i8 %load to i16
759 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
760 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
764 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_nooff_sexti8:
766 ; GFX900-NEXT: buffer_load_sbyte_d16 v1, off, s[0:3], s33 offset:4094{{$}}
767 ; GFX900-NEXT: s_waitcnt
768 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
769 ; GFX900-NEXT: s_waitcnt
770 ; GFX900-NEXT: s_setpc_b64
772 ; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094{{$}}
773 define void @load_private_lo_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
775 %reg.bc = bitcast i32 %reg to <2 x i16>
776 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
777 %ext = sext i8 %load to i16
778 %build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
779 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
783 ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_nooff_zexti8:
785 ; GFX900-NEXT: buffer_load_ubyte_d16 v1, off, s[0:3], s33 offset:4094{{$}}
786 ; GFX900-NEXT: s_waitcnt
787 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
788 ; GFX900-NEXT: s_waitcnt
789 ; GFX900-NEXT: s_setpc_b64
791 ; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}}
792 define void @load_private_lo_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i32 %reg) #0 {
794 %reg.bc = bitcast i32 %reg to <2 x half>
795 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
796 %ext = zext i8 %load to i16
797 %bc.ext = bitcast i16 %ext to half
798 %build1 = insertelement <2 x half> %reg.bc, half %bc.ext, i32 0
799 store <2 x half> %build1, <2 x half> addrspace(1)* undef
803 ; GCN-LABEL: {{^}}load_constant_lo_v2i16_reglo_vreg:
805 ; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
806 ; GFX900-NEXT: s_waitcnt
807 ; GFX900-NEXT: global_store_dword
808 ; GFX900-NEXT: s_waitcnt
809 ; GFX900-NEXT: s_setpc_b64
811 ; GFX803: flat_load_ushort
813 ; GFX906: global_load_ushort
814 define void @load_constant_lo_v2i16_reglo_vreg(i16 addrspace(4)* %in, i32 %reg) #0 {
816 %reg.bc = bitcast i32 %reg to <2 x i16>
817 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
818 %load = load i16, i16 addrspace(4)* %gep
819 %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
820 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
824 ; GCN-LABEL: load_constant_lo_v2f16_reglo_vreg
826 ; GFX900-NEXT: global_load_short_d16 v2, v[0:1], off offset:-4094
827 ; GFX900-NEXT: s_waitcnt
828 ; GFX900-NEXT: global_store_dword
829 ; GFX900-NEXT: s_waitcnt
830 ; GFX900-NEXT: s_setpc_b64
832 ; GFX803: flat_load_ushort
834 ; GFX906: global_load_ushort
835 define void @load_constant_lo_v2f16_reglo_vreg(half addrspace(4)* %in, i32 %reg) #0 {
837 %reg.bc = bitcast i32 %reg to <2 x half>
838 %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
839 %load = load half, half addrspace(4)* %gep
840 %build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
841 store <2 x half> %build1, <2 x half> addrspace(1)* undef
845 ; GCN-LABEL: {{^}}load_constant_lo_v2f16_reglo_vreg_zexti8:
847 ; GFX900-NEXT: global_load_ubyte_d16 v2, v[0:1], off offset:-4095
848 ; GFX900-NEXT: s_waitcnt
849 ; GFX900-NEXT: global_store_dword
850 ; GFX900-NEXT: s_waitcnt
851 ; GFX900-NEXT: s_setpc_b64
853 ; GFX906: global_load_ubyte v0, v[0:1], off offset:-4095
854 ; GFX906: v_and_b32_e32
855 ; GFX906: v_lshl_or_b32
857 ; GFX803: flat_load_ubyte
858 define void @load_constant_lo_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i32 %reg) #0 {
860 %reg.bc = bitcast i32 %reg to <2 x half>
861 %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
862 %load = load i8, i8 addrspace(4)* %gep
863 %ext = zext i8 %load to i16
864 %bitcast = bitcast i16 %ext to half
865 %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
866 store <2 x half> %build1, <2 x half> addrspace(1)* undef
870 ; GCN-LABEL: {{^}}load_constant_lo_v2f16_reglo_vreg_sexti8:
872 ; GFX900-NEXT: global_load_sbyte_d16 v2, v[0:1], off offset:-4095
873 ; GFX900-NEXT: s_waitcnt
874 ; GFX900-NEXT: global_store_dword
875 ; GFX900-NEXT: s_waitcnt
876 ; GFX900-NEXT: s_setpc_b64
878 ; GFX906: global_load_sbyte v0, v[0:1], off offset:-4095
879 ; GFX906: v_lshrrev_b32
881 ; GFX906: v_lshl_or_b32
883 ; GFX803: flat_load_sbyte
884 define void @load_constant_lo_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i32 %reg) #0 {
886 %reg.bc = bitcast i32 %reg to <2 x half>
887 %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
888 %load = load i8, i8 addrspace(4)* %gep
889 %ext = sext i8 %load to i16
890 %bitcast = bitcast i16 %ext to half
891 %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
892 store <2 x half> %build1, <2 x half> addrspace(1)* undef
896 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_to_offset:
897 ; GFX900: buffer_store_dword
898 ; GFX900-NEXT: buffer_load_short_d16 v0, off, s[0:3], s32 offset:4094
900 ; NO-D16-HI: buffer_load_ushort v
901 define void @load_private_lo_v2i16_reglo_vreg_to_offset(i32 %reg) #0 {
903 %obj0 = alloca [10 x i32], align 4, addrspace(5)
904 %obj1 = alloca [4096 x i16], align 2, addrspace(5)
905 %reg.bc = bitcast i32 %reg to <2 x i16>
906 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
907 store volatile i32 123, i32 addrspace(5)* %bc
908 %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
909 %load = load volatile i16, i16 addrspace(5)* %gep
910 %build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
911 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
915 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8_to_offset:
916 ; GFX900: buffer_store_dword
917 ; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
919 ; NO-D16-HI: buffer_load_sbyte v
920 define void @load_private_lo_v2i16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
922 %obj0 = alloca [10 x i32], align 4, addrspace(5)
923 %obj1 = alloca [4096 x i8], align 2, addrspace(5)
924 %reg.bc = bitcast i32 %reg to <2 x i16>
925 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
926 store volatile i32 123, i32 addrspace(5)* %bc
927 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
928 %load = load volatile i8, i8 addrspace(5)* %gep
929 %load.ext = sext i8 %load to i16
930 %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
931 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
935 ; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8_to_offset:
936 ; GFX900: buffer_store_dword
937 ; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
939 ; NO-D16-HI: buffer_load_ubyte v
940 define void @load_private_lo_v2i16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
942 %obj0 = alloca [10 x i32], align 4, addrspace(5)
943 %obj1 = alloca [4096 x i8], align 2, addrspace(5)
944 %reg.bc = bitcast i32 %reg to <2 x i16>
945 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
946 store volatile i32 123, i32 addrspace(5)* %bc
947 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
948 %load = load volatile i8, i8 addrspace(5)* %gep
949 %load.ext = zext i8 %load to i16
950 %build1 = insertelement <2 x i16> %reg.bc, i16 %load.ext, i32 0
951 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
955 ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_sexti8_to_offset:
956 ; GFX900: buffer_store_dword
957 ; GFX900-NEXT: buffer_load_sbyte_d16 v0, off, s[0:3], s32 offset:4095
959 ; NO-D16-HI: buffer_load_sbyte v
960 define void @load_private_lo_v2f16_reglo_vreg_sexti8_to_offset(i32 %reg) #0 {
962 %obj0 = alloca [10 x i32], align 4, addrspace(5)
963 %obj1 = alloca [4096 x i8], align 2, addrspace(5)
964 %reg.bc = bitcast i32 %reg to <2 x half>
965 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
966 store volatile i32 123, i32 addrspace(5)* %bc
967 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
968 %load = load volatile i8, i8 addrspace(5)* %gep
969 %load.ext = sext i8 %load to i16
970 %bitcast = bitcast i16 %load.ext to half
971 %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
972 store <2 x half> %build1, <2 x half> addrspace(1)* undef
976 ; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg_zexti8_to_offset:
977 ; GFX900: buffer_store_dword
978 ; GFX900-NEXT: buffer_load_ubyte_d16 v0, off, s[0:3], s32 offset:4095
980 ; NO-D16-HI: buffer_load_ubyte v
981 define void @load_private_lo_v2f16_reglo_vreg_zexti8_to_offset(i32 %reg) #0 {
983 %obj0 = alloca [10 x i32], align 4, addrspace(5)
984 %obj1 = alloca [4096 x i8], align 2, addrspace(5)
985 %reg.bc = bitcast i32 %reg to <2 x half>
986 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
987 store volatile i32 123, i32 addrspace(5)* %bc
988 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
989 %load = load volatile i8, i8 addrspace(5)* %gep
990 %load.ext = zext i8 %load to i16
991 %bitcast = bitcast i16 %load.ext to half
992 %build1 = insertelement <2 x half> %reg.bc, half %bitcast, i32 0
993 store <2 x half> %build1, <2 x half> addrspace(1)* undef
997 attributes #0 = { nounwind }