1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
2 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
4 ; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo:
6 ; GFX9-NEXT: ds_read_u16_d16_hi v0, v0
8 ; GFX9-NEXT: s_setpc_b64
11 define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
13 %load = load i16, i16 addrspace(3)* %in
14 %build = insertelement <2 x i16> undef, i16 %load, i32 1
18 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo:
20 ; GFX9-NEXT: ds_read_u16_d16_hi v1, v0
21 ; GFX9-NEXT: s_waitcnt
22 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
23 ; GFX9-NEXT: s_setpc_b64
26 define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
28 %load = load i16, i16 addrspace(3)* %in
29 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
30 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
34 ; Show that we get reasonable regalloc without physreg constraints.
35 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg:
37 ; GFX9-NEXT: ds_read_u16_d16_hi v1, v0
38 ; GFX9-NEXT: s_waitcnt
39 ; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
40 ; GFX9-NEXT: s_waitcnt
41 ; GFX9-NEXT: s_setpc_b64
44 define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
46 %load = load i16, i16 addrspace(3)* %in
47 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
48 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
49 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
53 ; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo:
55 ; GFX9-NEXT: v_mov_b32_e32 v1, 0
56 ; GFX9-NEXT: ds_read_u16_d16_hi v1, v0
57 ; GFX9-NEXT: s_waitcnt
58 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
59 ; GFX9-NEXT: s_setpc_b64
62 define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
64 %load = load i16, i16 addrspace(3)* %in
65 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
69 ; FIXME: Remove m0 initialization
70 ; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift:
72 ; GFX9-NEXT: ds_read_u16 v0, v0
73 ; GFX9-NEXT: s_waitcnt lgkmcnt(0)
74 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
75 ; GFX9-NEXT: s_setpc_b64
78 ; VI: v_lshlrev_b32_e32 v0, 16, v0
79 define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 {
81 %load = load i16, i16 addrspace(3)* %in
82 %zext = zext i16 %load to i32
83 %shift = shl i32 %zext, 16
87 ; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg:
89 ; GFX9-NEXT: ds_read_u16_d16_hi v1, v0
90 ; GFX9-NEXT: s_waitcnt
91 ; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
92 ; GFX9-NEXT: s_waitcnt
93 ; GFX9-NEXT: s_setpc_b64
96 define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
98 %load = load half, half addrspace(3)* %in
99 %build0 = insertelement <2 x half> undef, half %reg, i32 0
100 %build1 = insertelement <2 x half> %build0, half %load, i32 1
101 store <2 x half> %build1, <2 x half> addrspace(1)* undef
105 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8:
107 ; GFX9-NEXT: ds_read_u8_d16_hi v1, v0
108 ; GFX9-NEXT: s_waitcnt
109 ; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
110 ; GFX9-NEXT: s_waitcnt
111 ; GFX9-NEXT: s_setpc_b64
114 define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
116 %load = load i8, i8 addrspace(3)* %in
117 %ext = zext i8 %load to i16
118 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
119 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
120 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
124 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8:
126 ; GFX9-NEXT: ds_read_i8_d16_hi v1, v0
127 ; GFX9-NEXT: s_waitcnt
128 ; GFX9-NEXT: global_store_dword v[0:1], v1, off{{$}}
129 ; GFX9-NEXT: s_waitcnt
130 ; GFX9-NEXT: s_setpc_b64
133 define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
135 %load = load i8, i8 addrspace(3)* %in
136 %ext = sext i8 %load to i16
137 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
138 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
139 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
143 ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg:
145 ; GFX9-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
146 ; GFX9-NEXT: s_waitcnt
147 ; GFX9-NEXT: global_store_dword
148 ; GFX9-NEXT: s_waitcnt
149 ; GFX9-NEXT: s_setpc_b64
150 define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 {
152 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
153 %load = load i16, i16 addrspace(1)* %gep
154 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
155 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
156 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
160 ; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg:
162 ; GFX9-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
163 ; GFX9-NEXT: s_waitcnt
164 ; GFX9-NEXT: global_store_dword
165 ; GFX9-NEXT: s_waitcnt
166 ; GFX9-NEXT: s_setpc_b64
167 define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 {
169 %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
170 %load = load half, half addrspace(1)* %gep
171 %build0 = insertelement <2 x half> undef, half %reg, i32 0
172 %build1 = insertelement <2 x half> %build0, half %load, i32 1
173 store <2 x half> %build1, <2 x half> addrspace(1)* undef
177 ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8:
179 ; GFX9-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
180 ; GFX9-NEXT: s_waitcnt
181 ; GFX9-NEXT: global_store_dword
182 ; GFX9-NEXT: s_waitcnt
183 ; GFX9-NEXT: s_setpc_b64
184 define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
186 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
187 %load = load i8, i8 addrspace(1)* %gep
188 %ext = zext i8 %load to i16
189 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
190 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
191 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
195 ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8:
197 ; GFX9-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
198 ; GFX9-NEXT: s_waitcnt
199 ; GFX9-NEXT: global_store_dword
200 ; GFX9-NEXT: s_waitcnt
201 ; GFX9-NEXT: s_setpc_b64
202 define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
204 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
205 %load = load i8, i8 addrspace(1)* %gep
206 %ext = sext i8 %load to i16
207 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
208 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
209 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
213 ; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg:
215 ; GFX9-NEXT: flat_load_short_d16_hi v2, v[0:1]
216 ; GFX9-NEXT: s_waitcnt
217 ; GFX9-NEXT: global_store_dword v[0:1], v2
218 ; GFX9-NEXT: s_waitcnt
219 ; GFX9-NEXT: s_setpc_b64
221 ; VI: flat_load_ushort v{{[0-9]+}}
222 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
224 define void @load_flat_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 {
226 %load = load i16, i16 addrspace(4)* %in
227 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
228 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
229 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
233 ; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg:
235 ; GFX9-NEXT: flat_load_short_d16_hi v2, v[0:1]
236 ; GFX9-NEXT: s_waitcnt
237 ; GFX9-NEXT: global_store_dword v[0:1], v2
238 ; GFX9-NEXT: s_waitcnt
239 ; GFX9-NEXT: s_setpc_b64
241 ; VI: flat_load_ushort v{{[0-9]+}}
242 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
244 define void @load_flat_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 {
246 %load = load half, half addrspace(4)* %in
247 %build0 = insertelement <2 x half> undef, half %reg, i32 0
248 %build1 = insertelement <2 x half> %build0, half %load, i32 1
249 store <2 x half> %build1, <2 x half> addrspace(1)* undef
253 ; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8:
255 ; GFX9-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
256 ; GFX9-NEXT: s_waitcnt
257 ; GFX9-NEXT: global_store_dword v[0:1], v2
258 ; GFX9-NEXT: s_waitcnt
259 ; GFX9-NEXT: s_setpc_b64
261 ; VI: flat_load_ubyte v{{[0-9]+}}
262 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
264 define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(4)* %in, i16 %reg) #0 {
266 %load = load i8, i8 addrspace(4)* %in
267 %ext = zext i8 %load to i16
268 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
269 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
270 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
274 ; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8:
276 ; GFX9-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
277 ; GFX9-NEXT: s_waitcnt
278 ; GFX9-NEXT: global_store_dword v[0:1], v2
279 ; GFX9-NEXT: s_waitcnt
280 ; GFX9-NEXT: s_setpc_b64
282 ; VI: flat_load_sbyte v{{[0-9]+}}
283 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
285 define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(4)* %in, i16 %reg) #0 {
287 %load = load i8, i8 addrspace(4)* %in
288 %ext = sext i8 %load to i16
289 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
290 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
291 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
295 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg:
297 ; GFX9: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}}
298 ; GFX9-NEXT: s_waitcnt
299 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
300 ; GFX9-NEXT: s_waitcnt
301 ; GFX9-NEXT: s_setpc_b64
303 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}}
304 define void @load_private_hi_v2i16_reglo_vreg(i16* byval %in, i16 %reg) #0 {
306 %gep = getelementptr inbounds i16, i16* %in, i64 2045
307 %load = load i16, i16* %gep
308 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
309 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
310 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
314 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg:
316 ; GFX9: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}}
317 ; GFX9-NEXT: s_waitcnt
318 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
319 ; GFX9-NEXT: s_waitcnt
320 ; GFX9-NEXT: s_setpc_b64
322 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}}
323 define void @load_private_hi_v2f16_reglo_vreg(half* byval %in, half %reg) #0 {
325 %gep = getelementptr inbounds half, half* %in, i64 2045
326 %load = load half, half* %gep
327 %build0 = insertelement <2 x half> undef, half %reg, i32 0
328 %build1 = insertelement <2 x half> %build0, half %load, i32 1
329 store <2 x half> %build1, <2 x half> addrspace(1)* undef
333 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff:
335 ; GFX9: buffer_load_short_d16_hi v0, off, s[0:3], s4 offset:4094{{$}}
337 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
338 ; GFX9-NEXT: s_waitcnt
339 ; GFX9-NEXT: s_setpc_b64
341 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
342 define void @load_private_hi_v2i16_reglo_vreg_nooff(i16* byval %in, i16 %reg) #0 {
344 %load = load volatile i16, i16* inttoptr (i32 4094 to i16*)
345 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
346 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
347 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
351 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff:
353 ; GFX9-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
354 ; GFX9-NEXT: s_waitcnt
355 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
356 ; GFX9-NEXT: s_waitcnt
357 ; GFX9-NEXT: s_setpc_b64
359 ; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
360 define void @load_private_hi_v2f16_reglo_vreg_nooff(half* %in, half %reg) #0 {
362 %load = load volatile half, half* inttoptr (i32 4094 to half*)
363 %build0 = insertelement <2 x half> undef, half %reg, i32 0
364 %build1 = insertelement <2 x half> %build0, half %load, i32 1
365 store <2 x half> %build1, <2 x half> addrspace(1)* undef
369 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8:
371 ; GFX9: buffer_load_ubyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}}
372 ; GFX9-NEXT: s_waitcnt
373 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
374 ; GFX9-NEXT: s_waitcnt
375 ; GFX9-NEXT: s_setpc_b64
377 ; VI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}}
378 define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8* byval %in, i16 %reg) #0 {
380 %gep = getelementptr inbounds i8, i8* %in, i64 4091
381 %load = load i8, i8* %gep
382 %ext = zext i8 %load to i16
383 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
384 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
385 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
389 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8:
391 ; GFX9: buffer_load_sbyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}}
392 ; GFX9-NEXT: s_waitcnt
393 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
394 ; GFX9-NEXT: s_waitcnt
395 ; GFX9-NEXT: s_setpc_b64
397 ; VI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}}
398 define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8* byval %in, i16 %reg) #0 {
400 %gep = getelementptr inbounds i8, i8* %in, i64 4091
401 %load = load i8, i8* %gep
402 %ext = sext i8 %load to i16
403 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
404 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
405 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
409 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
411 ; GFX9-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
412 ; GFX9-NEXT: s_waitcnt
413 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
414 ; GFX9-NEXT: s_waitcnt
415 ; GFX9-NEXT: s_setpc_b64
417 ; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}}
418 define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8* %in, i16 %reg) #0 {
420 %load = load volatile i8, i8* inttoptr (i32 4094 to i8*)
421 %ext = zext i8 %load to i16
422 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
423 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
424 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
428 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
430 ; GFX9-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
431 ; GFX9-NEXT: s_waitcnt
432 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
433 ; GFX9-NEXT: s_waitcnt
434 ; GFX9-NEXT: s_setpc_b64
436 ; VI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}}
437 define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8* %in, i16 %reg) #0 {
439 %load = load volatile i8, i8* inttoptr (i32 4094 to i8*)
440 %ext = sext i8 %load to i16
441 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
442 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
443 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
447 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
449 ; GFX9-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
450 ; GFX9-NEXT: s_waitcnt
451 ; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
452 ; GFX9-NEXT: s_waitcnt
453 ; GFX9-NEXT: s_setpc_b64
455 ; VI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}}
456 define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8* %in, half %reg) #0 {
458 %load = load volatile i8, i8* inttoptr (i32 4094 to i8*)
459 %ext = zext i8 %load to i16
460 %bc.ext = bitcast i16 %ext to half
461 %build0 = insertelement <2 x half> undef, half %reg, i32 0
462 %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1
463 store <2 x half> %build1, <2 x half> addrspace(1)* undef
467 ; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg:
469 ; GFX9-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
470 ; GFX9-NEXT: s_waitcnt
471 ; GFX9-NEXT: global_store_dword
472 ; GFX9-NEXT: s_waitcnt
473 ; GFX9-NEXT: s_setpc_b64
475 ; VI: flat_load_ushort
476 define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(2)* %in, i16 %reg) #0 {
478 %gep = getelementptr inbounds i16, i16 addrspace(2)* %in, i64 -2047
479 %load = load i16, i16 addrspace(2)* %gep
480 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
481 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
482 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
486 ; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg
488 ; GFX9-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
489 ; GFX9-NEXT: s_waitcnt
490 ; GFX9-NEXT: global_store_dword
491 ; GFX9-NEXT: s_waitcnt
492 ; GFX9-NEXT: s_setpc_b64
494 ; VI: flat_load_ushort
495 define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(2)* %in, half %reg) #0 {
497 %gep = getelementptr inbounds half, half addrspace(2)* %in, i64 -2047
498 %load = load half, half addrspace(2)* %gep
499 %build0 = insertelement <2 x half> undef, half %reg, i32 0
500 %build1 = insertelement <2 x half> %build0, half %load, i32 1
501 store <2 x half> %build1, <2 x half> addrspace(1)* undef
505 ; Local object gives known offset, so requires converting from offen
508 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset:
509 ; GFX9: buffer_store_dword
510 ; GFX9-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4094
511 define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 {
513 %obj0 = alloca [10 x i32], align 4
514 %obj1 = alloca [4096 x i16], align 2
515 %bc = bitcast [10 x i32]* %obj0 to i32*
516 store volatile i32 123, i32* %bc
517 %gep = getelementptr inbounds [4096 x i16], [4096 x i16]* %obj1, i32 0, i32 2025
518 %load = load i16, i16* %gep
519 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
520 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
521 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
525 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
526 ; GFX9: buffer_store_dword
527 ; GFX9-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095
528 define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 {
530 %obj0 = alloca [10 x i32], align 4
531 %obj1 = alloca [4096 x i8], align 2
532 %bc = bitcast [10 x i32]* %obj0 to i32*
533 store volatile i32 123, i32* %bc
534 %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051
535 %load = load i8, i8* %gep
536 %ext = sext i8 %load to i16
537 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
538 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
539 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
543 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
544 ; GFX9: buffer_store_dword
545 ; GFX9-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095
546 define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 {
548 %obj0 = alloca [10 x i32], align 4
549 %obj1 = alloca [4096 x i8], align 2
550 %bc = bitcast [10 x i32]* %obj0 to i32*
551 store volatile i32 123, i32* %bc
552 %gep = getelementptr inbounds [4096 x i8], [4096 x i8]* %obj1, i32 0, i32 4051
553 %load = load i8, i8* %gep
554 %ext = zext i8 %load to i16
555 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
556 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
557 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
561 ; FIXME: Remove m0 init and waitcnt between reads
562 ; FIXME: Is there a cost to using the extload over not?
563 ; GCN-LABEL: {{^}}load_local_v2i16_split:
565 ; GFX9-NEXT: ds_read_u16 v1, v0
566 ; GFX9-NEXT: s_waitcnt
567 ; GFX9-NEXT: ds_read_u16_d16_hi v1, v0 offset:2
568 ; GFX9-NEXT: s_waitcnt
569 ; GFX9-NEXT: v_mov_b32_e32 v0, v1
570 ; GFX9-NEXT: s_setpc_b64
571 define <2 x i16> @load_local_v2i16_split(i16 addrspace(3)* %in) #0 {
573 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
574 %load0 = load volatile i16, i16 addrspace(3)* %in
575 %load1 = load volatile i16, i16 addrspace(3)* %gep
576 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
577 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
578 ret <2 x i16> %build1
581 ; FIXME: Remove waitcnt between reads
582 ; GCN-LABEL: {{^}}load_global_v2i16_split:
584 ; GFX9-NEXT: global_load_ushort v2
585 ; GFX9-NEXT: s_waitcnt
586 ; GFX9-NEXT: global_load_short_d16_hi v2
587 ; GFX9-NEXT: s_waitcnt
588 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
589 ; GFX9-NEXT: s_setpc_b64
590 define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 {
592 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
593 %load0 = load volatile i16, i16 addrspace(1)* %in
594 %load1 = load volatile i16, i16 addrspace(1)* %gep
595 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
596 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
597 ret <2 x i16> %build1
600 ; FIXME: Remove waitcnt between reads
601 ; GCN-LABEL: {{^}}load_flat_v2i16_split:
603 ; GFX9-NEXT: flat_load_ushort v2
604 ; GFX9-NEXT: s_waitcnt
605 ; GFX9-NEXT: flat_load_short_d16_hi v2
606 ; GFX9-NEXT: s_waitcnt
607 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
608 ; GFX9-NEXT: s_setpc_b64
609 define <2 x i16> @load_flat_v2i16_split(i16 addrspace(4)* %in) #0 {
611 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1
612 %load0 = load volatile i16, i16 addrspace(4)* %in
613 %load1 = load volatile i16, i16 addrspace(4)* %gep
614 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
615 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
616 ret <2 x i16> %build1
619 ; FIXME: Remove waitcnt between reads
620 ; GCN-LABEL: {{^}}load_constant_v2i16_split:
622 ; GFX9-NEXT: global_load_ushort v2
623 ; GFX9-NEXT: s_waitcnt
624 ; GFX9-NEXT: global_load_short_d16_hi v2
625 ; GFX9-NEXT: s_waitcnt
626 ; GFX9-NEXT: v_mov_b32_e32 v0, v2
627 ; GFX9-NEXT: s_setpc_b64
628 define <2 x i16> @load_constant_v2i16_split(i16 addrspace(2)* %in) #0 {
630 %gep = getelementptr inbounds i16, i16 addrspace(2)* %in, i64 1
631 %load0 = load volatile i16, i16 addrspace(2)* %in
632 %load1 = load volatile i16, i16 addrspace(2)* %gep
633 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
634 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
635 ret <2 x i16> %build1
638 ; FIXME: Remove m0 init and waitcnt between reads
639 ; FIXME: Is there a cost to using the extload over not?
640 ; GCN-LABEL: {{^}}load_private_v2i16_split:
642 ; GFX9: buffer_load_ushort v0, off, s[0:3], s5 offset:4{{$}}
643 ; GFX9-NEXT: s_waitcnt
644 ; GFX9-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:6
645 ; GFX9-NEXT: s_waitcnt
646 ; GFX9-NEXT: s_setpc_b64
647 define <2 x i16> @load_private_v2i16_split(i16* byval %in) #0 {
649 %gep = getelementptr inbounds i16, i16* %in, i32 1
650 %load0 = load volatile i16, i16* %in
651 %load1 = load volatile i16, i16* %gep
652 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
653 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
654 ret <2 x i16> %build1
657 attributes #0 = { nounwind }