1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s
3 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s
5 ; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo:
7 ; GFX900-NEXT: ds_read_u16_d16_hi v0, v0
8 ; GFX900-NEXT: s_waitcnt
9 ; GFX900-NEXT: s_setpc_b64
11 ; NO-D16-HI: ds_read_u16 v
12 define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
14 %load = load i16, i16 addrspace(3)* %in
15 %build = insertelement <2 x i16> undef, i16 %load, i32 1
19 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo:
21 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
22 ; GFX900-NEXT: s_waitcnt
23 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
24 ; GFX900-NEXT: s_setpc_b64
26 ; NO-D16-HI: ds_read_u16 v
27 define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
29 %load = load i16, i16 addrspace(3)* %in
30 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
31 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
35 ; Show that we get reasonable regalloc without physreg constraints.
36 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg:
38 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
39 ; GFX900-NEXT: s_waitcnt
40 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
41 ; GFX900-NEXT: s_waitcnt
42 ; GFX900-NEXT: s_setpc_b64
44 ; NO-D16-HI: ds_read_u16 v
45 define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
47 %load = load i16, i16 addrspace(3)* %in
48 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
49 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
50 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
54 ; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo:
56 ; GFX900-NEXT: v_mov_b32_e32 v1, 0
57 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
58 ; GFX900-NEXT: s_waitcnt
59 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
60 ; GFX900-NEXT: s_setpc_b64
62 ; NO-D16-HI: ds_read_u16 v
63 define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
65 %load = load i16, i16 addrspace(3)* %in
66 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
70 ; FIXME: Remove m0 initialization
71 ; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift:
73 ; GFX900-NEXT: ds_read_u16 v0, v0
74 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
75 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
76 ; GFX900-NEXT: s_setpc_b64
78 ; NO-D16-HI: ds_read_u16 v
79 ; NO-D16-HI: v_lshlrev_b32_e32 v0, 16, v0
80 define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 {
82 %load = load i16, i16 addrspace(3)* %in
83 %zext = zext i16 %load to i32
84 %shift = shl i32 %zext, 16
88 ; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg:
90 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
91 ; GFX900-NEXT: s_waitcnt
92 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
93 ; GFX900-NEXT: s_waitcnt
94 ; GFX900-NEXT: s_setpc_b64
96 ; NO-D16-HI: ds_read_u16 v
97 define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
99 %load = load half, half addrspace(3)* %in
100 %build0 = insertelement <2 x half> undef, half %reg, i32 0
101 %build1 = insertelement <2 x half> %build0, half %load, i32 1
102 store <2 x half> %build1, <2 x half> addrspace(1)* undef
106 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8:
108 ; GFX900-NEXT: ds_read_u8_d16_hi v1, v0
109 ; GFX900-NEXT: s_waitcnt
110 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
111 ; GFX900-NEXT: s_waitcnt
112 ; GFX900-NEXT: s_setpc_b64
114 ; NO-D16-HI: ds_read_u8 v
115 define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
117 %load = load i8, i8 addrspace(3)* %in
118 %ext = zext i8 %load to i16
119 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
120 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
121 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
125 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8:
127 ; GFX900-NEXT: ds_read_i8_d16_hi v1, v0
128 ; GFX900-NEXT: s_waitcnt
129 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
130 ; GFX900-NEXT: s_waitcnt
131 ; GFX900-NEXT: s_setpc_b64
133 ; NO-D16-HI: ds_read_i8 v
134 define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
136 %load = load i8, i8 addrspace(3)* %in
137 %ext = sext i8 %load to i16
138 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
139 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
140 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
144 ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg:
146 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
147 ; GFX900-NEXT: s_waitcnt
148 ; GFX900-NEXT: global_store_dword
149 ; GFX900-NEXT: s_waitcnt
150 ; GFX900-NEXT: s_setpc_b64
151 define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 {
153 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
154 %load = load i16, i16 addrspace(1)* %gep
155 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
156 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
157 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
161 ; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg:
163 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
164 ; GFX900-NEXT: s_waitcnt
165 ; GFX900-NEXT: global_store_dword
166 ; GFX900-NEXT: s_waitcnt
167 ; GFX900-NEXT: s_setpc_b64
168 define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 {
170 %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
171 %load = load half, half addrspace(1)* %gep
172 %build0 = insertelement <2 x half> undef, half %reg, i32 0
173 %build1 = insertelement <2 x half> %build0, half %load, i32 1
174 store <2 x half> %build1, <2 x half> addrspace(1)* undef
178 ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8:
180 ; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
181 ; GFX900-NEXT: s_waitcnt
182 ; GFX900-NEXT: global_store_dword
183 ; GFX900-NEXT: s_waitcnt
184 ; GFX900-NEXT: s_setpc_b64
185 define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
187 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
188 %load = load i8, i8 addrspace(1)* %gep
189 %ext = zext i8 %load to i16
190 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
191 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
192 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
196 ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8:
198 ; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
199 ; GFX900-NEXT: s_waitcnt
200 ; GFX900-NEXT: global_store_dword
201 ; GFX900-NEXT: s_waitcnt
202 ; GFX900-NEXT: s_setpc_b64
203 define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
205 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
206 %load = load i8, i8 addrspace(1)* %gep
207 %ext = sext i8 %load to i16
208 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
209 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
210 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
214 ; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg:
216 ; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
217 ; GFX900-NEXT: s_waitcnt
218 ; GFX900-NEXT: global_store_dword v[0:1], v2
219 ; GFX900-NEXT: s_waitcnt
220 ; GFX900-NEXT: s_setpc_b64
222 ; NO-D16-HI: flat_load_ushort v{{[0-9]+}}
223 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
224 ; GFX803: v_or_b32_sdwa
225 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
226 define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 {
228 %load = load i16, i16* %in
229 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
230 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
231 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
235 ; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg:
237 ; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
238 ; GFX900-NEXT: s_waitcnt
239 ; GFX900-NEXT: global_store_dword v[0:1], v2
240 ; GFX900-NEXT: s_waitcnt
241 ; GFX900-NEXT: s_setpc_b64
243 ; NO-D16-HI: flat_load_ushort v{{[0-9]+}}
244 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
245 ; GFX803: v_or_b32_sdwa
246 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
247 define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 {
249 %load = load half, half* %in
250 %build0 = insertelement <2 x half> undef, half %reg, i32 0
251 %build1 = insertelement <2 x half> %build0, half %load, i32 1
252 store <2 x half> %build1, <2 x half> addrspace(1)* undef
256 ; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8:
258 ; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
259 ; GFX900-NEXT: s_waitcnt
260 ; GFX900-NEXT: global_store_dword v[0:1], v2
261 ; GFX900-NEXT: s_waitcnt
262 ; GFX900-NEXT: s_setpc_b64
264 ; NO-D16-HI: flat_load_ubyte v{{[0-9]+}}
265 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
266 ; GFX803: v_or_b32_sdwa
267 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
268 define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 {
270 %load = load i8, i8* %in
271 %ext = zext i8 %load to i16
272 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
273 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
274 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
278 ; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8:
280 ; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
281 ; GFX900-NEXT: s_waitcnt
282 ; GFX900-NEXT: global_store_dword v[0:1], v2
283 ; GFX900-NEXT: s_waitcnt
284 ; GFX900-NEXT: s_setpc_b64
286 ; NO-D16-HI: flat_load_sbyte v{{[0-9]+}}
287 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
288 ; GFX803: v_or_b32_sdwa
289 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
290 define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 {
292 %load = load i8, i8* %in
293 %ext = sext i8 %load to i16
294 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
295 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
296 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
300 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg:
302 ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}}
303 ; GFX900-NEXT: s_waitcnt
304 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
305 ; GFX900-NEXT: s_waitcnt
306 ; GFX900-NEXT: s_setpc_b64
308 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}}
309 define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 {
311 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2045
312 %load = load i16, i16 addrspace(5)* %gep
313 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
314 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
315 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
319 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg:
321 ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}}
322 ; GFX900-NEXT: s_waitcnt
323 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
324 ; GFX900-NEXT: s_waitcnt
325 ; GFX900-NEXT: s_setpc_b64
327 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}}
328 define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval %in, half %reg) #0 {
330 %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2045
331 %load = load half, half addrspace(5)* %gep
332 %build0 = insertelement <2 x half> undef, half %reg, i32 0
333 %build1 = insertelement <2 x half> %build0, half %load, i32 1
334 store <2 x half> %build1, <2 x half> addrspace(1)* undef
338 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff:
340 ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s4 offset:4094{{$}}
342 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
343 ; GFX900-NEXT: s_waitcnt
344 ; GFX900-NEXT: s_setpc_b64
346 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
347 define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval %in, i16 %reg) #0 {
349 %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
350 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
351 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
352 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
356 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff:
358 ; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
359 ; GFX900-NEXT: s_waitcnt
360 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
361 ; GFX900-NEXT: s_waitcnt
362 ; GFX900-NEXT: s_setpc_b64
364 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
365 define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 {
367 %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
368 %build0 = insertelement <2 x half> undef, half %reg, i32 0
369 %build1 = insertelement <2 x half> %build0, half %load, i32 1
370 store <2 x half> %build1, <2 x half> addrspace(1)* undef
374 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8:
376 ; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}}
377 ; GFX900-NEXT: s_waitcnt
378 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
379 ; GFX900-NEXT: s_waitcnt
380 ; GFX900-NEXT: s_setpc_b64
382 ; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}}
383 define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 {
385 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
386 %load = load i8, i8 addrspace(5)* %gep
387 %ext = zext i8 %load to i16
388 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
389 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
390 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
394 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8:
396 ; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}}
397 ; GFX900-NEXT: s_waitcnt
398 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
399 ; GFX900-NEXT: s_waitcnt
400 ; GFX900-NEXT: s_setpc_b64
402 ; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}}
403 define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 {
405 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4091
406 %load = load i8, i8 addrspace(5)* %gep
407 %ext = sext i8 %load to i16
408 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
409 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
410 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
414 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
416 ; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
417 ; GFX900-NEXT: s_waitcnt
418 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
419 ; GFX900-NEXT: s_waitcnt
420 ; GFX900-NEXT: s_setpc_b64
422 ; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}}
423 define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
425 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
426 %ext = zext i8 %load to i16
427 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
428 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
429 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
433 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
435 ; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
436 ; GFX900-NEXT: s_waitcnt
437 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
438 ; GFX900-NEXT: s_waitcnt
439 ; GFX900-NEXT: s_setpc_b64
441 ; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s4 offset:4094{{$}}
442 define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
444 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
445 %ext = sext i8 %load to i16
446 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
447 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
448 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
452 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
454 ; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
455 ; GFX900-NEXT: s_waitcnt
456 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
457 ; GFX900-NEXT: s_waitcnt
458 ; GFX900-NEXT: s_setpc_b64
460 ; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s4 offset:4094{{$}}
461 define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 {
463 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
464 %ext = zext i8 %load to i16
465 %bc.ext = bitcast i16 %ext to half
466 %build0 = insertelement <2 x half> undef, half %reg, i32 0
467 %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1
468 store <2 x half> %build1, <2 x half> addrspace(1)* undef
472 ; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg:
474 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
475 ; GFX900-NEXT: s_waitcnt
476 ; GFX900-NEXT: global_store_dword
477 ; GFX900-NEXT: s_waitcnt
478 ; GFX900-NEXT: s_setpc_b64
480 ; GFX803: flat_load_ushort
481 ; GFX906: global_load_ushort
482 define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 {
484 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
485 %load = load i16, i16 addrspace(4)* %gep
486 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
487 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
488 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
492 ; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg
494 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
495 ; GFX900-NEXT: s_waitcnt
496 ; GFX900-NEXT: global_store_dword
497 ; GFX900-NEXT: s_waitcnt
498 ; GFX900-NEXT: s_setpc_b64
500 ; GFX803: flat_load_ushort
501 ; GFX906: global_load_ushort
502 define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 {
504 %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
505 %load = load half, half addrspace(4)* %gep
506 %build0 = insertelement <2 x half> undef, half %reg, i32 0
507 %build1 = insertelement <2 x half> %build0, half %load, i32 1
508 store <2 x half> %build1, <2 x half> addrspace(1)* undef
512 ; Local object gives known offset, so requires converting from offen
515 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset:
516 ; GFX900: buffer_store_dword
517 ; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4094
518 define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 {
520 %obj0 = alloca [10 x i32], align 4, addrspace(5)
521 %obj1 = alloca [4096 x i16], align 2, addrspace(5)
522 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
523 store volatile i32 123, i32 addrspace(5)* %bc
524 %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2025
525 %load = load i16, i16 addrspace(5)* %gep
526 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
527 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
528 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
532 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
533 ; GFX900: buffer_store_dword
534 ; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095
535 define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 {
537 %obj0 = alloca [10 x i32], align 4, addrspace(5)
538 %obj1 = alloca [4096 x i8], align 2, addrspace(5)
539 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
540 store volatile i32 123, i32 addrspace(5)* %bc
541 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
542 %load = load i8, i8 addrspace(5)* %gep
543 %ext = sext i8 %load to i16
544 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
545 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
546 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
550 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
551 ; GFX900: buffer_store_dword
552 ; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s5 offset:4095
553 define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 {
555 %obj0 = alloca [10 x i32], align 4, addrspace(5)
556 %obj1 = alloca [4096 x i8], align 2, addrspace(5)
557 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
558 store volatile i32 123, i32 addrspace(5)* %bc
559 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4051
560 %load = load i8, i8 addrspace(5)* %gep
561 %ext = zext i8 %load to i16
562 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
563 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
564 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
568 ; FIXME: Remove m0 init and waitcnt between reads
569 ; FIXME: Is there a cost to using the extload over not?
570 ; GCN-LABEL: {{^}}load_local_v2i16_split:
572 ; GFX900-NEXT: ds_read_u16 v1, v0
573 ; GFX900-NEXT: s_waitcnt
574 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:2
575 ; GFX900-NEXT: s_waitcnt
576 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
577 ; GFX900-NEXT: s_setpc_b64
578 define <2 x i16> @load_local_v2i16_split(i16 addrspace(3)* %in) #0 {
580 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
581 %load0 = load volatile i16, i16 addrspace(3)* %in
582 %load1 = load volatile i16, i16 addrspace(3)* %gep
583 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
584 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
585 ret <2 x i16> %build1
588 ; FIXME: Remove waitcnt between reads
589 ; GCN-LABEL: {{^}}load_global_v2i16_split:
591 ; GFX900-NEXT: global_load_ushort v2
592 ; GFX900-NEXT: s_waitcnt
593 ; GFX900-NEXT: global_load_short_d16_hi v2
594 ; GFX900-NEXT: s_waitcnt
595 ; GFX900-NEXT: v_mov_b32_e32 v0, v2
596 ; GFX900-NEXT: s_setpc_b64
597 define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 {
599 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
600 %load0 = load volatile i16, i16 addrspace(1)* %in
601 %load1 = load volatile i16, i16 addrspace(1)* %gep
602 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
603 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
604 ret <2 x i16> %build1
607 ; FIXME: Remove waitcnt between reads
608 ; GCN-LABEL: {{^}}load_flat_v2i16_split:
610 ; GFX900-NEXT: flat_load_ushort v2
611 ; GFX900-NEXT: s_waitcnt
612 ; GFX900-NEXT: flat_load_short_d16_hi v2
613 ; GFX900-NEXT: s_waitcnt
614 ; GFX900-NEXT: v_mov_b32_e32 v0, v2
615 ; GFX900-NEXT: s_setpc_b64
616 define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 {
618 %gep = getelementptr inbounds i16, i16* %in, i64 1
619 %load0 = load volatile i16, i16* %in
620 %load1 = load volatile i16, i16* %gep
621 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
622 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
623 ret <2 x i16> %build1
626 ; FIXME: Remove waitcnt between reads
627 ; GCN-LABEL: {{^}}load_constant_v2i16_split:
629 ; GFX900-NEXT: global_load_ushort v2
630 ; GFX900-NEXT: s_waitcnt
631 ; GFX900-NEXT: global_load_short_d16_hi v2
632 ; GFX900-NEXT: s_waitcnt
633 ; GFX900-NEXT: v_mov_b32_e32 v0, v2
634 ; GFX900-NEXT: s_setpc_b64
635 define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 {
637 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1
638 %load0 = load volatile i16, i16 addrspace(4)* %in
639 %load1 = load volatile i16, i16 addrspace(4)* %gep
640 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
641 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
642 ret <2 x i16> %build1
645 ; FIXME: Remove m0 init and waitcnt between reads
646 ; FIXME: Is there a cost to using the extload over not?
647 ; GCN-LABEL: {{^}}load_private_v2i16_split:
649 ; GFX900: buffer_load_ushort v0, off, s[0:3], s5 offset:4{{$}}
650 ; GFX900-NEXT: s_waitcnt
651 ; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:6
652 ; GFX900-NEXT: s_waitcnt
653 ; GFX900-NEXT: s_setpc_b64
654 define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 {
656 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i32 1
657 %load0 = load volatile i16, i16 addrspace(5)* %in
658 %load1 = load volatile i16, i16 addrspace(5)* %gep
659 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
660 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
661 ret <2 x i16> %build1
664 attributes #0 = { nounwind }