1 ; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx906 -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX906,NO-D16-HI %s
3 ; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sroa=0 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX803,NO-D16-HI %s
5 ; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lo:
6 ; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7 ; GFX900-NEXT: ds_read_u16 v2, v0
8 ; GFX900-NEXT: v_mov_b32_e32 v3, 0
9 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
10 ; GFX900-NEXT: v_mov_b32_e32 v1, v2
11 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16
12 ; GFX900-NEXT: ds_write_b16 v3, v2
13 ; GFX900-NEXT: s_waitcnt lgkmcnt(1)
14 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
15 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
16 ; GFX900-NEXT: s_setpc_b64 s[30:31]
17 define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lo(i16 addrspace(3)* noalias %in) #0 {
19 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
20 %load.lo = load i16, i16 addrspace(3)* %in
21 %load.hi = load i16, i16 addrspace(3)* %gep
22 store i16 %load.lo, i16 addrspace(3)* null
23 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
24 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
28 ; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_hi:
29 ; GFX900: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30 ; GFX900-NEXT: ds_read_u16 v1, v0
31 ; GFX900-NEXT: ds_read_u16 v0, v0 offset:16
32 ; GFX900-NEXT: v_mov_b32_e32 v2, 0
33 ; GFX900-NEXT: s_waitcnt lgkmcnt(1)
34 ; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v1
35 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
36 ; GFX900-NEXT: ds_write_b16 v2, v0
37 ; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1
38 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
39 ; GFX900-NEXT: s_setpc_b64 s[30:31]
40 define <2 x i16> @load_local_lo_hi_v2i16_multi_use_hi(i16 addrspace(3)* noalias %in) #0 {
42 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
43 %load.lo = load i16, i16 addrspace(3)* %in
44 %load.hi = load i16, i16 addrspace(3)* %gep
45 store i16 %load.hi, i16 addrspace(3)* null
46 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
47 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
51 ; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_multi_use_lohi:
52 ; GFX900: ds_read_u16 v3, v0
53 ; GFX900-NEXT: ds_read_u16 v0, v0 offset:16
54 ; GFX900-NEXT: s_waitcnt lgkmcnt(1)
55 ; GFX900-NEXT: ds_write_b16 v1, v3
56 ; GFX900-NEXT: s_waitcnt lgkmcnt(1)
57 ; GFX900-NEXT: ds_write_b16 v2, v0
58 ; GFX900-NEXT: v_and_b32_e32 v1, 0xffff, v3
59 ; GFX900-NEXT: v_lshl_or_b32 v0, v0, 16, v1
60 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
61 ; GFX900-NEXT: s_setpc_b64 s[30:31]
62 define <2 x i16> @load_local_lo_hi_v2i16_multi_use_lohi(i16 addrspace(3)* noalias %in, i16 addrspace(3)* noalias %out0, i16 addrspace(3)* noalias %out1) #0 {
64 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
65 %load.lo = load i16, i16 addrspace(3)* %in
66 %load.hi = load i16, i16 addrspace(3)* %gep
67 store i16 %load.lo, i16 addrspace(3)* %out0
68 store i16 %load.hi, i16 addrspace(3)* %out1
69 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
70 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
74 ; GCN-LABEL: {{^}}load_local_hi_v2i16_undeflo:
76 ; GFX900-NEXT: ds_read_u16_d16_hi v0, v0
77 ; GFX900-NEXT: s_waitcnt
78 ; GFX900-NEXT: s_setpc_b64
80 ; NO-D16-HI: ds_read_u16 v
81 define <2 x i16> @load_local_hi_v2i16_undeflo(i16 addrspace(3)* %in) #0 {
83 %load = load i16, i16 addrspace(3)* %in
84 %build = insertelement <2 x i16> undef, i16 %load, i32 1
88 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo:
90 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
91 ; GFX900-NEXT: s_waitcnt
92 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
93 ; GFX900-NEXT: s_setpc_b64
95 ; NO-D16-HI: ds_read_u16 v
96 define <2 x i16> @load_local_hi_v2i16_reglo(i16 addrspace(3)* %in, i16 %reg) #0 {
98 %load = load i16, i16 addrspace(3)* %in
99 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
100 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
101 ret <2 x i16> %build1
104 ; Show that we get reasonable regalloc without physreg constraints.
105 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg:
107 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
108 ; GFX900-NEXT: s_waitcnt
109 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
110 ; GFX900-NEXT: s_waitcnt
111 ; GFX900-NEXT: s_setpc_b64
113 ; NO-D16-HI: ds_read_u16 v
114 define void @load_local_hi_v2i16_reglo_vreg(i16 addrspace(3)* %in, i16 %reg) #0 {
116 %load = load i16, i16 addrspace(3)* %in
117 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
118 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
119 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
123 ; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo:
125 ; GFX900-NEXT: v_mov_b32_e32 v1, 0
126 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
127 ; GFX900-NEXT: s_waitcnt
128 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
129 ; GFX900-NEXT: s_setpc_b64
131 ; NO-D16-HI: ds_read_u16 v
132 define <2 x i16> @load_local_hi_v2i16_zerolo(i16 addrspace(3)* %in) #0 {
134 %load = load i16, i16 addrspace(3)* %in
135 %build = insertelement <2 x i16> zeroinitializer, i16 %load, i32 1
139 ; FIXME: Remove m0 initialization
140 ; GCN-LABEL: {{^}}load_local_hi_v2i16_zerolo_shift:
142 ; GFX900-NEXT: ds_read_u16 v0, v0
143 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
144 ; GFX900-NEXT: v_lshlrev_b32_e32 v0, 16, v0
145 ; GFX900-NEXT: s_setpc_b64
147 ; NO-D16-HI: ds_read_u16 v
148 ; NO-D16-HI: v_lshlrev_b32_e32 v0, 16, v0
149 define i32 @load_local_hi_v2i16_zerolo_shift(i16 addrspace(3)* %in) #0 {
151 %load = load i16, i16 addrspace(3)* %in
152 %zext = zext i16 %load to i32
153 %shift = shl i32 %zext, 16
157 ; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg:
159 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0
160 ; GFX900-NEXT: s_waitcnt
161 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
162 ; GFX900-NEXT: s_waitcnt
163 ; GFX900-NEXT: s_setpc_b64
165 ; NO-D16-HI: ds_read_u16 v
166 define void @load_local_hi_v2f16_reglo_vreg(half addrspace(3)* %in, half %reg) #0 {
168 %load = load half, half addrspace(3)* %in
169 %build0 = insertelement <2 x half> undef, half %reg, i32 0
170 %build1 = insertelement <2 x half> %build0, half %load, i32 1
171 store <2 x half> %build1, <2 x half> addrspace(1)* undef
175 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_zexti8:
177 ; GFX900-NEXT: ds_read_u8_d16_hi v1, v0
178 ; GFX900-NEXT: s_waitcnt
179 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
180 ; GFX900-NEXT: s_waitcnt
181 ; GFX900-NEXT: s_setpc_b64
183 ; NO-D16-HI: ds_read_u8 v
184 define void @load_local_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
186 %load = load i8, i8 addrspace(3)* %in
187 %ext = zext i8 %load to i16
188 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
189 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
190 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
194 ; GCN-LABEL: {{^}}load_local_hi_v2i16_reglo_vreg_sexti8:
196 ; GFX900-NEXT: ds_read_i8_d16_hi v1, v0
197 ; GFX900-NEXT: s_waitcnt
198 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
199 ; GFX900-NEXT: s_waitcnt
200 ; GFX900-NEXT: s_setpc_b64
202 ; NO-D16-HI: ds_read_i8 v
203 define void @load_local_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(3)* %in, i16 %reg) #0 {
205 %load = load i8, i8 addrspace(3)* %in
206 %ext = sext i8 %load to i16
207 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
208 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
209 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
213 ; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_zexti8:
215 ; GFX900-NEXT: ds_read_u8_d16_hi v1, v0
216 ; GFX900-NEXT: s_waitcnt
217 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
218 ; GFX900-NEXT: s_waitcnt
219 ; GFX900-NEXT: s_setpc_b64
221 ; NO-D16-HI: ds_read_u8 v
222 define void @load_local_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(3)* %in, half %reg) #0 {
224 %load = load i8, i8 addrspace(3)* %in
225 %ext = zext i8 %load to i16
226 %bitcast = bitcast i16 %ext to half
228 %build0 = insertelement <2 x half> undef, half %reg, i32 0
229 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
230 store <2 x half> %build1, <2 x half> addrspace(1)* undef
234 ; GCN-LABEL: {{^}}load_local_hi_v2f16_reglo_vreg_sexti8:
236 ; GFX900-NEXT: ds_read_i8_d16_hi v1, v0
237 ; GFX900-NEXT: s_waitcnt
238 ; GFX900-NEXT: global_store_dword v[0:1], v1, off{{$}}
239 ; GFX900-NEXT: s_waitcnt
240 ; GFX900-NEXT: s_setpc_b64
242 ; NO-D16-HI: ds_read_i8 v
243 define void @load_local_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(3)* %in, half %reg) #0 {
245 %load = load i8, i8 addrspace(3)* %in
246 %ext = sext i8 %load to i16
247 %bitcast = bitcast i16 %ext to half
249 %build0 = insertelement <2 x half> undef, half %reg, i32 0
250 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
251 store <2 x half> %build1, <2 x half> addrspace(1)* undef
255 ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg:
257 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
258 ; GFX900-NEXT: s_waitcnt
259 ; GFX900-NEXT: global_store_dword
260 ; GFX900-NEXT: s_waitcnt
261 ; GFX900-NEXT: s_setpc_b64
262 define void @load_global_hi_v2i16_reglo_vreg(i16 addrspace(1)* %in, i16 %reg) #0 {
264 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 -2047
265 %load = load i16, i16 addrspace(1)* %gep
266 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
267 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
268 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
272 ; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg:
274 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
275 ; GFX900-NEXT: s_waitcnt
276 ; GFX900-NEXT: global_store_dword
277 ; GFX900-NEXT: s_waitcnt
278 ; GFX900-NEXT: s_setpc_b64
279 define void @load_global_hi_v2f16_reglo_vreg(half addrspace(1)* %in, half %reg) #0 {
281 %gep = getelementptr inbounds half, half addrspace(1)* %in, i64 -2047
282 %load = load half, half addrspace(1)* %gep
283 %build0 = insertelement <2 x half> undef, half %reg, i32 0
284 %build1 = insertelement <2 x half> %build0, half %load, i32 1
285 store <2 x half> %build1, <2 x half> addrspace(1)* undef
289 ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_zexti8:
291 ; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
292 ; GFX900-NEXT: s_waitcnt
293 ; GFX900-NEXT: global_store_dword
294 ; GFX900-NEXT: s_waitcnt
295 ; GFX900-NEXT: s_setpc_b64
296 define void @load_global_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
298 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
299 %load = load i8, i8 addrspace(1)* %gep
300 %ext = zext i8 %load to i16
301 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
302 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
303 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
307 ; GCN-LABEL: {{^}}load_global_hi_v2i16_reglo_vreg_sexti8:
309 ; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
310 ; GFX900-NEXT: s_waitcnt
311 ; GFX900-NEXT: global_store_dword
312 ; GFX900-NEXT: s_waitcnt
313 ; GFX900-NEXT: s_setpc_b64
314 define void @load_global_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(1)* %in, i16 %reg) #0 {
316 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
317 %load = load i8, i8 addrspace(1)* %gep
318 %ext = sext i8 %load to i16
319 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
320 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
321 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
325 ; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_sexti8:
327 ; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
328 ; GFX900-NEXT: s_waitcnt
329 ; GFX900-NEXT: global_store_dword
330 ; GFX900-NEXT: s_waitcnt
331 ; GFX900-NEXT: s_setpc_b64
332 define void @load_global_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(1)* %in, half %reg) #0 {
334 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
335 %load = load i8, i8 addrspace(1)* %gep
336 %ext = sext i8 %load to i16
337 %bitcast = bitcast i16 %ext to half
338 %build0 = insertelement <2 x half> undef, half %reg, i32 0
339 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
340 store <2 x half> %build1, <2 x half> addrspace(1)* undef
344 ; GCN-LABEL: {{^}}load_global_hi_v2f16_reglo_vreg_zexti8:
346 ; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
347 ; GFX900-NEXT: s_waitcnt
348 ; GFX900-NEXT: global_store_dword
349 ; GFX900-NEXT: s_waitcnt
350 ; GFX900-NEXT: s_setpc_b64
351 define void @load_global_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(1)* %in, half %reg) #0 {
353 %gep = getelementptr inbounds i8, i8 addrspace(1)* %in, i64 -4095
354 %load = load i8, i8 addrspace(1)* %gep
355 %ext = zext i8 %load to i16
356 %bitcast = bitcast i16 %ext to half
357 %build0 = insertelement <2 x half> undef, half %reg, i32 0
358 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
359 store <2 x half> %build1, <2 x half> addrspace(1)* undef
363 ; GCN-LABEL: load_flat_hi_v2i16_reglo_vreg:
365 ; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
366 ; GFX900-NEXT: s_waitcnt
367 ; GFX900-NEXT: global_store_dword v[0:1], v2
368 ; GFX900-NEXT: s_waitcnt
369 ; GFX900-NEXT: s_setpc_b64
371 ; NO-D16-HI: flat_load_ushort v{{[0-9]+}}
372 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
373 ; GFX803: v_or_b32_sdwa
374 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
375 define void @load_flat_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 {
377 %load = load i16, i16* %in
378 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
379 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
380 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
384 ; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg:
386 ; GFX900-NEXT: flat_load_short_d16_hi v2, v[0:1]
387 ; GFX900-NEXT: s_waitcnt
388 ; GFX900-NEXT: global_store_dword v[0:1], v2
389 ; GFX900-NEXT: s_waitcnt
390 ; GFX900-NEXT: s_setpc_b64
392 ; NO-D16-HI: flat_load_ushort v{{[0-9]+}}
393 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
394 ; GFX803: v_or_b32_sdwa
395 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
396 define void @load_flat_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 {
398 %load = load half, half* %in
399 %build0 = insertelement <2 x half> undef, half %reg, i32 0
400 %build1 = insertelement <2 x half> %build0, half %load, i32 1
401 store <2 x half> %build1, <2 x half> addrspace(1)* undef
405 ; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_zexti8:
407 ; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
408 ; GFX900-NEXT: s_waitcnt
409 ; GFX900-NEXT: global_store_dword v[0:1], v2
410 ; GFX900-NEXT: s_waitcnt
411 ; GFX900-NEXT: s_setpc_b64
413 ; NO-D16-HI: flat_load_ubyte v{{[0-9]+}}
414 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
415 ; GFX803: v_or_b32_sdwa
416 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
417 define void @load_flat_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 {
419 %load = load i8, i8* %in
420 %ext = zext i8 %load to i16
421 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
422 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
423 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
427 ; GCN-LABEL: {{^}}load_flat_hi_v2i16_reglo_vreg_sexti8:
429 ; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
430 ; GFX900-NEXT: s_waitcnt
431 ; GFX900-NEXT: global_store_dword v[0:1], v2
432 ; GFX900-NEXT: s_waitcnt
433 ; GFX900-NEXT: s_setpc_b64
435 ; NO-D16-HI: flat_load_sbyte v{{[0-9]+}}
436 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
437 ; GFX803: v_or_b32_sdwa
438 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
439 define void @load_flat_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 {
441 %load = load i8, i8* %in
442 %ext = sext i8 %load to i16
443 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
444 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
445 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
449 ; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_zexti8:
451 ; GFX900-NEXT: flat_load_ubyte_d16_hi v2, v[0:1]
452 ; GFX900-NEXT: s_waitcnt
453 ; GFX900-NEXT: global_store_dword v[0:1], v2
454 ; GFX900-NEXT: s_waitcnt
455 ; GFX900-NEXT: s_setpc_b64
457 ; NO-D16-HI: flat_load_ubyte v{{[0-9]+}}
458 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
459 ; GFX803: v_or_b32_sdwa
460 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
461 define void @load_flat_hi_v2f16_reglo_vreg_zexti8(i8* %in, half %reg) #0 {
463 %load = load i8, i8* %in
464 %ext = zext i8 %load to i16
465 %bitcast = bitcast i16 %ext to half
466 %build0 = insertelement <2 x half> undef, half %reg, i32 0
467 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
468 store <2 x half> %build1, <2 x half> addrspace(1)* undef
472 ; GCN-LABEL: {{^}}load_flat_hi_v2f16_reglo_vreg_sexti8:
474 ; GFX900-NEXT: flat_load_sbyte_d16_hi v2, v[0:1]
475 ; GFX900-NEXT: s_waitcnt
476 ; GFX900-NEXT: global_store_dword v[0:1], v2
477 ; GFX900-NEXT: s_waitcnt
478 ; GFX900-NEXT: s_setpc_b64
480 ; NO-D16-HI: flat_load_sbyte v{{[0-9]+}}
481 ; GFX803: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
482 ; GFX803: v_or_b32_sdwa
483 ; GFX906: v_lshl_or_b32 v{{[0-9]+}}, v{{[0-9]+}}, 16,
484 define void @load_flat_hi_v2f16_reglo_vreg_sexti8(i8* %in, half %reg) #0 {
486 %load = load i8, i8* %in
487 %ext = sext i8 %load to i16
488 %bitcast = bitcast i16 %ext to half
489 %build0 = insertelement <2 x half> undef, half %reg, i32 0
490 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
491 store <2 x half> %build1, <2 x half> addrspace(1)* undef
495 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg:
497 ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
498 ; GFX900-NEXT: s_waitcnt
499 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
500 ; GFX900-NEXT: s_waitcnt
501 ; GFX900-NEXT: s_setpc_b64
503 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
504 define void @load_private_hi_v2i16_reglo_vreg(i16 addrspace(5)* byval %in, i16 %reg) #0 {
506 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i64 2047
507 %load = load i16, i16 addrspace(5)* %gep
508 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
509 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
510 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
514 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg:
516 ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:4094{{$}}
517 ; GFX900-NEXT: s_waitcnt
518 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
519 ; GFX900-NEXT: s_waitcnt
520 ; GFX900-NEXT: s_setpc_b64
522 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s32 offset:4094{{$}}
523 define void @load_private_hi_v2f16_reglo_vreg(half addrspace(5)* byval %in, half %reg) #0 {
525 %gep = getelementptr inbounds half, half addrspace(5)* %in, i64 2047
526 %load = load half, half addrspace(5)* %gep
527 %build0 = insertelement <2 x half> undef, half %reg, i32 0
528 %build1 = insertelement <2 x half> %build0, half %load, i32 1
529 store <2 x half> %build1, <2 x half> addrspace(1)* undef
533 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff:
535 ; GFX900: buffer_load_short_d16_hi v0, off, s[0:3], s33 offset:4094{{$}}
537 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
538 ; GFX900-NEXT: s_waitcnt
539 ; GFX900-NEXT: s_setpc_b64
541 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}}
542 define void @load_private_hi_v2i16_reglo_vreg_nooff(i16 addrspace(5)* byval %in, i16 %reg) #0 {
544 %load = load volatile i16, i16 addrspace(5)* inttoptr (i32 4094 to i16 addrspace(5)*)
545 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
546 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
547 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
551 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff:
553 ; GFX900-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s33 offset:4094{{$}}
554 ; GFX900-NEXT: s_waitcnt
555 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
556 ; GFX900-NEXT: s_waitcnt
557 ; GFX900-NEXT: s_setpc_b64
559 ; NO-D16-HI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s33 offset:4094{{$}}
560 define void @load_private_hi_v2f16_reglo_vreg_nooff(half addrspace(5)* %in, half %reg) #0 {
562 %load = load volatile half, half addrspace(5)* inttoptr (i32 4094 to half addrspace(5)*)
563 %build0 = insertelement <2 x half> undef, half %reg, i32 0
564 %build1 = insertelement <2 x half> %build0, half %load, i32 1
565 store <2 x half> %build1, <2 x half> addrspace(1)* undef
569 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8:
571 ; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
572 ; GFX900-NEXT: s_waitcnt
573 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
574 ; GFX900-NEXT: s_waitcnt
575 ; GFX900-NEXT: s_setpc_b64
577 ; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
578 define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 {
580 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
581 %load = load i8, i8 addrspace(5)* %gep
582 %ext = zext i8 %load to i16
583 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
584 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
585 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
589 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_zexti8:
591 ; GFX900: buffer_load_ubyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
592 ; GFX900-NEXT: s_waitcnt
593 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
594 ; GFX900-NEXT: s_waitcnt
595 ; GFX900-NEXT: s_setpc_b64
597 ; NO-D16-HI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
598 define void @load_private_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(5)* byval %in, half %reg) #0 {
600 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
601 %load = load i8, i8 addrspace(5)* %gep
602 %ext = zext i8 %load to i16
603 %bitcast = bitcast i16 %ext to half
604 %build0 = insertelement <2 x half> undef, half %reg, i32 0
605 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
606 store <2 x half> %build1, <2 x half> addrspace(1)* undef
610 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_sexti8:
612 ; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
613 ; GFX900-NEXT: s_waitcnt
614 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
615 ; GFX900-NEXT: s_waitcnt
616 ; GFX900-NEXT: s_setpc_b64
618 ; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
619 define void @load_private_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, half %reg) #0 {
621 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
622 %load = load i8, i8 addrspace(5)* %gep
623 %ext = sext i8 %load to i16
624 %bitcast = bitcast i16 %ext to half
625 %build0 = insertelement <2 x half> undef, half %reg, i32 0
626 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
627 store <2 x half> %build1, <2 x half> addrspace(1)* undef
631 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8:
633 ; GFX900: buffer_load_sbyte_d16_hi v0, off, s[0:3], s32 offset:4095{{$}}
634 ; GFX900-NEXT: s_waitcnt
635 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
636 ; GFX900-NEXT: s_waitcnt
637 ; GFX900-NEXT: s_setpc_b64
639 ; NO-D16-HI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s32 offset:4095{{$}}
640 define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8 addrspace(5)* byval %in, i16 %reg) #0 {
642 %gep = getelementptr inbounds i8, i8 addrspace(5)* %in, i64 4095
643 %load = load i8, i8 addrspace(5)* %gep
644 %ext = sext i8 %load to i16
645 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
646 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
647 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
651 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_zexti8:
653 ; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}}
654 ; GFX900-NEXT: s_waitcnt
655 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
656 ; GFX900-NEXT: s_waitcnt
657 ; GFX900-NEXT: s_setpc_b64
659 ; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}}
660 define void @load_private_hi_v2i16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
662 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
663 %ext = zext i8 %load to i16
664 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
665 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
666 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
670 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff_sexti8:
672 ; GFX900-NEXT: buffer_load_sbyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}}
673 ; GFX900-NEXT: s_waitcnt
674 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
675 ; GFX900-NEXT: s_waitcnt
676 ; GFX900-NEXT: s_setpc_b64
678 ; NO-D16-HI: buffer_load_sbyte v0, off, s[0:3], s33 offset:4094{{$}}
679 define void @load_private_hi_v2i16_reglo_vreg_nooff_sexti8(i8 addrspace(5)* %in, i16 %reg) #0 {
681 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
682 %ext = sext i8 %load to i16
683 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
684 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
685 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
689 ; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg_nooff_zexti8:
691 ; GFX900-NEXT: buffer_load_ubyte_d16_hi v1, off, s[0:3], s33 offset:4094{{$}}
692 ; GFX900-NEXT: s_waitcnt
693 ; GFX900-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
694 ; GFX900-NEXT: s_waitcnt
695 ; GFX900-NEXT: s_setpc_b64
697 ; NO-D16-HI: buffer_load_ubyte v0, off, s[0:3], s33 offset:4094{{$}}
698 define void @load_private_hi_v2f16_reglo_vreg_nooff_zexti8(i8 addrspace(5)* %in, half %reg) #0 {
700 %load = load volatile i8, i8 addrspace(5)* inttoptr (i32 4094 to i8 addrspace(5)*)
701 %ext = zext i8 %load to i16
702 %bc.ext = bitcast i16 %ext to half
703 %build0 = insertelement <2 x half> undef, half %reg, i32 0
704 %build1 = insertelement <2 x half> %build0, half %bc.ext, i32 1
705 store <2 x half> %build1, <2 x half> addrspace(1)* undef
709 ; GCN-LABEL: {{^}}load_constant_hi_v2i16_reglo_vreg:
711 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
712 ; GFX900-NEXT: s_waitcnt
713 ; GFX900-NEXT: global_store_dword
714 ; GFX900-NEXT: s_waitcnt
715 ; GFX900-NEXT: s_setpc_b64
717 ; GFX803: flat_load_ushort
718 ; GFX906: global_load_ushort
719 define void @load_constant_hi_v2i16_reglo_vreg(i16 addrspace(4)* %in, i16 %reg) #0 {
721 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 -2047
722 %load = load i16, i16 addrspace(4)* %gep
723 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
724 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
725 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
729 ; GCN-LABEL: load_constant_hi_v2f16_reglo_vreg
731 ; GFX900-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:-4094
732 ; GFX900-NEXT: s_waitcnt
733 ; GFX900-NEXT: global_store_dword
734 ; GFX900-NEXT: s_waitcnt
735 ; GFX900-NEXT: s_setpc_b64
737 ; GFX803: flat_load_ushort
738 ; GFX906: global_load_ushort
739 define void @load_constant_hi_v2f16_reglo_vreg(half addrspace(4)* %in, half %reg) #0 {
741 %gep = getelementptr inbounds half, half addrspace(4)* %in, i64 -2047
742 %load = load half, half addrspace(4)* %gep
743 %build0 = insertelement <2 x half> undef, half %reg, i32 0
744 %build1 = insertelement <2 x half> %build0, half %load, i32 1
745 store <2 x half> %build1, <2 x half> addrspace(1)* undef
749 ; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_sexti8:
751 ; GFX900-NEXT: global_load_sbyte_d16_hi v2, v[0:1], off offset:-4095
752 ; GFX900-NEXT: s_waitcnt
753 ; GFX900-NEXT: global_store_dword
754 ; GFX900-NEXT: s_waitcnt
755 ; GFX900-NEXT: s_setpc_b64
756 define void @load_constant_hi_v2f16_reglo_vreg_sexti8(i8 addrspace(4)* %in, half %reg) #0 {
758 %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
759 %load = load i8, i8 addrspace(4)* %gep
760 %ext = sext i8 %load to i16
761 %bitcast = bitcast i16 %ext to half
762 %build0 = insertelement <2 x half> undef, half %reg, i32 0
763 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
764 store <2 x half> %build1, <2 x half> addrspace(1)* undef
768 ; GCN-LABEL: {{^}}load_constant_hi_v2f16_reglo_vreg_zexti8:
770 ; GFX900-NEXT: global_load_ubyte_d16_hi v2, v[0:1], off offset:-4095
771 ; GFX900-NEXT: s_waitcnt
772 ; GFX900-NEXT: global_store_dword
773 ; GFX900-NEXT: s_waitcnt
774 ; GFX900-NEXT: s_setpc_b64
775 define void @load_constant_hi_v2f16_reglo_vreg_zexti8(i8 addrspace(4)* %in, half %reg) #0 {
777 %gep = getelementptr inbounds i8, i8 addrspace(4)* %in, i64 -4095
778 %load = load i8, i8 addrspace(4)* %gep
779 %ext = zext i8 %load to i16
780 %bitcast = bitcast i16 %ext to half
781 %build0 = insertelement <2 x half> undef, half %reg, i32 0
782 %build1 = insertelement <2 x half> %build0, half %bitcast, i32 1
783 store <2 x half> %build1, <2 x half> addrspace(1)* undef
787 ; Local object gives known offset, so requires converting from offen
790 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_to_offset:
791 ; GFX900: buffer_store_dword
792 ; GFX900-NEXT: buffer_load_short_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4094
793 define void @load_private_hi_v2i16_reglo_vreg_to_offset(i16 %reg) #0 {
795 %obj0 = alloca [10 x i32], align 4, addrspace(5)
796 %obj1 = alloca [4096 x i16], align 2, addrspace(5)
797 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
798 store volatile i32 123, i32 addrspace(5)* %bc
799 %gep = getelementptr inbounds [4096 x i16], [4096 x i16] addrspace(5)* %obj1, i32 0, i32 2027
800 %load = load i16, i16 addrspace(5)* %gep
801 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
802 %build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
803 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
807 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8_to_offset:
808 ; GFX900: buffer_store_dword
809 ; GFX900-NEXT: buffer_load_sbyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
810 define void @load_private_hi_v2i16_reglo_vreg_sexti8_to_offset(i16 %reg) #0 {
812 %obj0 = alloca [10 x i32], align 4, addrspace(5)
813 %obj1 = alloca [4096 x i8], align 2, addrspace(5)
814 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
815 store volatile i32 123, i32 addrspace(5)* %bc
816 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
817 %load = load i8, i8 addrspace(5)* %gep
818 %ext = sext i8 %load to i16
819 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
820 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
821 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
825 ; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8_to_offset:
826 ; GFX900: buffer_store_dword
827 ; GFX900-NEXT: buffer_load_ubyte_d16_hi v{{[0-9]+}}, off, s[0:3], s32 offset:4095
828 define void @load_private_hi_v2i16_reglo_vreg_zexti8_to_offset(i16 %reg) #0 {
830 %obj0 = alloca [10 x i32], align 4, addrspace(5)
831 %obj1 = alloca [4096 x i8], align 2, addrspace(5)
832 %bc = bitcast [10 x i32] addrspace(5)* %obj0 to i32 addrspace(5)*
833 store volatile i32 123, i32 addrspace(5)* %bc
834 %gep = getelementptr inbounds [4096 x i8], [4096 x i8] addrspace(5)* %obj1, i32 0, i32 4055
835 %load = load i8, i8 addrspace(5)* %gep
836 %ext = zext i8 %load to i16
837 %build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
838 %build1 = insertelement <2 x i16> %build0, i16 %ext, i32 1
839 store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
843 ; FIXME: Remove m0 init and waitcnt between reads
844 ; FIXME: Is there a cost to using the extload over not?
845 ; GCN-LABEL: {{^}}load_local_v2i16_split_multi_chain:
847 ; GFX900-NEXT: ds_read_u16 v1, v0
848 ; GFX900-NEXT: s_waitcnt
849 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:2
850 ; GFX900-NEXT: s_waitcnt
851 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
852 ; GFX900-NEXT: s_setpc_b64
853 define <2 x i16> @load_local_v2i16_split_multi_chain(i16 addrspace(3)* %in) #0 {
855 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
856 %load0 = load volatile i16, i16 addrspace(3)* %in
857 %load1 = load volatile i16, i16 addrspace(3)* %gep
858 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
859 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
860 ret <2 x i16> %build1
863 ; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_samechain:
864 ; GFX900: ds_read_u16 v1, v0
865 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
866 ; GFX900-NEXT: ds_read_u16_d16_hi v1, v0 offset:16
867 ; GFX900-NEXT: s_waitcnt lgkmcnt(0)
868 ; GFX900-NEXT: v_mov_b32_e32 v0, v1
869 ; GFX900-NEXT: s_setpc_b64
871 ; NO-D16-HI: ds_read_u16
872 ; NO-D16-HI: ds_read_u16
873 define <2 x i16> @load_local_lo_hi_v2i16_samechain(i16 addrspace(3)* %in) #0 {
875 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
876 %load.lo = load i16, i16 addrspace(3)* %in
877 %load.hi = load i16, i16 addrspace(3)* %gep
878 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
879 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
880 ret <2 x i16> %build1
884 ; GCN-LABEL: {{^}}load_local_v2i16_broadcast:
885 ; GCN: ds_read_u16 [[LOAD:v[0-9]+]]
887 ; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, [[LOAD]]
888 ; GFX9: v_lshl_or_b32 v0, [[LOAD]], 16, [[AND]]
889 define <2 x i16> @load_local_v2i16_broadcast(i16 addrspace(3)* %in) #0 {
891 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 1
892 %load0 = load i16, i16 addrspace(3)* %in
893 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
894 %build1 = insertelement <2 x i16> %build0, i16 %load0, i32 1
895 ret <2 x i16> %build1
898 ; GCN-LABEL: {{^}}load_local_lo_hi_v2i16_side_effect:
899 ; GFX900: ds_read_u16 [[LOAD0:v[0-9]+]], v0
900 ; GFX900: ds_write_b16
901 ; GFX900: ds_read_u16_d16_hi [[LOAD0]], v0 offset:16
903 ; NO-D16-HI: ds_read_u16
904 ; NO-D16-HI: ds_write_b16
905 ; NO-D16-HI: ds_read_u16
906 define <2 x i16> @load_local_lo_hi_v2i16_side_effect(i16 addrspace(3)* %in, i16 addrspace(3)* %may.alias) #0 {
908 %gep = getelementptr inbounds i16, i16 addrspace(3)* %in, i32 8
909 %load.lo = load i16, i16 addrspace(3)* %in
910 store i16 123, i16 addrspace(3)* %may.alias
911 %load.hi = load i16, i16 addrspace(3)* %gep
912 %build0 = insertelement <2 x i16> undef, i16 %load.lo, i32 0
913 %build1 = insertelement <2 x i16> %build0, i16 %load.hi, i32 1
914 ret <2 x i16> %build1
917 ; FIXME: Remove waitcnt between reads
918 ; GCN-LABEL: {{^}}load_global_v2i16_split:
920 ; GFX900-NEXT: global_load_ushort v2
921 ; GFX900-NEXT: s_waitcnt
922 ; GFX900-NEXT: global_load_short_d16_hi v2
923 ; GFX900-NEXT: s_waitcnt
924 ; GFX900-NEXT: v_mov_b32_e32 v0, v2
925 ; GFX900-NEXT: s_setpc_b64
926 define <2 x i16> @load_global_v2i16_split(i16 addrspace(1)* %in) #0 {
928 %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 1
929 %load0 = load volatile i16, i16 addrspace(1)* %in
930 %load1 = load volatile i16, i16 addrspace(1)* %gep
931 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
932 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
933 ret <2 x i16> %build1
936 ; FIXME: Remove waitcnt between reads
937 ; GCN-LABEL: {{^}}load_flat_v2i16_split:
939 ; GFX900-NEXT: flat_load_ushort v2
940 ; GFX900-NEXT: s_waitcnt
941 ; GFX900-NEXT: flat_load_short_d16_hi v2
942 ; GFX900-NEXT: s_waitcnt
943 ; GFX900-NEXT: v_mov_b32_e32 v0, v2
944 ; GFX900-NEXT: s_setpc_b64
945 define <2 x i16> @load_flat_v2i16_split(i16* %in) #0 {
947 %gep = getelementptr inbounds i16, i16* %in, i64 1
948 %load0 = load volatile i16, i16* %in
949 %load1 = load volatile i16, i16* %gep
950 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
951 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
952 ret <2 x i16> %build1
955 ; FIXME: Remove waitcnt between reads
956 ; GCN-LABEL: {{^}}load_constant_v2i16_split:
958 ; GFX900-NEXT: global_load_ushort v2
959 ; GFX900-NEXT: s_waitcnt
960 ; GFX900-NEXT: global_load_short_d16_hi v2
961 ; GFX900-NEXT: s_waitcnt
962 ; GFX900-NEXT: v_mov_b32_e32 v0, v2
963 ; GFX900-NEXT: s_setpc_b64
964 define <2 x i16> @load_constant_v2i16_split(i16 addrspace(4)* %in) #0 {
966 %gep = getelementptr inbounds i16, i16 addrspace(4)* %in, i64 1
967 %load0 = load volatile i16, i16 addrspace(4)* %in
968 %load1 = load volatile i16, i16 addrspace(4)* %gep
969 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
970 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
971 ret <2 x i16> %build1
974 ; FIXME: Remove m0 init and waitcnt between reads
975 ; FIXME: Is there a cost to using the extload over not?
976 ; GCN-LABEL: {{^}}load_private_v2i16_split:
978 ; GFX900: buffer_load_ushort v0, off, s[0:3], s32{{$}}
979 ; GFX900-NEXT: s_waitcnt
980 ; GFX900-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s32 offset:2
981 ; GFX900-NEXT: s_waitcnt
982 ; GFX900-NEXT: s_setpc_b64
983 define <2 x i16> @load_private_v2i16_split(i16 addrspace(5)* byval %in) #0 {
985 %gep = getelementptr inbounds i16, i16 addrspace(5)* %in, i32 1
986 %load0 = load volatile i16, i16 addrspace(5)* %in
987 %load1 = load volatile i16, i16 addrspace(5)* %gep
988 %build0 = insertelement <2 x i16> undef, i16 %load0, i32 0
989 %build1 = insertelement <2 x i16> %build0, i16 %load1, i32 1
990 ret <2 x i16> %build1
993 attributes #0 = { nounwind }