1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s
2 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI,SICIVI,FUNC %s
3 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,FUNC %s
4 ; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
6 ; Testing for ds_read/write_b128
7 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
10 ; FUNC-LABEL: {{^}}local_load_i8:
12 ; SICIVI: s_mov_b32 m0
16 ; EG: LDS_UBYTE_READ_RET
17 define amdgpu_kernel void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
19 %ld = load i8, i8 addrspace(3)* %in
20 store i8 %ld, i8 addrspace(3)* %out
24 ; FUNC-LABEL: {{^}}local_load_v2i8:
26 ; SICIVI: s_mov_b32 m0
30 ; EG: LDS_USHORT_READ_RET
31 define amdgpu_kernel void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
33 %ld = load <2 x i8>, <2 x i8> addrspace(3)* %in
34 store <2 x i8> %ld, <2 x i8> addrspace(3)* %out
38 ; FUNC-LABEL: {{^}}local_load_v3i8:
43 define amdgpu_kernel void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
45 %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
46 store <3 x i8> %ld, <3 x i8> addrspace(3)* %out
50 ; FUNC-LABEL: {{^}}local_load_v4i8:
55 define amdgpu_kernel void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
57 %ld = load <4 x i8>, <4 x i8> addrspace(3)* %in
58 store <4 x i8> %ld, <4 x i8> addrspace(3)* %out
62 ; FUNC-LABEL: {{^}}local_load_v8i8:
68 define amdgpu_kernel void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
70 %ld = load <8 x i8>, <8 x i8> addrspace(3)* %in
71 store <8 x i8> %ld, <8 x i8> addrspace(3)* %out
75 ; FUNC-LABEL: {{^}}local_load_v16i8:
77 ; GCN: ds_read2_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
78 ; GCN: ds_write2_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]{{\]}} offset1:1{{$}}
84 define amdgpu_kernel void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
86 %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in
87 store <16 x i8> %ld, <16 x i8> addrspace(3)* %out
91 ; FUNC-LABEL: {{^}}local_zextload_i8_to_i32:
94 ; SICIVI: s_mov_b32 m0
97 ; EG: LDS_UBYTE_READ_RET
98 define amdgpu_kernel void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
99 %a = load i8, i8 addrspace(3)* %in
100 %ext = zext i8 %a to i32
101 store i32 %ext, i32 addrspace(3)* %out
105 ; FUNC-LABEL: {{^}}local_sextload_i8_to_i32:
108 ; SICIVI: s_mov_b32 m0
111 ; EG: LDS_UBYTE_READ_RET
113 define amdgpu_kernel void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
114 %ld = load i8, i8 addrspace(3)* %in
115 %ext = sext i8 %ld to i32
116 store i32 %ext, i32 addrspace(3)* %out
120 ; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i32:
122 ; EG: LDS_UBYTE_READ_RET
123 define amdgpu_kernel void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
124 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
125 %ext = zext <1 x i8> %load to <1 x i32>
126 store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
130 ; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i32:
133 ; EG: LDS_UBYTE_READ_RET
135 define amdgpu_kernel void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
136 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
137 %ext = sext <1 x i8> %load to <1 x i32>
138 store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
142 ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i32:
146 ; EG: LDS_USHORT_READ_RET
147 define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
148 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
149 %ext = zext <2 x i8> %load to <2 x i32>
150 store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
154 ; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i32:
157 ; SICIVI: s_mov_b32 m0
159 ; FIXME: Need to optimize this sequence to avoid extra shift on VI.
160 ; t23: i16 = srl t39, Constant:i32<8>
161 ; t31: i32 = any_extend t23
162 ; t33: i32 = sign_extend_inreg t31, ValueType:ch:i8
164 ; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
165 ; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
167 ; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}}
168 ; VI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
169 ; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8
171 ; EG: LDS_USHORT_READ_RET
174 define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
175 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
176 %ext = sext <2 x i8> %load to <2 x i32>
177 store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
181 ; FUNC-LABEL: {{^}}local_zextload_v3i8_to_v3i32:
185 ; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
186 ; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, {{v[0-9]+}}
187 ; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
188 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
191 define amdgpu_kernel void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
193 %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
194 %ext = zext <3 x i8> %ld to <3 x i32>
195 store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
199 ; FUNC-LABEL: {{^}}local_sextload_v3i8_to_v3i32:
202 ; SICIVI: s_mov_b32 m0
210 ; GCN-DAG: ds_write_b64
211 ; GCN-DAG: ds_write_b32
217 define amdgpu_kernel void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
219 %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
220 %ext = sext <3 x i8> %ld to <3 x i32>
221 store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
225 ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i32:
228 ; SICIVI: s_mov_b32 m0
235 define amdgpu_kernel void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
236 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
237 %ext = zext <4 x i8> %load to <4 x i32>
238 store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
242 ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i32:
245 ; SICIVI: s_mov_b32 m0
248 ; EG-DAG: LDS_READ_RET
253 define amdgpu_kernel void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
254 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
255 %ext = sext <4 x i8> %load to <4 x i32>
256 store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
260 ; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i32:
261 ; SICIVI: s_mov_b32 m0
264 ; EG-DAG: LDS_READ_RET
265 ; EG-DAG: LDS_READ_RET
272 define amdgpu_kernel void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
273 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
274 %ext = zext <8 x i8> %load to <8 x i32>
275 store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
279 ; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i32:
280 ; SICIVI: s_mov_b32 m0
283 ; EG-DAG: LDS_READ_RET
284 ; EG-DAG: LDS_READ_RET
293 define amdgpu_kernel void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
294 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
295 %ext = sext <8 x i8> %load to <8 x i32>
296 store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
300 ; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i32:
301 ; SICIVI: s_mov_b32 m0
304 ; EG-DAG: LDS_READ_RET
305 ; EG-DAG: LDS_READ_RET
306 ; EG-DAG: LDS_READ_RET
307 ; EG-DAG: LDS_READ_RET
320 define amdgpu_kernel void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
321 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
322 %ext = zext <16 x i8> %load to <16 x i32>
323 store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
327 ; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i32:
328 ; SICIVI: s_mov_b32 m0
331 ; EG-DAG: LDS_READ_RET
332 ; EG-DAG: LDS_READ_RET
333 ; EG-DAG: LDS_READ_RET
334 ; EG-DAG: LDS_READ_RET
351 define amdgpu_kernel void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
352 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
353 %ext = sext <16 x i8> %load to <16 x i32>
354 store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
358 ; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i32:
359 ; SICIVI: s_mov_b32 m0
362 ; EG-DAG: LDS_READ_RET
363 ; EG-DAG: LDS_READ_RET
364 ; EG-DAG: LDS_READ_RET
365 ; EG-DAG: LDS_READ_RET
366 ; EG-DAG: LDS_READ_RET
367 ; EG-DAG: LDS_READ_RET
368 ; EG-DAG: LDS_READ_RET
369 ; EG-DAG: LDS_READ_RET
370 define amdgpu_kernel void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
371 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
372 %ext = zext <32 x i8> %load to <32 x i32>
373 store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
377 ; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i32:
378 ; SICIVI: s_mov_b32 m0
381 ; EG-DAG: LDS_READ_RET
382 ; EG-DAG: LDS_READ_RET
383 ; EG-DAG: LDS_READ_RET
384 ; EG-DAG: LDS_READ_RET
385 ; EG-DAG: LDS_READ_RET
386 ; EG-DAG: LDS_READ_RET
387 ; EG-DAG: LDS_READ_RET
388 ; EG-DAG: LDS_READ_RET
389 define amdgpu_kernel void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
390 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
391 %ext = sext <32 x i8> %load to <32 x i32>
392 store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
396 ; FUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i32:
397 ; SICIVI: s_mov_b32 m0
400 ; EG-DAG: LDS_READ_RET
401 ; EG-DAG: LDS_READ_RET
402 ; EG-DAG: LDS_READ_RET
403 ; EG-DAG: LDS_READ_RET
404 ; EG-DAG: LDS_READ_RET
405 ; EG-DAG: LDS_READ_RET
406 ; EG-DAG: LDS_READ_RET
407 ; EG-DAG: LDS_READ_RET
408 ; EG-DAG: LDS_READ_RET
409 ; EG-DAG: LDS_READ_RET
410 ; EG-DAG: LDS_READ_RET
411 ; EG-DAG: LDS_READ_RET
412 ; EG-DAG: LDS_READ_RET
413 ; EG-DAG: LDS_READ_RET
414 ; EG-DAG: LDS_READ_RET
415 ; EG-DAG: LDS_READ_RET
416 define amdgpu_kernel void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
417 %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
418 %ext = zext <64 x i8> %load to <64 x i32>
419 store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
423 ; FUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i32:
424 ; SICIVI: s_mov_b32 m0
427 ; EG-DAG: LDS_READ_RET
428 ; EG-DAG: LDS_READ_RET
429 ; EG-DAG: LDS_READ_RET
430 ; EG-DAG: LDS_READ_RET
431 ; EG-DAG: LDS_READ_RET
432 ; EG-DAG: LDS_READ_RET
433 ; EG-DAG: LDS_READ_RET
434 ; EG-DAG: LDS_READ_RET
435 ; EG-DAG: LDS_READ_RET
436 ; EG-DAG: LDS_READ_RET
437 ; EG-DAG: LDS_READ_RET
438 ; EG-DAG: LDS_READ_RET
439 ; EG-DAG: LDS_READ_RET
440 ; EG-DAG: LDS_READ_RET
441 ; EG-DAG: LDS_READ_RET
442 ; EG-DAG: LDS_READ_RET
443 define amdgpu_kernel void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
444 %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
445 %ext = sext <64 x i8> %load to <64 x i32>
446 store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
450 ; FUNC-LABEL: {{^}}local_zextload_i8_to_i64:
451 ; SICIVI: s_mov_b32 m0
454 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
455 ; GCN-DAG: ds_read_u8 v[[LO:[0-9]+]],
456 ; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
458 ; EG: LDS_UBYTE_READ_RET
459 ; EG: MOV {{.*}}, literal
461 define amdgpu_kernel void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
462 %a = load i8, i8 addrspace(3)* %in
463 %ext = zext i8 %a to i64
464 store i64 %ext, i64 addrspace(3)* %out
468 ; FUNC-LABEL: {{^}}local_sextload_i8_to_i64:
469 ; SICIVI: s_mov_b32 m0
472 ; GCN: ds_read_i8 v[[LO:[0-9]+]],
473 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
475 ; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
477 ; EG: LDS_UBYTE_READ_RET
481 define amdgpu_kernel void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
482 %a = load i8, i8 addrspace(3)* %in
483 %ext = sext i8 %a to i64
484 store i64 %ext, i64 addrspace(3)* %out
488 ; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i64:
489 ; SICIVI: s_mov_b32 m0
492 ; EG: LDS_UBYTE_READ_RET
493 ; EG: MOV {{.*}}, literal
496 define amdgpu_kernel void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
497 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
498 %ext = zext <1 x i8> %load to <1 x i64>
499 store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
503 ; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i64:
504 ; SICIVI: s_mov_b32 m0
507 ; EG: LDS_UBYTE_READ_RET
511 define amdgpu_kernel void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
512 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
513 %ext = sext <1 x i8> %load to <1 x i64>
514 store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
518 ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i64:
519 ; SICIVI: s_mov_b32 m0
522 ; EG: LDS_USHORT_READ_RET
523 define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
524 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
525 %ext = zext <2 x i8> %load to <2 x i64>
526 store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
530 ; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i64:
531 ; SICIVI: s_mov_b32 m0
534 ; EG: LDS_USHORT_READ_RET
537 define amdgpu_kernel void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
538 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
539 %ext = sext <2 x i8> %load to <2 x i64>
540 store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
544 ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i64:
545 ; SICIVI: s_mov_b32 m0
549 define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
550 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
551 %ext = zext <4 x i8> %load to <4 x i64>
552 store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
556 ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i64:
557 ; SICIVI: s_mov_b32 m0
561 define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
562 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
563 %ext = sext <4 x i8> %load to <4 x i64>
564 store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
568 ; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i64:
569 ; SICIVI: s_mov_b32 m0
574 define amdgpu_kernel void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
575 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
576 %ext = zext <8 x i8> %load to <8 x i64>
577 store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
581 ; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i64:
582 ; SICIVI: s_mov_b32 m0
596 define amdgpu_kernel void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
597 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
598 %ext = sext <8 x i8> %load to <8 x i64>
599 store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
603 ; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i64:
604 ; SICIVI: s_mov_b32 m0
611 define amdgpu_kernel void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
612 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
613 %ext = zext <16 x i8> %load to <16 x i64>
614 store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
618 ; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i64:
619 ; SICIVI: s_mov_b32 m0
626 define amdgpu_kernel void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
627 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
628 %ext = sext <16 x i8> %load to <16 x i64>
629 store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
633 ; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i64:
634 ; SICIVI: s_mov_b32 m0
645 define amdgpu_kernel void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
646 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
647 %ext = zext <32 x i8> %load to <32 x i64>
648 store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
652 ; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i64:
653 ; SICIVI: s_mov_b32 m0
664 define amdgpu_kernel void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
665 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
666 %ext = sext <32 x i8> %load to <32 x i64>
667 store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
671 ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i64:
672 ; define amdgpu_kernel void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
673 ; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
674 ; %ext = zext <64 x i8> %load to <64 x i64>
675 ; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
679 ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i64:
680 ; define amdgpu_kernel void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
681 ; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
682 ; %ext = sext <64 x i8> %load to <64 x i64>
683 ; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
687 ; FUNC-LABEL: {{^}}local_zextload_i8_to_i16:
688 ; SICIVI: s_mov_b32 m0
690 ; GCN: ds_read_u8 v[[VAL:[0-9]+]],
691 ; GCN: ds_write_b16 v[[VAL:[0-9]+]]
693 ; EG: LDS_UBYTE_READ_RET
694 ; EG: LDS_SHORT_WRITE
695 define amdgpu_kernel void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
696 %a = load i8, i8 addrspace(3)* %in
697 %ext = zext i8 %a to i16
698 store i16 %ext, i16 addrspace(3)* %out
702 ; FUNC-LABEL: {{^}}local_sextload_i8_to_i16:
703 ; SICIVI: s_mov_b32 m0
705 ; GCN: ds_read_i8 v[[VAL:[0-9]+]],
706 ; GCN: ds_write_b16 v{{[0-9]+}}, v[[VAL]]
708 ; EG: LDS_UBYTE_READ_RET
710 ; EG: LDS_SHORT_WRITE
711 define amdgpu_kernel void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
712 %a = load i8, i8 addrspace(3)* %in
713 %ext = sext i8 %a to i16
714 store i16 %ext, i16 addrspace(3)* %out
718 ; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i16:
719 ; SICIVI: s_mov_b32 m0
722 ; EG: LDS_UBYTE_READ_RET
723 ; EG: LDS_SHORT_WRITE
724 define amdgpu_kernel void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
725 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
726 %ext = zext <1 x i8> %load to <1 x i16>
727 store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
731 ; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i16:
732 ; SICIVI: s_mov_b32 m0
735 ; EG: LDS_UBYTE_READ_RET
737 ; EG: LDS_SHORT_WRITE
738 define amdgpu_kernel void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
739 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
740 %ext = sext <1 x i8> %load to <1 x i16>
741 store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
745 ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i16:
746 ; SICIVI: s_mov_b32 m0
749 ; EG: LDS_USHORT_READ_RET
751 define amdgpu_kernel void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
752 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
753 %ext = zext <2 x i8> %load to <2 x i16>
754 store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
758 ; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i16:
759 ; SICIVI: s_mov_b32 m0
762 ; EG: LDS_USHORT_READ_RET
766 define amdgpu_kernel void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
767 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
768 %ext = sext <2 x i8> %load to <2 x i16>
769 store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
773 ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i16:
774 ; SICIVI: s_mov_b32 m0
780 define amdgpu_kernel void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
781 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
782 %ext = zext <4 x i8> %load to <4 x i16>
783 store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
787 ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i16:
788 ; SICIVI: s_mov_b32 m0
792 ; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
799 define amdgpu_kernel void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
800 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
801 %ext = sext <4 x i8> %load to <4 x i16>
802 store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
806 ; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i16:
807 ; SICIVI: s_mov_b32 m0
816 define amdgpu_kernel void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
817 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
818 %ext = zext <8 x i8> %load to <8 x i16>
819 store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
823 ; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i16:
824 ; SICIVI: s_mov_b32 m0
829 ; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
842 define amdgpu_kernel void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
843 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
844 %ext = sext <8 x i8> %load to <8 x i16>
845 store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
849 ; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i16:
850 ; SICIVI: s_mov_b32 m0
865 define amdgpu_kernel void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
866 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
867 %ext = zext <16 x i8> %load to <16 x i16>
868 store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
872 ; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i16:
873 ; SICIVI: s_mov_b32 m0
880 ; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
905 define amdgpu_kernel void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
906 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
907 %ext = sext <16 x i8> %load to <16 x i16>
908 store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
912 ; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i16:
913 ; SICIVI: s_mov_b32 m0
940 define amdgpu_kernel void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
941 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
942 %ext = zext <32 x i8> %load to <32 x i16>
943 store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
947 ; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i16:
948 ; SICIVI: s_mov_b32 m0
959 ; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
1004 define amdgpu_kernel void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
1005 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
1006 %ext = sext <32 x i8> %load to <32 x i16>
1007 store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
1011 ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i16:
1012 ; define amdgpu_kernel void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
1013 ; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
1014 ; %ext = zext <64 x i8> %load to <64 x i16>
1015 ; store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
1019 ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i16:
1020 ; define amdgpu_kernel void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
1021 ; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
1022 ; %ext = sext <64 x i8> %load to <64 x i16>
1023 ; store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
1027 ; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load.
1028 ; FUNC-LABEL: {{^}}local_v16i8_to_128:
1030 ; SI-NOT: ds_read_b128
1031 ; SI-NOT: ds_write_b128
1033 ; CIVI: ds_read_b128
1034 ; CIVI: ds_write_b128
1040 define amdgpu_kernel void @local_v16i8_to_128(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) {
1041 %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 16
1042 store <16 x i8> %ld, <16 x i8> addrspace(3)* %out, align 16
1046 attributes #0 = { nounwind }