1 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s
2 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=tonga -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,VI,SICIVI,FUNC %s
3 ; RUN: llc -march=amdgcn -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-enable-ds128 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,FUNC %s
4 ; RUN: llc -march=r600 -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
6 ; Testing for ds_read/write_b128
7 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
10 ; FUNC-LABEL: {{^}}local_load_i8:
12 ; SICIVI: s_mov_b32 m0
16 ; EG: LDS_UBYTE_READ_RET
17 define amdgpu_kernel void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
19 %ld = load i8, i8 addrspace(3)* %in
20 store i8 %ld, i8 addrspace(3)* %out
24 ; FUNC-LABEL: {{^}}local_load_v2i8:
26 ; SICIVI: s_mov_b32 m0
30 ; EG: LDS_USHORT_READ_RET
31 define amdgpu_kernel void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
33 %ld = load <2 x i8>, <2 x i8> addrspace(3)* %in
34 store <2 x i8> %ld, <2 x i8> addrspace(3)* %out
38 ; FUNC-LABEL: {{^}}local_load_v3i8:
43 define amdgpu_kernel void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
45 %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
46 store <3 x i8> %ld, <3 x i8> addrspace(3)* %out
50 ; FUNC-LABEL: {{^}}local_load_v4i8:
55 define amdgpu_kernel void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
57 %ld = load <4 x i8>, <4 x i8> addrspace(3)* %in
58 store <4 x i8> %ld, <4 x i8> addrspace(3)* %out
62 ; FUNC-LABEL: {{^}}local_load_v8i8:
68 define amdgpu_kernel void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
70 %ld = load <8 x i8>, <8 x i8> addrspace(3)* %in
71 store <8 x i8> %ld, <8 x i8> addrspace(3)* %out
75 ; FUNC-LABEL: {{^}}local_load_v16i8:
77 ; GCN: ds_read2_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
78 ; GCN: ds_write2_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:{{[0-9]+}}], v[{{[0-9]+}}:[[HI]]{{\]}} offset1:1{{$}}
84 define amdgpu_kernel void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
86 %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in
87 store <16 x i8> %ld, <16 x i8> addrspace(3)* %out
91 ; FUNC-LABEL: {{^}}local_zextload_i8_to_i32:
94 ; SICIVI: s_mov_b32 m0
97 ; EG: LDS_UBYTE_READ_RET
98 define amdgpu_kernel void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
99 %a = load i8, i8 addrspace(3)* %in
100 %ext = zext i8 %a to i32
101 store i32 %ext, i32 addrspace(3)* %out
105 ; FUNC-LABEL: {{^}}local_sextload_i8_to_i32:
108 ; SICIVI: s_mov_b32 m0
111 ; EG: LDS_UBYTE_READ_RET
113 define amdgpu_kernel void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
114 %ld = load i8, i8 addrspace(3)* %in
115 %ext = sext i8 %ld to i32
116 store i32 %ext, i32 addrspace(3)* %out
120 ; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i32:
122 ; EG: LDS_UBYTE_READ_RET
123 define amdgpu_kernel void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
124 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
125 %ext = zext <1 x i8> %load to <1 x i32>
126 store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
130 ; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i32:
133 ; EG: LDS_UBYTE_READ_RET
135 define amdgpu_kernel void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
136 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
137 %ext = sext <1 x i8> %load to <1 x i32>
138 store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
142 ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i32:
146 ; EG: LDS_USHORT_READ_RET
147 define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
148 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
149 %ext = zext <2 x i8> %load to <2 x i32>
150 store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
154 ; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i32:
157 ; SICIVI: s_mov_b32 m0
159 ; FIXME: Need to optimize this sequence to avoid extra shift on VI.
160 ; t23: i16 = srl t39, Constant:i32<8>
161 ; t31: i32 = any_extend t23
162 ; t33: i32 = sign_extend_inreg t31, ValueType:ch:i8
164 ; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
165 ; SI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
167 ; VI-DAG: v_lshrrev_b16_e32 [[SHIFT:v[0-9]+]], 8, v{{[0-9]+}}
168 ; VI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
169 ; VI-DAG: v_bfe_i32 v{{[0-9]+}}, [[SHIFT]], 0, 8
171 ; EG: LDS_USHORT_READ_RET
174 define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
175 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
176 %ext = sext <2 x i8> %load to <2 x i32>
177 store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
181 ; FUNC-LABEL: {{^}}local_zextload_v3i8_to_v3i32:
185 ; SI-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 8, 8
186 ; VI-DAG: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, {{v[0-9]+}}
187 ; GCN-DAG: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, 16, 8
188 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
191 define amdgpu_kernel void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
193 %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
194 %ext = zext <3 x i8> %ld to <3 x i32>
195 store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
199 ; FUNC-LABEL: {{^}}local_sextload_v3i8_to_v3i32:
202 ; SICIVI: s_mov_b32 m0
210 ; SI-DAG: ds_write_b64
211 ; SI-DAG: ds_write_b32
212 ; CIVI-DAG: ds_write_b96
213 ; GFX9-DAG: ds_write_b96
219 define amdgpu_kernel void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
221 %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
222 %ext = sext <3 x i8> %ld to <3 x i32>
223 store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
227 ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i32:
230 ; SICIVI: s_mov_b32 m0
237 define amdgpu_kernel void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
238 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
239 %ext = zext <4 x i8> %load to <4 x i32>
240 store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
244 ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i32:
247 ; SICIVI: s_mov_b32 m0
250 ; EG-DAG: LDS_READ_RET
255 define amdgpu_kernel void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
256 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
257 %ext = sext <4 x i8> %load to <4 x i32>
258 store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
262 ; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i32:
263 ; SICIVI: s_mov_b32 m0
266 ; EG-DAG: LDS_READ_RET
267 ; EG-DAG: LDS_READ_RET
274 define amdgpu_kernel void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
275 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
276 %ext = zext <8 x i8> %load to <8 x i32>
277 store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
281 ; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i32:
282 ; SICIVI: s_mov_b32 m0
285 ; EG-DAG: LDS_READ_RET
286 ; EG-DAG: LDS_READ_RET
295 define amdgpu_kernel void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
296 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
297 %ext = sext <8 x i8> %load to <8 x i32>
298 store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
302 ; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i32:
303 ; SICIVI: s_mov_b32 m0
306 ; EG-DAG: LDS_READ_RET
307 ; EG-DAG: LDS_READ_RET
308 ; EG-DAG: LDS_READ_RET
309 ; EG-DAG: LDS_READ_RET
322 define amdgpu_kernel void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
323 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
324 %ext = zext <16 x i8> %load to <16 x i32>
325 store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
329 ; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i32:
330 ; SICIVI: s_mov_b32 m0
333 ; EG-DAG: LDS_READ_RET
334 ; EG-DAG: LDS_READ_RET
335 ; EG-DAG: LDS_READ_RET
336 ; EG-DAG: LDS_READ_RET
353 define amdgpu_kernel void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
354 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
355 %ext = sext <16 x i8> %load to <16 x i32>
356 store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
360 ; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i32:
361 ; SICIVI: s_mov_b32 m0
364 ; EG-DAG: LDS_READ_RET
365 ; EG-DAG: LDS_READ_RET
366 ; EG-DAG: LDS_READ_RET
367 ; EG-DAG: LDS_READ_RET
368 ; EG-DAG: LDS_READ_RET
369 ; EG-DAG: LDS_READ_RET
370 ; EG-DAG: LDS_READ_RET
371 ; EG-DAG: LDS_READ_RET
372 define amdgpu_kernel void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
373 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
374 %ext = zext <32 x i8> %load to <32 x i32>
375 store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
379 ; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i32:
380 ; SICIVI: s_mov_b32 m0
383 ; EG-DAG: LDS_READ_RET
384 ; EG-DAG: LDS_READ_RET
385 ; EG-DAG: LDS_READ_RET
386 ; EG-DAG: LDS_READ_RET
387 ; EG-DAG: LDS_READ_RET
388 ; EG-DAG: LDS_READ_RET
389 ; EG-DAG: LDS_READ_RET
390 ; EG-DAG: LDS_READ_RET
391 define amdgpu_kernel void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
392 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
393 %ext = sext <32 x i8> %load to <32 x i32>
394 store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
398 ; FUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i32:
399 ; SICIVI: s_mov_b32 m0
402 ; EG-DAG: LDS_READ_RET
403 ; EG-DAG: LDS_READ_RET
404 ; EG-DAG: LDS_READ_RET
405 ; EG-DAG: LDS_READ_RET
406 ; EG-DAG: LDS_READ_RET
407 ; EG-DAG: LDS_READ_RET
408 ; EG-DAG: LDS_READ_RET
409 ; EG-DAG: LDS_READ_RET
410 ; EG-DAG: LDS_READ_RET
411 ; EG-DAG: LDS_READ_RET
412 ; EG-DAG: LDS_READ_RET
413 ; EG-DAG: LDS_READ_RET
414 ; EG-DAG: LDS_READ_RET
415 ; EG-DAG: LDS_READ_RET
416 ; EG-DAG: LDS_READ_RET
417 ; EG-DAG: LDS_READ_RET
418 define amdgpu_kernel void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
419 %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
420 %ext = zext <64 x i8> %load to <64 x i32>
421 store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
425 ; FUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i32:
426 ; SICIVI: s_mov_b32 m0
429 ; EG-DAG: LDS_READ_RET
430 ; EG-DAG: LDS_READ_RET
431 ; EG-DAG: LDS_READ_RET
432 ; EG-DAG: LDS_READ_RET
433 ; EG-DAG: LDS_READ_RET
434 ; EG-DAG: LDS_READ_RET
435 ; EG-DAG: LDS_READ_RET
436 ; EG-DAG: LDS_READ_RET
437 ; EG-DAG: LDS_READ_RET
438 ; EG-DAG: LDS_READ_RET
439 ; EG-DAG: LDS_READ_RET
440 ; EG-DAG: LDS_READ_RET
441 ; EG-DAG: LDS_READ_RET
442 ; EG-DAG: LDS_READ_RET
443 ; EG-DAG: LDS_READ_RET
444 ; EG-DAG: LDS_READ_RET
445 define amdgpu_kernel void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
446 %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
447 %ext = sext <64 x i8> %load to <64 x i32>
448 store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
452 ; FUNC-LABEL: {{^}}local_zextload_i8_to_i64:
453 ; SICIVI: s_mov_b32 m0
456 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
457 ; GCN-DAG: ds_read_u8 v[[LO:[0-9]+]],
458 ; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
460 ; EG: LDS_UBYTE_READ_RET
461 ; EG: MOV {{.*}}, literal
463 define amdgpu_kernel void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
464 %a = load i8, i8 addrspace(3)* %in
465 %ext = zext i8 %a to i64
466 store i64 %ext, i64 addrspace(3)* %out
470 ; FUNC-LABEL: {{^}}local_sextload_i8_to_i64:
471 ; SICIVI: s_mov_b32 m0
474 ; GCN: ds_read_i8 v[[LO:[0-9]+]],
475 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
477 ; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
479 ; EG: LDS_UBYTE_READ_RET
483 define amdgpu_kernel void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
484 %a = load i8, i8 addrspace(3)* %in
485 %ext = sext i8 %a to i64
486 store i64 %ext, i64 addrspace(3)* %out
490 ; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i64:
491 ; SICIVI: s_mov_b32 m0
494 ; EG: LDS_UBYTE_READ_RET
495 ; EG: MOV {{.*}}, literal
498 define amdgpu_kernel void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
499 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
500 %ext = zext <1 x i8> %load to <1 x i64>
501 store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
505 ; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i64:
506 ; SICIVI: s_mov_b32 m0
509 ; EG: LDS_UBYTE_READ_RET
513 define amdgpu_kernel void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
514 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
515 %ext = sext <1 x i8> %load to <1 x i64>
516 store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
520 ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i64:
521 ; SICIVI: s_mov_b32 m0
524 ; EG: LDS_USHORT_READ_RET
525 define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
526 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
527 %ext = zext <2 x i8> %load to <2 x i64>
528 store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
532 ; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i64:
533 ; SICIVI: s_mov_b32 m0
536 ; EG: LDS_USHORT_READ_RET
539 define amdgpu_kernel void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
540 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
541 %ext = sext <2 x i8> %load to <2 x i64>
542 store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
546 ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i64:
547 ; SICIVI: s_mov_b32 m0
551 define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
552 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
553 %ext = zext <4 x i8> %load to <4 x i64>
554 store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
558 ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i64:
559 ; SICIVI: s_mov_b32 m0
563 define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
564 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
565 %ext = sext <4 x i8> %load to <4 x i64>
566 store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
570 ; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i64:
571 ; SICIVI: s_mov_b32 m0
576 define amdgpu_kernel void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
577 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
578 %ext = zext <8 x i8> %load to <8 x i64>
579 store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
583 ; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i64:
584 ; SICIVI: s_mov_b32 m0
598 define amdgpu_kernel void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
599 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
600 %ext = sext <8 x i8> %load to <8 x i64>
601 store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
605 ; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i64:
606 ; SICIVI: s_mov_b32 m0
613 define amdgpu_kernel void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
614 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
615 %ext = zext <16 x i8> %load to <16 x i64>
616 store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
620 ; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i64:
621 ; SICIVI: s_mov_b32 m0
628 define amdgpu_kernel void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
629 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
630 %ext = sext <16 x i8> %load to <16 x i64>
631 store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
635 ; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i64:
636 ; SICIVI: s_mov_b32 m0
647 define amdgpu_kernel void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
648 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
649 %ext = zext <32 x i8> %load to <32 x i64>
650 store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
654 ; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i64:
655 ; SICIVI: s_mov_b32 m0
666 define amdgpu_kernel void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
667 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
668 %ext = sext <32 x i8> %load to <32 x i64>
669 store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
673 ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i64:
674 ; define amdgpu_kernel void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
675 ; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
676 ; %ext = zext <64 x i8> %load to <64 x i64>
677 ; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
681 ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i64:
682 ; define amdgpu_kernel void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
683 ; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
684 ; %ext = sext <64 x i8> %load to <64 x i64>
685 ; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
689 ; FUNC-LABEL: {{^}}local_zextload_i8_to_i16:
690 ; SICIVI: s_mov_b32 m0
692 ; GCN: ds_read_u8 v[[VAL:[0-9]+]],
693 ; GCN: ds_write_b16 v[[VAL:[0-9]+]]
695 ; EG: LDS_UBYTE_READ_RET
696 ; EG: LDS_SHORT_WRITE
697 define amdgpu_kernel void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
698 %a = load i8, i8 addrspace(3)* %in
699 %ext = zext i8 %a to i16
700 store i16 %ext, i16 addrspace(3)* %out
704 ; FUNC-LABEL: {{^}}local_sextload_i8_to_i16:
705 ; SICIVI: s_mov_b32 m0
707 ; GCN: ds_read_i8 v[[VAL:[0-9]+]],
708 ; GCN: ds_write_b16 v{{[0-9]+}}, v[[VAL]]
710 ; EG: LDS_UBYTE_READ_RET
712 ; EG: LDS_SHORT_WRITE
713 define amdgpu_kernel void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
714 %a = load i8, i8 addrspace(3)* %in
715 %ext = sext i8 %a to i16
716 store i16 %ext, i16 addrspace(3)* %out
720 ; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i16:
721 ; SICIVI: s_mov_b32 m0
724 ; EG: LDS_UBYTE_READ_RET
725 ; EG: LDS_SHORT_WRITE
726 define amdgpu_kernel void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
727 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
728 %ext = zext <1 x i8> %load to <1 x i16>
729 store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
733 ; FUNC-LABEL: {{^}}local_sextload_v1i8_to_v1i16:
734 ; SICIVI: s_mov_b32 m0
737 ; EG: LDS_UBYTE_READ_RET
739 ; EG: LDS_SHORT_WRITE
740 define amdgpu_kernel void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
741 %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
742 %ext = sext <1 x i8> %load to <1 x i16>
743 store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
747 ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i16:
748 ; SICIVI: s_mov_b32 m0
751 ; EG: LDS_USHORT_READ_RET
753 define amdgpu_kernel void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
754 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
755 %ext = zext <2 x i8> %load to <2 x i16>
756 store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
760 ; FUNC-LABEL: {{^}}local_sextload_v2i8_to_v2i16:
761 ; SICIVI: s_mov_b32 m0
764 ; EG: LDS_USHORT_READ_RET
768 define amdgpu_kernel void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
769 %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
770 %ext = sext <2 x i8> %load to <2 x i16>
771 store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
775 ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i16:
776 ; SICIVI: s_mov_b32 m0
782 define amdgpu_kernel void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
783 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
784 %ext = zext <4 x i8> %load to <4 x i16>
785 store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
789 ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i16:
790 ; SICIVI: s_mov_b32 m0
794 ; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
801 define amdgpu_kernel void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
802 %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
803 %ext = sext <4 x i8> %load to <4 x i16>
804 store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
808 ; FUNC-LABEL: {{^}}local_zextload_v8i8_to_v8i16:
809 ; SICIVI: s_mov_b32 m0
818 define amdgpu_kernel void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
819 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
820 %ext = zext <8 x i8> %load to <8 x i16>
821 store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
825 ; FUNC-LABEL: {{^}}local_sextload_v8i8_to_v8i16:
826 ; SICIVI: s_mov_b32 m0
831 ; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
844 define amdgpu_kernel void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
845 %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
846 %ext = sext <8 x i8> %load to <8 x i16>
847 store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
851 ; FUNC-LABEL: {{^}}local_zextload_v16i8_to_v16i16:
852 ; SICIVI: s_mov_b32 m0
867 define amdgpu_kernel void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
868 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
869 %ext = zext <16 x i8> %load to <16 x i16>
870 store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
874 ; FUNC-LABEL: {{^}}local_sextload_v16i8_to_v16i16:
875 ; SICIVI: s_mov_b32 m0
882 ; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
907 define amdgpu_kernel void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
908 %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
909 %ext = sext <16 x i8> %load to <16 x i16>
910 store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
914 ; FUNC-LABEL: {{^}}local_zextload_v32i8_to_v32i16:
915 ; SICIVI: s_mov_b32 m0
942 define amdgpu_kernel void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
943 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
944 %ext = zext <32 x i8> %load to <32 x i16>
945 store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
949 ; FUNC-LABEL: {{^}}local_sextload_v32i8_to_v32i16:
950 ; SICIVI: s_mov_b32 m0
961 ; TODO: these do LSHR + BFE_INT, instead of just BFE_INT/ASHR
1006 define amdgpu_kernel void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
1007 %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
1008 %ext = sext <32 x i8> %load to <32 x i16>
1009 store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
1013 ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i16:
1014 ; define amdgpu_kernel void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
1015 ; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
1016 ; %ext = zext <64 x i8> %load to <64 x i16>
1017 ; store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
1021 ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i16:
1022 ; define amdgpu_kernel void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
1023 ; %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
1024 ; %ext = sext <64 x i8> %load to <64 x i16>
1025 ; store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
1029 ; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load.
1030 ; FUNC-LABEL: {{^}}local_v16i8_to_128:
1032 ; SI-NOT: ds_read_b128
1033 ; SI-NOT: ds_write_b128
1035 ; CIVI: ds_read_b128
1036 ; CIVI: ds_write_b128
1042 define amdgpu_kernel void @local_v16i8_to_128(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) {
1043 %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 16
1044 store <16 x i8> %ld, <16 x i8> addrspace(3)* %out, align 16
1048 attributes #0 = { nounwind }