1 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SI,SICIVI,FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,SICIVI,GFX89,FUNC %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GFX9,GFX89,FUNC %s
4 ; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
6 ; Testing for ds_read/write_b128
7 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+enable-ds128 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=CIVI,FUNC %s
10 ; FUNC-LABEL: {{^}}local_load_i16:
12 ; SICIVI: s_mov_b32 m0
14 ; GCN: ds_read_u16 v{{[0-9]+}}
16 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
17 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
18 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
19 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
20 ; EG: LDS_SHORT_WRITE {{\*?}} [[TO]], [[DATA]]
21 define amdgpu_kernel void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) {
23 %ld = load i16, i16 addrspace(3)* %in
24 store i16 %ld, i16 addrspace(3)* %out
28 ; FUNC-LABEL: {{^}}local_load_v2i16:
30 ; SICIVI: s_mov_b32 m0
34 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
35 ; EG: LDS_READ_RET {{.*}} [[FROM]]
36 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
37 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
38 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
39 define amdgpu_kernel void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) {
41 %ld = load <2 x i16>, <2 x i16> addrspace(3)* %in
42 store <2 x i16> %ld, <2 x i16> addrspace(3)* %out
46 ; FUNC-LABEL: {{^}}local_load_v3i16:
48 ; SICIVI: s_mov_b32 m0
51 ; GCN-DAG: ds_write_b32
52 ; GCN-DAG: ds_write_b16
54 ; EG-DAG: LDS_USHORT_READ_RET
55 ; EG-DAG: LDS_USHORT_READ_RET
56 define amdgpu_kernel void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
58 %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
59 store <3 x i16> %ld, <3 x i16> addrspace(3)* %out
63 ; FUNC-LABEL: {{^}}local_load_v4i16:
65 ; SICIVI: s_mov_b32 m0
71 define amdgpu_kernel void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) {
73 %ld = load <4 x i16>, <4 x i16> addrspace(3)* %in
74 store <4 x i16> %ld, <4 x i16> addrspace(3)* %out
78 ; FUNC-LABEL: {{^}}local_load_v8i16:
80 ; SICIVI: s_mov_b32 m0
82 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
88 define amdgpu_kernel void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
90 %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in
91 store <8 x i16> %ld, <8 x i16> addrspace(3)* %out
95 ; FUNC-LABEL: {{^}}local_load_v16i16:
97 ; SICIVI: s_mov_b32 m0
99 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
100 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
112 define amdgpu_kernel void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) {
114 %ld = load <16 x i16>, <16 x i16> addrspace(3)* %in
115 store <16 x i16> %ld, <16 x i16> addrspace(3)* %out
119 ; FUNC-LABEL: {{^}}local_zextload_i16_to_i32:
121 ; SICIVI: s_mov_b32 m0
126 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
127 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
128 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
129 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
130 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
131 define amdgpu_kernel void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
132 %a = load i16, i16 addrspace(3)* %in
133 %ext = zext i16 %a to i32
134 store i32 %ext, i32 addrspace(3)* %out
138 ; FUNC-LABEL: {{^}}local_sextload_i16_to_i32:
142 ; SICIVI: s_mov_b32 m0
146 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
147 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
148 ; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
149 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
150 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
152 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
153 define amdgpu_kernel void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
154 %a = load i16, i16 addrspace(3)* %in
155 %ext = sext i16 %a to i32
156 store i32 %ext, i32 addrspace(3)* %out
160 ; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i32:
162 ; SICIVI: s_mov_b32 m0
166 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
167 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
168 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
169 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
170 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
171 define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
172 %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
173 %ext = zext <1 x i16> %load to <1 x i32>
174 store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
178 ; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i32:
180 ; SICIVI: s_mov_b32 m0
184 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
185 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
186 ; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
187 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
188 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
190 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
191 define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
192 %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
193 %ext = sext <1 x i16> %load to <1 x i32>
194 store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
198 ; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i32:
201 ; SICIVI: s_mov_b32 m0
206 define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
207 %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
208 %ext = zext <2 x i16> %load to <2 x i32>
209 store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
213 ; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i32:
216 ; SICIVI: s_mov_b32 m0
223 define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
224 %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
225 %ext = sext <2 x i16> %load to <2 x i32>
226 store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
230 ; FUNC-LABEL: {{^}}local_local_zextload_v3i16_to_v3i32:
232 ; SICIVI: s_mov_b32 m0
235 ; GCN-DAG: ds_write_b32
236 ; GCN-DAG: ds_write_b64
238 ; EG: LDS_USHORT_READ_RET
239 ; EG: LDS_USHORT_READ_RET
240 ; EG: LDS_USHORT_READ_RET
241 define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
243 %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
244 %ext = zext <3 x i16> %ld to <3 x i32>
245 store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
249 ; FUNC-LABEL: {{^}}local_local_sextload_v3i16_to_v3i32:
251 ; SICIVI: s_mov_b32 m0
254 ; GCN-DAG: ds_write_b32
255 ; GCN-DAG: ds_write_b64
257 ; EG: LDS_USHORT_READ_RET
258 ; EG: LDS_USHORT_READ_RET
259 ; EG: LDS_USHORT_READ_RET
263 define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
265 %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
266 %ext = sext <3 x i16> %ld to <3 x i32>
267 store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
271 ; FUNC-LABEL: {{^}}local_local_zextload_v4i16_to_v4i32:
274 ; SICIVI: s_mov_b32 m0
280 define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
281 %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
282 %ext = zext <4 x i16> %load to <4 x i32>
283 store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
287 ; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i32:
290 ; SICIVI: s_mov_b32 m0
300 define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
301 %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
302 %ext = sext <4 x i16> %load to <4 x i32>
303 store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
307 ; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i32:
309 ; SICIVI: s_mov_b32 m0
311 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
317 define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
318 %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
319 %ext = zext <8 x i16> %load to <8 x i32>
320 store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
324 ; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i32:
326 ; SICIVI: s_mov_b32 m0
328 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
342 define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
343 %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
344 %ext = sext <8 x i16> %load to <8 x i32>
345 store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
349 ; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32:
351 ; SICIVI: s_mov_b32 m0
353 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
354 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
369 define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
370 %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
371 %ext = zext <16 x i16> %load to <16 x i32>
372 store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
376 ; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32:
378 ; SICIVI: s_mov_b32 m0
381 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
382 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
408 define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
409 %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
410 %ext = sext <16 x i16> %load to <16 x i32>
411 store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
415 ; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i32:
417 ; SICIVI: s_mov_b32 m0
419 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
420 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
421 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
422 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
440 define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
441 %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
442 %ext = zext <32 x i16> %load to <32 x i32>
443 store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
447 ; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
449 ; SICIVI: s_mov_b32 m0
451 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
452 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
453 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
454 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
455 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
456 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
457 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
458 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
459 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
460 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
461 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
462 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
480 define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
481 %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
482 %ext = sext <32 x i16> %load to <32 x i32>
483 store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
487 ; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
489 ; SICIVI: s_mov_b32 m0
491 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15
492 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
493 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
494 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
495 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
496 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9
497 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13
498 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11
499 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
500 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
501 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27
502 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:24 offset1:25
503 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:22 offset1:23
504 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:20 offset1:21
505 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:18 offset1:19
506 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:16 offset1:17
507 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
508 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
509 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
510 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
511 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
512 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
513 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
514 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
548 define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
549 %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
550 %ext = zext <64 x i16> %load to <64 x i32>
551 store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
555 ; FUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i32:
557 ; SICIVI: s_mov_b32 m0
591 define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
592 %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
593 %ext = sext <64 x i16> %load to <64 x i32>
594 store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
598 ; FUNC-LABEL: {{^}}local_zextload_i16_to_i64:
600 ; SICIVI: s_mov_b32 m0
602 ; GCN-DAG: ds_read_u16 v[[LO:[0-9]+]],
603 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
605 ; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
607 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
608 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
609 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
610 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
612 define amdgpu_kernel void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
613 %a = load i16, i16 addrspace(3)* %in
614 %ext = zext i16 %a to i64
615 store i64 %ext, i64 addrspace(3)* %out
619 ; FUNC-LABEL: {{^}}local_sextload_i16_to_i64:
621 ; SICIVI: s_mov_b32 m0
623 ; FIXME: Need to optimize this sequence to avoid an extra shift.
624 ; t25: i32,ch = load<LD2[%in(addrspace=3)], anyext from i16> t12, t10, undef:i32
625 ; t28: i64 = any_extend t25
626 ; t30: i64 = sign_extend_inreg t28, ValueType:ch:i16
627 ; SI: ds_read_i16 v[[LO:[0-9]+]],
628 ; GFX89: ds_read_u16 v[[ULO:[0-9]+]]
629 ; GFX89: v_bfe_i32 v[[LO:[0-9]+]], v[[ULO]], 0, 16
630 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
632 ; GCN: ds_write_b64 v{{[0-9]+}}, v{{\[}}[[LO]]:[[HI]]]
634 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
635 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
636 ; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
637 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
638 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
641 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
642 define amdgpu_kernel void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
643 %a = load i16, i16 addrspace(3)* %in
644 %ext = sext i16 %a to i64
645 store i64 %ext, i64 addrspace(3)* %out
649 ; FUNC-LABEL: {{^}}local_zextload_v1i16_to_v1i64:
651 ; SICIVI: s_mov_b32 m0
654 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
655 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
656 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
657 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
659 define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
660 %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
661 %ext = zext <1 x i16> %load to <1 x i64>
662 store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
666 ; FUNC-LABEL: {{^}}local_sextload_v1i16_to_v1i64:
668 ; SICIVI: s_mov_b32 m0
671 ; EG: MOV {{[* ]*}}[[FROM:T[0-9]+\.[XYZW]]], KC0[2].Z
672 ; EG: LDS_USHORT_READ_RET {{.*}} [[FROM]]
673 ; EG-DAG: MOV {{[* ]*}}[[TMP:T[0-9]+\.[XYZW]]], OQAP
674 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
675 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
678 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
679 define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
680 %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
681 %ext = sext <1 x i16> %load to <1 x i64>
682 store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
686 ; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64:
688 ; SICIVI: s_mov_b32 m0
692 define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
693 %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
694 %ext = zext <2 x i16> %load to <2 x i64>
695 store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
699 ; FUNC-LABEL: {{^}}local_sextload_v2i16_to_v2i64:
701 ; SICIVI: s_mov_b32 m0
707 define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
708 %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
709 %ext = sext <2 x i16> %load to <2 x i64>
710 store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
714 ; FUNC-LABEL: {{^}}local_zextload_v4i16_to_v4i64:
716 ; SICIVI: s_mov_b32 m0
721 define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
722 %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
723 %ext = zext <4 x i16> %load to <4 x i64>
724 store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
728 ; FUNC-LABEL: {{^}}local_sextload_v4i16_to_v4i64:
730 ; SICIVI: s_mov_b32 m0
739 define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
740 %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
741 %ext = sext <4 x i16> %load to <4 x i64>
742 store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
746 ; FUNC-LABEL: {{^}}local_zextload_v8i16_to_v8i64:
748 ; SICIVI: s_mov_b32 m0
755 define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
756 %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
757 %ext = zext <8 x i16> %load to <8 x i64>
758 store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
762 ; FUNC-LABEL: {{^}}local_sextload_v8i16_to_v8i64:
764 ; SICIVI: s_mov_b32 m0
779 define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
780 %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
781 %ext = sext <8 x i16> %load to <8 x i64>
782 store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
786 ; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i64:
788 ; SICIVI: s_mov_b32 m0
799 define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
800 %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
801 %ext = zext <16 x i16> %load to <16 x i64>
802 store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
806 ; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i64:
808 ; SICIVI: s_mov_b32 m0
835 define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
836 %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
837 %ext = sext <16 x i16> %load to <16 x i64>
838 store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
842 ; FUNC-LABEL: {{^}}local_zextload_v32i16_to_v32i64:
844 ; SICIVI: s_mov_b32 m0
863 define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
864 %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
865 %ext = zext <32 x i16> %load to <32 x i64>
866 store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
870 ; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i64:
872 ; SICIVI: s_mov_b32 m0
923 define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
924 %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
925 %ext = sext <32 x i16> %load to <32 x i64>
926 store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
930 ; ; XFUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i64:
931 ; define amdgpu_kernel void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
932 ; %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
933 ; %ext = zext <64 x i16> %load to <64 x i64>
934 ; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
938 ; ; XFUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i64:
939 ; define amdgpu_kernel void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
940 ; %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
941 ; %ext = sext <64 x i16> %load to <64 x i64>
942 ; store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
946 ; Tests if ds_read/write_b128 gets generated for the 16 byte aligned load.
947 ; FUNC-LABEL: {{^}}local_v8i16_to_128:
949 ; SI-NOT: ds_read_b128
950 ; SI-NOT: ds_write_b128
953 ; CIVI: ds_write_b128
959 define amdgpu_kernel void @local_v8i16_to_128(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
960 %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in, align 16
961 store <8 x i16> %ld, <8 x i16> addrspace(3)* %out, align 16
965 attributes #0 = { nounwind }