1 ; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
2 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-sdwa-peephole=0 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
4 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
5 declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
7 ; FUNC-LABEL: {{^}}test_copy_v4i8:
8 ; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
9 ; GCN: buffer_store_dword [[REG]]
11 define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
12 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
13 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
14 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
15 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
19 ; FUNC-LABEL: {{^}}test_copy_v4i8_x2:
20 ; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
21 ; GCN: buffer_store_dword [[REG]]
22 ; GCN: buffer_store_dword [[REG]]
24 define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
25 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
26 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
27 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
28 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
29 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
33 ; FUNC-LABEL: {{^}}test_copy_v4i8_x3:
34 ; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
35 ; GCN: buffer_store_dword [[REG]]
36 ; GCN: buffer_store_dword [[REG]]
37 ; GCN: buffer_store_dword [[REG]]
39 define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
40 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
41 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
42 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
43 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
44 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
45 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
49 ; FUNC-LABEL: {{^}}test_copy_v4i8_x4:
50 ; GCN: {{buffer|flat}}_load_dword [[REG:v[0-9]+]]
51 ; GCN: buffer_store_dword [[REG]]
52 ; GCN: buffer_store_dword [[REG]]
53 ; GCN: buffer_store_dword [[REG]]
54 ; GCN: buffer_store_dword [[REG]]
56 define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
57 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
58 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
59 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
60 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
61 store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
62 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
63 store <4 x i8> %val, <4 x i8> addrspace(1)* %out3, align 4
67 ; FUNC-LABEL: {{^}}test_copy_v4i8_extra_use:
68 ; GCN: {{buffer|flat}}_load_dword
69 ; GCN-DAG: v_lshrrev_b32
72 ; GCN-DAG: buffer_store_dword
73 ; GCN-DAG: buffer_store_dword
76 define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
77 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
78 %gep = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
79 %val = load <4 x i8>, <4 x i8> addrspace(1)* %gep, align 4
80 %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
81 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
82 store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
86 ; FIXME: Need to handle non-uniform case for function below (load without gep).
87 ; FUNC-LABEL: {{^}}test_copy_v4i8_x2_extra_use:
88 ; GCN: {{buffer|flat}}_load_dword
89 ; GCN-DAG: v_lshrrev_b32
94 ; GCN-DAG: {{buffer|flat}}_store_dword
95 ; GCN: {{buffer|flat}}_store_dword
96 ; GCN: {{buffer|flat}}_store_dword
98 define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
99 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
100 %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
101 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
102 %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
103 store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
104 store <4 x i8> %add, <4 x i8> addrspace(1)* %out1, align 4
105 store <4 x i8> %val, <4 x i8> addrspace(1)* %out2, align 4
109 ; FUNC-LABEL: {{^}}test_copy_v3i8_align4:
110 ; GCN: {{buffer|flat}}_load_dword
111 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
112 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
114 define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
115 %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
116 %gep = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %in, i32 %tid.x
117 %val = load <3 x i8>, <3 x i8> addrspace(1)* %gep, align 4
118 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
122 ; FUNC-LABEL: {{^}}test_copy_v3i8_align2:
123 ; GCN-DAG: {{buffer|flat}}_load_ushort v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
124 ; GCN-DAG: {{buffer|flat}}_load_ubyte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
125 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
126 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
128 define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
129 %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2
130 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2
134 ; FUNC-LABEL: {{^}}test_copy_v3i8_align1:
135 ; GCN: {{buffer|flat}}_load_ubyte
136 ; GCN: {{buffer|flat}}_load_ubyte
137 ; GCN: {{buffer|flat}}_load_ubyte
139 ; GCN: buffer_store_byte
140 ; GCN: buffer_store_byte
141 ; GCN: buffer_store_byte
143 define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
144 %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1
145 store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1
149 ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_load:
150 ; GCN: {{buffer|flat}}_load_dword
151 ; GCN: buffer_store_dword
153 define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
154 %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
155 store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
159 ; FUNC-LABEL: {{^}}test_copy_v4i8_volatile_store:
160 ; GCN: {{buffer|flat}}_load_ubyte
161 ; GCN: {{buffer|flat}}_load_ubyte
162 ; GCN: {{buffer|flat}}_load_ubyte
163 ; GCN: {{buffer|flat}}_load_ubyte
164 ; GCN: buffer_store_byte
165 ; GCN: buffer_store_byte
166 ; GCN: buffer_store_byte
167 ; GCN: buffer_store_byte
169 define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
170 %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
171 store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4