1 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
5 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s
6 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
7 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
8 ; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s
10 ; GCN-LABEL: test_local_misaligned_v2:
11 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
12 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
13 define amdgpu_kernel void @test_local_misaligned_v2(ptr addrspace(3) %arg) {
15 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
16 %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
17 %load = load <2 x i32>, ptr addrspace(3) %gep, align 4
18 %v1 = extractelement <2 x i32> %load, i32 0
19 %v2 = extractelement <2 x i32> %load, i32 1
20 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
21 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
22 store <2 x i32> %v4, ptr addrspace(3) %gep, align 4
26 ; GCN-LABEL: test_local_misaligned_v4:
27 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
28 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
29 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
30 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
31 define amdgpu_kernel void @test_local_misaligned_v4(ptr addrspace(3) %arg) {
33 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
34 %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
35 %load = load <4 x i32>, ptr addrspace(3) %gep, align 4
36 %v1 = extractelement <4 x i32> %load, i32 0
37 %v2 = extractelement <4 x i32> %load, i32 1
38 %v3 = extractelement <4 x i32> %load, i32 2
39 %v4 = extractelement <4 x i32> %load, i32 3
40 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
41 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
42 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
43 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
44 store <4 x i32> %v8, ptr addrspace(3) %gep, align 4
48 ; GCN-LABEL: test_local_misaligned_v3:
49 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
50 ; GCN-DAG: ds_{{read|load}}_b32
51 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
52 ; GCN-DAG: ds_{{write|store}}_b32
53 define amdgpu_kernel void @test_local_misaligned_v3(ptr addrspace(3) %arg) {
55 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
56 %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
57 %load = load <3 x i32>, ptr addrspace(3) %gep, align 4
58 %v1 = extractelement <3 x i32> %load, i32 0
59 %v2 = extractelement <3 x i32> %load, i32 1
60 %v3 = extractelement <3 x i32> %load, i32 2
61 %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
62 %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
63 %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
64 store <3 x i32> %v7, ptr addrspace(3) %gep, align 4
68 ; GCN-LABEL: test_flat_misaligned_v2:
69 ; VECT-DAG: flat_load_{{dwordx2|b64}} v
70 ; VECT-DAG: flat_store_{{dwordx2|b64}} v
71 ; SPLIT-DAG: flat_load_{{dword|b32}} v
72 ; SPLIT-DAG: flat_load_{{dword|b32}} v
73 ; SPLIT-DAG: flat_store_{{dword|b32}} v
74 ; SPLIT-DAG: flat_store_{{dword|b32}} v
75 define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) {
77 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
78 %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
79 %load = load <2 x i32>, ptr %gep, align 4
80 %v1 = extractelement <2 x i32> %load, i32 0
81 %v2 = extractelement <2 x i32> %load, i32 1
82 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
83 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
84 store <2 x i32> %v4, ptr %gep, align 4
88 ; GCN-LABEL: test_flat_misaligned_v4:
89 ; VECT-DAG: flat_load_{{dwordx4|b128}} v
90 ; VECT-DAG: flat_store_{{dwordx4|b128}} v
91 ; SPLIT-DAG: flat_load_{{dword|b32}} v
92 ; SPLIT-DAG: flat_load_{{dword|b32}} v
93 ; SPLIT-DAG: flat_load_{{dword|b32}} v
94 ; SPLIT-DAG: flat_load_{{dword|b32}} v
95 ; SPLIT-DAG: flat_store_{{dword|b32}} v
96 ; SPLIT-DAG: flat_store_{{dword|b32}} v
97 ; SPLIT-DAG: flat_store_{{dword|b32}} v
98 ; SPLIT-DAG: flat_store_{{dword|b32}} v
99 define amdgpu_kernel void @test_flat_misaligned_v4(ptr %arg) {
101 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
102 %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
103 %load = load <4 x i32>, ptr %gep, align 4
104 %v1 = extractelement <4 x i32> %load, i32 0
105 %v2 = extractelement <4 x i32> %load, i32 1
106 %v3 = extractelement <4 x i32> %load, i32 2
107 %v4 = extractelement <4 x i32> %load, i32 3
108 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
109 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
110 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
111 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
112 store <4 x i32> %v8, ptr %gep, align 4
116 ; GCN-LABEL: test_flat_misaligned_v3:
117 ; VECT-DAG: flat_load_{{dwordx3|b96}} v
118 ; VECT-DAG: flat_store_{{dwordx3|b96}} v
119 ; SPLIT-DAG: flat_load_{{dword|b32}} v
120 ; SPLIT-DAG: flat_load_{{dword|b32}} v
121 ; SPLIT-DAG: flat_load_{{dword|b32}} v
122 ; SPLIT-DAG: flat_store_{{dword|b32}} v
123 ; SPLIT-DAG: flat_store_{{dword|b32}} v
124 ; SPLIT-DAG: flat_store_{{dword|b32}} v
125 define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) {
127 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
128 %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
129 %load = load <3 x i32>, ptr %gep, align 4
130 %v1 = extractelement <3 x i32> %load, i32 0
131 %v2 = extractelement <3 x i32> %load, i32 1
132 %v3 = extractelement <3 x i32> %load, i32 2
133 %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
134 %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
135 %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
136 store <3 x i32> %v7, ptr %gep, align 4
140 ; GCN-LABEL: test_local_aligned_v2:
141 ; GCN-DAG: ds_{{read|load}}_b64
142 ; GCN-DAG: ds_{{write|store}}_b64
143 define amdgpu_kernel void @test_local_aligned_v2(ptr addrspace(3) %arg) {
145 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
146 %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
147 %load = load <2 x i32>, ptr addrspace(3) %gep, align 8
148 %v1 = extractelement <2 x i32> %load, i32 0
149 %v2 = extractelement <2 x i32> %load, i32 1
150 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
151 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
152 store <2 x i32> %v4, ptr addrspace(3) %gep, align 8
156 ; GCN-LABEL: test_local_aligned_v3:
157 ; GCN-DAG: ds_{{read|load}}_b96
158 ; GCN-DAG: ds_{{write|store}}_b96
159 define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) {
161 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
162 %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
163 %load = load <3 x i32>, ptr addrspace(3) %gep, align 16
164 %v1 = extractelement <3 x i32> %load, i32 0
165 %v2 = extractelement <3 x i32> %load, i32 1
166 %v3 = extractelement <3 x i32> %load, i32 2
167 %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
168 %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
169 %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
170 store <3 x i32> %v7, ptr addrspace(3) %gep, align 16
174 ; GCN-LABEL: test_flat_aligned_v2:
175 ; GCN-DAG: flat_load_{{dwordx2|b64}} v
176 ; GCN-DAG: flat_store_{{dwordx2|b64}} v
177 define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) {
179 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
180 %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
181 %load = load <2 x i32>, ptr %gep, align 8
182 %v1 = extractelement <2 x i32> %load, i32 0
183 %v2 = extractelement <2 x i32> %load, i32 1
184 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
185 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
186 store <2 x i32> %v4, ptr %gep, align 8
190 ; GCN-LABEL: test_flat_aligned_v4:
191 ; GCN-DAG: flat_load_{{dwordx4|b128}} v
192 ; GCN-DAG: flat_store_{{dwordx4|b128}} v
193 define amdgpu_kernel void @test_flat_aligned_v4(ptr %arg) {
195 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
196 %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
197 %load = load <4 x i32>, ptr %gep, align 16
198 %v1 = extractelement <4 x i32> %load, i32 0
199 %v2 = extractelement <4 x i32> %load, i32 1
200 %v3 = extractelement <4 x i32> %load, i32 2
201 %v4 = extractelement <4 x i32> %load, i32 3
202 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
203 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
204 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
205 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
206 store <4 x i32> %v8, ptr %gep, align 16
210 ; GCN-LABEL: test_local_v4_aligned8:
211 ; ALIGNED-DAG: ds_{{read2|load_2addr}}_b64
212 ; ALIGNED-DAG: ds_{{write2|store_2addr}}_b64
213 ; UNALIGNED-DAG: ds_{{read2|load_2addr}}_b64
214 ; UNALIGNED-DAG: ds_{{write2|store_2addr}}_b64
215 define amdgpu_kernel void @test_local_v4_aligned8(ptr addrspace(3) %arg) {
217 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
218 %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
219 %load = load <4 x i32>, ptr addrspace(3) %gep, align 8
220 %v1 = extractelement <4 x i32> %load, i32 0
221 %v2 = extractelement <4 x i32> %load, i32 1
222 %v3 = extractelement <4 x i32> %load, i32 2
223 %v4 = extractelement <4 x i32> %load, i32 3
224 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
225 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
226 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
227 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
228 store <4 x i32> %v8, ptr addrspace(3) %gep, align 8
232 ; GCN-LABEL: test_flat_v4_aligned8:
233 ; VECT-DAG: flat_load_{{dwordx4|b128}} v
234 ; VECT-DAG: flat_store_{{dwordx4|b128}} v
235 ; SPLIT-DAG: flat_load_{{dwordx2|b64}} v
236 ; SPLIT-DAG: flat_load_{{dwordx2|b64}} v
237 ; SPLIT-DAG: flat_store_{{dwordx2|b64}} v
238 ; SPLIT-DAG: flat_store_{{dwordx2|b64}} v
239 define amdgpu_kernel void @test_flat_v4_aligned8(ptr %arg) {
241 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
242 %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
243 %load = load <4 x i32>, ptr %gep, align 8
244 %v1 = extractelement <4 x i32> %load, i32 0
245 %v2 = extractelement <4 x i32> %load, i32 1
246 %v3 = extractelement <4 x i32> %load, i32 2
247 %v4 = extractelement <4 x i32> %load, i32 3
248 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
249 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
250 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
251 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
252 store <4 x i32> %v8, ptr %gep, align 8
256 declare i32 @llvm.amdgcn.workitem.id.x()