1 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
2 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
3 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,SPLIT %s
4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s
6 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
7 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
8 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode -early-live-intervals < %s | FileCheck -check-prefixes=GCN,ALIGNED,VECT %s
9 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -mattr=+cumode,+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,UNALIGNED,VECT %s
11 ; GCN-LABEL: test_local_misaligned_v2:
12 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
13 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
14 define amdgpu_kernel void @test_local_misaligned_v2(ptr addrspace(3) %arg) {
16 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
17 %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
18 %load = load <2 x i32>, ptr addrspace(3) %gep, align 4
19 %v1 = extractelement <2 x i32> %load, i32 0
20 %v2 = extractelement <2 x i32> %load, i32 1
21 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
22 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
23 store <2 x i32> %v4, ptr addrspace(3) %gep, align 4
27 ; GCN-LABEL: test_local_misaligned_v4:
28 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
29 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
30 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
31 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
32 define amdgpu_kernel void @test_local_misaligned_v4(ptr addrspace(3) %arg) {
34 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
35 %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
36 %load = load <4 x i32>, ptr addrspace(3) %gep, align 4
37 %v1 = extractelement <4 x i32> %load, i32 0
38 %v2 = extractelement <4 x i32> %load, i32 1
39 %v3 = extractelement <4 x i32> %load, i32 2
40 %v4 = extractelement <4 x i32> %load, i32 3
41 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
42 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
43 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
44 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
45 store <4 x i32> %v8, ptr addrspace(3) %gep, align 4
49 ; GCN-LABEL: test_local_misaligned_v3:
50 ; GCN-DAG: ds_{{read2|load_2addr}}_b32
51 ; GCN-DAG: ds_{{read|load}}_b32
52 ; GCN-DAG: ds_{{write2|store_2addr}}_b32
53 ; GCN-DAG: ds_{{write|store}}_b32
54 define amdgpu_kernel void @test_local_misaligned_v3(ptr addrspace(3) %arg) {
56 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
57 %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
58 %load = load <3 x i32>, ptr addrspace(3) %gep, align 4
59 %v1 = extractelement <3 x i32> %load, i32 0
60 %v2 = extractelement <3 x i32> %load, i32 1
61 %v3 = extractelement <3 x i32> %load, i32 2
62 %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
63 %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
64 %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
65 store <3 x i32> %v7, ptr addrspace(3) %gep, align 4
69 ; GCN-LABEL: test_flat_misaligned_v2:
70 ; VECT-DAG: flat_load_{{dwordx2|b64}} v
71 ; VECT-DAG: flat_store_{{dwordx2|b64}} v
72 ; SPLIT-DAG: flat_load_{{dword|b32}} v
73 ; SPLIT-DAG: flat_load_{{dword|b32}} v
74 ; SPLIT-DAG: flat_store_{{dword|b32}} v
75 ; SPLIT-DAG: flat_store_{{dword|b32}} v
76 define amdgpu_kernel void @test_flat_misaligned_v2(ptr %arg) {
78 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
79 %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
80 %load = load <2 x i32>, ptr %gep, align 4
81 %v1 = extractelement <2 x i32> %load, i32 0
82 %v2 = extractelement <2 x i32> %load, i32 1
83 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
84 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
85 store <2 x i32> %v4, ptr %gep, align 4
89 ; GCN-LABEL: test_flat_misaligned_v4:
90 ; VECT-DAG: flat_load_{{dwordx4|b128}} v
91 ; VECT-DAG: flat_store_{{dwordx4|b128}} v
92 ; SPLIT-DAG: flat_load_{{dword|b32}} v
93 ; SPLIT-DAG: flat_load_{{dword|b32}} v
94 ; SPLIT-DAG: flat_load_{{dword|b32}} v
95 ; SPLIT-DAG: flat_load_{{dword|b32}} v
96 ; SPLIT-DAG: flat_store_{{dword|b32}} v
97 ; SPLIT-DAG: flat_store_{{dword|b32}} v
98 ; SPLIT-DAG: flat_store_{{dword|b32}} v
99 ; SPLIT-DAG: flat_store_{{dword|b32}} v
100 define amdgpu_kernel void @test_flat_misaligned_v4(ptr %arg) {
102 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
103 %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
104 %load = load <4 x i32>, ptr %gep, align 4
105 %v1 = extractelement <4 x i32> %load, i32 0
106 %v2 = extractelement <4 x i32> %load, i32 1
107 %v3 = extractelement <4 x i32> %load, i32 2
108 %v4 = extractelement <4 x i32> %load, i32 3
109 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
110 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
111 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
112 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
113 store <4 x i32> %v8, ptr %gep, align 4
117 ; GCN-LABEL: test_flat_misaligned_v3:
118 ; VECT-DAG: flat_load_{{dwordx3|b96}} v
119 ; VECT-DAG: flat_store_{{dwordx3|b96}} v
120 ; SPLIT-DAG: flat_load_{{dword|b32}} v
121 ; SPLIT-DAG: flat_load_{{dword|b32}} v
122 ; SPLIT-DAG: flat_load_{{dword|b32}} v
123 ; SPLIT-DAG: flat_store_{{dword|b32}} v
124 ; SPLIT-DAG: flat_store_{{dword|b32}} v
125 ; SPLIT-DAG: flat_store_{{dword|b32}} v
126 define amdgpu_kernel void @test_flat_misaligned_v3(ptr %arg) {
128 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
129 %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
130 %load = load <3 x i32>, ptr %gep, align 4
131 %v1 = extractelement <3 x i32> %load, i32 0
132 %v2 = extractelement <3 x i32> %load, i32 1
133 %v3 = extractelement <3 x i32> %load, i32 2
134 %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
135 %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
136 %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
137 store <3 x i32> %v7, ptr %gep, align 4
141 ; GCN-LABEL: test_local_aligned_v2:
142 ; GCN-DAG: ds_{{read|load}}_b64
143 ; GCN-DAG: ds_{{write|store}}_b64
144 define amdgpu_kernel void @test_local_aligned_v2(ptr addrspace(3) %arg) {
146 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
147 %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
148 %load = load <2 x i32>, ptr addrspace(3) %gep, align 8
149 %v1 = extractelement <2 x i32> %load, i32 0
150 %v2 = extractelement <2 x i32> %load, i32 1
151 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
152 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
153 store <2 x i32> %v4, ptr addrspace(3) %gep, align 8
157 ; GCN-LABEL: test_local_aligned_v3:
158 ; GCN-DAG: ds_{{read|load}}_b96
159 ; GCN-DAG: ds_{{write|store}}_b96
160 define amdgpu_kernel void @test_local_aligned_v3(ptr addrspace(3) %arg) {
162 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
163 %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
164 %load = load <3 x i32>, ptr addrspace(3) %gep, align 16
165 %v1 = extractelement <3 x i32> %load, i32 0
166 %v2 = extractelement <3 x i32> %load, i32 1
167 %v3 = extractelement <3 x i32> %load, i32 2
168 %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
169 %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
170 %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
171 store <3 x i32> %v7, ptr addrspace(3) %gep, align 16
175 ; GCN-LABEL: test_flat_aligned_v2:
176 ; GCN-DAG: flat_load_{{dwordx2|b64}} v
177 ; GCN-DAG: flat_store_{{dwordx2|b64}} v
178 define amdgpu_kernel void @test_flat_aligned_v2(ptr %arg) {
180 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
181 %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
182 %load = load <2 x i32>, ptr %gep, align 8
183 %v1 = extractelement <2 x i32> %load, i32 0
184 %v2 = extractelement <2 x i32> %load, i32 1
185 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
186 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
187 store <2 x i32> %v4, ptr %gep, align 8
191 ; GCN-LABEL: test_flat_aligned_v4:
192 ; GCN-DAG: flat_load_{{dwordx4|b128}} v
193 ; GCN-DAG: flat_store_{{dwordx4|b128}} v
194 define amdgpu_kernel void @test_flat_aligned_v4(ptr %arg) {
196 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
197 %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
198 %load = load <4 x i32>, ptr %gep, align 16
199 %v1 = extractelement <4 x i32> %load, i32 0
200 %v2 = extractelement <4 x i32> %load, i32 1
201 %v3 = extractelement <4 x i32> %load, i32 2
202 %v4 = extractelement <4 x i32> %load, i32 3
203 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
204 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
205 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
206 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
207 store <4 x i32> %v8, ptr %gep, align 16
211 ; GCN-LABEL: test_local_v4_aligned8:
212 ; ALIGNED-DAG: ds_{{read2|load_2addr}}_b64
213 ; ALIGNED-DAG: ds_{{write2|store_2addr}}_b64
214 ; UNALIGNED-DAG: ds_{{read2|load_2addr}}_b64
215 ; UNALIGNED-DAG: ds_{{write2|store_2addr}}_b64
216 define amdgpu_kernel void @test_local_v4_aligned8(ptr addrspace(3) %arg) {
218 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
219 %gep = getelementptr inbounds i32, ptr addrspace(3) %arg, i32 %lid
220 %load = load <4 x i32>, ptr addrspace(3) %gep, align 8
221 %v1 = extractelement <4 x i32> %load, i32 0
222 %v2 = extractelement <4 x i32> %load, i32 1
223 %v3 = extractelement <4 x i32> %load, i32 2
224 %v4 = extractelement <4 x i32> %load, i32 3
225 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
226 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
227 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
228 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
229 store <4 x i32> %v8, ptr addrspace(3) %gep, align 8
233 ; GCN-LABEL: test_flat_v4_aligned8:
234 ; VECT-DAG: flat_load_{{dwordx4|b128}} v
235 ; VECT-DAG: flat_store_{{dwordx4|b128}} v
236 ; SPLIT-DAG: flat_load_{{dwordx2|b64}} v
237 ; SPLIT-DAG: flat_load_{{dwordx2|b64}} v
238 ; SPLIT-DAG: flat_store_{{dwordx2|b64}} v
239 ; SPLIT-DAG: flat_store_{{dwordx2|b64}} v
240 define amdgpu_kernel void @test_flat_v4_aligned8(ptr %arg) {
242 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
243 %gep = getelementptr inbounds i32, ptr %arg, i32 %lid
244 %load = load <4 x i32>, ptr %gep, align 8
245 %v1 = extractelement <4 x i32> %load, i32 0
246 %v2 = extractelement <4 x i32> %load, i32 1
247 %v3 = extractelement <4 x i32> %load, i32 2
248 %v4 = extractelement <4 x i32> %load, i32 3
249 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
250 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
251 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
252 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
253 store <4 x i32> %v8, ptr %gep, align 8
257 declare i32 @llvm.amdgcn.workitem.id.x()