1 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
2 ; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VECT %s
3 ; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SPLIT %s
4 ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -mattr=+cumode < %s | FileCheck -check-prefixes=GCN,VECT %s
6 ; GCN-LABEL: test_local_misaligned_v2:
7 ; GCN-DAG: ds_read2_b32
8 ; GCN-DAG: ds_write2_b32
9 define amdgpu_kernel void @test_local_misaligned_v2(i32 addrspace(3)* %arg) {
11 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
12 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
13 %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
14 %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 4
15 %v1 = extractelement <2 x i32> %load, i32 0
16 %v2 = extractelement <2 x i32> %load, i32 1
17 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
18 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
19 store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 4
23 ; GCN-LABEL: test_local_misaligned_v4:
24 ; GCN-DAG: ds_read2_b32
25 ; GCN-DAG: ds_read2_b32
26 ; GCN-DAG: ds_write2_b32
27 ; GCN-DAG: ds_write2_b32
28 define amdgpu_kernel void @test_local_misaligned_v4(i32 addrspace(3)* %arg) {
30 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
31 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
32 %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
33 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 4
34 %v1 = extractelement <4 x i32> %load, i32 0
35 %v2 = extractelement <4 x i32> %load, i32 1
36 %v3 = extractelement <4 x i32> %load, i32 2
37 %v4 = extractelement <4 x i32> %load, i32 3
38 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
39 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
40 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
41 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
42 store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 4
46 ; GCN-LABEL: test_local_misaligned_v3:
47 ; GCN-DAG: ds_read2_b32
48 ; GCN-DAG: ds_read_b32
49 ; GCN-DAG: ds_write2_b32
50 ; GCN-DAG: ds_write_b32
51 define amdgpu_kernel void @test_local_misaligned_v3(i32 addrspace(3)* %arg) {
53 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
54 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
55 %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
56 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 4
57 %v1 = extractelement <3 x i32> %load, i32 0
58 %v2 = extractelement <3 x i32> %load, i32 1
59 %v3 = extractelement <3 x i32> %load, i32 2
60 %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
61 %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
62 %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
63 store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 4
67 ; GCN-LABEL: test_flat_misaligned_v2:
68 ; VECT-DAG: flat_load_dwordx2 v
69 ; VECT-DAG: flat_store_dwordx2 v
70 ; SPLIT-DAG: flat_load_dword v
71 ; SPLIT-DAG: flat_load_dword v
72 ; SPLIT-DAG: flat_store_dword v
73 ; SPLIT-DAG: flat_store_dword v
74 define amdgpu_kernel void @test_flat_misaligned_v2(i32* %arg) {
76 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
77 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
78 %ptr = bitcast i32* %gep to <2 x i32>*
79 %load = load <2 x i32>, <2 x i32>* %ptr, align 4
80 %v1 = extractelement <2 x i32> %load, i32 0
81 %v2 = extractelement <2 x i32> %load, i32 1
82 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
83 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
84 store <2 x i32> %v4, <2 x i32>* %ptr, align 4
88 ; GCN-LABEL: test_flat_misaligned_v4:
89 ; VECT-DAG: flat_load_dwordx4 v
90 ; VECT-DAG: flat_store_dwordx4 v
91 ; SPLIT-DAG: flat_load_dword v
92 ; SPLIT-DAG: flat_load_dword v
93 ; SPLIT-DAG: flat_load_dword v
94 ; SPLIT-DAG: flat_load_dword v
95 ; SPLIT-DAG: flat_store_dword v
96 ; SPLIT-DAG: flat_store_dword v
97 ; SPLIT-DAG: flat_store_dword v
98 ; SPLIT-DAG: flat_store_dword v
99 define amdgpu_kernel void @test_flat_misaligned_v4(i32* %arg) {
101 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
102 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
103 %ptr = bitcast i32* %gep to <4 x i32>*
104 %load = load <4 x i32>, <4 x i32>* %ptr, align 4
105 %v1 = extractelement <4 x i32> %load, i32 0
106 %v2 = extractelement <4 x i32> %load, i32 1
107 %v3 = extractelement <4 x i32> %load, i32 2
108 %v4 = extractelement <4 x i32> %load, i32 3
109 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
110 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
111 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
112 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
113 store <4 x i32> %v8, <4 x i32>* %ptr, align 4
117 ; TODO: Reinstate the test below once v3i32/v3f32 is reinstated.
119 ; GCN-LABEL: test_flat_misaligned_v3:
120 ; xVECT-DAG: flat_load_dwordx3 v
121 ; xVECT-DAG: flat_store_dwordx3 v
122 ; xSPLIT-DAG: flat_load_dword v
123 ; xSPLIT-DAG: flat_load_dword v
124 ; xSPLIT-DAG: flat_load_dword v
125 ; xSPLIT-DAG: flat_store_dword v
126 ; xSPLIT-DAG: flat_store_dword v
127 ; xSPLIT-DAG: flat_store_dword v
128 define amdgpu_kernel void @test_flat_misaligned_v3(i32* %arg) {
130 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
131 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
132 %ptr = bitcast i32* %gep to <3 x i32>*
133 %load = load <3 x i32>, <3 x i32>* %ptr, align 4
134 %v1 = extractelement <3 x i32> %load, i32 0
135 %v2 = extractelement <3 x i32> %load, i32 1
136 %v3 = extractelement <3 x i32> %load, i32 2
137 %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
138 %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
139 %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
140 store <3 x i32> %v7, <3 x i32>* %ptr, align 4
144 ; GCN-LABEL: test_local_aligned_v2:
145 ; GCN-DAG: ds_read_b64
146 ; GCN-DAG: ds_write_b64
147 define amdgpu_kernel void @test_local_aligned_v2(i32 addrspace(3)* %arg) {
149 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
150 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
151 %ptr = bitcast i32 addrspace(3)* %gep to <2 x i32> addrspace(3)*
152 %load = load <2 x i32>, <2 x i32> addrspace(3)* %ptr, align 8
153 %v1 = extractelement <2 x i32> %load, i32 0
154 %v2 = extractelement <2 x i32> %load, i32 1
155 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
156 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
157 store <2 x i32> %v4, <2 x i32> addrspace(3)* %ptr, align 8
161 ; GCN-LABEL: test_local_aligned_v3:
162 ; GCN-DAG: ds_read_b64
163 ; GCN-DAG: ds_read_b32
164 ; GCN-DAG: ds_write_b64
165 ; GCN-DAG: ds_write_b32
166 define amdgpu_kernel void @test_local_aligned_v3(i32 addrspace(3)* %arg) {
168 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
169 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
170 %ptr = bitcast i32 addrspace(3)* %gep to <3 x i32> addrspace(3)*
171 %load = load <3 x i32>, <3 x i32> addrspace(3)* %ptr, align 16
172 %v1 = extractelement <3 x i32> %load, i32 0
173 %v2 = extractelement <3 x i32> %load, i32 1
174 %v3 = extractelement <3 x i32> %load, i32 2
175 %v5 = insertelement <3 x i32> undef, i32 %v3, i32 0
176 %v6 = insertelement <3 x i32> %v5, i32 %v1, i32 1
177 %v7 = insertelement <3 x i32> %v6, i32 %v2, i32 2
178 store <3 x i32> %v7, <3 x i32> addrspace(3)* %ptr, align 16
182 ; GCN-LABEL: test_flat_aligned_v2:
183 ; GCN-DAG: flat_load_dwordx2 v
184 ; GCN-DAG: flat_store_dwordx2 v
185 define amdgpu_kernel void @test_flat_aligned_v2(i32* %arg) {
187 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
188 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
189 %ptr = bitcast i32* %gep to <2 x i32>*
190 %load = load <2 x i32>, <2 x i32>* %ptr, align 8
191 %v1 = extractelement <2 x i32> %load, i32 0
192 %v2 = extractelement <2 x i32> %load, i32 1
193 %v3 = insertelement <2 x i32> undef, i32 %v2, i32 0
194 %v4 = insertelement <2 x i32> %v3, i32 %v1, i32 1
195 store <2 x i32> %v4, <2 x i32>* %ptr, align 8
199 ; GCN-LABEL: test_flat_aligned_v4:
200 ; GCN-DAG: flat_load_dwordx4 v
201 ; GCN-DAG: flat_store_dwordx4 v
202 define amdgpu_kernel void @test_flat_aligned_v4(i32* %arg) {
204 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
205 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
206 %ptr = bitcast i32* %gep to <4 x i32>*
207 %load = load <4 x i32>, <4 x i32>* %ptr, align 16
208 %v1 = extractelement <4 x i32> %load, i32 0
209 %v2 = extractelement <4 x i32> %load, i32 1
210 %v3 = extractelement <4 x i32> %load, i32 2
211 %v4 = extractelement <4 x i32> %load, i32 3
212 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
213 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
214 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
215 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
216 store <4 x i32> %v8, <4 x i32>* %ptr, align 16
220 ; GCN-LABEL: test_local_v4_aligned8:
221 ; GCN-DAG: ds_read2_b64
222 ; GCN-DAG: ds_write2_b64
223 define amdgpu_kernel void @test_local_v4_aligned8(i32 addrspace(3)* %arg) {
225 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
226 %gep = getelementptr inbounds i32, i32 addrspace(3)* %arg, i32 %lid
227 %ptr = bitcast i32 addrspace(3)* %gep to <4 x i32> addrspace(3)*
228 %load = load <4 x i32>, <4 x i32> addrspace(3)* %ptr, align 8
229 %v1 = extractelement <4 x i32> %load, i32 0
230 %v2 = extractelement <4 x i32> %load, i32 1
231 %v3 = extractelement <4 x i32> %load, i32 2
232 %v4 = extractelement <4 x i32> %load, i32 3
233 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
234 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
235 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
236 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
237 store <4 x i32> %v8, <4 x i32> addrspace(3)* %ptr, align 8
241 ; GCN-LABEL: test_flat_v4_aligned8:
242 ; VECT-DAG: flat_load_dwordx4 v
243 ; VECT-DAG: flat_store_dwordx4 v
244 ; SPLIT-DAG: flat_load_dwordx2 v
245 ; SPLIT-DAG: flat_load_dwordx2 v
246 ; SPLIT-DAG: flat_store_dwordx2 v
247 ; SPLIT-DAG: flat_store_dwordx2 v
248 define amdgpu_kernel void @test_flat_v4_aligned8(i32* %arg) {
250 %lid = tail call i32 @llvm.amdgcn.workitem.id.x()
251 %gep = getelementptr inbounds i32, i32* %arg, i32 %lid
252 %ptr = bitcast i32* %gep to <4 x i32>*
253 %load = load <4 x i32>, <4 x i32>* %ptr, align 8
254 %v1 = extractelement <4 x i32> %load, i32 0
255 %v2 = extractelement <4 x i32> %load, i32 1
256 %v3 = extractelement <4 x i32> %load, i32 2
257 %v4 = extractelement <4 x i32> %load, i32 3
258 %v5 = insertelement <4 x i32> undef, i32 %v4, i32 0
259 %v6 = insertelement <4 x i32> %v5, i32 %v3, i32 1
260 %v7 = insertelement <4 x i32> %v6, i32 %v2, i32 2
261 %v8 = insertelement <4 x i32> %v7, i32 %v1, i32 3
262 store <4 x i32> %v8, <4 x i32>* %ptr, align 8
266 declare i32 @llvm.amdgcn.workitem.id.x()