1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -S -passes=load-store-vectorizer --mcpu=hawaii -mattr=-unaligned-access-mode,+max-private-element-size-16 < %s | FileCheck -check-prefixes=CHECK,ALIGNED %s
3 ; RUN: opt -S -passes=load-store-vectorizer --mcpu=hawaii -mattr=+unaligned-access-mode,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefixes=CHECK,UNALIGNED %s
4 ; RUN: opt -S -passes='function(load-store-vectorizer)' --mcpu=hawaii -mattr=-unaligned-access-mode,+max-private-element-size-16 < %s | FileCheck -check-prefixes=CHECK,ALIGNED %s
5 ; RUN: opt -S -passes='function(load-store-vectorizer)' --mcpu=hawaii -mattr=+unaligned-access-mode,+unaligned-scratch-access,+max-private-element-size-16 < %s | FileCheck -check-prefixes=CHECK,UNALIGNED %s
7 target triple = "amdgcn--"
8 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
10 define amdgpu_kernel void @load_unknown_offset_align1_i8(ptr addrspace(1) noalias %out, i32 %offset) #0 {
11 ; ALIGNED-LABEL: @load_unknown_offset_align1_i8(
12 ; ALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i8], align 1, addrspace(5)
13 ; ALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i8], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
14 ; ALIGNED-NEXT: [[VAL0:%.*]] = load i8, ptr addrspace(5) [[PTR0]], align 1
15 ; ALIGNED-NEXT: [[PTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[PTR0]], i32 1
16 ; ALIGNED-NEXT: [[VAL1:%.*]] = load i8, ptr addrspace(5) [[PTR1]], align 1
17 ; ALIGNED-NEXT: [[ADD:%.*]] = add i8 [[VAL0]], [[VAL1]]
18 ; ALIGNED-NEXT: store i8 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 1
19 ; ALIGNED-NEXT: ret void
21 ; UNALIGNED-LABEL: @load_unknown_offset_align1_i8(
22 ; UNALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i8], align 1, addrspace(5)
23 ; UNALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i8], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
24 ; UNALIGNED-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr addrspace(5) [[PTR0]], align 1
25 ; UNALIGNED-NEXT: [[VAL01:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0
26 ; UNALIGNED-NEXT: [[VAL12:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
27 ; UNALIGNED-NEXT: [[ADD:%.*]] = add i8 [[VAL01]], [[VAL12]]
28 ; UNALIGNED-NEXT: store i8 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 1
29 ; UNALIGNED-NEXT: ret void
31 %alloca = alloca [128 x i8], align 1, addrspace(5)
32 %ptr0 = getelementptr inbounds [128 x i8], ptr addrspace(5) %alloca, i32 0, i32 %offset
33 %val0 = load i8, ptr addrspace(5) %ptr0, align 1
34 %ptr1 = getelementptr inbounds i8, ptr addrspace(5) %ptr0, i32 1
35 %val1 = load i8, ptr addrspace(5) %ptr1, align 1
36 %add = add i8 %val0, %val1
37 store i8 %add, ptr addrspace(1) %out
41 define amdgpu_kernel void @load_unknown_offset_align1_i16(ptr addrspace(1) noalias %out, i32 %offset) #0 {
42 ; ALIGNED-LABEL: @load_unknown_offset_align1_i16(
43 ; ALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i16], align 1, addrspace(5)
44 ; ALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i16], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
45 ; ALIGNED-NEXT: [[VAL0:%.*]] = load i16, ptr addrspace(5) [[PTR0]], align 1
46 ; ALIGNED-NEXT: [[PTR1:%.*]] = getelementptr inbounds i16, ptr addrspace(5) [[PTR0]], i32 1
47 ; ALIGNED-NEXT: [[VAL1:%.*]] = load i16, ptr addrspace(5) [[PTR1]], align 1
48 ; ALIGNED-NEXT: [[ADD:%.*]] = add i16 [[VAL0]], [[VAL1]]
49 ; ALIGNED-NEXT: store i16 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 2
50 ; ALIGNED-NEXT: ret void
52 ; UNALIGNED-LABEL: @load_unknown_offset_align1_i16(
53 ; UNALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i16], align 1, addrspace(5)
54 ; UNALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i16], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
55 ; UNALIGNED-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(5) [[PTR0]], align 1
56 ; UNALIGNED-NEXT: [[VAL01:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
57 ; UNALIGNED-NEXT: [[VAL12:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
58 ; UNALIGNED-NEXT: [[ADD:%.*]] = add i16 [[VAL01]], [[VAL12]]
59 ; UNALIGNED-NEXT: store i16 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 2
60 ; UNALIGNED-NEXT: ret void
62 %alloca = alloca [128 x i16], align 1, addrspace(5)
63 %ptr0 = getelementptr inbounds [128 x i16], ptr addrspace(5) %alloca, i32 0, i32 %offset
64 %val0 = load i16, ptr addrspace(5) %ptr0, align 1
65 %ptr1 = getelementptr inbounds i16, ptr addrspace(5) %ptr0, i32 1
66 %val1 = load i16, ptr addrspace(5) %ptr1, align 1
67 %add = add i16 %val0, %val1
68 store i16 %add, ptr addrspace(1) %out
72 ; FIXME: Although the offset is unknown here, we know it is a multiple
73 ; of the element size, so should still be align 4
74 define amdgpu_kernel void @load_unknown_offset_align1_i32(ptr addrspace(1) noalias %out, i32 %offset) #0 {
75 ; ALIGNED-LABEL: @load_unknown_offset_align1_i32(
76 ; ALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i32], align 1, addrspace(5)
77 ; ALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
78 ; ALIGNED-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(5) [[PTR0]], align 1
79 ; ALIGNED-NEXT: [[PTR1:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[PTR0]], i32 1
80 ; ALIGNED-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(5) [[PTR1]], align 1
81 ; ALIGNED-NEXT: [[ADD:%.*]] = add i32 [[VAL0]], [[VAL1]]
82 ; ALIGNED-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 4
83 ; ALIGNED-NEXT: ret void
85 ; UNALIGNED-LABEL: @load_unknown_offset_align1_i32(
86 ; UNALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i32], align 1, addrspace(5)
87 ; UNALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
88 ; UNALIGNED-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[PTR0]], align 1
89 ; UNALIGNED-NEXT: [[VAL01:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
90 ; UNALIGNED-NEXT: [[VAL12:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
91 ; UNALIGNED-NEXT: [[ADD:%.*]] = add i32 [[VAL01]], [[VAL12]]
92 ; UNALIGNED-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 4
93 ; UNALIGNED-NEXT: ret void
95 %alloca = alloca [128 x i32], align 1, addrspace(5)
96 %ptr0 = getelementptr inbounds [128 x i32], ptr addrspace(5) %alloca, i32 0, i32 %offset
97 %val0 = load i32, ptr addrspace(5) %ptr0, align 1
98 %ptr1 = getelementptr inbounds i32, ptr addrspace(5) %ptr0, i32 1
99 %val1 = load i32, ptr addrspace(5) %ptr1, align 1
100 %add = add i32 %val0, %val1
101 store i32 %add, ptr addrspace(1) %out
105 ; Make sure alloca alignment isn't decreased
106 define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(ptr addrspace(1) noalias %out, i32 %offset) #0 {
107 ; ALIGNED-LABEL: @load_alloca16_unknown_offset_align1_i32(
108 ; ALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i32], align 16, addrspace(5)
109 ; ALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
110 ; ALIGNED-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(5) [[PTR0]], align 1
111 ; ALIGNED-NEXT: [[PTR1:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[PTR0]], i32 1
112 ; ALIGNED-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(5) [[PTR1]], align 1
113 ; ALIGNED-NEXT: [[ADD:%.*]] = add i32 [[VAL0]], [[VAL1]]
114 ; ALIGNED-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 4
115 ; ALIGNED-NEXT: ret void
117 ; UNALIGNED-LABEL: @load_alloca16_unknown_offset_align1_i32(
118 ; UNALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i32], align 16, addrspace(5)
119 ; UNALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
120 ; UNALIGNED-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[PTR0]], align 4
121 ; UNALIGNED-NEXT: [[VAL01:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
122 ; UNALIGNED-NEXT: [[VAL12:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
123 ; UNALIGNED-NEXT: [[ADD:%.*]] = add i32 [[VAL01]], [[VAL12]]
124 ; UNALIGNED-NEXT: store i32 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 4
125 ; UNALIGNED-NEXT: ret void
127 %alloca = alloca [128 x i32], align 16, addrspace(5)
128 %ptr0 = getelementptr inbounds [128 x i32], ptr addrspace(5) %alloca, i32 0, i32 %offset
129 %val0 = load i32, ptr addrspace(5) %ptr0, align 1
130 %ptr1 = getelementptr inbounds i32, ptr addrspace(5) %ptr0, i32 1
131 %val1 = load i32, ptr addrspace(5) %ptr1, align 1
132 %add = add i32 %val0, %val1
133 store i32 %add, ptr addrspace(1) %out
137 define amdgpu_kernel void @store_unknown_offset_align1_i8(ptr addrspace(1) noalias %out, i32 %offset) #0 {
138 ; ALIGNED-LABEL: @store_unknown_offset_align1_i8(
139 ; ALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i8], align 1, addrspace(5)
140 ; ALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i8], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
141 ; ALIGNED-NEXT: store i8 9, ptr addrspace(5) [[PTR0]], align 1
142 ; ALIGNED-NEXT: [[PTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[PTR0]], i32 1
143 ; ALIGNED-NEXT: store i8 10, ptr addrspace(5) [[PTR1]], align 1
144 ; ALIGNED-NEXT: ret void
146 ; UNALIGNED-LABEL: @store_unknown_offset_align1_i8(
147 ; UNALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i8], align 1, addrspace(5)
148 ; UNALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i8], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
149 ; UNALIGNED-NEXT: store <2 x i8> <i8 9, i8 10>, ptr addrspace(5) [[PTR0]], align 1
150 ; UNALIGNED-NEXT: ret void
152 %alloca = alloca [128 x i8], align 1, addrspace(5)
153 %ptr0 = getelementptr inbounds [128 x i8], ptr addrspace(5) %alloca, i32 0, i32 %offset
154 store i8 9, ptr addrspace(5) %ptr0, align 1
155 %ptr1 = getelementptr inbounds i8, ptr addrspace(5) %ptr0, i32 1
156 store i8 10, ptr addrspace(5) %ptr1, align 1
160 define amdgpu_kernel void @store_unknown_offset_align1_i16(ptr addrspace(1) noalias %out, i32 %offset) #0 {
161 ; ALIGNED-LABEL: @store_unknown_offset_align1_i16(
162 ; ALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i16], align 1, addrspace(5)
163 ; ALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i16], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
164 ; ALIGNED-NEXT: store i16 9, ptr addrspace(5) [[PTR0]], align 1
165 ; ALIGNED-NEXT: [[PTR1:%.*]] = getelementptr inbounds i16, ptr addrspace(5) [[PTR0]], i32 1
166 ; ALIGNED-NEXT: store i16 10, ptr addrspace(5) [[PTR1]], align 1
167 ; ALIGNED-NEXT: ret void
169 ; UNALIGNED-LABEL: @store_unknown_offset_align1_i16(
170 ; UNALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i16], align 1, addrspace(5)
171 ; UNALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i16], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
172 ; UNALIGNED-NEXT: store <2 x i16> <i16 9, i16 10>, ptr addrspace(5) [[PTR0]], align 1
173 ; UNALIGNED-NEXT: ret void
175 %alloca = alloca [128 x i16], align 1, addrspace(5)
176 %ptr0 = getelementptr inbounds [128 x i16], ptr addrspace(5) %alloca, i32 0, i32 %offset
177 store i16 9, ptr addrspace(5) %ptr0, align 1
178 %ptr1 = getelementptr inbounds i16, ptr addrspace(5) %ptr0, i32 1
179 store i16 10, ptr addrspace(5) %ptr1, align 1
183 ; FIXME: Although the offset is unknown here, we know it is a multiple
184 ; of the element size, so it still should be align 4.
186 define amdgpu_kernel void @store_unknown_offset_align1_i32(ptr addrspace(1) noalias %out, i32 %offset) #0 {
187 ; ALIGNED-LABEL: @store_unknown_offset_align1_i32(
188 ; ALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i32], align 1, addrspace(5)
189 ; ALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
190 ; ALIGNED-NEXT: store i32 9, ptr addrspace(5) [[PTR0]], align 1
191 ; ALIGNED-NEXT: [[PTR1:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[PTR0]], i32 1
192 ; ALIGNED-NEXT: store i32 10, ptr addrspace(5) [[PTR1]], align 1
193 ; ALIGNED-NEXT: ret void
195 ; UNALIGNED-LABEL: @store_unknown_offset_align1_i32(
196 ; UNALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i32], align 1, addrspace(5)
197 ; UNALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
198 ; UNALIGNED-NEXT: store <2 x i32> <i32 9, i32 10>, ptr addrspace(5) [[PTR0]], align 1
199 ; UNALIGNED-NEXT: ret void
201 %alloca = alloca [128 x i32], align 1, addrspace(5)
202 %ptr0 = getelementptr inbounds [128 x i32], ptr addrspace(5) %alloca, i32 0, i32 %offset
203 store i32 9, ptr addrspace(5) %ptr0, align 1
204 %ptr1 = getelementptr inbounds i32, ptr addrspace(5) %ptr0, i32 1
205 store i32 10, ptr addrspace(5) %ptr1, align 1
209 define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32() {
210 ; CHECK-LABEL: @merge_private_store_4_vector_elts_loads_v4i32(
211 ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [8 x i32], align 4, addrspace(5)
212 ; CHECK-NEXT: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, ptr addrspace(5) [[ALLOCA]], align 4
213 ; CHECK-NEXT: ret void
215 %alloca = alloca [8 x i32], align 1, addrspace(5)
216 %out.gep.1 = getelementptr i32, ptr addrspace(5) %alloca, i32 1
217 %out.gep.2 = getelementptr i32, ptr addrspace(5) %alloca, i32 2
218 %out.gep.3 = getelementptr i32, ptr addrspace(5) %alloca, i32 3
220 store i32 9, ptr addrspace(5) %alloca, align 1
221 store i32 1, ptr addrspace(5) %out.gep.1, align 1
222 store i32 23, ptr addrspace(5) %out.gep.2, align 1
223 store i32 19, ptr addrspace(5) %out.gep.3, align 1
227 define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8() {
228 ; CHECK-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
229 ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [8 x i8], align 4, addrspace(5)
230 ; CHECK-NEXT: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, ptr addrspace(5) [[ALLOCA]], align 4
231 ; CHECK-NEXT: ret void
233 %alloca = alloca [8 x i8], align 1, addrspace(5)
234 %out.gep.1 = getelementptr i8, ptr addrspace(5) %alloca, i8 1
235 %out.gep.2 = getelementptr i8, ptr addrspace(5) %alloca, i8 2
236 %out.gep.3 = getelementptr i8, ptr addrspace(5) %alloca, i8 3
238 store i8 9, ptr addrspace(5) %alloca, align 1
239 store i8 1, ptr addrspace(5) %out.gep.1, align 1
240 store i8 23, ptr addrspace(5) %out.gep.2, align 1
241 store i8 19, ptr addrspace(5) %out.gep.3, align 1
245 define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i32() {
246 ; CHECK-LABEL: @merge_private_load_4_vector_elts_loads_v4i32(
247 ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [8 x i32], align 4, addrspace(5)
248 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(5) [[ALLOCA]], align 4
249 ; CHECK-NEXT: [[LOAD01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0
250 ; CHECK-NEXT: [[LOAD12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
251 ; CHECK-NEXT: [[LOAD23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2
252 ; CHECK-NEXT: [[LOAD34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
253 ; CHECK-NEXT: ret void
255 %alloca = alloca [8 x i32], align 1, addrspace(5)
256 %out.gep.1 = getelementptr i32, ptr addrspace(5) %alloca, i32 1
257 %out.gep.2 = getelementptr i32, ptr addrspace(5) %alloca, i32 2
258 %out.gep.3 = getelementptr i32, ptr addrspace(5) %alloca, i32 3
260 %load0 = load i32, ptr addrspace(5) %alloca, align 1
261 %load1 = load i32, ptr addrspace(5) %out.gep.1, align 1
262 %load2 = load i32, ptr addrspace(5) %out.gep.2, align 1
263 %load3 = load i32, ptr addrspace(5) %out.gep.3, align 1
267 define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i8() {
268 ; CHECK-LABEL: @merge_private_load_4_vector_elts_loads_v4i8(
269 ; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [8 x i8], align 4, addrspace(5)
270 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr addrspace(5) [[ALLOCA]], align 4
271 ; CHECK-NEXT: [[LOAD01:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
272 ; CHECK-NEXT: [[LOAD12:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
273 ; CHECK-NEXT: [[LOAD23:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
274 ; CHECK-NEXT: [[LOAD34:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
275 ; CHECK-NEXT: ret void
277 %alloca = alloca [8 x i8], align 1, addrspace(5)
278 %out.gep.1 = getelementptr i8, ptr addrspace(5) %alloca, i8 1
279 %out.gep.2 = getelementptr i8, ptr addrspace(5) %alloca, i8 2
280 %out.gep.3 = getelementptr i8, ptr addrspace(5) %alloca, i8 3
282 %load0 = load i8, ptr addrspace(5) %alloca, align 1
283 %load1 = load i8, ptr addrspace(5) %out.gep.1, align 1
284 %load2 = load i8, ptr addrspace(5) %out.gep.2, align 1
285 %load3 = load i8, ptr addrspace(5) %out.gep.3, align 1
289 ; Make sure we don't think the alignment will increase if the base address isn't an alloca
290 define void @private_store_2xi16_align2_not_alloca(ptr addrspace(5) %p, ptr addrspace(5) %r) #0 {
291 ; ALIGNED-LABEL: @private_store_2xi16_align2_not_alloca(
292 ; ALIGNED-NEXT: [[GEP_R:%.*]] = getelementptr i16, ptr addrspace(5) [[R:%.*]], i32 1
293 ; ALIGNED-NEXT: store i16 1, ptr addrspace(5) [[R]], align 2
294 ; ALIGNED-NEXT: store i16 2, ptr addrspace(5) [[GEP_R]], align 2
295 ; ALIGNED-NEXT: ret void
297 ; UNALIGNED-LABEL: @private_store_2xi16_align2_not_alloca(
298 ; UNALIGNED-NEXT: store <2 x i16> <i16 1, i16 2>, ptr addrspace(5) [[R:%.*]], align 2
299 ; UNALIGNED-NEXT: ret void
301 %gep.r = getelementptr i16, ptr addrspace(5) %r, i32 1
302 store i16 1, ptr addrspace(5) %r, align 2
303 store i16 2, ptr addrspace(5) %gep.r, align 2
307 define void @private_store_2xi16_align1_not_alloca(ptr addrspace(5) %p, ptr addrspace(5) %r) #0 {
308 ; ALIGNED-LABEL: @private_store_2xi16_align1_not_alloca(
309 ; ALIGNED-NEXT: [[GEP_R:%.*]] = getelementptr i16, ptr addrspace(5) [[R:%.*]], i32 1
310 ; ALIGNED-NEXT: store i16 1, ptr addrspace(5) [[R]], align 1
311 ; ALIGNED-NEXT: store i16 2, ptr addrspace(5) [[GEP_R]], align 1
312 ; ALIGNED-NEXT: ret void
314 ; UNALIGNED-LABEL: @private_store_2xi16_align1_not_alloca(
315 ; UNALIGNED-NEXT: store <2 x i16> <i16 1, i16 2>, ptr addrspace(5) [[R:%.*]], align 1
316 ; UNALIGNED-NEXT: ret void
318 %gep.r = getelementptr i16, ptr addrspace(5) %r, i32 1
319 store i16 1, ptr addrspace(5) %r, align 1
320 store i16 2, ptr addrspace(5) %gep.r, align 1
324 define i32 @private_load_2xi16_align2_not_alloca(ptr addrspace(5) %p) #0 {
325 ; ALIGNED-LABEL: @private_load_2xi16_align2_not_alloca(
326 ; ALIGNED-NEXT: [[GEP_P:%.*]] = getelementptr i16, ptr addrspace(5) [[P:%.*]], i64 1
327 ; ALIGNED-NEXT: [[P_0:%.*]] = load i16, ptr addrspace(5) [[P]], align 2
328 ; ALIGNED-NEXT: [[P_1:%.*]] = load i16, ptr addrspace(5) [[GEP_P]], align 2
329 ; ALIGNED-NEXT: [[ZEXT_0:%.*]] = zext i16 [[P_0]] to i32
330 ; ALIGNED-NEXT: [[ZEXT_1:%.*]] = zext i16 [[P_1]] to i32
331 ; ALIGNED-NEXT: [[SHL_1:%.*]] = shl i32 [[ZEXT_1]], 16
332 ; ALIGNED-NEXT: [[OR:%.*]] = or i32 [[ZEXT_0]], [[SHL_1]]
333 ; ALIGNED-NEXT: ret i32 [[OR]]
335 ; UNALIGNED-LABEL: @private_load_2xi16_align2_not_alloca(
336 ; UNALIGNED-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(5) [[P:%.*]], align 2
337 ; UNALIGNED-NEXT: [[P_01:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
338 ; UNALIGNED-NEXT: [[P_12:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
339 ; UNALIGNED-NEXT: [[ZEXT_0:%.*]] = zext i16 [[P_01]] to i32
340 ; UNALIGNED-NEXT: [[ZEXT_1:%.*]] = zext i16 [[P_12]] to i32
341 ; UNALIGNED-NEXT: [[SHL_1:%.*]] = shl i32 [[ZEXT_1]], 16
342 ; UNALIGNED-NEXT: [[OR:%.*]] = or i32 [[ZEXT_0]], [[SHL_1]]
343 ; UNALIGNED-NEXT: ret i32 [[OR]]
345 %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
346 %p.0 = load i16, ptr addrspace(5) %p, align 2
347 %p.1 = load i16, ptr addrspace(5) %gep.p, align 2
348 %zext.0 = zext i16 %p.0 to i32
349 %zext.1 = zext i16 %p.1 to i32
350 %shl.1 = shl i32 %zext.1, 16
351 %or = or i32 %zext.0, %shl.1
355 define i32 @private_load_2xi16_align1_not_alloca(ptr addrspace(5) %p) #0 {
356 ; ALIGNED-LABEL: @private_load_2xi16_align1_not_alloca(
357 ; ALIGNED-NEXT: [[GEP_P:%.*]] = getelementptr i16, ptr addrspace(5) [[P:%.*]], i64 1
358 ; ALIGNED-NEXT: [[P_0:%.*]] = load i16, ptr addrspace(5) [[P]], align 1
359 ; ALIGNED-NEXT: [[P_1:%.*]] = load i16, ptr addrspace(5) [[GEP_P]], align 1
360 ; ALIGNED-NEXT: [[ZEXT_0:%.*]] = zext i16 [[P_0]] to i32
361 ; ALIGNED-NEXT: [[ZEXT_1:%.*]] = zext i16 [[P_1]] to i32
362 ; ALIGNED-NEXT: [[SHL_1:%.*]] = shl i32 [[ZEXT_1]], 16
363 ; ALIGNED-NEXT: [[OR:%.*]] = or i32 [[ZEXT_0]], [[SHL_1]]
364 ; ALIGNED-NEXT: ret i32 [[OR]]
366 ; UNALIGNED-LABEL: @private_load_2xi16_align1_not_alloca(
367 ; UNALIGNED-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(5) [[P:%.*]], align 1
368 ; UNALIGNED-NEXT: [[P_01:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0
369 ; UNALIGNED-NEXT: [[P_12:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1
370 ; UNALIGNED-NEXT: [[ZEXT_0:%.*]] = zext i16 [[P_01]] to i32
371 ; UNALIGNED-NEXT: [[ZEXT_1:%.*]] = zext i16 [[P_12]] to i32
372 ; UNALIGNED-NEXT: [[SHL_1:%.*]] = shl i32 [[ZEXT_1]], 16
373 ; UNALIGNED-NEXT: [[OR:%.*]] = or i32 [[ZEXT_0]], [[SHL_1]]
374 ; UNALIGNED-NEXT: ret i32 [[OR]]
376 %gep.p = getelementptr i16, ptr addrspace(5) %p, i64 1
377 %p.0 = load i16, ptr addrspace(5) %p, align 1
378 %p.1 = load i16, ptr addrspace(5) %gep.p, align 1
379 %zext.0 = zext i16 %p.0 to i32
380 %zext.1 = zext i16 %p.1 to i32
381 %shl.1 = shl i32 %zext.1, 16
382 %or = or i32 %zext.0, %shl.1
386 define void @load_alloca16_unknown_offset_align1_i8(ptr addrspace(1) noalias %out, i32 %offset) #0 {
387 ; ALIGNED-LABEL: @load_alloca16_unknown_offset_align1_i8(
388 ; ALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i8], align 16, addrspace(5)
389 ; ALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i8], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
390 ; ALIGNED-NEXT: [[VAL0:%.*]] = load i8, ptr addrspace(5) [[PTR0]], align 1
391 ; ALIGNED-NEXT: [[PTR1:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[PTR0]], i32 1
392 ; ALIGNED-NEXT: [[VAL1:%.*]] = load i8, ptr addrspace(5) [[PTR1]], align 1
393 ; ALIGNED-NEXT: [[ADD:%.*]] = add i8 [[VAL0]], [[VAL1]]
394 ; ALIGNED-NEXT: store i8 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 1
395 ; ALIGNED-NEXT: ret void
397 ; UNALIGNED-LABEL: @load_alloca16_unknown_offset_align1_i8(
398 ; UNALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i8], align 16, addrspace(5)
399 ; UNALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i8], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
400 ; UNALIGNED-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr addrspace(5) [[PTR0]], align 1
401 ; UNALIGNED-NEXT: [[VAL01:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0
402 ; UNALIGNED-NEXT: [[VAL12:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
403 ; UNALIGNED-NEXT: [[ADD:%.*]] = add i8 [[VAL01]], [[VAL12]]
404 ; UNALIGNED-NEXT: store i8 [[ADD]], ptr addrspace(1) [[OUT:%.*]], align 1
405 ; UNALIGNED-NEXT: ret void
407 %alloca = alloca [128 x i8], align 16, addrspace(5)
408 %ptr0 = getelementptr inbounds [128 x i8], ptr addrspace(5) %alloca, i32 0, i32 %offset
409 %val0 = load i8, ptr addrspace(5) %ptr0, align 1
410 %ptr1 = getelementptr inbounds i8, ptr addrspace(5) %ptr0, i32 1
411 %val1 = load i8, ptr addrspace(5) %ptr1, align 1
412 %add = add i8 %val0, %val1
413 store i8 %add, ptr addrspace(1) %out
417 define void @store_alloca16_unknown_offset_align1_i32(ptr addrspace(1) noalias %out, i32 %offset) #0 {
418 ; ALIGNED-LABEL: @store_alloca16_unknown_offset_align1_i32(
419 ; ALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i32], align 16, addrspace(5)
420 ; ALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
421 ; ALIGNED-NEXT: store i32 9, ptr addrspace(5) [[PTR0]], align 1
422 ; ALIGNED-NEXT: [[PTR1:%.*]] = getelementptr inbounds i32, ptr addrspace(5) [[PTR0]], i32 1
423 ; ALIGNED-NEXT: store i32 10, ptr addrspace(5) [[PTR1]], align 1
424 ; ALIGNED-NEXT: ret void
426 ; UNALIGNED-LABEL: @store_alloca16_unknown_offset_align1_i32(
427 ; UNALIGNED-NEXT: [[ALLOCA:%.*]] = alloca [128 x i32], align 16, addrspace(5)
428 ; UNALIGNED-NEXT: [[PTR0:%.*]] = getelementptr inbounds [128 x i32], ptr addrspace(5) [[ALLOCA]], i32 0, i32 [[OFFSET:%.*]]
429 ; UNALIGNED-NEXT: store <2 x i32> <i32 9, i32 10>, ptr addrspace(5) [[PTR0]], align 4
430 ; UNALIGNED-NEXT: ret void
432 %alloca = alloca [128 x i32], align 16, addrspace(5)
433 %ptr0 = getelementptr inbounds [128 x i32], ptr addrspace(5) %alloca, i32 0, i32 %offset
434 store i32 9, ptr addrspace(5) %ptr0, align 1
435 %ptr1 = getelementptr inbounds i32, ptr addrspace(5) %ptr0, i32 1
436 store i32 10, ptr addrspace(5) %ptr1, align 1
440 attributes #0 = { nounwind }