1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -o - -S -passes=load-store-vectorizer,dce %s | FileCheck %s
4 ; Make sure LoadStoreVectorizer vectorizes the loads below.
5 ; In order to prove that the vectorization is safe, it tries to
6 ; match nested adds and find an expression that adds a constant
7 ; value to an existing index and the result doesn't overflow.
9 target triple = "x86_64--"
11 define void @ld_v4i8_add_nsw(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
12 ; CHECK-LABEL: @ld_v4i8_add_nsw(
14 ; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1
15 ; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]]
16 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
17 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]]
18 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
19 ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
20 ; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
21 ; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
22 ; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
23 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0
24 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
25 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
26 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
27 ; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
28 ; CHECK-NEXT: ret void
31 %tmp = add nsw i32 %v0, -1
32 %tmp1 = add nsw i32 %v1, %tmp
33 %tmp2 = sext i32 %tmp1 to i64
34 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
35 %tmp4 = load i8, ptr %tmp3, align 1
36 %tmp5 = add nsw i32 %v1, %v0
37 %tmp6 = sext i32 %tmp5 to i64
38 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
39 %tmp8 = load i8, ptr %tmp7, align 1
40 %tmp9 = add nsw i32 %v0, 1
41 %tmp10 = add nsw i32 %v1, %tmp9
42 %tmp11 = sext i32 %tmp10 to i64
43 %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
44 %tmp13 = load i8, ptr %tmp12, align 1
45 %tmp14 = add nsw i32 %v0, 2
46 %tmp15 = add nsw i32 %v1, %tmp14
47 %tmp16 = sext i32 %tmp15 to i64
48 %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
49 %tmp18 = load i8, ptr %tmp17, align 1
50 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
51 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
52 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
53 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
54 store <4 x i8> %tmp22, ptr %dst
58 ; Apply different operand orders for the nested add sequences
59 define void @ld_v4i8_add_nsw_operand_orders(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
60 ; CHECK-LABEL: @ld_v4i8_add_nsw_operand_orders(
62 ; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1
63 ; CHECK-NEXT: [[TMP1:%.*]] = add nsw i32 [[V1:%.*]], [[TMP]]
64 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
65 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]]
66 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP3]], align 1
67 ; CHECK-NEXT: [[TMP41:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
68 ; CHECK-NEXT: [[TMP82:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
69 ; CHECK-NEXT: [[TMP133:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
70 ; CHECK-NEXT: [[TMP184:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
71 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP41]], i32 0
72 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP82]], i32 1
73 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP133]], i32 2
74 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP184]], i32 3
75 ; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
76 ; CHECK-NEXT: ret void
79 %tmp = add nsw i32 %v0, -1
80 %tmp1 = add nsw i32 %v1, %tmp
81 %tmp2 = sext i32 %tmp1 to i64
82 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
83 %tmp4 = load i8, ptr %tmp3, align 1
84 %tmp5 = add nsw i32 %v0, %v1
85 %tmp6 = sext i32 %tmp5 to i64
86 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
87 %tmp8 = load i8, ptr %tmp7, align 1
88 %tmp9 = add nsw i32 %v0, 1
89 %tmp10 = add nsw i32 %tmp9, %v1
90 %tmp11 = sext i32 %tmp10 to i64
91 %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
92 %tmp13 = load i8, ptr %tmp12, align 1
93 %tmp14 = add nsw i32 %v0, 2
94 %tmp15 = add nsw i32 %v1, %tmp14
95 %tmp16 = sext i32 %tmp15 to i64
96 %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
97 %tmp18 = load i8, ptr %tmp17, align 1
98 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
99 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
100 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
101 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
102 store <4 x i8> %tmp22, ptr %dst
106 define void @ld_v4i8_add_known_bits(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
107 ; CHECK-LABEL: @ld_v4i8_add_known_bits(
109 ; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
110 ; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 4
111 ; CHECK-NEXT: [[TMP:%.*]] = add i32 [[V0]], -1
112 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[V1]], [[TMP]]
113 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
114 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]]
115 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
116 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
117 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
118 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
119 ; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i8>, ptr [[TMP7]], align 1
120 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <3 x i8> [[TMP1]], i32 0
121 ; CHECK-NEXT: [[TMP132:%.*]] = extractelement <3 x i8> [[TMP1]], i32 1
122 ; CHECK-NEXT: [[TMP183:%.*]] = extractelement <3 x i8> [[TMP1]], i32 2
123 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0
124 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
125 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
126 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
127 ; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
128 ; CHECK-NEXT: ret void
131 %v0 = mul i32 %ind0, 4
132 %v1 = mul i32 %ind1, 4
133 %tmp = add i32 %v0, -1
134 %tmp1 = add i32 %v1, %tmp
135 %tmp2 = sext i32 %tmp1 to i64
136 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
137 %tmp4 = load i8, ptr %tmp3, align 1
138 %tmp5 = add i32 %v1, %v0
139 %tmp6 = sext i32 %tmp5 to i64
140 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
141 %tmp8 = load i8, ptr %tmp7, align 1
142 %tmp9 = add i32 %v0, 1
143 %tmp10 = add i32 %v1, %tmp9
144 %tmp11 = sext i32 %tmp10 to i64
145 %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
146 %tmp13 = load i8, ptr %tmp12, align 1
147 %tmp14 = add i32 %v0, 2
148 %tmp15 = add i32 %v1, %tmp14
149 %tmp16 = sext i32 %tmp15 to i64
150 %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
151 %tmp18 = load i8, ptr %tmp17, align 1
152 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
153 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
154 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
155 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
156 store <4 x i8> %tmp22, ptr %dst
160 define void @ld_v4i8_add_known_bits1(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
161 ; CHECK-LABEL: @ld_v4i8_add_known_bits1(
163 ; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
164 ; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 4
165 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
166 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
167 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
168 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
169 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
170 ; CHECK-NEXT: [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
171 ; CHECK-NEXT: [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
172 ; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
173 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0
174 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
175 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
176 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
177 ; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
178 ; CHECK-NEXT: ret void
181 %v0 = mul i32 %ind0, 4
182 %v1 = mul i32 %ind1, 4
183 %tmp = add i32 %v0, 3
184 %tmp1 = add i32 %v1, %tmp
185 %tmp2 = sext i32 %tmp1 to i64
186 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
187 %tmp4 = load i8, ptr %tmp3, align 1
188 %tmp5 = add i32 %v1, %v0
189 %tmp6 = sext i32 %tmp5 to i64
190 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
191 %tmp8 = load i8, ptr %tmp7, align 1
192 %tmp9 = add i32 %v0, 1
193 %tmp10 = add i32 %v1, %tmp9
194 %tmp11 = sext i32 %tmp10 to i64
195 %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
196 %tmp13 = load i8, ptr %tmp12, align 1
197 %tmp14 = add i32 %v0, 2
198 %tmp15 = add i32 %v1, %tmp14
199 %tmp16 = sext i32 %tmp15 to i64
200 %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
201 %tmp18 = load i8, ptr %tmp17, align 1
202 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
203 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
204 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
205 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
206 store <4 x i8> %tmp22, ptr %dst
210 define void @ld_v4i8_add_known_bits_by_assume(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
211 ; CHECK-LABEL: @ld_v4i8_add_known_bits_by_assume(
213 ; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 3
214 ; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3
215 ; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[V0]], 3
216 ; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0
217 ; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[V1]], 3
218 ; CHECK-NEXT: [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0
219 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I]])
220 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I_1]])
221 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
222 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
223 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
224 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
225 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
226 ; CHECK-NEXT: [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
227 ; CHECK-NEXT: [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
228 ; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
229 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0
230 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
231 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
232 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
233 ; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
234 ; CHECK-NEXT: ret void
237 %v0 = mul i32 %ind0, 3
238 %v1 = mul i32 %ind1, 3
239 %and.i = and i32 %v0, 3
240 %cmp.i = icmp eq i32 %and.i, 0
241 %and.i.1 = and i32 %v1, 3
242 %cmp.i.1 = icmp eq i32 %and.i.1, 0
243 call void @llvm.assume(i1 %cmp.i)
244 call void @llvm.assume(i1 %cmp.i.1)
245 %tmp = add i32 %v0, 3
246 %tmp1 = add i32 %v1, %tmp
247 %tmp2 = sext i32 %tmp1 to i64
248 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
249 %tmp4 = load i8, ptr %tmp3, align 1
250 %tmp5 = add i32 %v1, %v0
251 %tmp6 = sext i32 %tmp5 to i64
252 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
253 %tmp8 = load i8, ptr %tmp7, align 1
254 %tmp9 = add i32 %v0, 1
255 %tmp10 = add i32 %v1, %tmp9
256 %tmp11 = sext i32 %tmp10 to i64
257 %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
258 %tmp13 = load i8, ptr %tmp12, align 1
259 %tmp14 = add i32 %v0, 2
260 %tmp15 = add i32 %v1, %tmp14
261 %tmp16 = sext i32 %tmp15 to i64
262 %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
263 %tmp18 = load i8, ptr %tmp17, align 1
264 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
265 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
266 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
267 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
268 store <4 x i8> %tmp22, ptr %dst
272 declare void @llvm.assume(i1)
274 define void @ld_v4i8_add_assume_on_arg(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
275 ; CHECK-LABEL: @ld_v4i8_add_assume_on_arg(
277 ; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[V0:%.*]], 3
278 ; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0
279 ; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[V1:%.*]], 3
280 ; CHECK-NEXT: [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0
281 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I]])
282 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I_1]])
283 ; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0]], -1
284 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[V1]], [[TMP]]
285 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
286 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]]
287 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
288 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
289 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
290 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
291 ; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i8>, ptr [[TMP7]], align 1
292 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <3 x i8> [[TMP1]], i32 0
293 ; CHECK-NEXT: [[TMP132:%.*]] = extractelement <3 x i8> [[TMP1]], i32 1
294 ; CHECK-NEXT: [[TMP183:%.*]] = extractelement <3 x i8> [[TMP1]], i32 2
295 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0
296 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
297 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
298 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
299 ; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
300 ; CHECK-NEXT: ret void
303 %and.i = and i32 %v0, 3
304 %cmp.i = icmp eq i32 %and.i, 0
305 %and.i.1 = and i32 %v1, 3
306 %cmp.i.1 = icmp eq i32 %and.i.1, 0
307 call void @llvm.assume(i1 %cmp.i)
308 call void @llvm.assume(i1 %cmp.i.1)
309 %tmp = add nsw i32 %v0, -1
310 %tmp1 = add i32 %v1, %tmp
311 %tmp2 = sext i32 %tmp1 to i64
312 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
313 %tmp4 = load i8, ptr %tmp3, align 1
314 %tmp5 = add i32 %v1, %v0
315 %tmp6 = sext i32 %tmp5 to i64
316 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
317 %tmp8 = load i8, ptr %tmp7, align 1
318 %tmp9 = add nsw i32 %v0, 1
319 %tmp10 = add i32 %v1, %tmp9
320 %tmp11 = sext i32 %tmp10 to i64
321 %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
322 %tmp13 = load i8, ptr %tmp12, align 1
323 %tmp14 = add nsw i32 %v0, 2
324 %tmp15 = add i32 %v1, %tmp14
325 %tmp16 = sext i32 %tmp15 to i64
326 %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
327 %tmp18 = load i8, ptr %tmp17, align 1
328 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
329 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
330 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
331 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
332 store <4 x i8> %tmp22, ptr %dst
336 define void @ld_v4i8_add_assume_on_arg1(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
337 ; CHECK-LABEL: @ld_v4i8_add_assume_on_arg1(
339 ; CHECK-NEXT: [[AND_I:%.*]] = and i32 [[V0:%.*]], 3
340 ; CHECK-NEXT: [[CMP_I:%.*]] = icmp eq i32 [[AND_I]], 0
341 ; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[V1:%.*]], 3
342 ; CHECK-NEXT: [[CMP_I_1:%.*]] = icmp eq i32 [[AND_I_1]], 0
343 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I]])
344 ; CHECK-NEXT: call void @llvm.assume(i1 [[CMP_I_1]])
345 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
346 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
347 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
348 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, ptr [[TMP7]], align 1
349 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <4 x i8> [[TMP1]], i32 0
350 ; CHECK-NEXT: [[TMP132:%.*]] = extractelement <4 x i8> [[TMP1]], i32 1
351 ; CHECK-NEXT: [[TMP183:%.*]] = extractelement <4 x i8> [[TMP1]], i32 2
352 ; CHECK-NEXT: [[TMP44:%.*]] = extractelement <4 x i8> [[TMP1]], i32 3
353 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP44]], i32 0
354 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP81]], i32 1
355 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP132]], i32 2
356 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP183]], i32 3
357 ; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
358 ; CHECK-NEXT: ret void
361 %and.i = and i32 %v0, 3
362 %cmp.i = icmp eq i32 %and.i, 0
363 %and.i.1 = and i32 %v1, 3
364 %cmp.i.1 = icmp eq i32 %and.i.1, 0
365 call void @llvm.assume(i1 %cmp.i)
366 call void @llvm.assume(i1 %cmp.i.1)
367 %tmp = add nsw i32 %v0, 3
368 %tmp1 = add i32 %v1, %tmp
369 %tmp2 = sext i32 %tmp1 to i64
370 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
371 %tmp4 = load i8, ptr %tmp3, align 1
372 %tmp5 = add i32 %v1, %v0
373 %tmp6 = sext i32 %tmp5 to i64
374 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
375 %tmp8 = load i8, ptr %tmp7, align 1
376 %tmp9 = add nsw i32 %v0, 1
377 %tmp10 = add i32 %v1, %tmp9
378 %tmp11 = sext i32 %tmp10 to i64
379 %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
380 %tmp13 = load i8, ptr %tmp12, align 1
381 %tmp14 = add nsw i32 %v0, 2
382 %tmp15 = add i32 %v1, %tmp14
383 %tmp16 = sext i32 %tmp15 to i64
384 %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
385 %tmp18 = load i8, ptr %tmp17, align 1
386 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
387 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
388 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
389 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
390 store <4 x i8> %tmp22, ptr %dst
394 ; Address computations are partly separated by control flow and with llvm.assume placed
395 ; in the second basic block
397 define void @ld_v2i8_add_different_contexts(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
398 ; CHECK-LABEL: @ld_v2i8_add_different_contexts(
400 ; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
401 ; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3
402 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
403 ; CHECK-NEXT: [[BIT_COND:%.*]] = icmp eq i32 [[V1]], 0
404 ; CHECK-NEXT: br i1 [[BIT_COND]], label [[BB_LOADS:%.*]], label [[BB_SKIP:%.*]]
406 ; CHECK-NEXT: call void @llvm.assume(i1 [[BIT_COND]])
407 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
408 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
409 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1
410 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0
411 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
412 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0
413 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1
414 ; CHECK-NEXT: store <2 x i8> [[TMP20]], ptr [[DST:%.*]]
415 ; CHECK-NEXT: br label [[BB_SKIP]]
417 ; CHECK-NEXT: ret void
420 %v0 = mul i32 %ind0, 4
421 %v1 = mul i32 %ind1, 3
422 %tmp5 = add i32 %v1, %v0
423 %bit_cond = icmp eq i32 %v1, 0
424 br i1 %bit_cond, label %bb.loads, label %bb.skip
427 call void @llvm.assume(i1 %bit_cond)
428 %tmp = add nsw i32 %v0, 1
429 %tmp1 = add i32 %v1, %tmp
430 %tmp2 = sext i32 %tmp1 to i64
431 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
432 %tmp4 = load i8, ptr %tmp3, align 1
433 %tmp6 = sext i32 %tmp5 to i64
434 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
435 %tmp8 = load i8, ptr %tmp7, align 1
436 %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0
437 %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1
438 store <2 x i8> %tmp20, ptr %dst
445 ; Same as ld_v2i8_add_different_contexts but with llvm.assume placed between loads
447 define void @ld_v2i8_add_different_contexts1(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
448 ; CHECK-LABEL: @ld_v2i8_add_different_contexts1(
450 ; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
451 ; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3
452 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
453 ; CHECK-NEXT: [[BIT_COND:%.*]] = icmp eq i32 [[V1]], 0
454 ; CHECK-NEXT: br i1 [[BIT_COND]], label [[BB_LOADS:%.*]], label [[BB_SKIP:%.*]]
456 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
457 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
458 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1
459 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0
460 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
461 ; CHECK-NEXT: call void @llvm.assume(i1 [[BIT_COND]])
462 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0
463 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1
464 ; CHECK-NEXT: store <2 x i8> [[TMP20]], ptr [[DST:%.*]]
465 ; CHECK-NEXT: br label [[BB_SKIP]]
467 ; CHECK-NEXT: ret void
470 %v0 = mul i32 %ind0, 4
471 %v1 = mul i32 %ind1, 3
472 %tmp5 = add i32 %v1, %v0
473 %bit_cond = icmp eq i32 %v1, 0
474 br i1 %bit_cond, label %bb.loads, label %bb.skip
477 %tmp6 = sext i32 %tmp5 to i64
478 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
479 %tmp8 = load i8, ptr %tmp7, align 1
480 call void @llvm.assume(i1 %bit_cond)
481 %tmp = add nsw i32 %v0, 1
482 %tmp1 = add i32 %v1, %tmp
483 %tmp2 = sext i32 %tmp1 to i64
484 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
485 %tmp4 = load i8, ptr %tmp3, align 1
486 %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0
487 %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1
488 store <2 x i8> %tmp20, ptr %dst
495 ; llvm.assume is placed between loads in a single basic block
497 define void @ld_v2i8_add_context(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
498 ; CHECK-LABEL: @ld_v2i8_add_context(
500 ; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
501 ; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3
502 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
503 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
504 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
505 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1
506 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0
507 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
508 ; CHECK-NEXT: [[BIT_COND:%.*]] = icmp eq i32 [[TMP5]], 0
509 ; CHECK-NEXT: call void @llvm.assume(i1 [[BIT_COND]])
510 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0
511 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1
512 ; CHECK-NEXT: store <2 x i8> [[TMP20]], ptr [[DST:%.*]]
513 ; CHECK-NEXT: ret void
516 %v0 = mul i32 %ind0, 4
517 %v1 = mul i32 %ind1, 3
518 %tmp5 = add i32 %v1, %v0
519 %tmp6 = sext i32 %tmp5 to i64
520 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
521 %tmp8 = load i8, ptr %tmp7, align 1
522 %bit_cond = icmp eq i32 %tmp5, 0
523 call void @llvm.assume(i1 %bit_cond)
524 %tmp = add nsw i32 %v0, 1
525 %tmp1 = add i32 %v1, %tmp
526 %tmp2 = sext i32 %tmp1 to i64
527 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
528 %tmp4 = load i8, ptr %tmp3, align 1
529 %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0
530 %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1
531 store <2 x i8> %tmp20, ptr %dst
535 ; Placing llvm.assume after all the loads and stores in the basic block still works
537 define void @ld_v2i8_add_context1(i32 %ind0, i32 %ind1, ptr %src, ptr %dst) {
538 ; CHECK-LABEL: @ld_v2i8_add_context1(
540 ; CHECK-NEXT: [[V0:%.*]] = mul i32 [[IND0:%.*]], 4
541 ; CHECK-NEXT: [[V1:%.*]] = mul i32 [[IND1:%.*]], 3
542 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
543 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
544 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP6]]
545 ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1
546 ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i8> [[TMP1]], i32 0
547 ; CHECK-NEXT: [[TMP42:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
548 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x i8> undef, i8 [[TMP42]], i32 0
549 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x i8> [[TMP19]], i8 [[TMP81]], i32 1
550 ; CHECK-NEXT: store <2 x i8> [[TMP20]], ptr [[DST:%.*]]
551 ; CHECK-NEXT: [[BIT_COND:%.*]] = icmp eq i32 [[TMP5]], 0
552 ; CHECK-NEXT: call void @llvm.assume(i1 [[BIT_COND]])
553 ; CHECK-NEXT: ret void
556 %v0 = mul i32 %ind0, 4
557 %v1 = mul i32 %ind1, 3
558 %tmp5 = add i32 %v1, %v0
559 %tmp6 = sext i32 %tmp5 to i64
560 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
561 %tmp8 = load i8, ptr %tmp7, align 1
562 %tmp = add nsw i32 %v0, 1
563 %tmp1 = add i32 %v1, %tmp
564 %tmp2 = sext i32 %tmp1 to i64
565 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
566 %tmp4 = load i8, ptr %tmp3, align 1
567 %tmp19 = insertelement <2 x i8> undef, i8 %tmp4, i32 0
568 %tmp20 = insertelement <2 x i8> %tmp19, i8 %tmp8, i32 1
569 store <2 x i8> %tmp20, ptr %dst
570 %bit_cond = icmp eq i32 %tmp5, 0
571 call void @llvm.assume(i1 %bit_cond)
575 ; Make sure we don't vectorize the loads below because the source of
576 ; sext instructions doesn't have the nsw flag or known bits allowing
577 ; to apply the vectorization.
579 define void @ld_v4i8_add_not_safe(i32 %v0, i32 %v1, ptr %src, ptr %dst) {
580 ; CHECK-LABEL: @ld_v4i8_add_not_safe(
582 ; CHECK-NEXT: [[TMP:%.*]] = add nsw i32 [[V0:%.*]], -1
583 ; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[V1:%.*]], [[TMP]]
584 ; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[TMP1]] to i64
585 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC:%.*]], i64 [[TMP2]]
586 ; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[TMP3]], align 1
587 ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[V1]], [[V0]]
588 ; CHECK-NEXT: [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
589 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP6]]
590 ; CHECK-NEXT: [[TMP8:%.*]] = load i8, ptr [[TMP7]], align 1
591 ; CHECK-NEXT: [[TMP9:%.*]] = add nsw i32 [[V0]], 1
592 ; CHECK-NEXT: [[TMP10:%.*]] = add i32 [[V1]], [[TMP9]]
593 ; CHECK-NEXT: [[TMP11:%.*]] = sext i32 [[TMP10]] to i64
594 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP11]]
595 ; CHECK-NEXT: [[TMP13:%.*]] = load i8, ptr [[TMP12]], align 1
596 ; CHECK-NEXT: [[TMP14:%.*]] = add nsw i32 [[V0]], 2
597 ; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[V1]], [[TMP14]]
598 ; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
599 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP16]]
600 ; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[TMP17]], align 1
601 ; CHECK-NEXT: [[TMP19:%.*]] = insertelement <4 x i8> undef, i8 [[TMP4]], i32 0
602 ; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i8> [[TMP19]], i8 [[TMP8]], i32 1
603 ; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i8> [[TMP20]], i8 [[TMP13]], i32 2
604 ; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP21]], i8 [[TMP18]], i32 3
605 ; CHECK-NEXT: store <4 x i8> [[TMP22]], ptr [[DST:%.*]]
606 ; CHECK-NEXT: ret void
609 %tmp = add nsw i32 %v0, -1
610 %tmp1 = add i32 %v1, %tmp
611 %tmp2 = sext i32 %tmp1 to i64
612 %tmp3 = getelementptr inbounds i8, ptr %src, i64 %tmp2
613 %tmp4 = load i8, ptr %tmp3, align 1
614 %tmp5 = add i32 %v1, %v0
615 %tmp6 = sext i32 %tmp5 to i64
616 %tmp7 = getelementptr inbounds i8, ptr %src, i64 %tmp6
617 %tmp8 = load i8, ptr %tmp7, align 1
618 %tmp9 = add nsw i32 %v0, 1
619 %tmp10 = add i32 %v1, %tmp9
620 %tmp11 = sext i32 %tmp10 to i64
621 %tmp12 = getelementptr inbounds i8, ptr %src, i64 %tmp11
622 %tmp13 = load i8, ptr %tmp12, align 1
623 %tmp14 = add nsw i32 %v0, 2
624 %tmp15 = add i32 %v1, %tmp14
625 %tmp16 = sext i32 %tmp15 to i64
626 %tmp17 = getelementptr inbounds i8, ptr %src, i64 %tmp16
627 %tmp18 = load i8, ptr %tmp17, align 1
628 %tmp19 = insertelement <4 x i8> undef, i8 %tmp4, i32 0
629 %tmp20 = insertelement <4 x i8> %tmp19, i8 %tmp8, i32 1
630 %tmp21 = insertelement <4 x i8> %tmp20, i8 %tmp13, i32 2
631 %tmp22 = insertelement <4 x i8> %tmp21, i8 %tmp18, i32 3
632 store <4 x i8> %tmp22, ptr %dst