1 ; RUN: opt < %s -passes=loop-vectorize -S | FileCheck %s --check-prefixes=COMMON,DEFAULT
2 ; RUN: opt < %s -passes=loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-PREFER
3 ; RUN: opt < %s -passes=loop-vectorize -tail-predication=enabled -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-PREFER
4 ; RUN: opt < %s -passes=loop-vectorize -tail-predication=enabled -S | FileCheck %s --check-prefixes=COMMON,CHECK-TF,CHECK-ENABLE-TP
6 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
7 target triple = "thumbv8.1m.main-arm-unknown-eabihf"
9 ; This IR corresponds to this type of C-code:
11 ; void f(char *a, char *b, char *c, int N) {
16 define dso_local void @sgt_loopguard(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 {
17 ; COMMON-LABEL: @sgt_loopguard(
18 ; COMMON: vector.body:
20 ; CHECK-TF: %[[VIVELEM0:.*]] = extractelement <16 x i32> %vec.iv, i32 0
21 ; CHECK-TF: %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %[[VIVELEM0]], i32 %N)
22 ; CHECK-TF: llvm.masked.load.v16i8.p0(ptr %{{.*}}, i32 1, <16 x i1> %active.lane.mask
23 ; CHECK-TF: llvm.masked.load.v16i8.p0(ptr %{{.*}}, i32 1, <16 x i1> %active.lane.mask
24 ; CHECK-TF: llvm.masked.store.v16i8.p0(<16 x i8> %{{.*}}, ptr %{{.*}}, i32 1, <16 x i1> %active.lane.mask)
26 %cmp5 = icmp sgt i32 %N, 0
27 br i1 %cmp5, label %while.body.preheader, label %while.end
33 %N.addr.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
34 %c.addr.08 = phi ptr [ %incdec.ptr4, %while.body ], [ %c, %while.body.preheader ]
35 %b.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %b, %while.body.preheader ]
36 %a.addr.06 = phi ptr [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ]
37 %dec = add nsw i32 %N.addr.09, -1
38 %incdec.ptr = getelementptr inbounds i8, ptr %a.addr.06, i32 1
39 %0 = load i8, ptr %a.addr.06, align 1
40 %incdec.ptr1 = getelementptr inbounds i8, ptr %b.addr.07, i32 1
41 %1 = load i8, ptr %b.addr.07, align 1
43 %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1
44 store i8 %add, ptr %c.addr.08, align 1
45 %cmp = icmp sgt i32 %N.addr.09, 1
46 br i1 %cmp, label %while.body, label %while.end.loopexit
55 ; No loop-guard: we need one for this to be valid.
57 define dso_local void @sgt_no_loopguard(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 {
58 ; COMMON-LABEL: @sgt_no_loopguard(
59 ; COMMON: vector.body:
60 ; CHECK-TF: masked.load
61 ; CHECK-TF: masked.load
62 ; CHECK-TF: masked.store
67 %N.addr.09 = phi i32 [ %dec, %while.body ], [ %N, %entry ]
68 %c.addr.08 = phi ptr [ %incdec.ptr4, %while.body ], [ %c, %entry ]
69 %b.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %b, %entry ]
70 %a.addr.06 = phi ptr [ %incdec.ptr, %while.body ], [ %a, %entry ]
71 %dec = add nsw i32 %N.addr.09, -1
72 %incdec.ptr = getelementptr inbounds i8, ptr %a.addr.06, i32 1
73 %0 = load i8, ptr %a.addr.06, align 1
74 %incdec.ptr1 = getelementptr inbounds i8, ptr %b.addr.07, i32 1
75 %1 = load i8, ptr %b.addr.07, align 1
77 %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1
78 store i8 %add, ptr %c.addr.08, align 1
79 %cmp = icmp sgt i32 %N.addr.09, 1
80 br i1 %cmp, label %while.body, label %while.end.loopexit
89 define dso_local void @sgt_extra_use_cmp(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 {
90 ; COMMON-LABEL: @sgt_extra_use_cmp(
91 ; COMMON: vector.body:
92 ; CHECK-TF: masked.load
93 ; CHECK-TF: masked.load
94 ; CHECK-TF: masked.store
99 %N.addr.09 = phi i32 [ %dec, %while.body ], [ %N, %entry ]
100 %c.addr.08 = phi ptr [ %incdec.ptr4, %while.body ], [ %c, %entry ]
101 %b.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %b, %entry ]
102 %a.addr.06 = phi ptr [ %incdec.ptr, %while.body ], [ %a, %entry ]
103 %dec = add nsw i32 %N.addr.09, -1
104 %incdec.ptr = getelementptr inbounds i8, ptr %a.addr.06, i32 1
105 %0 = load i8, ptr %a.addr.06, align 1
106 %incdec.ptr1 = getelementptr inbounds i8, ptr %b.addr.07, i32 1
107 %1 = load i8, ptr %b.addr.07, align 1
109 %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1
110 %cmp = icmp sgt i32 %N.addr.09, 1
111 %select = select i1 %cmp, i8 %0, i8 %1
112 %add2 = add i8 %add, %select
113 store i8 %add2, ptr %c.addr.08, align 1
114 br i1 %cmp, label %while.body, label %while.end.loopexit
123 define dso_local void @sgt_const_tripcount(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 {
124 ; COMMON-LABEL: @sgt_const_tripcount(
125 ; COMMON: vector.body:
126 ; CHECK-TF: masked.load
127 ; CHECK-TF: masked.load
128 ; CHECK-TF: masked.store
130 %cmp5 = icmp sgt i32 %N, 0
131 br i1 %cmp5, label %while.body.preheader, label %while.end
133 while.body.preheader:
137 %N.addr.09 = phi i32 [ %dec, %while.body ], [ 2049, %while.body.preheader ]
138 %c.addr.08 = phi ptr [ %incdec.ptr4, %while.body ], [ %c, %while.body.preheader ]
139 %b.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %b, %while.body.preheader ]
140 %a.addr.06 = phi ptr [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ]
141 %dec = add nsw i32 %N.addr.09, -1
142 %incdec.ptr = getelementptr inbounds i8, ptr %a.addr.06, i32 1
143 %0 = load i8, ptr %a.addr.06, align 1
144 %incdec.ptr1 = getelementptr inbounds i8, ptr %b.addr.07, i32 1
145 %1 = load i8, ptr %b.addr.07, align 1
147 %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1
148 store i8 %add, ptr %c.addr.08, align 1
149 %cmp = icmp sgt i32 %N.addr.09, 1
150 br i1 %cmp, label %while.body, label %while.end.loopexit
159 define dso_local void @sgt_no_guard_0_startval(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 {
160 ; COMMON-LABEL: @sgt_no_guard_0_startval(
161 ; COMMON-NOT: vector.body:
166 %N.addr.09 = phi i32 [ %dec, %while.body ], [ 0, %entry ]
167 %c.addr.08 = phi ptr [ %incdec.ptr4, %while.body ], [ %c, %entry ]
168 %b.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %b, %entry ]
169 %a.addr.06 = phi ptr [ %incdec.ptr, %while.body ], [ %a, %entry]
170 %dec = add nsw i32 %N.addr.09, -1
171 %incdec.ptr = getelementptr inbounds i8, ptr %a.addr.06, i32 1
172 %0 = load i8, ptr %a.addr.06, align 1
173 %incdec.ptr1 = getelementptr inbounds i8, ptr %b.addr.07, i32 1
174 %1 = load i8, ptr %b.addr.07, align 1
176 %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1
177 store i8 %add, ptr %c.addr.08, align 1
178 %cmp = icmp sgt i32 %N.addr.09, 1
179 br i1 %cmp, label %while.body, label %while.end.loopexit
188 define dso_local void @sgt_step_minus_two(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 {
189 ; COMMON-LABEL: @sgt_step_minus_two(
190 ; COMMON: vector.body:
191 ; CHECK-TF: masked.load
192 ; CHECK-TF: masked.load
193 ; CHECK-TF: masked.store
195 %cmp5 = icmp sgt i32 %N, 0
196 br i1 %cmp5, label %while.body.preheader, label %while.end
198 while.body.preheader:
202 %N.addr.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
203 %c.addr.08 = phi ptr [ %incdec.ptr4, %while.body ], [ %c, %while.body.preheader ]
204 %b.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %b, %while.body.preheader ]
205 %a.addr.06 = phi ptr [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ]
206 %dec = add nsw i32 %N.addr.09, -2
207 %incdec.ptr = getelementptr inbounds i8, ptr %a.addr.06, i32 1
208 %0 = load i8, ptr %a.addr.06, align 1
209 %incdec.ptr1 = getelementptr inbounds i8, ptr %b.addr.07, i32 1
210 %1 = load i8, ptr %b.addr.07, align 1
212 %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1
213 store i8 %add, ptr %c.addr.08, align 1
214 %cmp = icmp sgt i32 %N.addr.09, 1
215 br i1 %cmp, label %while.body, label %while.end.loopexit
224 define dso_local void @sgt_step_not_constant(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N, i32 %S) local_unnamed_addr #0 {
225 ; COMMON-LABEL: @sgt_step_not_constant(
226 ; COMMON-NOT: vector.body:
228 %cmp5 = icmp sgt i32 %N, 0
229 br i1 %cmp5, label %while.body.preheader, label %while.end
231 while.body.preheader:
235 %N.addr.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
236 %c.addr.08 = phi ptr [ %incdec.ptr4, %while.body ], [ %c, %while.body.preheader ]
237 %b.addr.07 = phi ptr [ %incdec.ptr1, %while.body ], [ %b, %while.body.preheader ]
238 %a.addr.06 = phi ptr [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ]
239 %dec = add nsw i32 %N.addr.09, %S
240 %incdec.ptr = getelementptr inbounds i8, ptr %a.addr.06, i32 1
241 %0 = load i8, ptr %a.addr.06, align 1
242 %incdec.ptr1 = getelementptr inbounds i8, ptr %b.addr.07, i32 1
243 %1 = load i8, ptr %b.addr.07, align 1
245 %incdec.ptr4 = getelementptr inbounds i8, ptr %c.addr.08, i32 1
246 store i8 %add, ptr %c.addr.08, align 1
247 %cmp = icmp sgt i32 %N.addr.09, 1
248 br i1 %cmp, label %while.body, label %while.end.loopexit
257 define dso_local void @icmp_eq(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %N) #0 {
258 ; COMMON-LABEL: @icmp_eq
259 ; COMMON: vector.body:
261 %cmp6 = icmp eq i32 %N, 0
262 br i1 %cmp6, label %while.end, label %while.body.preheader
264 while.body.preheader:
268 %N.addr.010 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
269 %C.addr.09 = phi ptr [ %incdec.ptr4, %while.body ], [ %C, %while.body.preheader ]
270 %B.addr.08 = phi ptr [ %incdec.ptr1, %while.body ], [ %B, %while.body.preheader ]
271 %A.addr.07 = phi ptr [ %incdec.ptr, %while.body ], [ %A, %while.body.preheader ]
272 %incdec.ptr = getelementptr inbounds i8, ptr %A.addr.07, i32 1
273 %0 = load i8, ptr %A.addr.07, align 1
274 %incdec.ptr1 = getelementptr inbounds i8, ptr %B.addr.08, i32 1
275 %1 = load i8, ptr %B.addr.08, align 1
277 %incdec.ptr4 = getelementptr inbounds i8, ptr %C.addr.09, i32 1
278 store i8 %add, ptr %C.addr.09, align 1
279 %dec = add i32 %N.addr.010, -1
280 %cmp = icmp eq i32 %dec, 0
281 br i1 %cmp, label %while.end.loopexit, label %while.body
290 ; This IR corresponds to this type of C-code:
292 ; void f(char *a, char *b, char * __restrict c, int N) {
293 ; #pragma clang loop vectorize_width(16)
294 ; for (int i = N; i>0; i--)
295 ; c[i] = a[i] + b[i];
298 define dso_local void @sgt_for_loop(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 {
299 ; COMMON-LABEL: @sgt_for_loop(
300 ; COMMON: vector.body:
301 ; CHECK-PREFER: masked.load
302 ; CHECK-PREFER: masked.load
303 ; CHECK-PREFER: masked.store
305 ; TODO: if tail-predication is requested, tail-folding isn't triggered because
306 ; the profitability check returns "Different strides found, can't tail-predicate",
309 ; CHECK-ENABLE-TP-NOT: masked.load
310 ; CHECK-ENABLE-TP-NOT: masked.load
311 ; CHECK-ENABLE-TP-NOT: masked.store
314 %cmp5 = icmp sgt i32 %N, 0
315 br i1 %cmp5, label %for.body.preheader, label %for.end
321 %i.011 = phi i32 [ %dec, %for.body ], [ %N, %for.body.preheader ]
322 %arrayidx = getelementptr inbounds i8, ptr %a, i32 %i.011
323 %0 = load i8, ptr %arrayidx, align 1
324 %arrayidx1 = getelementptr inbounds i8, ptr %b, i32 %i.011
325 %1 = load i8, ptr %arrayidx1, align 1
327 %arrayidx4 = getelementptr inbounds i8, ptr %c, i32 %i.011
328 store i8 %add, ptr %arrayidx4, align 1
329 %dec = add nsw i32 %i.011, -1
330 %cmp = icmp sgt i32 %i.011, 1
331 br i1 %cmp, label %for.body, label %for.end, !llvm.loop !1
337 define dso_local void @sgt_for_loop_i64(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 {
338 ; COMMON-LABEL: @sgt_for_loop_i64(
339 ; COMMON: vector.body:
341 ; CHECK-PREFER: masked.load
342 ; CHECK-PREFER: masked.load
343 ; CHECK-PREFER: masked.store
345 ; With -disable-mve-tail-predication=false, the target hook returns
346 ; "preferPredicateOverEpilogue: hardware-loop is not profitable."
347 ; so here we don't expect the tail-folding. TODO: look into this.
349 ; CHECK-ENABLE-TP-NOT: masked.load
350 ; CHECK-ENABLE-TP-NOT: masked.load
351 ; CHECK-ENABLE-TP-NOT: masked.store
354 %cmp14 = icmp sgt i32 %N, 0
355 br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
358 %conv16 = zext i32 %N to i64
361 for.cond.cleanup.loopexit:
362 br label %for.cond.cleanup
368 %i.015 = phi i64 [ %dec, %for.body ], [ %conv16, %for.body.preheader ]
369 %idxprom = trunc i64 %i.015 to i32
370 %arrayidx = getelementptr inbounds i8, ptr %a, i32 %idxprom
371 %0 = load i8, ptr %arrayidx, align 1
372 %arrayidx4 = getelementptr inbounds i8, ptr %b, i32 %idxprom
373 %1 = load i8, ptr %arrayidx4, align 1
375 %arrayidx8 = getelementptr inbounds i8, ptr %c, i32 %idxprom
376 store i8 %add, ptr %arrayidx8, align 1
377 %dec = add nsw i64 %i.015, -1
378 %cmp = icmp sgt i64 %i.015, 1
379 br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !1
382 ; This IR corresponds to this nested-loop:
384 ; for (int i = 0; i<N; i++)
385 ; for (int j = i+1; j>0; j--)
386 ; c[j] = a[j] + b[j];
388 ; while the inner-loop looks similar to previous examples, we can't
389 ; transform this because the inner loop because isGuarded returns
390 ; false for the inner-loop.
392 define dso_local void @sgt_nested_loop(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) local_unnamed_addr #0 {
393 ; COMMON-LABEL: @sgt_nested_loop(
394 ; DEFAULT-NOT: vector.body:
395 ; CHECK-TF-NOT: masked.load
396 ; CHECK-TF-NOT: masked.load
397 ; CHECK-TF-NOT: masked.store
401 %cmp21 = icmp sgt i32 %N, 0
402 br i1 %cmp21, label %for.body.preheader, label %for.cond.cleanup
408 %exitcond = icmp eq i32 %add, %N
409 br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
411 for.cond.cleanup.loopexit:
412 br label %for.cond.cleanup
418 %i.022 = phi i32 [ %add, %for.cond.loopexit ], [ 0, %for.body.preheader ]
419 %add = add nuw nsw i32 %i.022, 1
422 for.body4: ; preds = %for.body, %for.body4
423 %j.020 = phi i32 [ %add, %for.body ], [ %dec, %for.body4 ]
424 %arrayidx = getelementptr inbounds i8, ptr %a, i32 %j.020
425 %0 = load i8, ptr %arrayidx, align 1
426 %arrayidx5 = getelementptr inbounds i8, ptr %b, i32 %j.020
427 %1 = load i8, ptr %arrayidx5, align 1
428 %add7 = add i8 %1, %0
429 %arrayidx9 = getelementptr inbounds i8, ptr %c, i32 %j.020
430 store i8 %add7, ptr %arrayidx9, align 1
431 %dec = add nsw i32 %j.020, -1
432 %cmp2 = icmp sgt i32 %j.020, 1
433 br i1 %cmp2, label %for.body4, label %for.cond.loopexit
436 attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }
438 !1 = distinct !{!1, !2}
439 !2 = !{!"llvm.loop.vectorize.width", i32 16}