1 ; RUN: opt < %s -tiny-trip-count-interleave-threshold=32 -p loop-vectorize -S -pass-remarks=loop-vectorize -disable-output 2>&1 | FileCheck %s
2 ; TODO: remove -tiny-trip-count-interleave-threshold once the interleave threshold is removed
4 target triple = "aarch64-linux-gnu"
6 %pair = type { i8, i8 }
8 ; For this loop with known TC of 32, when the auto-vectorizer chooses VF 16, it should choose
9 ; IC 2 since there is no remainder loop run needed when the vector loop runs.
10 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
11 define void @loop_with_tc_32(ptr noalias %p, ptr noalias %q) {
16 %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
17 %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
18 %tmp1 = load i8, ptr %tmp0, align 1
19 %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
20 %tmp3 = load i8, ptr %tmp2, align 1
21 %add = add i8 %tmp1, %tmp3
22 %qi = getelementptr i8, ptr %q, i64 %i
23 store i8 %add, ptr %qi, align 1
24 %i.next = add nuw nsw i64 %i, 1
25 %cond = icmp eq i64 %i.next, 32
26 br i1 %cond, label %for.end, label %for.body
32 ; TODO: For this loop with known TC of 33, when the auto-vectorizer chooses VF 16, it should choose
33 ; IC 1 since there may be a remainder loop that needs to run after the vector loop.
34 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
35 define void @loop_with_tc_33(ptr noalias %p, ptr noalias %q) {
40 %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
41 %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
42 %tmp1 = load i8, ptr %tmp0, align 1
43 %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
44 %tmp3 = load i8, ptr %tmp2, align 1
45 %add = add i8 %tmp1, %tmp3
46 %qi = getelementptr i8, ptr %q, i64 %i
47 store i8 %add, ptr %qi, align 1
48 %i.next = add nuw nsw i64 %i, 1
49 %cond = icmp eq i64 %i.next, 33
50 br i1 %cond, label %for.end, label %for.body
56 ; For a loop with unknown trip count but a profile showing an approx TC estimate of 32, when the
57 ; auto-vectorizer chooses VF 16, it should choose IC 2 since chances are high that the remainder loop
59 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
60 define void @loop_with_profile_tc_32(ptr noalias %p, ptr noalias %q, i64 %n) {
65 %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
66 %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
67 %tmp1 = load i8, ptr %tmp0, align 1
68 %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
69 %tmp3 = load i8, ptr %tmp2, align 1
70 %add = add i8 %tmp1, %tmp3
71 %qi = getelementptr i8, ptr %q, i64 %i
72 store i8 %add, ptr %qi, align 1
73 %i.next = add nuw nsw i64 %i, 1
74 %cond = icmp eq i64 %i.next, %n
75 br i1 %cond, label %for.end, label %for.body, !prof !0
81 ; TODO: For a loop with unknown trip count but a profile showing an approx TC estimate of 33,
82 ; when the auto-vectorizer chooses VF 16, it should choose IC 1 since chances are high that the
83 ; remainder loop will need to run
84 ; CHECK: remark: <unknown>:0:0: vectorized loop (vectorization width: 16, interleaved count: 2)
85 define void @loop_with_profile_tc_33(ptr noalias %p, ptr noalias %q, i64 %n) {
90 %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
91 %tmp0 = getelementptr %pair, ptr %p, i64 %i, i32 0
92 %tmp1 = load i8, ptr %tmp0, align 1
93 %tmp2 = getelementptr %pair, ptr %p, i64 %i, i32 1
94 %tmp3 = load i8, ptr %tmp2, align 1
95 %add = add i8 %tmp1, %tmp3
96 %qi = getelementptr i8, ptr %q, i64 %i
97 store i8 %add, ptr %qi, align 1
98 %i.next = add nuw nsw i64 %i, 1
99 %cond = icmp eq i64 %i.next, %n
100 br i1 %cond, label %for.end, label %for.body, !prof !1
106 !0 = !{!"branch_weights", i32 1, i32 31}
107 !1 = !{!"branch_weights", i32 1, i32 32}