1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s
4 define dso_local void @foo(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
7 ; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
8 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
10 ; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
11 ; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
12 ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
13 ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
14 ; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
15 ; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
16 ; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
17 ; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
18 ; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]])
19 ; CHECK-NEXT: [[TMP3]] = sub i32 [[TMP1]], 4
20 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
21 ; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
22 ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
23 ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP2]])
24 ; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
25 ; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4
26 ; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4
27 ; CHECK-NEXT: [[TMP5]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
28 ; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
29 ; CHECK-NEXT: br i1 [[TMP6]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
30 ; CHECK: for.cond.cleanup:
31 ; CHECK-NEXT: ret void
34 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
38 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
39 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
40 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
41 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
42 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
43 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
44 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
45 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
46 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
47 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
48 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
49 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
50 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
51 %index.next = add i32 %index, 4
52 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
53 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
54 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
55 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
56 %4 = icmp ne i32 %3, 0
57 br i1 %4, label %vector.body, label %for.cond.cleanup
63 ; Silly test case: the loop count is constant and a multiple of the vectorisation
64 ; factor. So, the vectoriser should not produce masked loads/stores and there's
65 ; nothing to tail-predicate here, just checking.
66 define dso_local void @foo2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
69 ; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 2000)
70 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
72 ; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
73 ; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
74 ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
75 ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
76 ; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
77 ; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
78 ; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
79 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[LSR_IV10]], align 4
80 ; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[LSR_IV1113]], align 4
81 ; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD]]
82 ; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[LSR_IV1416]], align 4
83 ; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
84 ; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4
85 ; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4
86 ; CHECK-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
87 ; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
88 ; CHECK-NEXT: br i1 [[TMP3]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
89 ; CHECK: for.cond.cleanup:
90 ; CHECK-NEXT: ret void
93 %start = call i32 @llvm.start.loop.iterations.i32(i32 2000)
97 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
98 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
99 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
100 %0 = phi i32 [ %start, %entry ], [ %2, %vector.body ]
101 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
102 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
103 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
104 %wide.load = load <4 x i32>, <4 x i32>* %lsr.iv10, align 4
105 %wide.load9 = load <4 x i32>, <4 x i32>* %lsr.iv1113, align 4
106 %1 = add nsw <4 x i32> %wide.load9, %wide.load
107 store <4 x i32> %1, <4 x i32>* %lsr.iv1416, align 4
108 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
109 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
110 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
111 %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
112 %3 = icmp ne i32 %2, 0
113 br i1 %3, label %vector.body, label %for.cond.cleanup
119 ; Check that the icmp is a ult
120 define dso_local void @foo3(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
121 ; CHECK-LABEL: @foo3(
123 ; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
124 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
125 ; CHECK: vector.body:
126 ; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
127 ; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
128 ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
129 ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
130 ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
131 ; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
132 ; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
133 ; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
134 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
135 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
136 ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
137 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i32> [[INDUCTION]], <i32 32002, i32 32002, i32 32002, i32 32002>
138 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
139 ; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
140 ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
141 ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP1]])
142 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
143 ; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
144 ; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4
145 ; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4
146 ; CHECK-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
147 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
148 ; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
149 ; CHECK: for.cond.cleanup:
150 ; CHECK-NEXT: ret void
153 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
154 br label %vector.body
157 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
158 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
159 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
160 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
161 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
162 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
163 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
164 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
165 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
166 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
167 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
170 %1 = icmp ugt <4 x i32> %induction, <i32 32002, i32 32002, i32 32002, i32 32002>
172 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
173 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
174 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
175 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
176 %index.next = add i32 %index, 4
177 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
178 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
179 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
180 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
181 %4 = icmp ne i32 %3, 0
182 br i1 %4, label %vector.body, label %for.cond.cleanup
188 define dso_local void @foo5(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
189 ; CHECK-LABEL: @foo5(
191 ; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
192 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
193 ; CHECK: vector.body:
194 ; CHECK-NEXT: [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
195 ; CHECK-NEXT: [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
196 ; CHECK-NEXT: [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
197 ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
198 ; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
199 ; CHECK-NEXT: [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
200 ; CHECK-NEXT: [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
201 ; CHECK-NEXT: [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
202 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
203 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
204 ; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
205 ; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[INDUCTION]], <i32 0, i32 3200, i32 32002, i32 32002>
206 ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
207 ; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
208 ; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
209 ; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP1]])
210 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
211 ; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
212 ; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4
213 ; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4
214 ; CHECK-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
215 ; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
216 ; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
217 ; CHECK: for.cond.cleanup:
218 ; CHECK-NEXT: ret void
221 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
222 br label %vector.body
225 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
226 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
227 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
228 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
229 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
230 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
231 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
232 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
233 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
234 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
235 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
236 %1 = icmp ult <4 x i32> %induction, <i32 0, i32 3200, i32 32002, i32 32002>
237 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
238 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
239 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
240 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
241 %index.next = add i32 %index, 4
242 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
243 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
244 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
245 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
246 %4 = icmp ne i32 %3, 0
247 br i1 %4, label %vector.body, label %for.cond.cleanup
253 ; CHECK-LABEL: @inconsistent_tripcounts(
254 ; CHECK: vector.body:
255 ; CHECK-NOT: @llvm.arm.mve.vctp32
256 ; CHECK: @llvm.get.active.lane.mask
259 define dso_local void @inconsistent_tripcounts(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
261 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
262 br label %vector.body
265 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
266 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
267 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
268 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
269 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
270 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
271 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
272 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
273 ; BTC = UINT_MAX, and scalar trip count BTC + 1 would overflow:
274 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967295)
275 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
276 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
277 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
278 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
279 %index.next = add i32 %index, 4
280 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
281 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
282 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
283 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
284 %4 = icmp ne i32 %3, 0
285 br i1 %4, label %vector.body, label %for.cond.cleanup
291 ; CHECK-LABEL: @overflow_in_sub(
292 ; CHECK: vector.body:
293 ; CHECK-NOT: @llvm.arm.mve.vctp32
294 ; CHECK: @llvm.get.active.lane.mask
297 define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
299 %start = call i32 @llvm.start.loop.iterations.i32(i32 1073741824)
300 br label %vector.body
303 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
304 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
305 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
306 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
307 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
308 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
309 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
310 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
311 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
312 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
313 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
314 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
315 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
316 %index.next = add i32 %index, 4
317 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
318 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
319 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
320 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
321 %4 = icmp ne i32 %3, 0
322 br i1 %4, label %vector.body, label %for.cond.cleanup
329 ; CHECK-LABEL: @IV_not_an_induction(
330 ; CHECK: vector.body:
331 ; CHECK-NOT: @llvm.arm.mve.vctp32
332 ; CHECK: @llvm.get.active.lane.mask
335 define dso_local void @IV_not_an_induction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
337 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
338 br label %vector.body
341 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
342 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
343 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
344 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
345 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
346 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
347 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
348 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
349 ; The induction variable %N is not an IV:
350 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003)
351 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
352 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
353 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
354 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
355 %index.next = add i32 %index, 4
356 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
357 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
358 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
359 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
360 %4 = icmp ne i32 %3, 0
361 br i1 %4, label %vector.body, label %for.cond.cleanup
367 ; CHECK-LABEL: @IV_wrong_step(
368 ; CHECK: vector.body:
369 ; CHECK-NOT: @llvm.arm.mve.vctp32
370 ; CHECK: @llvm.get.active.lane.mask
373 define dso_local void @IV_wrong_step(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
375 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
376 br label %vector.body
379 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
380 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
381 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
382 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
383 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
384 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
385 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
386 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
387 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
388 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
389 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
390 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
391 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
393 ; %index is incremented with 3 and not 4, which is the vectorisation factor
394 ; that we expect here:
395 %index.next = add i32 %index, 3
397 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
398 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
399 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
400 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
401 %4 = icmp ne i32 %3, 0
402 br i1 %4, label %vector.body, label %for.cond.cleanup
408 ; CHECK-LABEL: @IV_step_not_constant(
409 ; CHECK: vector.body:
410 ; CHECK-NOT: @llvm.arm.mve.vctp32
411 ; CHECK: @llvm.get.active.lane.mask
414 define dso_local void @IV_step_not_constant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
416 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
417 br label %vector.body
420 %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
421 %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
422 %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
423 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
424 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
425 %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
426 %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
427 %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
428 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
429 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
430 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
431 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
432 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
434 ; %index is incremented with some runtime value, i.e. not a constant:
435 %index.next = add i32 %index, %N
437 %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
438 %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
439 %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
440 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
441 %4 = icmp ne i32 %3, 0
442 br i1 %4, label %vector.body, label %for.cond.cleanup
448 ; CHECK-LABEL: @outerloop_phi(
449 ; CHECK: vector.body:
450 ; CHECK-NOT: @llvm.arm.mve.vctp32
451 ; CHECK: @llvm.get.active.lane.mask
454 define dso_local void @outerloop_phi(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
456 %cmp24 = icmp eq i32 %N, 0
457 br i1 %cmp24, label %for.cond.cleanup, label %vector.ph.preheader
459 vector.ph.preheader: ; preds = %entry
462 vector.ph: ; preds = %vector.ph.preheader, %for.cond.cleanup3
463 %lsr.iv36 = phi i32* [ %B, %vector.ph.preheader ], [ %scevgep37, %for.cond.cleanup3 ]
464 %lsr.iv31 = phi i32* [ %C, %vector.ph.preheader ], [ %scevgep32, %for.cond.cleanup3 ]
465 %lsr.iv = phi i32* [ %A, %vector.ph.preheader ], [ %scevgep, %for.cond.cleanup3 ]
466 %j.025 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %vector.ph.preheader ]
467 %start = call i32 @llvm.start.loop.iterations.i32(i32 1025)
468 br label %vector.body
470 vector.body: ; preds = %vector.body, %vector.ph
471 %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %lsr.iv36, %vector.ph ]
472 %lsr.iv33 = phi i32* [ %scevgep34, %vector.body ], [ %lsr.iv31, %vector.ph ]
473 %lsr.iv28 = phi i32* [ %scevgep29, %vector.body ], [ %lsr.iv, %vector.ph ]
474 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
475 %0 = phi i32 [ %start, %vector.ph ], [ %2, %vector.body ]
476 %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
477 %lsr.iv3335 = bitcast i32* %lsr.iv33 to <4 x i32>*
478 %lsr.iv2830 = bitcast i32* %lsr.iv28 to <4 x i32>*
479 ; It's using %j.025, the induction variable from its outer loop:
480 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %j.025, i32 4096)
481 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
482 %wide.masked.load27 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3335, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
483 %1 = add nsw <4 x i32> %wide.masked.load27, %wide.masked.load
484 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %lsr.iv2830, i32 4, <4 x i1> %active.lane.mask)
485 %index.next = add i32 %index, 4
486 %scevgep29 = getelementptr i32, i32* %lsr.iv28, i32 4
487 %scevgep34 = getelementptr i32, i32* %lsr.iv33, i32 4
488 %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 4
489 %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
490 %3 = icmp ne i32 %2, 0
491 br i1 %3, label %vector.body, label %for.cond.cleanup3
493 for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry
496 for.cond.cleanup3: ; preds = %vector.body
497 %inc11 = add nuw i32 %j.025, 1
498 %scevgep = getelementptr i32, i32* %lsr.iv, i32 1
499 %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 1
500 %scevgep37 = getelementptr i32, i32* %lsr.iv36, i32 1
501 %exitcond26 = icmp eq i32 %inc11, %N
502 br i1 %exitcond26, label %for.cond.cleanup, label %vector.ph
506 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
507 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
508 declare i32 @llvm.loop.decrement.reg.i32(i32 , i32 )
509 declare i32 @llvm.start.loop.iterations.i32(i32)
510 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)