1 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
3 ; CHECK-LABEL: mul_v16i8
4 ; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
6 ; CHECK: %index = phi i32
7 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
8 ; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[ELEMS]])
9 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16
10 ; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
11 ; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
12 ; CHECK: tail call void @llvm.masked.store.v16i8.p0(<16 x i8> {{.*}}, ptr {{.*}}, i32 4, <16 x i1> [[VCTP]])
13 define dso_local arm_aapcs_vfpcc void @mul_v16i8(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
15 %cmp8 = icmp eq i32 %N, 0
16 %tmp8 = add i32 %N, 15
17 %tmp9 = lshr i32 %tmp8, 4
18 %tmp10 = shl nuw i32 %tmp9, 4
19 %tmp11 = add i32 %tmp10, -16
20 %tmp12 = lshr i32 %tmp11, 4
21 %tmp13 = add nuw nsw i32 %tmp12, 1
22 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
24 vector.ph: ; preds = %entry
25 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
28 vector.body: ; preds = %vector.body, %vector.ph
29 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
30 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
31 %tmp = getelementptr inbounds i8, ptr %a, i32 %index
32 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
33 %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef)
34 %tmp3 = getelementptr inbounds i8, ptr %b, i32 %index
35 %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0(ptr %tmp3, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef)
36 %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load
37 %tmp6 = getelementptr inbounds i8, ptr %c, i32 %index
38 tail call void @llvm.masked.store.v16i8.p0(<16 x i8> %mul, ptr %tmp6, i32 4, <16 x i1> %active.lane.mask)
39 %index.next = add i32 %index, 16
40 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
41 %tmp16 = icmp ne i32 %tmp15, 0
42 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
44 for.cond.cleanup: ; preds = %vector.body, %entry
48 ; CHECK-LABEL: mul_v8i16
49 ; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
51 ; CHECK: %index = phi i32
52 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
53 ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]])
54 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8
55 ; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
56 ; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
57 ; CHECK: tail call void @llvm.masked.store.v8i16.p0(<8 x i16> {{.*}}, ptr {{.*}}, i32 4, <8 x i1> [[VCTP]])
58 define dso_local arm_aapcs_vfpcc void @mul_v8i16(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
60 %cmp8 = icmp eq i32 %N, 0
62 %tmp9 = lshr i32 %tmp8, 3
63 %tmp10 = shl nuw i32 %tmp9, 3
64 %tmp11 = add i32 %tmp10, -8
65 %tmp12 = lshr i32 %tmp11, 3
66 %tmp13 = add nuw nsw i32 %tmp12, 1
67 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
69 vector.ph: ; preds = %entry
70 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
73 vector.body: ; preds = %vector.body, %vector.ph
74 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
75 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
76 %tmp = getelementptr inbounds i16, ptr %a, i32 %index
77 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
78 %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef)
79 %tmp3 = getelementptr inbounds i16, ptr %b, i32 %index
80 %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0(ptr %tmp3, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef)
81 %mul = mul nsw <8 x i16> %wide.masked.load2, %wide.masked.load
82 %tmp6 = getelementptr inbounds i16, ptr %c, i32 %index
83 tail call void @llvm.masked.store.v8i16.p0(<8 x i16> %mul, ptr %tmp6, i32 4, <8 x i1> %active.lane.mask)
84 %index.next = add i32 %index, 8
85 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
86 %tmp16 = icmp ne i32 %tmp15, 0
87 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
89 for.cond.cleanup: ; preds = %vector.body, %entry
93 ; CHECK-LABEL: mul_v4i32
94 ; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
96 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
97 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
98 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
99 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
100 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
101 ; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]])
102 define dso_local arm_aapcs_vfpcc void @mul_v4i32(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
104 %cmp8 = icmp eq i32 %N, 0
105 %tmp8 = add i32 %N, 3
106 %tmp9 = lshr i32 %tmp8, 2
107 %tmp10 = shl nuw i32 %tmp9, 2
108 %tmp11 = add i32 %tmp10, -4
109 %tmp12 = lshr i32 %tmp11, 2
110 %tmp13 = add nuw nsw i32 %tmp12, 1
111 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
113 vector.ph: ; preds = %entry
114 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
115 br label %vector.body
117 vector.body: ; preds = %vector.body, %vector.ph
118 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
119 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
120 %tmp = getelementptr inbounds i32, ptr %a, i32 %index
121 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
122 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
123 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
124 %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
125 %mul = mul nsw <4 x i32> %wide.masked.load2, %wide.masked.load
126 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
127 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %mul, ptr %tmp6, i32 4, <4 x i1> %active.lane.mask)
128 %index.next = add i32 %index, 4
129 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
130 %tmp16 = icmp ne i32 %tmp15, 0
131 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
133 for.cond.cleanup: ; preds = %vector.body, %entry
137 ; CHECK-LABEL: split_vector
138 ; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
139 ; CHECK: vector.body:
140 ; CHECK: %index = phi i32
141 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
142 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
143 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
144 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
145 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
146 ; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]])
147 define dso_local arm_aapcs_vfpcc void @split_vector(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
149 %cmp8 = icmp eq i32 %N, 0
150 %tmp8 = add i32 %N, 3
151 %tmp9 = lshr i32 %tmp8, 2
152 %tmp10 = shl nuw i32 %tmp9, 2
153 %tmp11 = add i32 %tmp10, -4
154 %tmp12 = lshr i32 %tmp11, 2
155 %tmp13 = add nuw nsw i32 %tmp12, 1
156 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
158 vector.ph: ; preds = %entry
159 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
160 br label %vector.body
162 vector.body: ; preds = %vector.body, %vector.ph
163 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
164 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
165 %tmp = getelementptr inbounds i32, ptr %a, i32 %index
166 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
167 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
168 %extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2>
169 %extract.1.high = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 1, i32 3>
170 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
171 %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
172 %extract.2.low = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 0, i32 2>
173 %extract.2.high = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 1, i32 3>
174 %mul = mul nsw <2 x i32> %extract.1.low, %extract.2.low
175 %sub = sub nsw <2 x i32> %extract.1.high, %extract.2.high
176 %combine = shufflevector <2 x i32> %mul, <2 x i32> %sub, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
177 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
178 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %combine, ptr %tmp6, i32 4, <4 x i1> %active.lane.mask)
179 %index.next = add i32 %index, 4
180 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
181 %tmp16 = icmp ne i32 %tmp15, 0
182 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
184 for.cond.cleanup: ; preds = %vector.body, %entry
188 ; One of the loads now uses ult predicate.
189 ; CHECK-LABEL: mismatch_load_pred
190 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
191 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
192 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
193 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
194 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef)
195 ; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> [[VCTP]])
196 define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
198 %cmp8 = icmp eq i32 %N, 0
199 %tmp8 = add i32 %N, 3
200 %tmp9 = lshr i32 %tmp8, 2
201 %tmp10 = shl nuw i32 %tmp9, 2
202 %tmp11 = add i32 %tmp10, -4
203 %tmp12 = lshr i32 %tmp11, 2
204 %tmp13 = add nuw nsw i32 %tmp12, 1
205 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
207 vector.ph: ; preds = %entry
208 %trip.count.minus.1 = add i32 %N, -1
209 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
210 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
211 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
212 br label %vector.body
214 vector.body: ; preds = %vector.body, %vector.ph
215 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
216 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
217 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
218 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
219 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
220 %tmp = getelementptr inbounds i32, ptr %a, i32 %index
221 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
222 %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
223 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
224 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
225 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %wrong, <4 x i32> undef)
226 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
227 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
228 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %active.lane.mask)
229 %index.next = add i32 %index, 4
230 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
231 %tmp16 = icmp ne i32 %tmp15, 0
232 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
234 for.cond.cleanup: ; preds = %vector.body, %entry
238 ; The store now uses ult predicate.
239 ; CHECK-LABEL: mismatch_store_pred
240 ; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
241 ; CHECK: vector.body:
242 ; CHECK: %index = phi i32
243 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
244 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
245 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
246 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
247 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
248 ; CHECK: tail call void @llvm.masked.store.v4i32.p0(<4 x i32> {{.*}}, ptr {{.*}}, i32 4, <4 x i1> %wrong)
249 define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
251 %cmp8 = icmp eq i32 %N, 0
252 %tmp8 = add i32 %N, 3
253 %tmp9 = lshr i32 %tmp8, 2
254 %tmp10 = shl nuw i32 %tmp9, 2
255 %tmp11 = add i32 %tmp10, -4
256 %tmp12 = lshr i32 %tmp11, 2
257 %tmp13 = add nuw nsw i32 %tmp12, 1
258 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
260 vector.ph: ; preds = %entry
261 %trip.count.minus.1 = add i32 %N, -1
262 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
263 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
264 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
265 br label %vector.body
267 vector.body: ; preds = %vector.body, %vector.ph
268 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
269 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
270 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
271 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
272 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
273 %tmp = getelementptr inbounds i32, ptr %a, i32 %index
274 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
275 %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
276 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
277 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
278 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
279 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
280 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
281 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %wrong)
282 %index.next = add i32 %index, 4
283 %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
284 %tmp16 = icmp ne i32 %tmp15, 0
285 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
287 for.cond.cleanup: ; preds = %vector.body, %entry
291 ; TODO: Multiple intrinsics not yet supported.
292 ; This is currently rejected, because if the vector body is unrolled, the step
293 ; is not what we expect:
295 ; Step value 16 doesn't match vector width 4
297 ; CHECK-LABEL: interleave4
298 ; CHECK: vector.body:
299 ; CHECK: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
300 ; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N)
301 ; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N)
302 ; CHECK: %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N)
304 define dso_local void @interleave4(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
306 %cmp8 = icmp sgt i32 %N, 0
308 %v1 = lshr i32 %v0, 4
309 %v2 = shl nuw i32 %v1, 4
310 %v3 = add i32 %v2, -16
311 %v4 = lshr i32 %v3, 4
312 %v5 = add nuw nsw i32 %v4, 1
313 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
317 %scevgep = getelementptr i32, ptr %A, i32 8
318 %scevgep30 = getelementptr i32, ptr %C, i32 8
319 %scevgep37 = getelementptr i32, ptr %B, i32 8
320 %start = call i32 @llvm.start.loop.iterations.i32(i32 %v5)
321 br label %vector.body
324 %lsr.iv38 = phi ptr [ %scevgep39, %vector.body ], [ %scevgep37, %vector.ph ]
325 %lsr.iv31 = phi ptr [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ]
326 %lsr.iv = phi ptr [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ]
327 %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ]
328 %v6 = phi i32 [ %start, %vector.ph ], [ %v15, %vector.body ]
329 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
330 %v7 = add i32 %index, 4
331 %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N)
333 %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N)
335 %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N)
336 %scevgep42 = getelementptr <4 x i32>, ptr %lsr.iv38, i32 -2
337 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
338 %scevgep43 = getelementptr <4 x i32>, ptr %lsr.iv38, i32 -1
339 %wide.masked.load18 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %scevgep43, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
340 %wide.masked.load19 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %lsr.iv38, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
341 %scevgep41 = getelementptr <4 x i32>, ptr %lsr.iv38, i32 1
342 %wide.masked.load20 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %scevgep41, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
343 %scevgep34 = getelementptr <4 x i32>, ptr %lsr.iv31, i32 -2
344 %wide.masked.load21 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %scevgep34, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
345 %scevgep35 = getelementptr <4 x i32>, ptr %lsr.iv31, i32 -1
346 %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %scevgep35, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
347 %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %lsr.iv31, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
348 %scevgep36 = getelementptr <4 x i32>, ptr %lsr.iv31, i32 1
349 %wide.masked.load24 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr nonnull %scevgep36, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
350 %v10 = add nsw <4 x i32> %wide.masked.load21, %wide.masked.load
351 %v11 = add nsw <4 x i32> %wide.masked.load22, %wide.masked.load18
352 %v12 = add nsw <4 x i32> %wide.masked.load23, %wide.masked.load19
353 %v13 = add nsw <4 x i32> %wide.masked.load24, %wide.masked.load20
354 %scevgep27 = getelementptr <4 x i32>, ptr %lsr.iv, i32 -2
355 call void @llvm.masked.store.v4i32.p0(<4 x i32> %v10, ptr %scevgep27, i32 4, <4 x i1> %active.lane.mask)
356 %scevgep28 = getelementptr <4 x i32>, ptr %lsr.iv, i32 -1
357 call void @llvm.masked.store.v4i32.p0(<4 x i32> %v11, ptr %scevgep28, i32 4, <4 x i1> %active.lane.mask15)
358 call void @llvm.masked.store.v4i32.p0(<4 x i32> %v12, ptr %lsr.iv, i32 4, <4 x i1> %active.lane.mask16)
359 %scevgep29 = getelementptr <4 x i32>, ptr %lsr.iv, i32 1
360 call void @llvm.masked.store.v4i32.p0(<4 x i32> %v13, ptr %scevgep29, i32 4, <4 x i1> %active.lane.mask17)
361 %scevgep25 = getelementptr i32, ptr %lsr.iv, i32 16
362 %scevgep32 = getelementptr i32, ptr %lsr.iv31, i32 16
363 %scevgep39 = getelementptr i32, ptr %lsr.iv38, i32 16
364 %v14 = add i32 %v9, 4
365 %v15 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1)
366 %v16 = icmp ne i32 %v15, 0
367 br i1 %v16, label %vector.body, label %for.cond.cleanup
373 ; CHECK-LABEL: const_expected_in_set_loop
374 ; CHECK: call <4 x i1> @llvm.get.active.lane.mask
378 define dso_local void @const_expected_in_set_loop(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
380 %cmp8 = icmp sgt i32 %N, 0
383 %2 = shl nuw i32 %1, 2
386 %5 = add nuw nsw i32 %4, 1
387 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
390 %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
391 br label %vector.body
393 vector.body: ; preds = %vector.body, %vector.ph
394 %lsr.iv17 = phi ptr [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
395 %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
396 %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %vector.ph ]
397 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
398 %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
399 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 42)
400 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
401 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv14, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
402 %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
403 call void @llvm.masked.store.v4i32.p0(<4 x i32> %7, ptr %lsr.iv17, i32 4, <4 x i1> %active.lane.mask)
404 %index.next = add i32 %index, 4
405 %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
406 %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
407 %scevgep18 = getelementptr i32, ptr %lsr.iv17, i32 4
408 %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
409 %9 = icmp ne i32 %8, 0
410 br i1 %9, label %vector.body, label %for.cond.cleanup
412 for.cond.cleanup: ; preds = %vector.body, %entry
416 ; CHECK-LABEL: tripcount_arg_not_invariant
417 ; CHECK: call <4 x i1> @llvm.get.active.lane.mask
421 define dso_local void @tripcount_arg_not_invariant(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
423 %cmp8 = icmp sgt i32 %N, 0
426 %2 = shl nuw i32 %1, 2
429 %5 = add nuw nsw i32 %4, 1
430 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
432 vector.ph: ; preds = %entry
433 %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
434 br label %vector.body
436 vector.body: ; preds = %vector.body, %vector.ph
437 %lsr.iv17 = phi ptr [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
438 %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
439 %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %vector.ph ]
440 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
441 %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
443 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index)
444 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
445 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv14, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
446 %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
447 call void @llvm.masked.store.v4i32.p0(<4 x i32> %7, ptr %lsr.iv17, i32 4, <4 x i1> %active.lane.mask)
448 %index.next = add i32 %index, 4
449 %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
450 %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
451 %scevgep18 = getelementptr i32, ptr %lsr.iv17, i32 4
452 %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
453 %9 = icmp ne i32 %8, 0
454 ;br i1 %9, label %vector.body, label %for.cond.cleanup
455 br i1 %9, label %vector.body, label %vector.ph
457 for.cond.cleanup: ; preds = %vector.body, %entry
461 ; CHECK-LABEL: addrec_base_not_zero
462 ; CHECK: call <4 x i1> @llvm.get.active.lane.mask
466 define dso_local void @addrec_base_not_zero(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
468 %cmp8 = icmp sgt i32 %N, 0
471 %2 = shl nuw i32 %1, 2
474 %5 = add nuw nsw i32 %4, 1
475 br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
477 vector.ph: ; preds = %entry
478 %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
479 br label %vector.body
481 vector.body: ; preds = %vector.body, %vector.ph
482 %lsr.iv17 = phi ptr [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
483 %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
484 %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %vector.ph ]
486 ; AddRec base is not 0:
487 %index = phi i32 [ 1, %vector.ph ], [ %index.next, %vector.body ]
489 %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
490 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
491 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
492 %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv14, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
493 %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
494 call void @llvm.masked.store.v4i32.p0(<4 x i32> %7, ptr %lsr.iv17, i32 4, <4 x i1> %active.lane.mask)
495 %index.next = add i32 %index, 4
496 %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
497 %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
498 %scevgep18 = getelementptr i32, ptr %lsr.iv17, i32 4
499 %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
500 %9 = icmp ne i32 %8, 0
501 ;br i1 %9, label %vector.body, label %for.cond.cleanup
502 br i1 %9, label %vector.body, label %vector.ph
504 for.cond.cleanup: ; preds = %vector.body, %entry
509 declare <16 x i8> @llvm.masked.load.v16i8.p0(ptr, i32 immarg, <16 x i1>, <16 x i8>)
510 declare void @llvm.masked.store.v16i8.p0(<16 x i8>, ptr, i32 immarg, <16 x i1>)
511 declare <8 x i16> @llvm.masked.load.v8i16.p0(ptr, i32 immarg, <8 x i1>, <8 x i16>)
512 declare void @llvm.masked.store.v8i16.p0(<8 x i16>, ptr, i32 immarg, <8 x i1>)
513 declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>)
514 declare void @llvm.masked.store.v2i64.p0(<2 x i64>, ptr, i32 immarg, <2 x i1>)
515 declare <2 x i64> @llvm.masked.load.v2i64.p0(ptr, i32 immarg, <2 x i1>, <2 x i64>)
516 declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>)
517 declare i32 @llvm.start.loop.iterations.i32(i32)
518 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
519 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
520 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
521 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)