1 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve,+lob %s -S -o - | FileCheck %s
3 ; CHECK-LABEL: mul_v16i8
5 ; CHECK: %index = phi i32
6 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
7 ; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.vctp8(i32 [[ELEMS]])
8 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16
9 ; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
10 ; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
11 ; CHECK: tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> {{.*}}, <16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]])
12 define dso_local arm_aapcs_vfpcc void @mul_v16i8(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) {
14 %cmp8 = icmp eq i32 %N, 0
15 %tmp8 = add i32 %N, 15
16 %tmp9 = lshr i32 %tmp8, 4
17 %tmp10 = shl nuw i32 %tmp9, 4
18 %tmp11 = add i32 %tmp10, -16
19 %tmp12 = lshr i32 %tmp11, 4
20 %tmp13 = add nuw nsw i32 %tmp12, 1
21 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
23 vector.ph: ; preds = %entry
24 %trip.count.minus.1 = add i32 %N, -1
25 %broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
26 %broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer
27 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
30 vector.body: ; preds = %vector.body, %vector.ph
31 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
32 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
33 %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
34 %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
35 %induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
36 %tmp = getelementptr inbounds i8, i8* %a, i32 %index
37 %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11
38 %tmp2 = bitcast i8* %tmp to <16 x i8>*
39 %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %tmp1, <16 x i8> undef)
40 %tmp3 = getelementptr inbounds i8, i8* %b, i32 %index
41 %tmp4 = bitcast i8* %tmp3 to <16 x i8>*
42 %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %tmp1, <16 x i8> undef)
43 %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load
44 %tmp6 = getelementptr inbounds i8, i8* %c, i32 %index
45 %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
46 tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %tmp1)
47 %index.next = add i32 %index, 16
48 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
49 %tmp16 = icmp ne i32 %tmp15, 0
50 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
52 for.cond.cleanup: ; preds = %vector.body, %entry
56 ; CHECK-LABEL: mul_v8i16
58 ; CHECK: %index = phi i32
59 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
60 ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
61 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8
62 ; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
63 ; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
64 ; CHECK: tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> {{.*}}, <8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]])
65 define dso_local arm_aapcs_vfpcc void @mul_v8i16(i16* noalias nocapture readonly %a, i16* noalias nocapture readonly %b, i16* noalias nocapture %c, i32 %N) {
67 %cmp8 = icmp eq i32 %N, 0
69 %tmp9 = lshr i32 %tmp8, 3
70 %tmp10 = shl nuw i32 %tmp9, 3
71 %tmp11 = add i32 %tmp10, -8
72 %tmp12 = lshr i32 %tmp11, 3
73 %tmp13 = add nuw nsw i32 %tmp12, 1
74 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
76 vector.ph: ; preds = %entry
77 %trip.count.minus.1 = add i32 %N, -1
78 %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
79 %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
80 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
83 vector.body: ; preds = %vector.body, %vector.ph
84 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
85 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
86 %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
87 %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
88 %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
89 %tmp = getelementptr inbounds i16, i16* %a, i32 %index
90 %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
91 %tmp2 = bitcast i16* %tmp to <8 x i16>*
92 %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
93 %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
94 %tmp4 = bitcast i16* %tmp3 to <8 x i16>*
95 %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
96 %mul = mul nsw <8 x i16> %wide.masked.load2, %wide.masked.load
97 %tmp6 = getelementptr inbounds i16, i16* %c, i32 %index
98 %tmp7 = bitcast i16* %tmp6 to <8 x i16>*
99 tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %tmp1)
100 %index.next = add i32 %index, 8
101 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
102 %tmp16 = icmp ne i32 %tmp15, 0
103 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
105 for.cond.cleanup: ; preds = %vector.body, %entry
109 ; CHECK-LABEL: mul_v4i32
110 ; CHECK: vector.body:
111 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
112 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
113 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
114 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
115 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
116 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
117 define dso_local arm_aapcs_vfpcc void @mul_v4i32(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
119 %cmp8 = icmp eq i32 %N, 0
120 %tmp8 = add i32 %N, 3
121 %tmp9 = lshr i32 %tmp8, 2
122 %tmp10 = shl nuw i32 %tmp9, 2
123 %tmp11 = add i32 %tmp10, -4
124 %tmp12 = lshr i32 %tmp11, 2
125 %tmp13 = add nuw nsw i32 %tmp12, 1
126 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
128 vector.ph: ; preds = %entry
129 %trip.count.minus.1 = add i32 %N, -1
130 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
131 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
132 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
133 br label %vector.body
135 vector.body: ; preds = %vector.body, %vector.ph
136 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
137 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
138 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
139 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
140 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
141 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
142 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
143 %tmp2 = bitcast i32* %tmp to <4 x i32>*
144 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
145 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
146 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
147 %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
148 %mul = mul nsw <4 x i32> %wide.masked.load2, %wide.masked.load
149 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
150 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
151 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
152 %index.next = add i32 %index, 4
153 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
154 %tmp16 = icmp ne i32 %tmp15, 0
155 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
157 for.cond.cleanup: ; preds = %vector.body, %entry
161 ; CHECK-LABEL: copy_v2i64
162 ; CHECK: vector.body:
163 ; CHECK: %index = phi i32
164 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
165 ; CHECK: [[VCTP:%[^ ]+]] = call <2 x i1> @llvm.arm.vctp64(i32 [[ELEMS]])
166 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 2
167 ; CHECK: [[LD0:%[^ ]+]] = tail call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]], <2 x i64> undef)
168 ; CHECK: tail call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[LD0]], <2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]])
169 define void @copy_v2i64(i64* %a, i64* %b, i32 %N) {
171 %cmp8 = icmp eq i32 %N, 0
172 %tmp8 = add i32 %N, 1
173 %tmp9 = lshr i32 %tmp8, 1
174 %tmp10 = shl nuw i32 %tmp9, 1
175 %tmp11 = add i32 %tmp10, -2
176 %tmp12 = lshr i32 %tmp11, 1
177 %tmp13 = add nuw nsw i32 %tmp12, 1
178 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
180 vector.ph: ; preds = %entry
181 %trip.count.minus.1 = add i32 %N, -1
182 %broadcast.splatinsert10 = insertelement <2 x i32> undef, i32 %trip.count.minus.1, i32 0
183 %broadcast.splat11 = shufflevector <2 x i32> %broadcast.splatinsert10, <2 x i32> undef, <2 x i32> zeroinitializer
184 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
185 br label %vector.body
187 vector.body: ; preds = %vector.body, %vector.ph
188 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
189 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
190 %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
191 %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
192 %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
193 %tmp1 = icmp ule <2 x i32> %induction, %broadcast.splat11
194 %tmp = getelementptr inbounds i64, i64* %a, i32 %index
195 %tmp2 = bitcast i64* %tmp to <2 x i64>*
196 %wide.masked.load = tail call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %tmp2, i32 4, <2 x i1> %tmp1, <2 x i64> undef)
197 %tmp3 = getelementptr inbounds i64, i64* %b, i32 %index
198 %tmp7 = bitcast i64* %tmp3 to <2 x i64>*
199 tail call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %wide.masked.load, <2 x i64>* %tmp7, i32 4, <2 x i1> %tmp1)
200 %index.next = add i32 %index, 2
201 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
202 %tmp16 = icmp ne i32 %tmp15, 0
203 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
205 for.cond.cleanup: ; preds = %vector.body, %entry
209 ; CHECK-LABEL: split_vector
210 ; CHECK: vector.body:
211 ; CHECK: %index = phi i32
212 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
213 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
214 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
215 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
216 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
217 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
218 define dso_local arm_aapcs_vfpcc void @split_vector(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
220 %cmp8 = icmp eq i32 %N, 0
221 %tmp8 = add i32 %N, 3
222 %tmp9 = lshr i32 %tmp8, 2
223 %tmp10 = shl nuw i32 %tmp9, 2
224 %tmp11 = add i32 %tmp10, -4
225 %tmp12 = lshr i32 %tmp11, 2
226 %tmp13 = add nuw nsw i32 %tmp12, 1
227 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
229 vector.ph: ; preds = %entry
230 %trip.count.minus.1 = add i32 %N, -1
231 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
232 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
233 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
234 br label %vector.body
236 vector.body: ; preds = %vector.body, %vector.ph
237 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
238 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
239 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
240 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
241 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
242 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
243 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
244 %tmp2 = bitcast i32* %tmp to <4 x i32>*
245 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
246 %extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2>
247 %extract.1.high = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 1, i32 3>
248 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
249 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
250 %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
251 %extract.2.low = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 0, i32 2>
252 %extract.2.high = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 1, i32 3>
253 %mul = mul nsw <2 x i32> %extract.1.low, %extract.2.low
254 %sub = sub nsw <2 x i32> %extract.1.high, %extract.2.high
255 %combine = shufflevector <2 x i32> %mul, <2 x i32> %sub, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
256 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
257 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
258 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
259 %index.next = add i32 %index, 4
260 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
261 %tmp16 = icmp ne i32 %tmp15, 0
262 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
264 for.cond.cleanup: ; preds = %vector.body, %entry
268 ; One of the loads now uses ult predicate.
269 ; CHECK-LABEL: mismatch_load_pred
270 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
271 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
272 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
273 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
274 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef)
275 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
276 define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
278 %cmp8 = icmp eq i32 %N, 0
279 %tmp8 = add i32 %N, 3
280 %tmp9 = lshr i32 %tmp8, 2
281 %tmp10 = shl nuw i32 %tmp9, 2
282 %tmp11 = add i32 %tmp10, -4
283 %tmp12 = lshr i32 %tmp11, 2
284 %tmp13 = add nuw nsw i32 %tmp12, 1
285 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
287 vector.ph: ; preds = %entry
288 %trip.count.minus.1 = add i32 %N, -1
289 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
290 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
291 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
292 br label %vector.body
294 vector.body: ; preds = %vector.body, %vector.ph
295 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
296 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
297 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
298 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
299 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
300 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
301 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
302 %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
303 %tmp2 = bitcast i32* %tmp to <4 x i32>*
304 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
305 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
306 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
307 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %wrong, <4 x i32> undef)
308 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
309 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
310 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
311 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
312 %index.next = add i32 %index, 4
313 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
314 %tmp16 = icmp ne i32 %tmp15, 0
315 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
317 for.cond.cleanup: ; preds = %vector.body, %entry
321 ; The store now uses ult predicate.
322 ; CHECK-LABEL: mismatch_store_pred
323 ; CHECK: %index = phi i32
324 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
325 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
326 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
327 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
328 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
329 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong)
330 define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
332 %cmp8 = icmp eq i32 %N, 0
333 %tmp8 = add i32 %N, 3
334 %tmp9 = lshr i32 %tmp8, 2
335 %tmp10 = shl nuw i32 %tmp9, 2
336 %tmp11 = add i32 %tmp10, -4
337 %tmp12 = lshr i32 %tmp11, 2
338 %tmp13 = add nuw nsw i32 %tmp12, 1
339 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
341 vector.ph: ; preds = %entry
342 %trip.count.minus.1 = add i32 %N, -1
343 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
344 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
345 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
346 br label %vector.body
348 vector.body: ; preds = %vector.body, %vector.ph
349 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
350 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
351 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
352 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
353 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
354 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
355 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
356 %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
357 %tmp2 = bitcast i32* %tmp to <4 x i32>*
358 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
359 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
360 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
361 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
362 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
363 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
364 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
365 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %wrong)
366 %index.next = add i32 %index, 4
367 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
368 %tmp16 = icmp ne i32 %tmp15, 0
369 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
371 for.cond.cleanup: ; preds = %vector.body, %entry
375 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
376 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
377 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
378 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
379 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
380 declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, <2 x i1>)
381 declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>)
382 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
383 declare void @llvm.set.loop.iterations.i32(i32)
384 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)