1 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve,+lob %s -S -o - | FileCheck %s
3 ; The following functions should all fail to become tail-predicated.
4 ; CHECK-NOT: call i32 @llvm.arm.vctp
6 ; trip.count.minus.1 has been inserted into element 1, not 0.
7 define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_0(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
9 %cmp8 = icmp eq i32 %N, 0
11 %tmp9 = lshr i32 %tmp8, 2
12 %tmp10 = shl nuw i32 %tmp9, 2
13 %tmp11 = add i32 %tmp10, -4
14 %tmp12 = lshr i32 %tmp11, 2
15 %tmp13 = add nuw nsw i32 %tmp12, 1
16 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
18 vector.ph: ; preds = %entry
19 %trip.count.minus.1 = add i32 %N, -1
20 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 1
21 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
22 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
25 vector.body: ; preds = %vector.body, %vector.ph
26 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
27 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
28 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
29 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
30 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
31 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
32 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
33 %tmp2 = bitcast i32* %tmp to <4 x i32>*
34 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
35 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
36 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
37 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
38 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
39 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
40 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
41 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
42 %index.next = add i32 %index, 4
43 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
44 %tmp16 = icmp ne i32 %tmp15, 0
45 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
47 for.cond.cleanup: ; preds = %vector.body, %entry
51 ; The insert isn't using an undef for operand 0.
52 define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_def(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
54 %cmp8 = icmp eq i32 %N, 0
56 %tmp9 = lshr i32 %tmp8, 2
57 %tmp10 = shl nuw i32 %tmp9, 2
58 %tmp11 = add i32 %tmp10, -4
59 %tmp12 = lshr i32 %tmp11, 2
60 %tmp13 = add nuw nsw i32 %tmp12, 1
61 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
63 vector.ph: ; preds = %entry
64 %trip.count.minus.1 = add i32 %N, -1
65 %broadcast.splatinsert10 = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i32 %trip.count.minus.1, i32 0
66 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
67 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
70 vector.body: ; preds = %vector.body, %vector.ph
71 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
72 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
73 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
74 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
75 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
76 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
77 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
78 %tmp2 = bitcast i32* %tmp to <4 x i32>*
79 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
80 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
81 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
82 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
83 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
84 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
85 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
86 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
87 %index.next = add i32 %index, 4
88 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
89 %tmp16 = icmp ne i32 %tmp15, 0
90 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
92 for.cond.cleanup: ; preds = %vector.body, %entry
96 ; The shuffle uses a defined value for operand 1.
97 define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_1(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
99 %cmp8 = icmp eq i32 %N, 0
100 %tmp8 = add i32 %N, 3
101 %tmp9 = lshr i32 %tmp8, 2
102 %tmp10 = shl nuw i32 %tmp9, 2
103 %tmp11 = add i32 %tmp10, -4
104 %tmp12 = lshr i32 %tmp11, 2
105 %tmp13 = add nuw nsw i32 %tmp12, 1
106 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
108 vector.ph: ; preds = %entry
109 %trip.count.minus.1 = add i32 %N, -1
110 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
111 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> zeroinitializer
112 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
113 br label %vector.body
115 vector.body: ; preds = %vector.body, %vector.ph
116 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
117 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
118 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
119 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
120 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
121 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
122 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
123 %tmp2 = bitcast i32* %tmp to <4 x i32>*
124 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
125 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
126 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
127 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
128 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
129 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
130 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
131 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
132 %index.next = add i32 %index, 4
133 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
134 %tmp16 = icmp ne i32 %tmp15, 0
135 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
137 for.cond.cleanup: ; preds = %vector.body, %entry
141 ; The shuffle uses a non zero value for operand 2.
142 define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
144 %cmp8 = icmp eq i32 %N, 0
145 %tmp8 = add i32 %N, 3
146 %tmp9 = lshr i32 %tmp8, 2
147 %tmp10 = shl nuw i32 %tmp9, 2
148 %tmp11 = add i32 %tmp10, -4
149 %tmp12 = lshr i32 %tmp11, 2
150 %tmp13 = add nuw nsw i32 %tmp12, 1
151 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
153 vector.ph: ; preds = %entry
154 %trip.count.minus.1 = add i32 %N, -1
155 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
156 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
157 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
158 br label %vector.body
160 vector.body: ; preds = %vector.body, %vector.ph
161 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
162 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
163 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
164 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
165 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
166 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
167 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
168 %tmp2 = bitcast i32* %tmp to <4 x i32>*
169 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
170 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
171 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
172 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
173 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
174 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
175 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
176 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
177 %index.next = add i32 %index, 4
178 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
179 %tmp16 = icmp ne i32 %tmp15, 0
180 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
182 for.cond.cleanup: ; preds = %vector.body, %entry
187 define dso_local arm_aapcs_vfpcc void @trip_count_minus_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
189 %cmp8 = icmp eq i32 %N, 0
190 %tmp8 = add i32 %N, 3
191 %tmp9 = lshr i32 %tmp8, 2
192 %tmp10 = shl nuw i32 %tmp9, 2
193 %tmp11 = add i32 %tmp10, -4
194 %tmp12 = lshr i32 %tmp11, 2
195 %tmp13 = add nuw nsw i32 %tmp12, 1
196 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
198 vector.ph: ; preds = %entry
199 %trip.count.minus.2 = add i32 %N, -2
200 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.2, i32 1
201 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
202 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
203 br label %vector.body
205 vector.body: ; preds = %vector.body, %vector.ph
206 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
207 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
208 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
209 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
210 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
211 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
212 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
213 %tmp2 = bitcast i32* %tmp to <4 x i32>*
214 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
215 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
216 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
217 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
218 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
219 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
220 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
221 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
222 %index.next = add i32 %index, 4
223 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
224 %tmp16 = icmp ne i32 %tmp15, 0
225 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
227 for.cond.cleanup: ; preds = %vector.body, %entry
231 ; index has been inserted at element 1, not 0.
232 define dso_local arm_aapcs_vfpcc void @wrong_loop_insert(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
234 %cmp8 = icmp eq i32 %N, 0
235 %tmp8 = add i32 %N, 3
236 %tmp9 = lshr i32 %tmp8, 2
237 %tmp10 = shl nuw i32 %tmp9, 2
238 %tmp11 = add i32 %tmp10, -4
239 %tmp12 = lshr i32 %tmp11, 2
240 %tmp13 = add nuw nsw i32 %tmp12, 1
241 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
243 vector.ph: ; preds = %entry
244 %trip.count.minus.1 = add i32 %N, -1
245 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
246 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
247 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
248 br label %vector.body
250 vector.body: ; preds = %vector.body, %vector.ph
251 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
252 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
253 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 1
254 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
255 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
256 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
257 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
258 %tmp2 = bitcast i32* %tmp to <4 x i32>*
259 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
260 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
261 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
262 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
263 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
264 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
265 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
266 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
267 %index.next = add i32 %index, 4
268 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
269 %tmp16 = icmp ne i32 %tmp15, 0
270 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
272 for.cond.cleanup: ; preds = %vector.body, %entry
276 define dso_local arm_aapcs_vfpcc void @wrong_loop_invalid_index_splat(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
278 %cmp8 = icmp eq i32 %N, 0
279 %tmp8 = add i32 %N, 3
280 %tmp9 = lshr i32 %tmp8, 2
281 %tmp10 = shl nuw i32 %tmp9, 2
282 %tmp11 = add i32 %tmp10, -4
283 %tmp12 = lshr i32 %tmp11, 2
284 %tmp13 = add nuw nsw i32 %tmp12, 1
285 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
287 vector.ph: ; preds = %entry
288 %trip.count.minus.1 = add i32 %N, -1
289 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
290 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
291 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
292 br label %vector.body
294 vector.body: ; preds = %vector.body, %vector.ph
295 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
296 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
297 %incorrect = add i32 %index, 1
298 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %incorrect, i32 0
299 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
300 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
301 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
302 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
303 %tmp2 = bitcast i32* %tmp to <4 x i32>*
304 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
305 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
306 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
307 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
308 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
309 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
310 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
311 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
312 %index.next = add i32 %index, 4
313 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
314 %tmp16 = icmp ne i32 %tmp15, 0
315 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
317 for.cond.cleanup: ; preds = %vector.body, %entry
321 ; Now using ult, not ule for the vector icmp
322 define dso_local arm_aapcs_vfpcc void @wrong_pred_opcode(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
324 %cmp8 = icmp eq i32 %N, 0
325 %tmp8 = add i32 %N, 3
326 %tmp9 = lshr i32 %tmp8, 2
327 %tmp10 = shl nuw i32 %tmp9, 2
328 %tmp11 = add i32 %tmp10, -4
329 %tmp12 = lshr i32 %tmp11, 2
330 %tmp13 = add nuw nsw i32 %tmp12, 1
331 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
333 vector.ph: ; preds = %entry
334 %trip.count.minus.1 = add i32 %N, -1
335 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
336 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
337 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
338 br label %vector.body
340 vector.body: ; preds = %vector.body, %vector.ph
341 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
342 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
343 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
344 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
345 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
346 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
347 %tmp1 = icmp ult <4 x i32> %induction, %broadcast.splat11
348 %tmp2 = bitcast i32* %tmp to <4 x i32>*
349 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
350 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
351 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
352 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
353 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
354 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
355 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
356 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
357 %index.next = add i32 %index, 4
358 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
359 %tmp16 = icmp ne i32 %tmp15, 0
360 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
362 for.cond.cleanup: ; preds = %vector.body, %entry
366 ; The add in the body uses 1, 2, 3, 4
367 define void @wrong_body_broadcast_splat(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
369 %cmp8 = icmp eq i32 %N, 0
370 %tmp8 = add i32 %N, 3
371 %tmp9 = lshr i32 %tmp8, 2
372 %tmp10 = shl nuw i32 %tmp9, 2
373 %tmp11 = add i32 %tmp10, -4
374 %tmp12 = lshr i32 %tmp11, 2
375 %tmp13 = add nuw nsw i32 %tmp12, 1
376 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
378 vector.ph: ; preds = %entry
379 %trip.count.minus.1 = add i32 %N, -1
380 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
381 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
382 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
383 br label %vector.body
385 vector.body: ; preds = %vector.body, %vector.ph
386 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
387 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
388 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
389 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
390 %induction = add <4 x i32> %broadcast.splat, <i32 1, i32 2, i32 3, i32 4>
391 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
392 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
393 %tmp2 = bitcast i32* %tmp to <4 x i32>*
394 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
395 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
396 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
397 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
398 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
399 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
400 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
401 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
402 %index.next = add i32 %index, 4
403 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
404 %tmp16 = icmp ne i32 %tmp15, 0
405 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
407 for.cond.cleanup: ; preds = %vector.body, %entry
411 ; Using a variable for the loop body broadcast.
412 define void @wrong_body_broadcast_splat_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N, <4 x i32> %offsets) {
414 %cmp8 = icmp eq i32 %N, 0
415 %tmp8 = add i32 %N, 3
416 %tmp9 = lshr i32 %tmp8, 2
417 %tmp10 = shl nuw i32 %tmp9, 2
418 %tmp11 = add i32 %tmp10, -4
419 %tmp12 = lshr i32 %tmp11, 2
420 %tmp13 = add nuw nsw i32 %tmp12, 1
421 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
423 vector.ph: ; preds = %entry
424 %trip.count.minus.1 = add i32 %N, -1
425 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
426 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
427 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
428 br label %vector.body
430 vector.body: ; preds = %vector.body, %vector.ph
431 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
432 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
433 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
434 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
435 %induction = add <4 x i32> %broadcast.splat, %offsets
436 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
437 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
438 %tmp2 = bitcast i32* %tmp to <4 x i32>*
439 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
440 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
441 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
442 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
443 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
444 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
445 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
446 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
447 %index.next = add i32 %index, 4
448 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
449 %tmp16 = icmp ne i32 %tmp15, 0
450 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
452 for.cond.cleanup: ; preds = %vector.body, %entry
456 ; adding 5, instead of 4, to index.
457 define void @wrong_index_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
459 %cmp8 = icmp eq i32 %N, 0
460 %tmp8 = add i32 %N, 3
461 %tmp9 = lshr i32 %tmp8, 2
462 %tmp10 = shl nuw i32 %tmp9, 2
463 %tmp11 = add i32 %tmp10, -4
464 %tmp12 = lshr i32 %tmp11, 2
465 %tmp13 = add nuw nsw i32 %tmp12, 1
466 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
468 vector.ph: ; preds = %entry
469 %trip.count.minus.1 = add i32 %N, -1
470 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
471 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
472 call void @llvm.set.loop.iterations.i32(i32 %tmp13)
473 br label %vector.body
475 vector.body: ; preds = %vector.body, %vector.ph
476 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
477 %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
478 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
479 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
480 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
481 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
482 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
483 %tmp2 = bitcast i32* %tmp to <4 x i32>*
484 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
485 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
486 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
487 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
488 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
489 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
490 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
491 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
492 %index.next = add i32 %index, 5
493 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
494 %tmp16 = icmp ne i32 %tmp15, 0
495 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
497 for.cond.cleanup: ; preds = %vector.body, %entry
501 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
502 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
503 declare void @llvm.set.loop.iterations.i32(i32) #3
504 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3