1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
4 ; The following functions should all fail to become tail-predicated.
5 ; CHECK-NOT: call i32 @llvm.arm.vctp
7 ; trip.count.minus.1 has been inserted into element 1, not 0.
8 define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_0(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
10 %cmp8 = icmp eq i32 %N, 0
12 %tmp9 = lshr i32 %tmp8, 2
13 %tmp10 = shl nuw i32 %tmp9, 2
14 %tmp11 = add i32 %tmp10, -4
15 %tmp12 = lshr i32 %tmp11, 2
16 %tmp13 = add nuw nsw i32 %tmp12, 1
17 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
19 vector.ph: ; preds = %entry
20 %trip.count.minus.1 = add i32 %N, -1
21 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 1
22 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
23 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
26 vector.body: ; preds = %vector.body, %vector.ph
27 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
28 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
29 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
30 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
31 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
32 %tmp = getelementptr inbounds i32, ptr %a, i32 %index
33 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
34 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
35 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
36 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
37 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
38 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
39 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
40 %index.next = add i32 %index, 4
41 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
42 %tmp16 = icmp ne i32 %tmp15, 0
43 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
45 for.cond.cleanup: ; preds = %vector.body, %entry
49 ; The insert isn't using an undef for operand 0.
50 define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_def(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
52 %cmp8 = icmp eq i32 %N, 0
54 %tmp9 = lshr i32 %tmp8, 2
55 %tmp10 = shl nuw i32 %tmp9, 2
56 %tmp11 = add i32 %tmp10, -4
57 %tmp12 = lshr i32 %tmp11, 2
58 %tmp13 = add nuw nsw i32 %tmp12, 1
59 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
61 vector.ph: ; preds = %entry
62 %trip.count.minus.1 = add i32 %N, -1
63 %broadcast.splatinsert10 = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i32 %trip.count.minus.1, i32 0
64 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
65 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
68 vector.body: ; preds = %vector.body, %vector.ph
69 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
70 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
71 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
72 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
73 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
74 %tmp = getelementptr inbounds i32, ptr %a, i32 %index
75 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
76 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
77 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
78 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
79 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
80 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
81 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
82 %index.next = add i32 %index, 4
83 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
84 %tmp16 = icmp ne i32 %tmp15, 0
85 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
87 for.cond.cleanup: ; preds = %vector.body, %entry
91 ; The shuffle uses a defined value for operand 1.
92 define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_1(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
94 %cmp8 = icmp eq i32 %N, 0
96 %tmp9 = lshr i32 %tmp8, 2
97 %tmp10 = shl nuw i32 %tmp9, 2
98 %tmp11 = add i32 %tmp10, -4
99 %tmp12 = lshr i32 %tmp11, 2
100 %tmp13 = add nuw nsw i32 %tmp12, 1
101 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
103 vector.ph: ; preds = %entry
104 %trip.count.minus.1 = add i32 %N, -1
105 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
106 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> zeroinitializer
107 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
108 br label %vector.body
110 vector.body: ; preds = %vector.body, %vector.ph
111 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
112 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
113 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
114 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
115 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
116 %tmp = getelementptr inbounds i32, ptr %a, i32 %index
117 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
118 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
119 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
120 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
121 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
122 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
123 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
124 %index.next = add i32 %index, 4
125 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
126 %tmp16 = icmp ne i32 %tmp15, 0
127 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
129 for.cond.cleanup: ; preds = %vector.body, %entry
133 ; The shuffle uses a non zero value for operand 2.
134 define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
136 %cmp8 = icmp eq i32 %N, 0
137 %tmp8 = add i32 %N, 3
138 %tmp9 = lshr i32 %tmp8, 2
139 %tmp10 = shl nuw i32 %tmp9, 2
140 %tmp11 = add i32 %tmp10, -4
141 %tmp12 = lshr i32 %tmp11, 2
142 %tmp13 = add nuw nsw i32 %tmp12, 1
143 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
145 vector.ph: ; preds = %entry
146 %trip.count.minus.1 = add i32 %N, -1
147 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
148 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
149 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
150 br label %vector.body
152 vector.body: ; preds = %vector.body, %vector.ph
153 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
154 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
155 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
156 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
157 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
158 %tmp = getelementptr inbounds i32, ptr %a, i32 %index
159 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
160 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
161 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
162 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
163 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
164 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
165 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
166 %index.next = add i32 %index, 4
167 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
168 %tmp16 = icmp ne i32 %tmp15, 0
169 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
171 for.cond.cleanup: ; preds = %vector.body, %entry
176 define dso_local arm_aapcs_vfpcc void @trip_count_minus_2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
178 %cmp8 = icmp eq i32 %N, 0
179 %tmp8 = add i32 %N, 3
180 %tmp9 = lshr i32 %tmp8, 2
181 %tmp10 = shl nuw i32 %tmp9, 2
182 %tmp11 = add i32 %tmp10, -4
183 %tmp12 = lshr i32 %tmp11, 2
184 %tmp13 = add nuw nsw i32 %tmp12, 1
185 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
187 vector.ph: ; preds = %entry
188 %trip.count.minus.2 = add i32 %N, -2
189 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.2, i32 1
190 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
191 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
192 br label %vector.body
194 vector.body: ; preds = %vector.body, %vector.ph
195 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
196 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
197 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
198 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
199 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
200 %tmp = getelementptr inbounds i32, ptr %a, i32 %index
201 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
202 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
203 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
204 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
205 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
206 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
207 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
208 %index.next = add i32 %index, 4
209 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
210 %tmp16 = icmp ne i32 %tmp15, 0
211 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
213 for.cond.cleanup: ; preds = %vector.body, %entry
217 ; index has been inserted at element 1, not 0.
218 define dso_local arm_aapcs_vfpcc void @wrong_loop_insert(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
220 %cmp8 = icmp eq i32 %N, 0
221 %tmp8 = add i32 %N, 3
222 %tmp9 = lshr i32 %tmp8, 2
223 %tmp10 = shl nuw i32 %tmp9, 2
224 %tmp11 = add i32 %tmp10, -4
225 %tmp12 = lshr i32 %tmp11, 2
226 %tmp13 = add nuw nsw i32 %tmp12, 1
227 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
229 vector.ph: ; preds = %entry
230 %trip.count.minus.1 = add i32 %N, -1
231 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
232 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
233 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
234 br label %vector.body
236 vector.body: ; preds = %vector.body, %vector.ph
237 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
238 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
239 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 1
240 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
241 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
242 %tmp = getelementptr inbounds i32, ptr %a, i32 %index
243 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
244 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
245 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
246 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
247 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
248 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
249 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
250 %index.next = add i32 %index, 4
251 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
252 %tmp16 = icmp ne i32 %tmp15, 0
253 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
255 for.cond.cleanup: ; preds = %vector.body, %entry
259 define dso_local arm_aapcs_vfpcc void @wrong_loop_invalid_index_splat(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
261 %cmp8 = icmp eq i32 %N, 0
262 %tmp8 = add i32 %N, 3
263 %tmp9 = lshr i32 %tmp8, 2
264 %tmp10 = shl nuw i32 %tmp9, 2
265 %tmp11 = add i32 %tmp10, -4
266 %tmp12 = lshr i32 %tmp11, 2
267 %tmp13 = add nuw nsw i32 %tmp12, 1
268 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
270 vector.ph: ; preds = %entry
271 %trip.count.minus.1 = add i32 %N, -1
272 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
273 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
274 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
275 br label %vector.body
277 vector.body: ; preds = %vector.body, %vector.ph
278 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
279 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
280 %incorrect = add i32 %index, 1
281 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %incorrect, i32 0
282 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
283 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
284 %tmp = getelementptr inbounds i32, ptr %a, i32 %index
285 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
286 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
287 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
288 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
289 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
290 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
291 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
292 %index.next = add i32 %index, 4
293 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
294 %tmp16 = icmp ne i32 %tmp15, 0
295 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
297 for.cond.cleanup: ; preds = %vector.body, %entry
301 ; Now using ult, not ule for the vector icmp
302 define dso_local arm_aapcs_vfpcc void @wrong_pred_opcode(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
304 %cmp8 = icmp eq i32 %N, 0
305 %tmp8 = add i32 %N, 3
306 %tmp9 = lshr i32 %tmp8, 2
307 %tmp10 = shl nuw i32 %tmp9, 2
308 %tmp11 = add i32 %tmp10, -4
309 %tmp12 = lshr i32 %tmp11, 2
310 %tmp13 = add nuw nsw i32 %tmp12, 1
311 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
313 vector.ph: ; preds = %entry
314 %trip.count.minus.1 = add i32 %N, -1
315 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
316 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
317 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
318 br label %vector.body
320 vector.body: ; preds = %vector.body, %vector.ph
321 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
322 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
323 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
324 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
325 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
326 %tmp = getelementptr inbounds i32, ptr %a, i32 %index
327 %tmp1 = icmp ult <4 x i32> %induction, %broadcast.splat11
328 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
329 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
330 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
331 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
332 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
333 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
334 %index.next = add i32 %index, 4
335 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
336 %tmp16 = icmp ne i32 %tmp15, 0
337 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
339 for.cond.cleanup: ; preds = %vector.body, %entry
343 ; The add in the body uses 1, 2, 3, 4
344 define void @wrong_body_broadcast_splat(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
346 %cmp8 = icmp eq i32 %N, 0
347 %tmp8 = add i32 %N, 3
348 %tmp9 = lshr i32 %tmp8, 2
349 %tmp10 = shl nuw i32 %tmp9, 2
350 %tmp11 = add i32 %tmp10, -4
351 %tmp12 = lshr i32 %tmp11, 2
352 %tmp13 = add nuw nsw i32 %tmp12, 1
353 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
355 vector.ph: ; preds = %entry
356 %trip.count.minus.1 = add i32 %N, -1
357 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
358 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
359 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
360 br label %vector.body
362 vector.body: ; preds = %vector.body, %vector.ph
363 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
364 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
365 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
366 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
367 %induction = add <4 x i32> %broadcast.splat, <i32 1, i32 2, i32 3, i32 4>
368 %tmp = getelementptr inbounds i32, ptr %a, i32 %index
369 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
370 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
371 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
372 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
373 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
374 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
375 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
376 %index.next = add i32 %index, 4
377 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
378 %tmp16 = icmp ne i32 %tmp15, 0
379 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
381 for.cond.cleanup: ; preds = %vector.body, %entry
385 ; Using a variable for the loop body broadcast.
386 define void @wrong_body_broadcast_splat_2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N, <4 x i32> %offsets) {
388 %cmp8 = icmp eq i32 %N, 0
389 %tmp8 = add i32 %N, 3
390 %tmp9 = lshr i32 %tmp8, 2
391 %tmp10 = shl nuw i32 %tmp9, 2
392 %tmp11 = add i32 %tmp10, -4
393 %tmp12 = lshr i32 %tmp11, 2
394 %tmp13 = add nuw nsw i32 %tmp12, 1
395 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
397 vector.ph: ; preds = %entry
398 %trip.count.minus.1 = add i32 %N, -1
399 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
400 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
401 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
402 br label %vector.body
404 vector.body: ; preds = %vector.body, %vector.ph
405 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
406 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
407 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
408 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
409 %induction = add <4 x i32> %broadcast.splat, %offsets
410 %tmp = getelementptr inbounds i32, ptr %a, i32 %index
411 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
412 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
413 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
414 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
415 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
416 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
417 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
418 %index.next = add i32 %index, 4
419 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
420 %tmp16 = icmp ne i32 %tmp15, 0
421 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
423 for.cond.cleanup: ; preds = %vector.body, %entry
427 ; adding 5, instead of 4, to index.
428 define void @wrong_index_add(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
430 %cmp8 = icmp eq i32 %N, 0
431 %tmp8 = add i32 %N, 3
432 %tmp9 = lshr i32 %tmp8, 2
433 %tmp10 = shl nuw i32 %tmp9, 2
434 %tmp11 = add i32 %tmp10, -4
435 %tmp12 = lshr i32 %tmp11, 2
436 %tmp13 = add nuw nsw i32 %tmp12, 1
437 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
439 vector.ph: ; preds = %entry
440 %trip.count.minus.1 = add i32 %N, -1
441 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
442 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
443 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
444 br label %vector.body
446 vector.body: ; preds = %vector.body, %vector.ph
447 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
448 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
449 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
450 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
451 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
452 %tmp = getelementptr inbounds i32, ptr %a, i32 %index
453 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
454 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
455 %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
456 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
457 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
458 %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
459 tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
460 %index.next = add i32 %index, 5
461 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
462 %tmp16 = icmp ne i32 %tmp15, 0
463 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
465 for.cond.cleanup: ; preds = %vector.body, %entry
469 declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) #1
470 declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) #2
471 declare i32 @llvm.start.loop.iterations.i32(i32) #3
472 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3