1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
4 ; The following functions should all fail to become tail-predicated.
5 ; CHECK-NOT: call i32 @llvm.arm.vctp
7 ; trip.count.minus.1 has been inserted into element 1, not 0.
8 define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_0(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
10 %cmp8 = icmp eq i32 %N, 0
12 %tmp9 = lshr i32 %tmp8, 2
13 %tmp10 = shl nuw i32 %tmp9, 2
14 %tmp11 = add i32 %tmp10, -4
15 %tmp12 = lshr i32 %tmp11, 2
16 %tmp13 = add nuw nsw i32 %tmp12, 1
17 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
19 vector.ph: ; preds = %entry
20 %trip.count.minus.1 = add i32 %N, -1
21 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 1
22 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
23 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
26 vector.body: ; preds = %vector.body, %vector.ph
27 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
28 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
29 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
30 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
31 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
32 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
33 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
34 %tmp2 = bitcast i32* %tmp to <4 x i32>*
35 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
36 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
37 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
38 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
39 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
40 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
41 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
42 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
43 %index.next = add i32 %index, 4
44 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
45 %tmp16 = icmp ne i32 %tmp15, 0
46 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
48 for.cond.cleanup: ; preds = %vector.body, %entry
52 ; The insert isn't using an undef for operand 0.
53 define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_def(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
55 %cmp8 = icmp eq i32 %N, 0
57 %tmp9 = lshr i32 %tmp8, 2
58 %tmp10 = shl nuw i32 %tmp9, 2
59 %tmp11 = add i32 %tmp10, -4
60 %tmp12 = lshr i32 %tmp11, 2
61 %tmp13 = add nuw nsw i32 %tmp12, 1
62 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
64 vector.ph: ; preds = %entry
65 %trip.count.minus.1 = add i32 %N, -1
66 %broadcast.splatinsert10 = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i32 %trip.count.minus.1, i32 0
67 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
68 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
71 vector.body: ; preds = %vector.body, %vector.ph
72 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
73 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
74 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
75 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
76 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
77 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
78 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
79 %tmp2 = bitcast i32* %tmp to <4 x i32>*
80 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
81 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
82 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
83 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
84 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
85 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
86 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
87 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
88 %index.next = add i32 %index, 4
89 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
90 %tmp16 = icmp ne i32 %tmp15, 0
91 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
93 for.cond.cleanup: ; preds = %vector.body, %entry
97 ; The shuffle uses a defined value for operand 1.
98 define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_1(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
100 %cmp8 = icmp eq i32 %N, 0
101 %tmp8 = add i32 %N, 3
102 %tmp9 = lshr i32 %tmp8, 2
103 %tmp10 = shl nuw i32 %tmp9, 2
104 %tmp11 = add i32 %tmp10, -4
105 %tmp12 = lshr i32 %tmp11, 2
106 %tmp13 = add nuw nsw i32 %tmp12, 1
107 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
109 vector.ph: ; preds = %entry
110 %trip.count.minus.1 = add i32 %N, -1
111 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
112 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> zeroinitializer
113 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
114 br label %vector.body
116 vector.body: ; preds = %vector.body, %vector.ph
117 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
118 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
119 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
120 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
121 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
122 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
123 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
124 %tmp2 = bitcast i32* %tmp to <4 x i32>*
125 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
126 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
127 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
128 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
129 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
130 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
131 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
132 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
133 %index.next = add i32 %index, 4
134 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
135 %tmp16 = icmp ne i32 %tmp15, 0
136 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
138 for.cond.cleanup: ; preds = %vector.body, %entry
142 ; The shuffle uses a non zero value for operand 2.
143 define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
145 %cmp8 = icmp eq i32 %N, 0
146 %tmp8 = add i32 %N, 3
147 %tmp9 = lshr i32 %tmp8, 2
148 %tmp10 = shl nuw i32 %tmp9, 2
149 %tmp11 = add i32 %tmp10, -4
150 %tmp12 = lshr i32 %tmp11, 2
151 %tmp13 = add nuw nsw i32 %tmp12, 1
152 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
154 vector.ph: ; preds = %entry
155 %trip.count.minus.1 = add i32 %N, -1
156 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
157 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
158 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
159 br label %vector.body
161 vector.body: ; preds = %vector.body, %vector.ph
162 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
163 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
164 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
165 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
166 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
167 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
168 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
169 %tmp2 = bitcast i32* %tmp to <4 x i32>*
170 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
171 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
172 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
173 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
174 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
175 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
176 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
177 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
178 %index.next = add i32 %index, 4
179 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
180 %tmp16 = icmp ne i32 %tmp15, 0
181 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
183 for.cond.cleanup: ; preds = %vector.body, %entry
188 define dso_local arm_aapcs_vfpcc void @trip_count_minus_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
190 %cmp8 = icmp eq i32 %N, 0
191 %tmp8 = add i32 %N, 3
192 %tmp9 = lshr i32 %tmp8, 2
193 %tmp10 = shl nuw i32 %tmp9, 2
194 %tmp11 = add i32 %tmp10, -4
195 %tmp12 = lshr i32 %tmp11, 2
196 %tmp13 = add nuw nsw i32 %tmp12, 1
197 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
199 vector.ph: ; preds = %entry
200 %trip.count.minus.2 = add i32 %N, -2
201 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.2, i32 1
202 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
203 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
204 br label %vector.body
206 vector.body: ; preds = %vector.body, %vector.ph
207 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
208 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
209 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
210 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
211 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
212 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
213 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
214 %tmp2 = bitcast i32* %tmp to <4 x i32>*
215 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
216 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
217 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
218 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
219 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
220 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
221 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
222 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
223 %index.next = add i32 %index, 4
224 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
225 %tmp16 = icmp ne i32 %tmp15, 0
226 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
228 for.cond.cleanup: ; preds = %vector.body, %entry
232 ; index has been inserted at element 1, not 0.
233 define dso_local arm_aapcs_vfpcc void @wrong_loop_insert(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
235 %cmp8 = icmp eq i32 %N, 0
236 %tmp8 = add i32 %N, 3
237 %tmp9 = lshr i32 %tmp8, 2
238 %tmp10 = shl nuw i32 %tmp9, 2
239 %tmp11 = add i32 %tmp10, -4
240 %tmp12 = lshr i32 %tmp11, 2
241 %tmp13 = add nuw nsw i32 %tmp12, 1
242 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
244 vector.ph: ; preds = %entry
245 %trip.count.minus.1 = add i32 %N, -1
246 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
247 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
248 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
249 br label %vector.body
251 vector.body: ; preds = %vector.body, %vector.ph
252 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
253 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
254 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 1
255 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
256 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
257 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
258 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
259 %tmp2 = bitcast i32* %tmp to <4 x i32>*
260 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
261 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
262 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
263 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
264 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
265 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
266 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
267 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
268 %index.next = add i32 %index, 4
269 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
270 %tmp16 = icmp ne i32 %tmp15, 0
271 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
273 for.cond.cleanup: ; preds = %vector.body, %entry
277 define dso_local arm_aapcs_vfpcc void @wrong_loop_invalid_index_splat(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
279 %cmp8 = icmp eq i32 %N, 0
280 %tmp8 = add i32 %N, 3
281 %tmp9 = lshr i32 %tmp8, 2
282 %tmp10 = shl nuw i32 %tmp9, 2
283 %tmp11 = add i32 %tmp10, -4
284 %tmp12 = lshr i32 %tmp11, 2
285 %tmp13 = add nuw nsw i32 %tmp12, 1
286 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
288 vector.ph: ; preds = %entry
289 %trip.count.minus.1 = add i32 %N, -1
290 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
291 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
292 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
293 br label %vector.body
295 vector.body: ; preds = %vector.body, %vector.ph
296 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
297 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
298 %incorrect = add i32 %index, 1
299 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %incorrect, i32 0
300 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
301 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
302 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
303 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
304 %tmp2 = bitcast i32* %tmp to <4 x i32>*
305 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
306 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
307 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
308 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
309 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
310 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
311 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
312 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
313 %index.next = add i32 %index, 4
314 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
315 %tmp16 = icmp ne i32 %tmp15, 0
316 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
318 for.cond.cleanup: ; preds = %vector.body, %entry
322 ; Now using ult, not ule for the vector icmp
323 define dso_local arm_aapcs_vfpcc void @wrong_pred_opcode(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
325 %cmp8 = icmp eq i32 %N, 0
326 %tmp8 = add i32 %N, 3
327 %tmp9 = lshr i32 %tmp8, 2
328 %tmp10 = shl nuw i32 %tmp9, 2
329 %tmp11 = add i32 %tmp10, -4
330 %tmp12 = lshr i32 %tmp11, 2
331 %tmp13 = add nuw nsw i32 %tmp12, 1
332 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
334 vector.ph: ; preds = %entry
335 %trip.count.minus.1 = add i32 %N, -1
336 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
337 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
338 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
339 br label %vector.body
341 vector.body: ; preds = %vector.body, %vector.ph
342 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
343 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
344 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
345 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
346 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
347 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
348 %tmp1 = icmp ult <4 x i32> %induction, %broadcast.splat11
349 %tmp2 = bitcast i32* %tmp to <4 x i32>*
350 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
351 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
352 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
353 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
354 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
355 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
356 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
357 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
358 %index.next = add i32 %index, 4
359 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
360 %tmp16 = icmp ne i32 %tmp15, 0
361 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
363 for.cond.cleanup: ; preds = %vector.body, %entry
367 ; The add in the body uses 1, 2, 3, 4
368 define void @wrong_body_broadcast_splat(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
370 %cmp8 = icmp eq i32 %N, 0
371 %tmp8 = add i32 %N, 3
372 %tmp9 = lshr i32 %tmp8, 2
373 %tmp10 = shl nuw i32 %tmp9, 2
374 %tmp11 = add i32 %tmp10, -4
375 %tmp12 = lshr i32 %tmp11, 2
376 %tmp13 = add nuw nsw i32 %tmp12, 1
377 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
379 vector.ph: ; preds = %entry
380 %trip.count.minus.1 = add i32 %N, -1
381 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
382 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
383 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
384 br label %vector.body
386 vector.body: ; preds = %vector.body, %vector.ph
387 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
388 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
389 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
390 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
391 %induction = add <4 x i32> %broadcast.splat, <i32 1, i32 2, i32 3, i32 4>
392 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
393 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
394 %tmp2 = bitcast i32* %tmp to <4 x i32>*
395 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
396 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
397 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
398 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
399 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
400 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
401 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
402 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
403 %index.next = add i32 %index, 4
404 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
405 %tmp16 = icmp ne i32 %tmp15, 0
406 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
408 for.cond.cleanup: ; preds = %vector.body, %entry
412 ; Using a variable for the loop body broadcast.
413 define void @wrong_body_broadcast_splat_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N, <4 x i32> %offsets) {
415 %cmp8 = icmp eq i32 %N, 0
416 %tmp8 = add i32 %N, 3
417 %tmp9 = lshr i32 %tmp8, 2
418 %tmp10 = shl nuw i32 %tmp9, 2
419 %tmp11 = add i32 %tmp10, -4
420 %tmp12 = lshr i32 %tmp11, 2
421 %tmp13 = add nuw nsw i32 %tmp12, 1
422 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
424 vector.ph: ; preds = %entry
425 %trip.count.minus.1 = add i32 %N, -1
426 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
427 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
428 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
429 br label %vector.body
431 vector.body: ; preds = %vector.body, %vector.ph
432 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
433 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
434 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
435 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
436 %induction = add <4 x i32> %broadcast.splat, %offsets
437 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
438 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
439 %tmp2 = bitcast i32* %tmp to <4 x i32>*
440 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
441 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
442 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
443 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
444 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
445 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
446 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
447 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
448 %index.next = add i32 %index, 4
449 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
450 %tmp16 = icmp ne i32 %tmp15, 0
451 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
453 for.cond.cleanup: ; preds = %vector.body, %entry
457 ; adding 5, instead of 4, to index.
458 define void @wrong_index_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
460 %cmp8 = icmp eq i32 %N, 0
461 %tmp8 = add i32 %N, 3
462 %tmp9 = lshr i32 %tmp8, 2
463 %tmp10 = shl nuw i32 %tmp9, 2
464 %tmp11 = add i32 %tmp10, -4
465 %tmp12 = lshr i32 %tmp11, 2
466 %tmp13 = add nuw nsw i32 %tmp12, 1
467 br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
469 vector.ph: ; preds = %entry
470 %trip.count.minus.1 = add i32 %N, -1
471 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
472 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
473 %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
474 br label %vector.body
476 vector.body: ; preds = %vector.body, %vector.ph
477 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
478 %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
479 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
480 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
481 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
482 %tmp = getelementptr inbounds i32, i32* %a, i32 %index
483 %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
484 %tmp2 = bitcast i32* %tmp to <4 x i32>*
485 %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
486 %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
487 %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
488 %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
489 %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
490 %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
491 %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
492 tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
493 %index.next = add i32 %index, 5
494 %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
495 %tmp16 = icmp ne i32 %tmp15, 0
496 br i1 %tmp16, label %vector.body, label %for.cond.cleanup
498 for.cond.cleanup: ; preds = %vector.body, %entry
502 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
503 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
504 declare i32 @llvm.start.loop.iterations.i32(i32) #3
505 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3