1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -disable-mve-tail-predication=false -enable-arm-maskedldst=true %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture readonly %b, i32 %N) {
5 ; CHECK-LABEL: test_acc_scalar_char:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: cmp r2, #0
9 ; CHECK-NEXT: moveq r0, #0
11 ; CHECK-NEXT: push {r7, lr}
12 ; CHECK-NEXT: vpush {d8, d9}
13 ; CHECK-NEXT: adds r3, r2, #3
14 ; CHECK-NEXT: subs r2, #1
15 ; CHECK-NEXT: bic r3, r3, #3
16 ; CHECK-NEXT: vdup.32 q1, r2
17 ; CHECK-NEXT: sub.w r12, r3, #4
18 ; CHECK-NEXT: movs r3, #1
19 ; CHECK-NEXT: vmov.i32 q0, #0x0
20 ; CHECK-NEXT: movs r2, #0
21 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
22 ; CHECK-NEXT: adr r3, .LCPI0_0
23 ; CHECK-NEXT: vldrw.u32 q2, [r3]
24 ; CHECK-NEXT: dls lr, lr
25 ; CHECK-NEXT: .LBB0_1: @ %vector.body
26 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
27 ; CHECK-NEXT: vadd.i32 q4, q2, r2
28 ; CHECK-NEXT: adds r3, r1, r2
29 ; CHECK-NEXT: adds r2, #4
30 ; CHECK-NEXT: vpt.u32 cs, q1, q4
31 ; CHECK-NEXT: vldrbt.u32 q4, [r3]
32 ; CHECK-NEXT: vmov q3, q0
33 ; CHECK-NEXT: vmla.u32 q0, q4, r0
34 ; CHECK-NEXT: le lr, .LBB0_1
35 ; CHECK-NEXT: @ %bb.2: @ %middle.block
36 ; CHECK-NEXT: vpsel q0, q0, q3
37 ; CHECK-NEXT: vaddv.u32 r0, q0
38 ; CHECK-NEXT: vpop {d8, d9}
39 ; CHECK-NEXT: pop {r7, pc}
40 ; CHECK-NEXT: .p2align 4
41 ; CHECK-NEXT: @ %bb.3:
42 ; CHECK-NEXT: .LCPI0_0:
43 ; CHECK-NEXT: .long 0 @ 0x0
44 ; CHECK-NEXT: .long 1 @ 0x1
45 ; CHECK-NEXT: .long 2 @ 0x2
46 ; CHECK-NEXT: .long 3 @ 0x3
48 %cmp7 = icmp eq i32 %N, 0
49 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph
51 vector.ph: ; preds = %entry
52 %conv = zext i8 %a to i32
53 %n.rnd.up = add i32 %N, 3
54 %n.vec = and i32 %n.rnd.up, -4
55 %trip.count.minus.1 = add i32 %N, -1
56 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
57 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
58 %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
59 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
62 vector.body: ; preds = %vector.body, %vector.ph
63 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
64 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
65 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
66 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
67 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
68 %0 = getelementptr inbounds i8, i8* %b, i32 %index
69 %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
70 %2 = bitcast i8* %0 to <4 x i8>*
71 %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef)
72 %3 = zext <4 x i8> %wide.masked.load to <4 x i32>
73 %4 = mul nuw nsw <4 x i32> %broadcast.splat13, %3
74 %5 = add nuw nsw <4 x i32> %4, %vec.phi
75 %index.next = add i32 %index, 4
76 %6 = icmp eq i32 %index.next, %n.vec
77 br i1 %6, label %middle.block, label %vector.body
79 middle.block: ; preds = %vector.body
80 %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
81 %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
82 br label %for.cond.cleanup
84 for.cond.cleanup: ; preds = %middle.block, %entry
85 %res.0.lcssa = phi i32 [ 0, %entry ], [ %8, %middle.block ]
89 define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture readonly %b, i32 %N) {
90 ; CHECK-LABEL: test_acc_scalar_short:
91 ; CHECK: @ %bb.0: @ %entry
92 ; CHECK-NEXT: cmp r2, #0
94 ; CHECK-NEXT: moveq r0, #0
96 ; CHECK-NEXT: push {r7, lr}
97 ; CHECK-NEXT: vpush {d8, d9}
98 ; CHECK-NEXT: adds r3, r2, #3
99 ; CHECK-NEXT: subs r2, #1
100 ; CHECK-NEXT: bic r3, r3, #3
101 ; CHECK-NEXT: vdup.32 q1, r2
102 ; CHECK-NEXT: sub.w r12, r3, #4
103 ; CHECK-NEXT: movs r3, #1
104 ; CHECK-NEXT: vmov.i32 q0, #0x0
105 ; CHECK-NEXT: movs r2, #0
106 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
107 ; CHECK-NEXT: adr r3, .LCPI1_0
108 ; CHECK-NEXT: vldrw.u32 q2, [r3]
109 ; CHECK-NEXT: dls lr, lr
110 ; CHECK-NEXT: .LBB1_1: @ %vector.body
111 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
112 ; CHECK-NEXT: vadd.i32 q4, q2, r2
113 ; CHECK-NEXT: adds r2, #4
114 ; CHECK-NEXT: vpt.u32 cs, q1, q4
115 ; CHECK-NEXT: vldrht.s32 q4, [r1]
116 ; CHECK-NEXT: adds r1, #8
117 ; CHECK-NEXT: vmov q3, q0
118 ; CHECK-NEXT: vmla.u32 q0, q4, r0
119 ; CHECK-NEXT: le lr, .LBB1_1
120 ; CHECK-NEXT: @ %bb.2: @ %middle.block
121 ; CHECK-NEXT: vpsel q0, q0, q3
122 ; CHECK-NEXT: vaddv.u32 r0, q0
123 ; CHECK-NEXT: vpop {d8, d9}
124 ; CHECK-NEXT: pop {r7, pc}
125 ; CHECK-NEXT: .p2align 4
126 ; CHECK-NEXT: @ %bb.3:
127 ; CHECK-NEXT: .LCPI1_0:
128 ; CHECK-NEXT: .long 0 @ 0x0
129 ; CHECK-NEXT: .long 1 @ 0x1
130 ; CHECK-NEXT: .long 2 @ 0x2
131 ; CHECK-NEXT: .long 3 @ 0x3
133 %cmp7 = icmp eq i32 %N, 0
134 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph
136 vector.ph: ; preds = %entry
137 %conv = sext i16 %a to i32
138 %n.rnd.up = add i32 %N, 3
139 %n.vec = and i32 %n.rnd.up, -4
140 %trip.count.minus.1 = add i32 %N, -1
141 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
142 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
143 %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
144 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
145 br label %vector.body
147 vector.body: ; preds = %vector.body, %vector.ph
148 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
149 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
150 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
151 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
152 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
153 %0 = getelementptr inbounds i16, i16* %b, i32 %index
154 %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
155 %2 = bitcast i16* %0 to <4 x i16>*
156 %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
157 %3 = sext <4 x i16> %wide.masked.load to <4 x i32>
158 %4 = mul nsw <4 x i32> %broadcast.splat13, %3
159 %5 = add nsw <4 x i32> %4, %vec.phi
160 %index.next = add i32 %index, 4
161 %6 = icmp eq i32 %index.next, %n.vec
162 br i1 %6, label %middle.block, label %vector.body
164 middle.block: ; preds = %vector.body
165 %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
166 %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
167 br label %for.cond.cleanup
169 for.cond.cleanup: ; preds = %middle.block, %entry
170 %res.0.lcssa = phi i32 [ 0, %entry ], [ %8, %middle.block ]
174 define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture readonly %b, i32 %N) {
175 ; CHECK-LABEL: test_acc_scalar_uchar:
176 ; CHECK: @ %bb.0: @ %entry
177 ; CHECK-NEXT: cmp r2, #0
179 ; CHECK-NEXT: moveq r0, #0
180 ; CHECK-NEXT: bxeq lr
181 ; CHECK-NEXT: push {r7, lr}
182 ; CHECK-NEXT: vpush {d8, d9}
183 ; CHECK-NEXT: adds r3, r2, #3
184 ; CHECK-NEXT: subs r2, #1
185 ; CHECK-NEXT: bic r3, r3, #3
186 ; CHECK-NEXT: vdup.32 q1, r2
187 ; CHECK-NEXT: sub.w r12, r3, #4
188 ; CHECK-NEXT: movs r3, #1
189 ; CHECK-NEXT: vmov.i32 q0, #0x0
190 ; CHECK-NEXT: movs r2, #0
191 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
192 ; CHECK-NEXT: adr r3, .LCPI2_0
193 ; CHECK-NEXT: vldrw.u32 q2, [r3]
194 ; CHECK-NEXT: dls lr, lr
195 ; CHECK-NEXT: .LBB2_1: @ %vector.body
196 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
197 ; CHECK-NEXT: vadd.i32 q4, q2, r2
198 ; CHECK-NEXT: adds r3, r1, r2
199 ; CHECK-NEXT: adds r2, #4
200 ; CHECK-NEXT: vpt.u32 cs, q1, q4
201 ; CHECK-NEXT: vldrbt.u32 q4, [r3]
202 ; CHECK-NEXT: vmov q3, q0
203 ; CHECK-NEXT: vmla.u32 q0, q4, r0
204 ; CHECK-NEXT: le lr, .LBB2_1
205 ; CHECK-NEXT: @ %bb.2: @ %middle.block
206 ; CHECK-NEXT: vpsel q0, q0, q3
207 ; CHECK-NEXT: vaddv.u32 r0, q0
208 ; CHECK-NEXT: vpop {d8, d9}
209 ; CHECK-NEXT: pop {r7, pc}
210 ; CHECK-NEXT: .p2align 4
211 ; CHECK-NEXT: @ %bb.3:
212 ; CHECK-NEXT: .LCPI2_0:
213 ; CHECK-NEXT: .long 0 @ 0x0
214 ; CHECK-NEXT: .long 1 @ 0x1
215 ; CHECK-NEXT: .long 2 @ 0x2
216 ; CHECK-NEXT: .long 3 @ 0x3
218 %cmp7 = icmp eq i32 %N, 0
219 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph
221 vector.ph: ; preds = %entry
222 %conv = zext i8 %a to i32
223 %n.rnd.up = add i32 %N, 3
224 %n.vec = and i32 %n.rnd.up, -4
225 %trip.count.minus.1 = add i32 %N, -1
226 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
227 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
228 %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
229 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
230 br label %vector.body
232 vector.body: ; preds = %vector.body, %vector.ph
233 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
234 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
235 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
236 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
237 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
238 %0 = getelementptr inbounds i8, i8* %b, i32 %index
239 %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
240 %2 = bitcast i8* %0 to <4 x i8>*
241 %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef)
242 %3 = zext <4 x i8> %wide.masked.load to <4 x i32>
243 %4 = mul nuw nsw <4 x i32> %broadcast.splat13, %3
244 %5 = add nuw nsw <4 x i32> %4, %vec.phi
245 %index.next = add i32 %index, 4
246 %6 = icmp eq i32 %index.next, %n.vec
247 br i1 %6, label %middle.block, label %vector.body
249 middle.block: ; preds = %vector.body
250 %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
251 %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
252 br label %for.cond.cleanup
254 for.cond.cleanup: ; preds = %middle.block, %entry
255 %res.0.lcssa = phi i32 [ 0, %entry ], [ %8, %middle.block ]
259 define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocapture readonly %b, i32 %N) {
260 ; CHECK-LABEL: test_acc_scalar_ushort:
261 ; CHECK: @ %bb.0: @ %entry
262 ; CHECK-NEXT: cmp r2, #0
264 ; CHECK-NEXT: moveq r0, #0
265 ; CHECK-NEXT: bxeq lr
266 ; CHECK-NEXT: push {r7, lr}
267 ; CHECK-NEXT: vpush {d8, d9}
268 ; CHECK-NEXT: adds r3, r2, #3
269 ; CHECK-NEXT: subs r2, #1
270 ; CHECK-NEXT: bic r3, r3, #3
271 ; CHECK-NEXT: vdup.32 q1, r2
272 ; CHECK-NEXT: sub.w r12, r3, #4
273 ; CHECK-NEXT: movs r3, #1
274 ; CHECK-NEXT: vmov.i32 q0, #0x0
275 ; CHECK-NEXT: movs r2, #0
276 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
277 ; CHECK-NEXT: adr r3, .LCPI3_0
278 ; CHECK-NEXT: vldrw.u32 q2, [r3]
279 ; CHECK-NEXT: dls lr, lr
280 ; CHECK-NEXT: .LBB3_1: @ %vector.body
281 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
282 ; CHECK-NEXT: vadd.i32 q4, q2, r2
283 ; CHECK-NEXT: adds r2, #4
284 ; CHECK-NEXT: vpt.u32 cs, q1, q4
285 ; CHECK-NEXT: vldrht.u32 q4, [r1]
286 ; CHECK-NEXT: adds r1, #8
287 ; CHECK-NEXT: vmov q3, q0
288 ; CHECK-NEXT: vmla.u32 q0, q4, r0
289 ; CHECK-NEXT: le lr, .LBB3_1
290 ; CHECK-NEXT: @ %bb.2: @ %middle.block
291 ; CHECK-NEXT: vpsel q0, q0, q3
292 ; CHECK-NEXT: vaddv.u32 r0, q0
293 ; CHECK-NEXT: vpop {d8, d9}
294 ; CHECK-NEXT: pop {r7, pc}
295 ; CHECK-NEXT: .p2align 4
296 ; CHECK-NEXT: @ %bb.3:
297 ; CHECK-NEXT: .LCPI3_0:
298 ; CHECK-NEXT: .long 0 @ 0x0
299 ; CHECK-NEXT: .long 1 @ 0x1
300 ; CHECK-NEXT: .long 2 @ 0x2
301 ; CHECK-NEXT: .long 3 @ 0x3
303 %cmp7 = icmp eq i32 %N, 0
304 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph
306 vector.ph: ; preds = %entry
307 %conv = sext i16 %a to i32
308 %n.rnd.up = add i32 %N, 3
309 %n.vec = and i32 %n.rnd.up, -4
310 %trip.count.minus.1 = add i32 %N, -1
311 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
312 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
313 %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
314 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
315 br label %vector.body
317 vector.body: ; preds = %vector.body, %vector.ph
318 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
319 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
320 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
321 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
322 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
323 %0 = getelementptr inbounds i16, i16* %b, i32 %index
324 %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
325 %2 = bitcast i16* %0 to <4 x i16>*
326 %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
327 %3 = zext <4 x i16> %wide.masked.load to <4 x i32>
328 %4 = mul nsw <4 x i32> %broadcast.splat13, %3
329 %5 = add nsw <4 x i32> %4, %vec.phi
330 %index.next = add i32 %index, 4
331 %6 = icmp eq i32 %index.next, %n.vec
332 br i1 %6, label %middle.block, label %vector.body
334 middle.block: ; preds = %vector.body
335 %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
336 %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
337 br label %for.cond.cleanup
339 for.cond.cleanup: ; preds = %middle.block, %entry
340 %res.0.lcssa = phi i32 [ 0, %entry ], [ %8, %middle.block ]
344 define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly %b, i32 %N) {
345 ; CHECK-LABEL: test_acc_scalar_int:
346 ; CHECK: @ %bb.0: @ %entry
347 ; CHECK-NEXT: cmp r2, #0
349 ; CHECK-NEXT: moveq r0, #0
350 ; CHECK-NEXT: bxeq lr
351 ; CHECK-NEXT: push {r7, lr}
352 ; CHECK-NEXT: adds r3, r2, #3
353 ; CHECK-NEXT: vmov.i32 q0, #0x0
354 ; CHECK-NEXT: bic r3, r3, #3
355 ; CHECK-NEXT: sub.w r12, r3, #4
356 ; CHECK-NEXT: movs r3, #1
357 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
358 ; CHECK-NEXT: dls lr, lr
359 ; CHECK-NEXT: .LBB4_1: @ %vector.body
360 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
361 ; CHECK-NEXT: vctp.32 r2
363 ; CHECK-NEXT: vldrwt.u32 q2, [r1]
364 ; CHECK-NEXT: mov r3, r2
365 ; CHECK-NEXT: adds r1, #16
366 ; CHECK-NEXT: subs r2, #4
367 ; CHECK-NEXT: vmov q1, q0
368 ; CHECK-NEXT: vmla.u32 q0, q2, r0
369 ; CHECK-NEXT: le lr, .LBB4_1
370 ; CHECK-NEXT: @ %bb.2: @ %middle.block
371 ; CHECK-NEXT: vctp.32 r3
372 ; CHECK-NEXT: vpsel q0, q0, q1
373 ; CHECK-NEXT: vaddv.u32 r0, q0
374 ; CHECK-NEXT: pop {r7, pc}
376 %cmp6 = icmp eq i32 %N, 0
377 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
379 vector.ph: ; preds = %entry
380 %n.rnd.up = add i32 %N, 3
381 %n.vec = and i32 %n.rnd.up, -4
382 %trip.count.minus.1 = add i32 %N, -1
383 %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
384 %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
385 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %a, i32 0
386 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
387 br label %vector.body
389 vector.body: ; preds = %vector.body, %vector.ph
390 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
391 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %4, %vector.body ]
392 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
393 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
394 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
395 %0 = getelementptr inbounds i32, i32* %b, i32 %index
396 %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
397 %2 = bitcast i32* %0 to <4 x i32>*
398 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
399 %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat12
400 %4 = add nsw <4 x i32> %3, %vec.phi
401 %index.next = add i32 %index, 4
402 %5 = icmp eq i32 %index.next, %n.vec
403 br i1 %5, label %middle.block, label %vector.body
405 middle.block: ; preds = %vector.body
406 %6 = select <4 x i1> %1, <4 x i32> %4, <4 x i32> %vec.phi
407 %7 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %6)
408 br label %for.cond.cleanup
410 for.cond.cleanup: ; preds = %middle.block, %entry
411 %res.0.lcssa = phi i32 [ 0, %entry ], [ %7, %middle.block ]
415 define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly %a, i8* nocapture readonly %b, i8 zeroext %c, i32* nocapture %res, i32 %N) {
416 ; CHECK-LABEL: test_vec_mul_scalar_add_char:
417 ; CHECK: @ %bb.0: @ %entry
418 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
419 ; CHECK-NEXT: ldr r7, [sp, #28]
420 ; CHECK-NEXT: cmp r7, #0
421 ; CHECK-NEXT: beq.w .LBB5_12
422 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph
423 ; CHECK-NEXT: add.w r4, r3, r7, lsl #2
424 ; CHECK-NEXT: adds r5, r1, r7
425 ; CHECK-NEXT: cmp r4, r1
426 ; CHECK-NEXT: add.w r6, r0, r7
427 ; CHECK-NEXT: cset r12, hi
428 ; CHECK-NEXT: cmp r5, r3
429 ; CHECK-NEXT: cset r5, hi
430 ; CHECK-NEXT: cmp r4, r0
431 ; CHECK-NEXT: cset r4, hi
432 ; CHECK-NEXT: cmp r6, r3
433 ; CHECK-NEXT: cset r6, hi
434 ; CHECK-NEXT: ands r6, r4
435 ; CHECK-NEXT: lsls r6, r6, #31
437 ; CHECK-NEXT: andeq.w r6, r5, r12
438 ; CHECK-NEXT: lslseq.w r6, r6, #31
439 ; CHECK-NEXT: beq .LBB5_4
440 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
441 ; CHECK-NEXT: subs r6, r7, #1
442 ; CHECK-NEXT: and lr, r7, #3
443 ; CHECK-NEXT: cmp r6, #3
444 ; CHECK-NEXT: bhs .LBB5_6
445 ; CHECK-NEXT: @ %bb.3:
446 ; CHECK-NEXT: movs r7, #0
447 ; CHECK-NEXT: b .LBB5_9
448 ; CHECK-NEXT: .LBB5_4: @ %vector.ph
449 ; CHECK-NEXT: adds r6, r7, #3
450 ; CHECK-NEXT: movs r5, #1
451 ; CHECK-NEXT: bic r6, r6, #3
452 ; CHECK-NEXT: subs r7, #1
453 ; CHECK-NEXT: subs r6, #4
454 ; CHECK-NEXT: vdup.32 q0, r7
455 ; CHECK-NEXT: movs r7, #0
456 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
457 ; CHECK-NEXT: adr r6, .LCPI5_0
458 ; CHECK-NEXT: vldrw.u32 q1, [r6]
459 ; CHECK-NEXT: dls lr, lr
460 ; CHECK-NEXT: .LBB5_5: @ %vector.body
461 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
462 ; CHECK-NEXT: vadd.i32 q2, q1, r7
463 ; CHECK-NEXT: adds r4, r0, r7
464 ; CHECK-NEXT: vpt.u32 cs, q0, q2
465 ; CHECK-NEXT: vldrbt.u32 q2, [r4]
466 ; CHECK-NEXT: adds r4, r1, r7
468 ; CHECK-NEXT: vldrbt.u32 q3, [r4]
469 ; CHECK-NEXT: vmul.i32 q2, q3, q2
470 ; CHECK-NEXT: vadd.i32 q2, q2, r2
472 ; CHECK-NEXT: vstrwt.32 q2, [r3]
473 ; CHECK-NEXT: adds r3, #16
474 ; CHECK-NEXT: adds r7, #4
475 ; CHECK-NEXT: le lr, .LBB5_5
476 ; CHECK-NEXT: b .LBB5_12
477 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new
478 ; CHECK-NEXT: sub.w r12, lr, r7
479 ; CHECK-NEXT: subs r4, r1, #3
480 ; CHECK-NEXT: subs r5, r0, #3
481 ; CHECK-NEXT: sub.w r7, r3, #16
482 ; CHECK-NEXT: mov.w r9, #0
483 ; CHECK-NEXT: .LBB5_7: @ %for.body
484 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
485 ; CHECK-NEXT: ldrb.w r8, [r5, #3]
486 ; CHECK-NEXT: sub.w r9, r9, #4
487 ; CHECK-NEXT: ldrb r6, [r4, #3]
488 ; CHECK-NEXT: cmp r12, r9
489 ; CHECK-NEXT: smlabb r6, r6, r8, r2
490 ; CHECK-NEXT: str r6, [r7, #16]!
491 ; CHECK-NEXT: ldrb r8, [r5, #4]!
492 ; CHECK-NEXT: ldrb r6, [r4, #4]!
493 ; CHECK-NEXT: smlabb r6, r6, r8, r2
494 ; CHECK-NEXT: str r6, [r7, #4]
495 ; CHECK-NEXT: ldrb.w r8, [r5, #1]
496 ; CHECK-NEXT: ldrb r6, [r4, #1]
497 ; CHECK-NEXT: smlabb r6, r6, r8, r2
498 ; CHECK-NEXT: str r6, [r7, #8]
499 ; CHECK-NEXT: ldrb.w r8, [r5, #2]
500 ; CHECK-NEXT: ldrb r6, [r4, #2]
501 ; CHECK-NEXT: smlabb r6, r6, r8, r2
502 ; CHECK-NEXT: str r6, [r7, #12]
503 ; CHECK-NEXT: bne .LBB5_7
504 ; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup.loopexit.unr-lcssa.loopexit
505 ; CHECK-NEXT: rsb.w r7, r9, #0
506 ; CHECK-NEXT: .LBB5_9: @ %for.cond.cleanup.loopexit.unr-lcssa
507 ; CHECK-NEXT: wls lr, lr, .LBB5_12
508 ; CHECK-NEXT: @ %bb.10: @ %for.body.epil.preheader
509 ; CHECK-NEXT: subs r7, #1
510 ; CHECK-NEXT: add r0, r7
511 ; CHECK-NEXT: add r1, r7
512 ; CHECK-NEXT: add.w r3, r3, r7, lsl #2
513 ; CHECK-NEXT: .LBB5_11: @ %for.body.epil
514 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
515 ; CHECK-NEXT: ldrb r7, [r0, #1]!
516 ; CHECK-NEXT: ldrb r6, [r1, #1]!
517 ; CHECK-NEXT: smlabb r7, r6, r7, r2
518 ; CHECK-NEXT: str r7, [r3, #4]!
519 ; CHECK-NEXT: le lr, .LBB5_11
520 ; CHECK-NEXT: .LBB5_12: @ %for.cond.cleanup
521 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
522 ; CHECK-NEXT: .p2align 4
523 ; CHECK-NEXT: @ %bb.13:
524 ; CHECK-NEXT: .LCPI5_0:
525 ; CHECK-NEXT: .long 0 @ 0x0
526 ; CHECK-NEXT: .long 1 @ 0x1
527 ; CHECK-NEXT: .long 2 @ 0x2
528 ; CHECK-NEXT: .long 3 @ 0x3
530 %res12 = bitcast i32* %res to i8*
531 %cmp10 = icmp eq i32 %N, 0
532 br i1 %cmp10, label %for.cond.cleanup, label %for.body.lr.ph
534 for.body.lr.ph: ; preds = %entry
535 %conv3 = zext i8 %c to i32
536 %scevgep = getelementptr i32, i32* %res, i32 %N
537 %scevgep13 = bitcast i32* %scevgep to i8*
538 %scevgep14 = getelementptr i8, i8* %a, i32 %N
539 %scevgep15 = getelementptr i8, i8* %b, i32 %N
540 %bound0 = icmp ugt i8* %scevgep14, %res12
541 %bound1 = icmp ugt i8* %scevgep13, %a
542 %found.conflict = and i1 %bound0, %bound1
543 %bound016 = icmp ugt i8* %scevgep15, %res12
544 %bound117 = icmp ugt i8* %scevgep13, %b
545 %found.conflict18 = and i1 %bound016, %bound117
546 %conflict.rdx = or i1 %found.conflict, %found.conflict18
547 br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph
549 for.body.preheader: ; preds = %for.body.lr.ph
551 %xtraiter = and i32 %N, 3
552 %1 = icmp ult i32 %0, 3
553 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
555 for.body.preheader.new: ; preds = %for.body.preheader
556 %unroll_iter = sub i32 %N, %xtraiter
559 vector.ph: ; preds = %for.body.lr.ph
560 %n.rnd.up = add i32 %N, 3
561 %n.vec = and i32 %n.rnd.up, -4
562 %trip.count.minus.1 = add i32 %N, -1
563 %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
564 %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
565 %broadcast.splatinsert22 = insertelement <4 x i32> undef, i32 %conv3, i32 0
566 %broadcast.splat23 = shufflevector <4 x i32> %broadcast.splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
567 br label %vector.body
569 vector.body: ; preds = %vector.body, %vector.ph
570 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
571 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
572 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
573 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
574 %2 = getelementptr inbounds i8, i8* %a, i32 %index
575 %3 = icmp ule <4 x i32> %induction, %broadcast.splat20
576 %4 = bitcast i8* %2 to <4 x i8>*
577 %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef)
578 %5 = zext <4 x i8> %wide.masked.load to <4 x i32>
579 %6 = getelementptr inbounds i8, i8* %b, i32 %index
580 %7 = bitcast i8* %6 to <4 x i8>*
581 %wide.masked.load21 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %7, i32 1, <4 x i1> %3, <4 x i8> undef)
582 %8 = zext <4 x i8> %wide.masked.load21 to <4 x i32>
583 %9 = mul nuw nsw <4 x i32> %8, %5
584 %10 = add nuw nsw <4 x i32> %9, %broadcast.splat23
585 %11 = getelementptr inbounds i32, i32* %res, i32 %index
586 %12 = bitcast i32* %11 to <4 x i32>*
587 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %12, i32 4, <4 x i1> %3)
588 %index.next = add i32 %index, 4
589 %13 = icmp eq i32 %index.next, %n.vec
590 br i1 %13, label %for.cond.cleanup, label %vector.body
592 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
593 %i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
594 %lcmp.mod = icmp eq i32 %xtraiter, 0
595 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
597 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
598 %i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
599 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
600 %arrayidx.epil = getelementptr inbounds i8, i8* %a, i32 %i.011.epil
601 %14 = load i8, i8* %arrayidx.epil, align 1
602 %conv.epil = zext i8 %14 to i32
603 %arrayidx1.epil = getelementptr inbounds i8, i8* %b, i32 %i.011.epil
604 %15 = load i8, i8* %arrayidx1.epil, align 1
605 %conv2.epil = zext i8 %15 to i32
606 %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil
607 %add.epil = add nuw nsw i32 %mul.epil, %conv3
608 %arrayidx4.epil = getelementptr inbounds i32, i32* %res, i32 %i.011.epil
609 store i32 %add.epil, i32* %arrayidx4.epil, align 4
610 %inc.epil = add nuw i32 %i.011.epil, 1
611 %epil.iter.sub = add i32 %epil.iter, -1
612 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
613 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
615 for.cond.cleanup: ; preds = %vector.body, %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
618 for.body: ; preds = %for.body, %for.body.preheader.new
619 %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
620 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
621 %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.011
622 %16 = load i8, i8* %arrayidx, align 1
623 %conv = zext i8 %16 to i32
624 %arrayidx1 = getelementptr inbounds i8, i8* %b, i32 %i.011
625 %17 = load i8, i8* %arrayidx1, align 1
626 %conv2 = zext i8 %17 to i32
627 %mul = mul nuw nsw i32 %conv2, %conv
628 %add = add nuw nsw i32 %mul, %conv3
629 %arrayidx4 = getelementptr inbounds i32, i32* %res, i32 %i.011
630 store i32 %add, i32* %arrayidx4, align 4
631 %inc = or i32 %i.011, 1
632 %arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %inc
633 %18 = load i8, i8* %arrayidx.1, align 1
634 %conv.1 = zext i8 %18 to i32
635 %arrayidx1.1 = getelementptr inbounds i8, i8* %b, i32 %inc
636 %19 = load i8, i8* %arrayidx1.1, align 1
637 %conv2.1 = zext i8 %19 to i32
638 %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
639 %add.1 = add nuw nsw i32 %mul.1, %conv3
640 %arrayidx4.1 = getelementptr inbounds i32, i32* %res, i32 %inc
641 store i32 %add.1, i32* %arrayidx4.1, align 4
642 %inc.1 = or i32 %i.011, 2
643 %arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %inc.1
644 %20 = load i8, i8* %arrayidx.2, align 1
645 %conv.2 = zext i8 %20 to i32
646 %arrayidx1.2 = getelementptr inbounds i8, i8* %b, i32 %inc.1
647 %21 = load i8, i8* %arrayidx1.2, align 1
648 %conv2.2 = zext i8 %21 to i32
649 %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2
650 %add.2 = add nuw nsw i32 %mul.2, %conv3
651 %arrayidx4.2 = getelementptr inbounds i32, i32* %res, i32 %inc.1
652 store i32 %add.2, i32* %arrayidx4.2, align 4
653 %inc.2 = or i32 %i.011, 3
654 %arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %inc.2
655 %22 = load i8, i8* %arrayidx.3, align 1
656 %conv.3 = zext i8 %22 to i32
657 %arrayidx1.3 = getelementptr inbounds i8, i8* %b, i32 %inc.2
658 %23 = load i8, i8* %arrayidx1.3, align 1
659 %conv2.3 = zext i8 %23 to i32
660 %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3
661 %add.3 = add nuw nsw i32 %mul.3, %conv3
662 %arrayidx4.3 = getelementptr inbounds i32, i32* %res, i32 %inc.2
663 store i32 %add.3, i32* %arrayidx4.3, align 4
664 %inc.3 = add nuw i32 %i.011, 4
665 %niter.nsub.3 = add i32 %niter, -4
666 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
667 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
670 define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readonly %a, i16* nocapture readonly %b, i16 signext %c, i32* nocapture %res, i32 %N) {
671 ; CHECK-LABEL: test_vec_mul_scalar_add_short:
672 ; CHECK: @ %bb.0: @ %entry
673 ; CHECK-NEXT: push {r4, lr}
674 ; CHECK-NEXT: ldr.w r12, [sp, #8]
675 ; CHECK-NEXT: cmp.w r12, #0
677 ; CHECK-NEXT: popeq {r4, pc}
678 ; CHECK-NEXT: add.w lr, r12, #3
679 ; CHECK-NEXT: movs r4, #1
680 ; CHECK-NEXT: bic lr, lr, #3
681 ; CHECK-NEXT: sub.w lr, lr, #4
682 ; CHECK-NEXT: add.w lr, r4, lr, lsr #2
683 ; CHECK-NEXT: sub.w r4, r12, #1
684 ; CHECK-NEXT: vdup.32 q0, r4
685 ; CHECK-NEXT: adr r4, .LCPI6_0
686 ; CHECK-NEXT: vldrw.u32 q1, [r4]
687 ; CHECK-NEXT: mov.w r12, #0
688 ; CHECK-NEXT: dls lr, lr
689 ; CHECK-NEXT: .LBB6_1: @ %vector.body
690 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
691 ; CHECK-NEXT: vadd.i32 q2, q1, r12
692 ; CHECK-NEXT: add.w r12, r12, #4
693 ; CHECK-NEXT: vptt.u32 cs, q0, q2
694 ; CHECK-NEXT: vldrht.s32 q2, [r0]
695 ; CHECK-NEXT: vldrht.s32 q3, [r1]
696 ; CHECK-NEXT: adds r0, #8
697 ; CHECK-NEXT: vmul.i32 q2, q3, q2
698 ; CHECK-NEXT: adds r1, #8
699 ; CHECK-NEXT: vadd.i32 q2, q2, r2
701 ; CHECK-NEXT: vstrwt.32 q2, [r3]
702 ; CHECK-NEXT: adds r3, #16
703 ; CHECK-NEXT: le lr, .LBB6_1
704 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
705 ; CHECK-NEXT: pop {r4, pc}
706 ; CHECK-NEXT: .p2align 4
707 ; CHECK-NEXT: @ %bb.3:
708 ; CHECK-NEXT: .LCPI6_0:
709 ; CHECK-NEXT: .long 0 @ 0x0
710 ; CHECK-NEXT: .long 1 @ 0x1
711 ; CHECK-NEXT: .long 2 @ 0x2
712 ; CHECK-NEXT: .long 3 @ 0x3
714 %cmp10 = icmp eq i32 %N, 0
715 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
717 vector.ph: ; preds = %entry
718 %conv3 = sext i16 %c to i32
719 %n.rnd.up = add i32 %N, 3
720 %n.vec = and i32 %n.rnd.up, -4
721 %trip.count.minus.1 = add i32 %N, -1
722 %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
723 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
724 %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %conv3, i32 0
725 %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
726 br label %vector.body
728 vector.body: ; preds = %vector.body, %vector.ph
729 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
730 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
731 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
732 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
733 %0 = getelementptr inbounds i16, i16* %a, i32 %index
734 %1 = icmp ule <4 x i32> %induction, %broadcast.splat13
735 %2 = bitcast i16* %0 to <4 x i16>*
736 %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
737 %3 = sext <4 x i16> %wide.masked.load to <4 x i32>
738 %4 = getelementptr inbounds i16, i16* %b, i32 %index
739 %5 = bitcast i16* %4 to <4 x i16>*
740 %wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %5, i32 2, <4 x i1> %1, <4 x i16> undef)
741 %6 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
742 %7 = mul nsw <4 x i32> %6, %3
743 %8 = add nsw <4 x i32> %7, %broadcast.splat16
744 %9 = getelementptr inbounds i32, i32* %res, i32 %index
745 %10 = bitcast i32* %9 to <4 x i32>*
746 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %1)
747 %index.next = add i32 %index, 4
748 %11 = icmp eq i32 %index.next, %n.vec
749 br i1 %11, label %for.cond.cleanup, label %vector.body
751 for.cond.cleanup: ; preds = %vector.body, %entry
755 define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonly %a, i8* nocapture readonly %b, i8 zeroext %c, i32* nocapture %res, i32 %N) {
756 ; CHECK-LABEL: test_vec_mul_scalar_add_uchar:
757 ; CHECK: @ %bb.0: @ %entry
758 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
759 ; CHECK-NEXT: ldr r7, [sp, #28]
760 ; CHECK-NEXT: cmp r7, #0
761 ; CHECK-NEXT: beq.w .LBB7_12
762 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph
763 ; CHECK-NEXT: add.w r4, r3, r7, lsl #2
764 ; CHECK-NEXT: adds r5, r1, r7
765 ; CHECK-NEXT: cmp r4, r1
766 ; CHECK-NEXT: add.w r6, r0, r7
767 ; CHECK-NEXT: cset r12, hi
768 ; CHECK-NEXT: cmp r5, r3
769 ; CHECK-NEXT: cset r5, hi
770 ; CHECK-NEXT: cmp r4, r0
771 ; CHECK-NEXT: cset r4, hi
772 ; CHECK-NEXT: cmp r6, r3
773 ; CHECK-NEXT: cset r6, hi
774 ; CHECK-NEXT: ands r6, r4
775 ; CHECK-NEXT: lsls r6, r6, #31
777 ; CHECK-NEXT: andeq.w r6, r5, r12
778 ; CHECK-NEXT: lslseq.w r6, r6, #31
779 ; CHECK-NEXT: beq .LBB7_4
780 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
781 ; CHECK-NEXT: subs r6, r7, #1
782 ; CHECK-NEXT: and lr, r7, #3
783 ; CHECK-NEXT: cmp r6, #3
784 ; CHECK-NEXT: bhs .LBB7_6
785 ; CHECK-NEXT: @ %bb.3:
786 ; CHECK-NEXT: movs r7, #0
787 ; CHECK-NEXT: b .LBB7_9
788 ; CHECK-NEXT: .LBB7_4: @ %vector.ph
789 ; CHECK-NEXT: adds r6, r7, #3
790 ; CHECK-NEXT: movs r5, #1
791 ; CHECK-NEXT: bic r6, r6, #3
792 ; CHECK-NEXT: subs r7, #1
793 ; CHECK-NEXT: subs r6, #4
794 ; CHECK-NEXT: vdup.32 q0, r7
795 ; CHECK-NEXT: movs r7, #0
796 ; CHECK-NEXT: add.w lr, r5, r6, lsr #2
797 ; CHECK-NEXT: adr r6, .LCPI7_0
798 ; CHECK-NEXT: vldrw.u32 q1, [r6]
799 ; CHECK-NEXT: dls lr, lr
800 ; CHECK-NEXT: .LBB7_5: @ %vector.body
801 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
802 ; CHECK-NEXT: vadd.i32 q2, q1, r7
803 ; CHECK-NEXT: adds r4, r0, r7
804 ; CHECK-NEXT: vpt.u32 cs, q0, q2
805 ; CHECK-NEXT: vldrbt.u32 q2, [r4]
806 ; CHECK-NEXT: adds r4, r1, r7
808 ; CHECK-NEXT: vldrbt.u32 q3, [r4]
809 ; CHECK-NEXT: vmul.i32 q2, q3, q2
810 ; CHECK-NEXT: vadd.i32 q2, q2, r2
812 ; CHECK-NEXT: vstrwt.32 q2, [r3]
813 ; CHECK-NEXT: adds r3, #16
814 ; CHECK-NEXT: adds r7, #4
815 ; CHECK-NEXT: le lr, .LBB7_5
816 ; CHECK-NEXT: b .LBB7_12
817 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new
818 ; CHECK-NEXT: sub.w r12, lr, r7
819 ; CHECK-NEXT: subs r4, r1, #3
820 ; CHECK-NEXT: subs r5, r0, #3
821 ; CHECK-NEXT: sub.w r7, r3, #16
822 ; CHECK-NEXT: mov.w r9, #0
823 ; CHECK-NEXT: .LBB7_7: @ %for.body
824 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
825 ; CHECK-NEXT: ldrb.w r8, [r5, #3]
826 ; CHECK-NEXT: sub.w r9, r9, #4
827 ; CHECK-NEXT: ldrb r6, [r4, #3]
828 ; CHECK-NEXT: cmp r12, r9
829 ; CHECK-NEXT: smlabb r6, r6, r8, r2
830 ; CHECK-NEXT: str r6, [r7, #16]!
831 ; CHECK-NEXT: ldrb r8, [r5, #4]!
832 ; CHECK-NEXT: ldrb r6, [r4, #4]!
833 ; CHECK-NEXT: smlabb r6, r6, r8, r2
834 ; CHECK-NEXT: str r6, [r7, #4]
835 ; CHECK-NEXT: ldrb.w r8, [r5, #1]
836 ; CHECK-NEXT: ldrb r6, [r4, #1]
837 ; CHECK-NEXT: smlabb r6, r6, r8, r2
838 ; CHECK-NEXT: str r6, [r7, #8]
839 ; CHECK-NEXT: ldrb.w r8, [r5, #2]
840 ; CHECK-NEXT: ldrb r6, [r4, #2]
841 ; CHECK-NEXT: smlabb r6, r6, r8, r2
842 ; CHECK-NEXT: str r6, [r7, #12]
843 ; CHECK-NEXT: bne .LBB7_7
844 ; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup.loopexit.unr-lcssa.loopexit
845 ; CHECK-NEXT: rsb.w r7, r9, #0
846 ; CHECK-NEXT: .LBB7_9: @ %for.cond.cleanup.loopexit.unr-lcssa
847 ; CHECK-NEXT: wls lr, lr, .LBB7_12
848 ; CHECK-NEXT: @ %bb.10: @ %for.body.epil.preheader
849 ; CHECK-NEXT: subs r7, #1
850 ; CHECK-NEXT: add r0, r7
851 ; CHECK-NEXT: add r1, r7
852 ; CHECK-NEXT: add.w r3, r3, r7, lsl #2
853 ; CHECK-NEXT: .LBB7_11: @ %for.body.epil
854 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
855 ; CHECK-NEXT: ldrb r7, [r0, #1]!
856 ; CHECK-NEXT: ldrb r6, [r1, #1]!
857 ; CHECK-NEXT: smlabb r7, r6, r7, r2
858 ; CHECK-NEXT: str r7, [r3, #4]!
859 ; CHECK-NEXT: le lr, .LBB7_11
860 ; CHECK-NEXT: .LBB7_12: @ %for.cond.cleanup
861 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
862 ; CHECK-NEXT: .p2align 4
863 ; CHECK-NEXT: @ %bb.13:
864 ; CHECK-NEXT: .LCPI7_0:
865 ; CHECK-NEXT: .long 0 @ 0x0
866 ; CHECK-NEXT: .long 1 @ 0x1
867 ; CHECK-NEXT: .long 2 @ 0x2
868 ; CHECK-NEXT: .long 3 @ 0x3
870 %res12 = bitcast i32* %res to i8*
871 %cmp10 = icmp eq i32 %N, 0
872 br i1 %cmp10, label %for.cond.cleanup, label %for.body.lr.ph
874 for.body.lr.ph: ; preds = %entry
875 %conv3 = zext i8 %c to i32
876 %scevgep = getelementptr i32, i32* %res, i32 %N
877 %scevgep13 = bitcast i32* %scevgep to i8*
878 %scevgep14 = getelementptr i8, i8* %a, i32 %N
879 %scevgep15 = getelementptr i8, i8* %b, i32 %N
880 %bound0 = icmp ugt i8* %scevgep14, %res12
881 %bound1 = icmp ugt i8* %scevgep13, %a
882 %found.conflict = and i1 %bound0, %bound1
883 %bound016 = icmp ugt i8* %scevgep15, %res12
884 %bound117 = icmp ugt i8* %scevgep13, %b
885 %found.conflict18 = and i1 %bound016, %bound117
886 %conflict.rdx = or i1 %found.conflict, %found.conflict18
887 br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph
889 for.body.preheader: ; preds = %for.body.lr.ph
891 %xtraiter = and i32 %N, 3
892 %1 = icmp ult i32 %0, 3
893 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
895 for.body.preheader.new: ; preds = %for.body.preheader
896 %unroll_iter = sub i32 %N, %xtraiter
899 vector.ph: ; preds = %for.body.lr.ph
900 %n.rnd.up = add i32 %N, 3
901 %n.vec = and i32 %n.rnd.up, -4
902 %trip.count.minus.1 = add i32 %N, -1
903 %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
904 %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
905 %broadcast.splatinsert22 = insertelement <4 x i32> undef, i32 %conv3, i32 0
906 %broadcast.splat23 = shufflevector <4 x i32> %broadcast.splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
907 br label %vector.body
909 vector.body: ; preds = %vector.body, %vector.ph
910 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
911 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
912 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
913 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
914 %2 = getelementptr inbounds i8, i8* %a, i32 %index
915 %3 = icmp ule <4 x i32> %induction, %broadcast.splat20
916 %4 = bitcast i8* %2 to <4 x i8>*
917 %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef)
918 %5 = zext <4 x i8> %wide.masked.load to <4 x i32>
919 %6 = getelementptr inbounds i8, i8* %b, i32 %index
920 %7 = bitcast i8* %6 to <4 x i8>*
921 %wide.masked.load21 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %7, i32 1, <4 x i1> %3, <4 x i8> undef)
922 %8 = zext <4 x i8> %wide.masked.load21 to <4 x i32>
923 %9 = mul nuw nsw <4 x i32> %8, %5
924 %10 = add nuw nsw <4 x i32> %9, %broadcast.splat23
925 %11 = getelementptr inbounds i32, i32* %res, i32 %index
926 %12 = bitcast i32* %11 to <4 x i32>*
927 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %12, i32 4, <4 x i1> %3)
928 %index.next = add i32 %index, 4
929 %13 = icmp eq i32 %index.next, %n.vec
930 br i1 %13, label %for.cond.cleanup, label %vector.body
932 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
933 %i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
934 %lcmp.mod = icmp eq i32 %xtraiter, 0
935 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
937 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
938 %i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
939 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
940 %arrayidx.epil = getelementptr inbounds i8, i8* %a, i32 %i.011.epil
941 %14 = load i8, i8* %arrayidx.epil, align 1
942 %conv.epil = zext i8 %14 to i32
943 %arrayidx1.epil = getelementptr inbounds i8, i8* %b, i32 %i.011.epil
944 %15 = load i8, i8* %arrayidx1.epil, align 1
945 %conv2.epil = zext i8 %15 to i32
946 %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil
947 %add.epil = add nuw nsw i32 %mul.epil, %conv3
948 %arrayidx4.epil = getelementptr inbounds i32, i32* %res, i32 %i.011.epil
949 store i32 %add.epil, i32* %arrayidx4.epil, align 4
950 %inc.epil = add nuw i32 %i.011.epil, 1
951 %epil.iter.sub = add i32 %epil.iter, -1
952 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
953 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
955 for.cond.cleanup: ; preds = %vector.body, %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
958 for.body: ; preds = %for.body, %for.body.preheader.new
959 %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
960 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
961 %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.011
962 %16 = load i8, i8* %arrayidx, align 1
963 %conv = zext i8 %16 to i32
964 %arrayidx1 = getelementptr inbounds i8, i8* %b, i32 %i.011
965 %17 = load i8, i8* %arrayidx1, align 1
966 %conv2 = zext i8 %17 to i32
967 %mul = mul nuw nsw i32 %conv2, %conv
968 %add = add nuw nsw i32 %mul, %conv3
969 %arrayidx4 = getelementptr inbounds i32, i32* %res, i32 %i.011
970 store i32 %add, i32* %arrayidx4, align 4
971 %inc = or i32 %i.011, 1
972 %arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %inc
973 %18 = load i8, i8* %arrayidx.1, align 1
974 %conv.1 = zext i8 %18 to i32
975 %arrayidx1.1 = getelementptr inbounds i8, i8* %b, i32 %inc
976 %19 = load i8, i8* %arrayidx1.1, align 1
977 %conv2.1 = zext i8 %19 to i32
978 %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
979 %add.1 = add nuw nsw i32 %mul.1, %conv3
980 %arrayidx4.1 = getelementptr inbounds i32, i32* %res, i32 %inc
981 store i32 %add.1, i32* %arrayidx4.1, align 4
982 %inc.1 = or i32 %i.011, 2
983 %arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %inc.1
984 %20 = load i8, i8* %arrayidx.2, align 1
985 %conv.2 = zext i8 %20 to i32
986 %arrayidx1.2 = getelementptr inbounds i8, i8* %b, i32 %inc.1
987 %21 = load i8, i8* %arrayidx1.2, align 1
988 %conv2.2 = zext i8 %21 to i32
989 %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2
990 %add.2 = add nuw nsw i32 %mul.2, %conv3
991 %arrayidx4.2 = getelementptr inbounds i32, i32* %res, i32 %inc.1
992 store i32 %add.2, i32* %arrayidx4.2, align 4
993 %inc.2 = or i32 %i.011, 3
994 %arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %inc.2
995 %22 = load i8, i8* %arrayidx.3, align 1
996 %conv.3 = zext i8 %22 to i32
997 %arrayidx1.3 = getelementptr inbounds i8, i8* %b, i32 %inc.2
998 %23 = load i8, i8* %arrayidx1.3, align 1
999 %conv2.3 = zext i8 %23 to i32
1000 %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3
1001 %add.3 = add nuw nsw i32 %mul.3, %conv3
1002 %arrayidx4.3 = getelementptr inbounds i32, i32* %res, i32 %inc.2
1003 store i32 %add.3, i32* %arrayidx4.3, align 4
1004 %inc.3 = add nuw i32 %i.011, 4
1005 %niter.nsub.3 = add i32 %niter, -4
1006 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1007 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1010 define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture readonly %a, i16* nocapture readonly %b, i16 signext %c, i32* nocapture %res, i32 %N) {
1011 ; CHECK-LABEL: test_vec_mul_scalar_add_ushort:
1012 ; CHECK: @ %bb.0: @ %entry
1013 ; CHECK-NEXT: push {r4, lr}
1014 ; CHECK-NEXT: ldr.w r12, [sp, #8]
1015 ; CHECK-NEXT: cmp.w r12, #0
1017 ; CHECK-NEXT: popeq {r4, pc}
1018 ; CHECK-NEXT: add.w lr, r12, #3
1019 ; CHECK-NEXT: movs r4, #1
1020 ; CHECK-NEXT: bic lr, lr, #3
1021 ; CHECK-NEXT: sub.w lr, lr, #4
1022 ; CHECK-NEXT: add.w lr, r4, lr, lsr #2
1023 ; CHECK-NEXT: sub.w r4, r12, #1
1024 ; CHECK-NEXT: vdup.32 q0, r4
1025 ; CHECK-NEXT: adr r4, .LCPI8_0
1026 ; CHECK-NEXT: vldrw.u32 q1, [r4]
1027 ; CHECK-NEXT: mov.w r12, #0
1028 ; CHECK-NEXT: dls lr, lr
1029 ; CHECK-NEXT: .LBB8_1: @ %vector.body
1030 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1031 ; CHECK-NEXT: vadd.i32 q2, q1, r12
1032 ; CHECK-NEXT: add.w r12, r12, #4
1033 ; CHECK-NEXT: vptt.u32 cs, q0, q2
1034 ; CHECK-NEXT: vldrht.u32 q2, [r0]
1035 ; CHECK-NEXT: vldrht.u32 q3, [r1]
1036 ; CHECK-NEXT: adds r0, #8
1037 ; CHECK-NEXT: vmul.i32 q2, q3, q2
1038 ; CHECK-NEXT: adds r1, #8
1039 ; CHECK-NEXT: vadd.i32 q2, q2, r2
1041 ; CHECK-NEXT: vstrwt.32 q2, [r3]
1042 ; CHECK-NEXT: adds r3, #16
1043 ; CHECK-NEXT: le lr, .LBB8_1
1044 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
1045 ; CHECK-NEXT: pop {r4, pc}
1046 ; CHECK-NEXT: .p2align 4
1047 ; CHECK-NEXT: @ %bb.3:
1048 ; CHECK-NEXT: .LCPI8_0:
1049 ; CHECK-NEXT: .long 0 @ 0x0
1050 ; CHECK-NEXT: .long 1 @ 0x1
1051 ; CHECK-NEXT: .long 2 @ 0x2
1052 ; CHECK-NEXT: .long 3 @ 0x3
1054 %cmp10 = icmp eq i32 %N, 0
1055 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
1057 vector.ph: ; preds = %entry
1058 %conv3 = sext i16 %c to i32
1059 %n.rnd.up = add i32 %N, 3
1060 %n.vec = and i32 %n.rnd.up, -4
1061 %trip.count.minus.1 = add i32 %N, -1
1062 %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
1063 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
1064 %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %conv3, i32 0
1065 %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
1066 br label %vector.body
1068 vector.body: ; preds = %vector.body, %vector.ph
1069 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1070 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
1071 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
1072 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
1073 %0 = getelementptr inbounds i16, i16* %a, i32 %index
1074 %1 = icmp ule <4 x i32> %induction, %broadcast.splat13
1075 %2 = bitcast i16* %0 to <4 x i16>*
1076 %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
1077 %3 = zext <4 x i16> %wide.masked.load to <4 x i32>
1078 %4 = getelementptr inbounds i16, i16* %b, i32 %index
1079 %5 = bitcast i16* %4 to <4 x i16>*
1080 %wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %5, i32 2, <4 x i1> %1, <4 x i16> undef)
1081 %6 = zext <4 x i16> %wide.masked.load14 to <4 x i32>
1082 %7 = mul nuw nsw <4 x i32> %6, %3
1083 %8 = add nsw <4 x i32> %7, %broadcast.splat16
1084 %9 = getelementptr inbounds i32, i32* %res, i32 %index
1085 %10 = bitcast i32* %9 to <4 x i32>*
1086 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %1)
1087 %index.next = add i32 %index, 4
1088 %11 = icmp eq i32 %index.next, %n.vec
1089 br i1 %11, label %for.cond.cleanup, label %vector.body
1091 for.cond.cleanup: ; preds = %vector.body, %entry
1095 define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %c, i32* nocapture %res, i32 %N) {
1096 ; CHECK-LABEL: test_vec_mul_scalar_add_int:
1097 ; CHECK: @ %bb.0: @ %entry
1098 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
1099 ; CHECK-NEXT: ldr.w r12, [sp, #32]
1100 ; CHECK-NEXT: cmp.w r12, #0
1101 ; CHECK-NEXT: beq.w .LBB9_11
1102 ; CHECK-NEXT: @ %bb.1: @ %vector.memcheck
1103 ; CHECK-NEXT: add.w r4, r3, r12, lsl #2
1104 ; CHECK-NEXT: add.w r5, r1, r12, lsl #2
1105 ; CHECK-NEXT: cmp r4, r1
1106 ; CHECK-NEXT: add.w r6, r0, r12, lsl #2
1107 ; CHECK-NEXT: cset r7, hi
1108 ; CHECK-NEXT: cmp r5, r3
1109 ; CHECK-NEXT: cset r5, hi
1110 ; CHECK-NEXT: cmp r4, r0
1111 ; CHECK-NEXT: cset r4, hi
1112 ; CHECK-NEXT: cmp r6, r3
1113 ; CHECK-NEXT: cset r6, hi
1114 ; CHECK-NEXT: mov.w lr, #1
1115 ; CHECK-NEXT: ands r6, r4
1116 ; CHECK-NEXT: lsls r6, r6, #31
1117 ; CHECK-NEXT: itt eq
1118 ; CHECK-NEXT: andeq.w r4, r5, r7
1119 ; CHECK-NEXT: lslseq.w r4, r4, #31
1120 ; CHECK-NEXT: beq .LBB9_4
1121 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
1122 ; CHECK-NEXT: sub.w r4, r12, #1
1123 ; CHECK-NEXT: and r5, r12, #3
1124 ; CHECK-NEXT: cmp r4, #3
1125 ; CHECK-NEXT: bhs .LBB9_6
1126 ; CHECK-NEXT: @ %bb.3:
1127 ; CHECK-NEXT: mov r10, r5
1128 ; CHECK-NEXT: mov.w r12, #0
1129 ; CHECK-NEXT: b .LBB9_8
1130 ; CHECK-NEXT: .LBB9_4: @ %vector.ph
1131 ; CHECK-NEXT: add.w r4, r12, #3
1132 ; CHECK-NEXT: bic r4, r4, #3
1133 ; CHECK-NEXT: subs r4, #4
1134 ; CHECK-NEXT: add.w lr, lr, r4, lsr #2
1135 ; CHECK-NEXT: dls lr, lr
1136 ; CHECK-NEXT: .LBB9_5: @ %vector.body
1137 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1138 ; CHECK-NEXT: vctp.32 r12
1140 ; CHECK-NEXT: vldrwt.u32 q0, [r0]
1141 ; CHECK-NEXT: vldrwt.u32 q1, [r1]
1142 ; CHECK-NEXT: vmul.i32 q0, q1, q0
1143 ; CHECK-NEXT: adds r0, #16
1144 ; CHECK-NEXT: vadd.i32 q0, q0, r2
1146 ; CHECK-NEXT: vstrwt.32 q0, [r3]
1147 ; CHECK-NEXT: adds r1, #16
1148 ; CHECK-NEXT: adds r3, #16
1149 ; CHECK-NEXT: sub.w r12, r12, #4
1150 ; CHECK-NEXT: le lr, .LBB9_5
1151 ; CHECK-NEXT: b .LBB9_11
1152 ; CHECK-NEXT: .LBB9_6: @ %for.body.preheader.new
1153 ; CHECK-NEXT: sub.w r7, r12, r5
1154 ; CHECK-NEXT: mov r10, r5
1155 ; CHECK-NEXT: subs r7, #4
1156 ; CHECK-NEXT: movs r4, #0
1157 ; CHECK-NEXT: mov.w r12, #0
1158 ; CHECK-NEXT: add.w lr, lr, r7, lsr #2
1159 ; CHECK-NEXT: dls lr, lr
1160 ; CHECK-NEXT: .LBB9_7: @ %for.body
1161 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1162 ; CHECK-NEXT: ldr r5, [r0, r4]
1163 ; CHECK-NEXT: add.w r9, r0, r4
1164 ; CHECK-NEXT: ldr r6, [r1, r4]
1165 ; CHECK-NEXT: adds r7, r1, r4
1166 ; CHECK-NEXT: add.w r12, r12, #4
1167 ; CHECK-NEXT: mla r5, r6, r5, r2
1168 ; CHECK-NEXT: str r5, [r3, r4]
1169 ; CHECK-NEXT: ldr.w r8, [r9, #4]
1170 ; CHECK-NEXT: ldr r6, [r7, #4]
1171 ; CHECK-NEXT: mla r8, r6, r8, r2
1172 ; CHECK-NEXT: adds r6, r3, r4
1173 ; CHECK-NEXT: adds r4, #16
1174 ; CHECK-NEXT: str.w r8, [r6, #4]
1175 ; CHECK-NEXT: ldr.w r8, [r9, #8]
1176 ; CHECK-NEXT: ldr r5, [r7, #8]
1177 ; CHECK-NEXT: mla r5, r5, r8, r2
1178 ; CHECK-NEXT: str r5, [r6, #8]
1179 ; CHECK-NEXT: ldr.w r5, [r9, #12]
1180 ; CHECK-NEXT: ldr r7, [r7, #12]
1181 ; CHECK-NEXT: mla r5, r7, r5, r2
1182 ; CHECK-NEXT: str r5, [r6, #12]
1183 ; CHECK-NEXT: le lr, .LBB9_7
1184 ; CHECK-NEXT: .LBB9_8: @ %for.cond.cleanup.loopexit.unr-lcssa
1185 ; CHECK-NEXT: wls lr, r10, .LBB9_11
1186 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader
1187 ; CHECK-NEXT: mvn r7, #3
1188 ; CHECK-NEXT: mov lr, r10
1189 ; CHECK-NEXT: add.w r7, r7, r12, lsl #2
1190 ; CHECK-NEXT: add r0, r7
1191 ; CHECK-NEXT: add r1, r7
1192 ; CHECK-NEXT: add r3, r7
1193 ; CHECK-NEXT: .LBB9_10: @ %for.body.epil
1194 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1195 ; CHECK-NEXT: ldr r7, [r0, #4]!
1196 ; CHECK-NEXT: ldr r6, [r1, #4]!
1197 ; CHECK-NEXT: mla r7, r6, r7, r2
1198 ; CHECK-NEXT: str r7, [r3, #4]!
1199 ; CHECK-NEXT: le lr, .LBB9_10
1200 ; CHECK-NEXT: .LBB9_11: @ %for.cond.cleanup
1201 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
1203 %cmp8 = icmp eq i32 %N, 0
1204 br i1 %cmp8, label %for.cond.cleanup, label %vector.memcheck
1206 vector.memcheck: ; preds = %entry
1207 %scevgep = getelementptr i32, i32* %res, i32 %N
1208 %scevgep13 = getelementptr i32, i32* %a, i32 %N
1209 %scevgep16 = getelementptr i32, i32* %b, i32 %N
1210 %bound0 = icmp ugt i32* %scevgep13, %res
1211 %bound1 = icmp ugt i32* %scevgep, %a
1212 %found.conflict = and i1 %bound0, %bound1
1213 %bound018 = icmp ugt i32* %scevgep16, %res
1214 %bound119 = icmp ugt i32* %scevgep, %b
1215 %found.conflict20 = and i1 %bound018, %bound119
1216 %conflict.rdx = or i1 %found.conflict, %found.conflict20
1217 br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph
1219 for.body.preheader: ; preds = %vector.memcheck
1221 %xtraiter = and i32 %N, 3
1222 %1 = icmp ult i32 %0, 3
1223 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1225 for.body.preheader.new: ; preds = %for.body.preheader
1226 %unroll_iter = sub i32 %N, %xtraiter
1229 vector.ph: ; preds = %vector.memcheck
1230 %n.rnd.up = add i32 %N, 3
1231 %n.vec = and i32 %n.rnd.up, -4
1232 %trip.count.minus.1 = add i32 %N, -1
1233 %broadcast.splatinsert21 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
1234 %broadcast.splat22 = shufflevector <4 x i32> %broadcast.splatinsert21, <4 x i32> undef, <4 x i32> zeroinitializer
1235 %broadcast.splatinsert24 = insertelement <4 x i32> undef, i32 %c, i32 0
1236 %broadcast.splat25 = shufflevector <4 x i32> %broadcast.splatinsert24, <4 x i32> undef, <4 x i32> zeroinitializer
1237 br label %vector.body
1239 vector.body: ; preds = %vector.body, %vector.ph
1240 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1241 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
1242 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
1243 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
1244 %2 = getelementptr inbounds i32, i32* %a, i32 %index
1245 %3 = icmp ule <4 x i32> %induction, %broadcast.splat22
1246 %4 = bitcast i32* %2 to <4 x i32>*
1247 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %3, <4 x i32> undef)
1248 %5 = getelementptr inbounds i32, i32* %b, i32 %index
1249 %6 = bitcast i32* %5 to <4 x i32>*
1250 %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %6, i32 4, <4 x i1> %3, <4 x i32> undef)
1251 %7 = mul nsw <4 x i32> %wide.masked.load23, %wide.masked.load
1252 %8 = add nsw <4 x i32> %7, %broadcast.splat25
1253 %9 = getelementptr inbounds i32, i32* %res, i32 %index
1254 %10 = bitcast i32* %9 to <4 x i32>*
1255 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %3)
1256 %index.next = add i32 %index, 4
1257 %11 = icmp eq i32 %index.next, %n.vec
1258 br i1 %11, label %for.cond.cleanup, label %vector.body
1260 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1261 %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1262 %lcmp.mod = icmp eq i32 %xtraiter, 0
1263 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1265 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1266 %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1267 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1268 %arrayidx.epil = getelementptr inbounds i32, i32* %a, i32 %i.09.epil
1269 %12 = load i32, i32* %arrayidx.epil, align 4
1270 %arrayidx1.epil = getelementptr inbounds i32, i32* %b, i32 %i.09.epil
1271 %13 = load i32, i32* %arrayidx1.epil, align 4
1272 %mul.epil = mul nsw i32 %13, %12
1273 %add.epil = add nsw i32 %mul.epil, %c
1274 %arrayidx2.epil = getelementptr inbounds i32, i32* %res, i32 %i.09.epil
1275 store i32 %add.epil, i32* %arrayidx2.epil, align 4
1276 %inc.epil = add nuw i32 %i.09.epil, 1
1277 %epil.iter.sub = add i32 %epil.iter, -1
1278 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1279 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1281 for.cond.cleanup: ; preds = %vector.body, %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1284 for.body: ; preds = %for.body, %for.body.preheader.new
1285 %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1286 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1287 %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.09
1288 %14 = load i32, i32* %arrayidx, align 4
1289 %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.09
1290 %15 = load i32, i32* %arrayidx1, align 4
1291 %mul = mul nsw i32 %15, %14
1292 %add = add nsw i32 %mul, %c
1293 %arrayidx2 = getelementptr inbounds i32, i32* %res, i32 %i.09
1294 store i32 %add, i32* %arrayidx2, align 4
1295 %inc = or i32 %i.09, 1
1296 %arrayidx.1 = getelementptr inbounds i32, i32* %a, i32 %inc
1297 %16 = load i32, i32* %arrayidx.1, align 4
1298 %arrayidx1.1 = getelementptr inbounds i32, i32* %b, i32 %inc
1299 %17 = load i32, i32* %arrayidx1.1, align 4
1300 %mul.1 = mul nsw i32 %17, %16
1301 %add.1 = add nsw i32 %mul.1, %c
1302 %arrayidx2.1 = getelementptr inbounds i32, i32* %res, i32 %inc
1303 store i32 %add.1, i32* %arrayidx2.1, align 4
1304 %inc.1 = or i32 %i.09, 2
1305 %arrayidx.2 = getelementptr inbounds i32, i32* %a, i32 %inc.1
1306 %18 = load i32, i32* %arrayidx.2, align 4
1307 %arrayidx1.2 = getelementptr inbounds i32, i32* %b, i32 %inc.1
1308 %19 = load i32, i32* %arrayidx1.2, align 4
1309 %mul.2 = mul nsw i32 %19, %18
1310 %add.2 = add nsw i32 %mul.2, %c
1311 %arrayidx2.2 = getelementptr inbounds i32, i32* %res, i32 %inc.1
1312 store i32 %add.2, i32* %arrayidx2.2, align 4
1313 %inc.2 = or i32 %i.09, 3
1314 %arrayidx.3 = getelementptr inbounds i32, i32* %a, i32 %inc.2
1315 %20 = load i32, i32* %arrayidx.3, align 4
1316 %arrayidx1.3 = getelementptr inbounds i32, i32* %b, i32 %inc.2
1317 %21 = load i32, i32* %arrayidx1.3, align 4
1318 %mul.3 = mul nsw i32 %21, %20
1319 %add.3 = add nsw i32 %mul.3, %c
1320 %arrayidx2.3 = getelementptr inbounds i32, i32* %res, i32 %inc.2
1321 store i32 %add.3, i32* %arrayidx2.3, align 4
1322 %inc.3 = add nuw i32 %i.09, 4
1323 %niter.nsub.3 = add i32 %niter, -4
1324 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1325 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1328 ; Function Attrs: argmemonly nounwind readonly willreturn
1329 declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) #2
1331 ; Function Attrs: nounwind readnone willreturn
1332 declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #3
1334 ; Function Attrs: argmemonly nounwind readonly willreturn
1335 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #2
1337 ; Function Attrs: argmemonly nounwind readonly willreturn
1338 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
1340 ; Function Attrs: argmemonly nounwind willreturn
1341 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #4