1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -disable-mve-tail-predication=false -enable-arm-maskedldst=true %s -o - | FileCheck %s
4 define arm_aapcs_vfpcc i32 @test_acc_scalar_char(i8 zeroext %a, i8* nocapture readonly %b, i32 %N) {
5 ; CHECK-LABEL: test_acc_scalar_char:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: mov r12, r0
8 ; CHECK-NEXT: movs r0, #0
9 ; CHECK-NEXT: cmp r2, #0
12 ; CHECK-NEXT: push {r4, lr}
13 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
14 ; CHECK-NEXT: sub sp, #8
15 ; CHECK-NEXT: adds r3, r2, #3
16 ; CHECK-NEXT: subs r2, #1
17 ; CHECK-NEXT: bic r3, r3, #3
18 ; CHECK-NEXT: vdup.32 q0, r2
19 ; CHECK-NEXT: sub.w lr, r3, #4
20 ; CHECK-NEXT: adr r2, .LCPI0_0
21 ; CHECK-NEXT: movs r3, #1
22 ; CHECK-NEXT: vldrw.u32 q1, [r2]
23 ; CHECK-NEXT: add.w lr, r3, lr, lsr #2
24 ; CHECK-NEXT: vmov.i32 q4, #0x0
25 ; CHECK-NEXT: vmov.i32 q2, #0xff
26 ; CHECK-NEXT: dls lr, lr
27 ; CHECK-NEXT: .LBB0_1: @ %vector.body
28 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
29 ; CHECK-NEXT: vmov q3, q4
30 ; CHECK-NEXT: vadd.i32 q4, q1, r0
31 ; CHECK-NEXT: vcmp.u32 cs, q0, q4
32 ; CHECK-NEXT: @ implicit-def: $q4
33 ; CHECK-NEXT: vmrs r3, p0
34 ; CHECK-NEXT: and r2, r3, #1
35 ; CHECK-NEXT: rsbs r4, r2, #0
36 ; CHECK-NEXT: movs r2, #0
37 ; CHECK-NEXT: bfi r2, r4, #0, #1
38 ; CHECK-NEXT: ubfx r4, r3, #4, #1
39 ; CHECK-NEXT: rsbs r4, r4, #0
40 ; CHECK-NEXT: bfi r2, r4, #1, #1
41 ; CHECK-NEXT: ubfx r4, r3, #8, #1
42 ; CHECK-NEXT: ubfx r3, r3, #12, #1
43 ; CHECK-NEXT: rsbs r4, r4, #0
44 ; CHECK-NEXT: bfi r2, r4, #2, #1
45 ; CHECK-NEXT: rsbs r3, r3, #0
46 ; CHECK-NEXT: bfi r2, r3, #3, #1
47 ; CHECK-NEXT: lsls r3, r2, #31
48 ; CHECK-NEXT: add.w r3, r1, r0
50 ; CHECK-NEXT: ldrbne r4, [r3]
51 ; CHECK-NEXT: vmovne.32 q4[0], r4
52 ; CHECK-NEXT: lsls r4, r2, #30
54 ; CHECK-NEXT: ldrbmi r4, [r3, #1]
55 ; CHECK-NEXT: vmovmi.32 q4[1], r4
56 ; CHECK-NEXT: lsls r4, r2, #29
58 ; CHECK-NEXT: ldrbmi r4, [r3, #2]
59 ; CHECK-NEXT: vmovmi.32 q4[2], r4
60 ; CHECK-NEXT: lsls r2, r2, #28
62 ; CHECK-NEXT: ldrbmi r2, [r3, #3]
63 ; CHECK-NEXT: vmovmi.32 q4[3], r2
64 ; CHECK-NEXT: vand q5, q4, q2
65 ; CHECK-NEXT: vmov q4, q3
66 ; CHECK-NEXT: adds r0, #4
67 ; CHECK-NEXT: vmla.u32 q4, q5, r12
68 ; CHECK-NEXT: le lr, .LBB0_1
69 ; CHECK-NEXT: @ %bb.2: @ %middle.block
70 ; CHECK-NEXT: vpsel q0, q4, q3
71 ; CHECK-NEXT: vaddv.u32 r0, q0
72 ; CHECK-NEXT: add sp, #8
73 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
74 ; CHECK-NEXT: pop.w {r4, lr}
76 ; CHECK-NEXT: .p2align 4
77 ; CHECK-NEXT: @ %bb.3:
78 ; CHECK-NEXT: .LCPI0_0:
79 ; CHECK-NEXT: .long 0 @ 0x0
80 ; CHECK-NEXT: .long 1 @ 0x1
81 ; CHECK-NEXT: .long 2 @ 0x2
82 ; CHECK-NEXT: .long 3 @ 0x3
84 %cmp7 = icmp eq i32 %N, 0
85 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph
87 vector.ph: ; preds = %entry
88 %conv = zext i8 %a to i32
89 %n.rnd.up = add i32 %N, 3
90 %n.vec = and i32 %n.rnd.up, -4
91 %trip.count.minus.1 = add i32 %N, -1
92 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
93 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
94 %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
95 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
98 vector.body: ; preds = %vector.body, %vector.ph
99 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
100 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
101 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
102 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
103 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
104 %0 = getelementptr inbounds i8, i8* %b, i32 %index
105 %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
106 %2 = bitcast i8* %0 to <4 x i8>*
107 %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef)
108 %3 = zext <4 x i8> %wide.masked.load to <4 x i32>
109 %4 = mul nuw nsw <4 x i32> %broadcast.splat13, %3
110 %5 = add nuw nsw <4 x i32> %4, %vec.phi
111 %index.next = add i32 %index, 4
112 %6 = icmp eq i32 %index.next, %n.vec
113 br i1 %6, label %middle.block, label %vector.body
115 middle.block: ; preds = %vector.body
116 %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
117 %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
118 br label %for.cond.cleanup
120 for.cond.cleanup: ; preds = %middle.block, %entry
121 %res.0.lcssa = phi i32 [ 0, %entry ], [ %8, %middle.block ]
125 define arm_aapcs_vfpcc i32 @test_acc_scalar_short(i16 signext %a, i16* nocapture readonly %b, i32 %N) {
126 ; CHECK-LABEL: test_acc_scalar_short:
127 ; CHECK: @ %bb.0: @ %entry
128 ; CHECK-NEXT: mov r12, r0
129 ; CHECK-NEXT: movs r0, #0
130 ; CHECK-NEXT: cmp r2, #0
132 ; CHECK-NEXT: bxeq lr
133 ; CHECK-NEXT: push {r4, lr}
134 ; CHECK-NEXT: vpush {d8, d9}
135 ; CHECK-NEXT: sub sp, #8
136 ; CHECK-NEXT: adds r3, r2, #3
137 ; CHECK-NEXT: subs r2, #1
138 ; CHECK-NEXT: bic r3, r3, #3
139 ; CHECK-NEXT: vdup.32 q0, r2
140 ; CHECK-NEXT: sub.w lr, r3, #4
141 ; CHECK-NEXT: adr r2, .LCPI1_0
142 ; CHECK-NEXT: movs r3, #1
143 ; CHECK-NEXT: vldrw.u32 q1, [r2]
144 ; CHECK-NEXT: add.w lr, r3, lr, lsr #2
145 ; CHECK-NEXT: vmov.i32 q3, #0x0
146 ; CHECK-NEXT: dls lr, lr
147 ; CHECK-NEXT: .LBB1_1: @ %vector.body
148 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
149 ; CHECK-NEXT: vmov q2, q3
150 ; CHECK-NEXT: vadd.i32 q3, q1, r0
151 ; CHECK-NEXT: vcmp.u32 cs, q0, q3
152 ; CHECK-NEXT: @ implicit-def: $q3
153 ; CHECK-NEXT: adds r0, #4
154 ; CHECK-NEXT: vmrs r3, p0
155 ; CHECK-NEXT: and r2, r3, #1
156 ; CHECK-NEXT: rsbs r4, r2, #0
157 ; CHECK-NEXT: movs r2, #0
158 ; CHECK-NEXT: bfi r2, r4, #0, #1
159 ; CHECK-NEXT: ubfx r4, r3, #4, #1
160 ; CHECK-NEXT: rsbs r4, r4, #0
161 ; CHECK-NEXT: bfi r2, r4, #1, #1
162 ; CHECK-NEXT: ubfx r4, r3, #8, #1
163 ; CHECK-NEXT: ubfx r3, r3, #12, #1
164 ; CHECK-NEXT: rsbs r4, r4, #0
165 ; CHECK-NEXT: bfi r2, r4, #2, #1
166 ; CHECK-NEXT: rsbs r3, r3, #0
167 ; CHECK-NEXT: bfi r2, r3, #3, #1
168 ; CHECK-NEXT: lsls r3, r2, #31
170 ; CHECK-NEXT: ldrhne r3, [r1]
171 ; CHECK-NEXT: vmovne.32 q3[0], r3
172 ; CHECK-NEXT: lsls r3, r2, #30
174 ; CHECK-NEXT: ldrhmi r3, [r1, #2]
175 ; CHECK-NEXT: vmovmi.32 q3[1], r3
176 ; CHECK-NEXT: lsls r3, r2, #29
178 ; CHECK-NEXT: ldrhmi r3, [r1, #4]
179 ; CHECK-NEXT: vmovmi.32 q3[2], r3
180 ; CHECK-NEXT: lsls r2, r2, #28
182 ; CHECK-NEXT: ldrhmi r2, [r1, #6]
183 ; CHECK-NEXT: vmovmi.32 q3[3], r2
184 ; CHECK-NEXT: vmovlb.s16 q4, q3
185 ; CHECK-NEXT: vmov q3, q2
186 ; CHECK-NEXT: adds r1, #8
187 ; CHECK-NEXT: vmla.u32 q3, q4, r12
188 ; CHECK-NEXT: le lr, .LBB1_1
189 ; CHECK-NEXT: @ %bb.2: @ %middle.block
190 ; CHECK-NEXT: vpsel q0, q3, q2
191 ; CHECK-NEXT: vaddv.u32 r0, q0
192 ; CHECK-NEXT: add sp, #8
193 ; CHECK-NEXT: vpop {d8, d9}
194 ; CHECK-NEXT: pop.w {r4, lr}
196 ; CHECK-NEXT: .p2align 4
197 ; CHECK-NEXT: @ %bb.3:
198 ; CHECK-NEXT: .LCPI1_0:
199 ; CHECK-NEXT: .long 0 @ 0x0
200 ; CHECK-NEXT: .long 1 @ 0x1
201 ; CHECK-NEXT: .long 2 @ 0x2
202 ; CHECK-NEXT: .long 3 @ 0x3
204 %cmp7 = icmp eq i32 %N, 0
205 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph
207 vector.ph: ; preds = %entry
208 %conv = sext i16 %a to i32
209 %n.rnd.up = add i32 %N, 3
210 %n.vec = and i32 %n.rnd.up, -4
211 %trip.count.minus.1 = add i32 %N, -1
212 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
213 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
214 %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
215 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
216 br label %vector.body
218 vector.body: ; preds = %vector.body, %vector.ph
219 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
220 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
221 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
222 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
223 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
224 %0 = getelementptr inbounds i16, i16* %b, i32 %index
225 %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
226 %2 = bitcast i16* %0 to <4 x i16>*
227 %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
228 %3 = sext <4 x i16> %wide.masked.load to <4 x i32>
229 %4 = mul nsw <4 x i32> %broadcast.splat13, %3
230 %5 = add nsw <4 x i32> %4, %vec.phi
231 %index.next = add i32 %index, 4
232 %6 = icmp eq i32 %index.next, %n.vec
233 br i1 %6, label %middle.block, label %vector.body
235 middle.block: ; preds = %vector.body
236 %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
237 %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
238 br label %for.cond.cleanup
240 for.cond.cleanup: ; preds = %middle.block, %entry
241 %res.0.lcssa = phi i32 [ 0, %entry ], [ %8, %middle.block ]
245 define arm_aapcs_vfpcc i32 @test_acc_scalar_uchar(i8 zeroext %a, i8* nocapture readonly %b, i32 %N) {
246 ; CHECK-LABEL: test_acc_scalar_uchar:
247 ; CHECK: @ %bb.0: @ %entry
248 ; CHECK-NEXT: mov r12, r0
249 ; CHECK-NEXT: movs r0, #0
250 ; CHECK-NEXT: cmp r2, #0
252 ; CHECK-NEXT: bxeq lr
253 ; CHECK-NEXT: push {r4, lr}
254 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
255 ; CHECK-NEXT: sub sp, #8
256 ; CHECK-NEXT: adds r3, r2, #3
257 ; CHECK-NEXT: subs r2, #1
258 ; CHECK-NEXT: bic r3, r3, #3
259 ; CHECK-NEXT: vdup.32 q0, r2
260 ; CHECK-NEXT: sub.w lr, r3, #4
261 ; CHECK-NEXT: adr r2, .LCPI2_0
262 ; CHECK-NEXT: movs r3, #1
263 ; CHECK-NEXT: vldrw.u32 q1, [r2]
264 ; CHECK-NEXT: add.w lr, r3, lr, lsr #2
265 ; CHECK-NEXT: vmov.i32 q4, #0x0
266 ; CHECK-NEXT: vmov.i32 q2, #0xff
267 ; CHECK-NEXT: dls lr, lr
268 ; CHECK-NEXT: .LBB2_1: @ %vector.body
269 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
270 ; CHECK-NEXT: vmov q3, q4
271 ; CHECK-NEXT: vadd.i32 q4, q1, r0
272 ; CHECK-NEXT: vcmp.u32 cs, q0, q4
273 ; CHECK-NEXT: @ implicit-def: $q4
274 ; CHECK-NEXT: vmrs r3, p0
275 ; CHECK-NEXT: and r2, r3, #1
276 ; CHECK-NEXT: rsbs r4, r2, #0
277 ; CHECK-NEXT: movs r2, #0
278 ; CHECK-NEXT: bfi r2, r4, #0, #1
279 ; CHECK-NEXT: ubfx r4, r3, #4, #1
280 ; CHECK-NEXT: rsbs r4, r4, #0
281 ; CHECK-NEXT: bfi r2, r4, #1, #1
282 ; CHECK-NEXT: ubfx r4, r3, #8, #1
283 ; CHECK-NEXT: ubfx r3, r3, #12, #1
284 ; CHECK-NEXT: rsbs r4, r4, #0
285 ; CHECK-NEXT: bfi r2, r4, #2, #1
286 ; CHECK-NEXT: rsbs r3, r3, #0
287 ; CHECK-NEXT: bfi r2, r3, #3, #1
288 ; CHECK-NEXT: lsls r3, r2, #31
289 ; CHECK-NEXT: add.w r3, r1, r0
291 ; CHECK-NEXT: ldrbne r4, [r3]
292 ; CHECK-NEXT: vmovne.32 q4[0], r4
293 ; CHECK-NEXT: lsls r4, r2, #30
295 ; CHECK-NEXT: ldrbmi r4, [r3, #1]
296 ; CHECK-NEXT: vmovmi.32 q4[1], r4
297 ; CHECK-NEXT: lsls r4, r2, #29
299 ; CHECK-NEXT: ldrbmi r4, [r3, #2]
300 ; CHECK-NEXT: vmovmi.32 q4[2], r4
301 ; CHECK-NEXT: lsls r2, r2, #28
303 ; CHECK-NEXT: ldrbmi r2, [r3, #3]
304 ; CHECK-NEXT: vmovmi.32 q4[3], r2
305 ; CHECK-NEXT: vand q5, q4, q2
306 ; CHECK-NEXT: vmov q4, q3
307 ; CHECK-NEXT: adds r0, #4
308 ; CHECK-NEXT: vmla.u32 q4, q5, r12
309 ; CHECK-NEXT: le lr, .LBB2_1
310 ; CHECK-NEXT: @ %bb.2: @ %middle.block
311 ; CHECK-NEXT: vpsel q0, q4, q3
312 ; CHECK-NEXT: vaddv.u32 r0, q0
313 ; CHECK-NEXT: add sp, #8
314 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
315 ; CHECK-NEXT: pop.w {r4, lr}
317 ; CHECK-NEXT: .p2align 4
318 ; CHECK-NEXT: @ %bb.3:
319 ; CHECK-NEXT: .LCPI2_0:
320 ; CHECK-NEXT: .long 0 @ 0x0
321 ; CHECK-NEXT: .long 1 @ 0x1
322 ; CHECK-NEXT: .long 2 @ 0x2
323 ; CHECK-NEXT: .long 3 @ 0x3
325 %cmp7 = icmp eq i32 %N, 0
326 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph
328 vector.ph: ; preds = %entry
329 %conv = zext i8 %a to i32
330 %n.rnd.up = add i32 %N, 3
331 %n.vec = and i32 %n.rnd.up, -4
332 %trip.count.minus.1 = add i32 %N, -1
333 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
334 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
335 %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
336 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
337 br label %vector.body
339 vector.body: ; preds = %vector.body, %vector.ph
340 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
341 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
342 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
343 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
344 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
345 %0 = getelementptr inbounds i8, i8* %b, i32 %index
346 %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
347 %2 = bitcast i8* %0 to <4 x i8>*
348 %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %2, i32 1, <4 x i1> %1, <4 x i8> undef)
349 %3 = zext <4 x i8> %wide.masked.load to <4 x i32>
350 %4 = mul nuw nsw <4 x i32> %broadcast.splat13, %3
351 %5 = add nuw nsw <4 x i32> %4, %vec.phi
352 %index.next = add i32 %index, 4
353 %6 = icmp eq i32 %index.next, %n.vec
354 br i1 %6, label %middle.block, label %vector.body
356 middle.block: ; preds = %vector.body
357 %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
358 %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
359 br label %for.cond.cleanup
361 for.cond.cleanup: ; preds = %middle.block, %entry
362 %res.0.lcssa = phi i32 [ 0, %entry ], [ %8, %middle.block ]
366 define arm_aapcs_vfpcc i32 @test_acc_scalar_ushort(i16 signext %a, i16* nocapture readonly %b, i32 %N) {
367 ; CHECK-LABEL: test_acc_scalar_ushort:
368 ; CHECK: @ %bb.0: @ %entry
369 ; CHECK-NEXT: mov r12, r0
370 ; CHECK-NEXT: movs r0, #0
371 ; CHECK-NEXT: cmp r2, #0
373 ; CHECK-NEXT: bxeq lr
374 ; CHECK-NEXT: push {r4, lr}
375 ; CHECK-NEXT: vpush {d8, d9}
376 ; CHECK-NEXT: sub sp, #8
377 ; CHECK-NEXT: adds r3, r2, #3
378 ; CHECK-NEXT: subs r2, #1
379 ; CHECK-NEXT: bic r3, r3, #3
380 ; CHECK-NEXT: vdup.32 q0, r2
381 ; CHECK-NEXT: sub.w lr, r3, #4
382 ; CHECK-NEXT: adr r2, .LCPI3_0
383 ; CHECK-NEXT: movs r3, #1
384 ; CHECK-NEXT: vldrw.u32 q1, [r2]
385 ; CHECK-NEXT: add.w lr, r3, lr, lsr #2
386 ; CHECK-NEXT: vmov.i32 q3, #0x0
387 ; CHECK-NEXT: dls lr, lr
388 ; CHECK-NEXT: .LBB3_1: @ %vector.body
389 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
390 ; CHECK-NEXT: vmov q2, q3
391 ; CHECK-NEXT: vadd.i32 q3, q1, r0
392 ; CHECK-NEXT: vcmp.u32 cs, q0, q3
393 ; CHECK-NEXT: @ implicit-def: $q3
394 ; CHECK-NEXT: adds r0, #4
395 ; CHECK-NEXT: vmrs r3, p0
396 ; CHECK-NEXT: and r2, r3, #1
397 ; CHECK-NEXT: rsbs r4, r2, #0
398 ; CHECK-NEXT: movs r2, #0
399 ; CHECK-NEXT: bfi r2, r4, #0, #1
400 ; CHECK-NEXT: ubfx r4, r3, #4, #1
401 ; CHECK-NEXT: rsbs r4, r4, #0
402 ; CHECK-NEXT: bfi r2, r4, #1, #1
403 ; CHECK-NEXT: ubfx r4, r3, #8, #1
404 ; CHECK-NEXT: ubfx r3, r3, #12, #1
405 ; CHECK-NEXT: rsbs r4, r4, #0
406 ; CHECK-NEXT: bfi r2, r4, #2, #1
407 ; CHECK-NEXT: rsbs r3, r3, #0
408 ; CHECK-NEXT: bfi r2, r3, #3, #1
409 ; CHECK-NEXT: lsls r3, r2, #31
411 ; CHECK-NEXT: ldrhne r3, [r1]
412 ; CHECK-NEXT: vmovne.32 q3[0], r3
413 ; CHECK-NEXT: lsls r3, r2, #30
415 ; CHECK-NEXT: ldrhmi r3, [r1, #2]
416 ; CHECK-NEXT: vmovmi.32 q3[1], r3
417 ; CHECK-NEXT: lsls r3, r2, #29
419 ; CHECK-NEXT: ldrhmi r3, [r1, #4]
420 ; CHECK-NEXT: vmovmi.32 q3[2], r3
421 ; CHECK-NEXT: lsls r2, r2, #28
423 ; CHECK-NEXT: ldrhmi r2, [r1, #6]
424 ; CHECK-NEXT: vmovmi.32 q3[3], r2
425 ; CHECK-NEXT: vmovlb.u16 q4, q3
426 ; CHECK-NEXT: vmov q3, q2
427 ; CHECK-NEXT: adds r1, #8
428 ; CHECK-NEXT: vmla.u32 q3, q4, r12
429 ; CHECK-NEXT: le lr, .LBB3_1
430 ; CHECK-NEXT: @ %bb.2: @ %middle.block
431 ; CHECK-NEXT: vpsel q0, q3, q2
432 ; CHECK-NEXT: vaddv.u32 r0, q0
433 ; CHECK-NEXT: add sp, #8
434 ; CHECK-NEXT: vpop {d8, d9}
435 ; CHECK-NEXT: pop.w {r4, lr}
437 ; CHECK-NEXT: .p2align 4
438 ; CHECK-NEXT: @ %bb.3:
439 ; CHECK-NEXT: .LCPI3_0:
440 ; CHECK-NEXT: .long 0 @ 0x0
441 ; CHECK-NEXT: .long 1 @ 0x1
442 ; CHECK-NEXT: .long 2 @ 0x2
443 ; CHECK-NEXT: .long 3 @ 0x3
445 %cmp7 = icmp eq i32 %N, 0
446 br i1 %cmp7, label %for.cond.cleanup, label %vector.ph
448 vector.ph: ; preds = %entry
449 %conv = sext i16 %a to i32
450 %n.rnd.up = add i32 %N, 3
451 %n.vec = and i32 %n.rnd.up, -4
452 %trip.count.minus.1 = add i32 %N, -1
453 %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
454 %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
455 %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %conv, i32 0
456 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
457 br label %vector.body
459 vector.body: ; preds = %vector.body, %vector.ph
460 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
461 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %5, %vector.body ]
462 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
463 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
464 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
465 %0 = getelementptr inbounds i16, i16* %b, i32 %index
466 %1 = icmp ule <4 x i32> %induction, %broadcast.splat11
467 %2 = bitcast i16* %0 to <4 x i16>*
468 %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
469 %3 = zext <4 x i16> %wide.masked.load to <4 x i32>
470 %4 = mul nsw <4 x i32> %broadcast.splat13, %3
471 %5 = add nsw <4 x i32> %4, %vec.phi
472 %index.next = add i32 %index, 4
473 %6 = icmp eq i32 %index.next, %n.vec
474 br i1 %6, label %middle.block, label %vector.body
476 middle.block: ; preds = %vector.body
477 %7 = select <4 x i1> %1, <4 x i32> %5, <4 x i32> %vec.phi
478 %8 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %7)
479 br label %for.cond.cleanup
481 for.cond.cleanup: ; preds = %middle.block, %entry
482 %res.0.lcssa = phi i32 [ 0, %entry ], [ %8, %middle.block ]
486 define arm_aapcs_vfpcc i32 @test_acc_scalar_int(i32 %a, i32* nocapture readonly %b, i32 %N) {
487 ; CHECK-LABEL: test_acc_scalar_int:
488 ; CHECK: @ %bb.0: @ %entry
489 ; CHECK-NEXT: cmp r2, #0
491 ; CHECK-NEXT: moveq r0, #0
492 ; CHECK-NEXT: bxeq lr
493 ; CHECK-NEXT: push {r7, lr}
494 ; CHECK-NEXT: adds r3, r2, #3
495 ; CHECK-NEXT: vmov.i32 q0, #0x0
496 ; CHECK-NEXT: bic r3, r3, #3
497 ; CHECK-NEXT: sub.w r12, r3, #4
498 ; CHECK-NEXT: movs r3, #1
499 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
500 ; CHECK-NEXT: dls lr, lr
501 ; CHECK-NEXT: .LBB4_1: @ %vector.body
502 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
503 ; CHECK-NEXT: subs r2, #4
504 ; CHECK-NEXT: vmov q1, q0
505 ; CHECK-NEXT: vctp.32 r2
507 ; CHECK-NEXT: vldrwt.u32 q2, [r1]
508 ; CHECK-NEXT: adds r1, #16
509 ; CHECK-NEXT: vmla.u32 q0, q2, r0
510 ; CHECK-NEXT: le lr, .LBB4_1
511 ; CHECK-NEXT: @ %bb.2: @ %middle.block
512 ; CHECK-NEXT: vctp.32 r2
513 ; CHECK-NEXT: vpsel q0, q0, q1
514 ; CHECK-NEXT: vaddv.u32 r0, q0
515 ; CHECK-NEXT: pop {r7, pc}
517 %cmp6 = icmp eq i32 %N, 0
518 br i1 %cmp6, label %for.cond.cleanup, label %vector.ph
520 vector.ph: ; preds = %entry
521 %n.rnd.up = add i32 %N, 3
522 %n.vec = and i32 %n.rnd.up, -4
523 %trip.count.minus.1 = add i32 %N, -1
524 %broadcast.splatinsert9 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
525 %broadcast.splat10 = shufflevector <4 x i32> %broadcast.splatinsert9, <4 x i32> undef, <4 x i32> zeroinitializer
526 %broadcast.splatinsert11 = insertelement <4 x i32> undef, i32 %a, i32 0
527 %broadcast.splat12 = shufflevector <4 x i32> %broadcast.splatinsert11, <4 x i32> undef, <4 x i32> zeroinitializer
528 br label %vector.body
530 vector.body: ; preds = %vector.body, %vector.ph
531 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
532 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %4, %vector.body ]
533 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
534 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
535 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
536 %0 = getelementptr inbounds i32, i32* %b, i32 %index
537 %1 = icmp ule <4 x i32> %induction, %broadcast.splat10
538 %2 = bitcast i32* %0 to <4 x i32>*
539 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %2, i32 4, <4 x i1> %1, <4 x i32> undef)
540 %3 = mul nsw <4 x i32> %wide.masked.load, %broadcast.splat12
541 %4 = add nsw <4 x i32> %3, %vec.phi
542 %index.next = add i32 %index, 4
543 %5 = icmp eq i32 %index.next, %n.vec
544 br i1 %5, label %middle.block, label %vector.body
546 middle.block: ; preds = %vector.body
547 %6 = select <4 x i1> %1, <4 x i32> %4, <4 x i32> %vec.phi
548 %7 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %6)
549 br label %for.cond.cleanup
551 for.cond.cleanup: ; preds = %middle.block, %entry
552 %res.0.lcssa = phi i32 [ 0, %entry ], [ %7, %middle.block ]
556 define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly %a, i8* nocapture readonly %b, i8 zeroext %c, i32* nocapture %res, i32 %N) {
557 ; CHECK-LABEL: test_vec_mul_scalar_add_char:
558 ; CHECK: @ %bb.0: @ %entry
559 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
560 ; CHECK-NEXT: sub sp, #4
561 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
562 ; CHECK-NEXT: sub sp, #8
563 ; CHECK-NEXT: ldr.w r12, [sp, #72]
564 ; CHECK-NEXT: cmp.w r12, #0
565 ; CHECK-NEXT: beq.w .LBB5_12
566 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph
567 ; CHECK-NEXT: add.w r5, r3, r12, lsl #2
568 ; CHECK-NEXT: add.w r6, r1, r12
569 ; CHECK-NEXT: cmp r5, r1
570 ; CHECK-NEXT: add.w r4, r0, r12
571 ; CHECK-NEXT: cset r7, hi
572 ; CHECK-NEXT: cmp r6, r3
573 ; CHECK-NEXT: cset r6, hi
574 ; CHECK-NEXT: cmp r5, r0
575 ; CHECK-NEXT: cset r5, hi
576 ; CHECK-NEXT: cmp r4, r3
577 ; CHECK-NEXT: cset r4, hi
578 ; CHECK-NEXT: ands r5, r4
579 ; CHECK-NEXT: lsls r5, r5, #31
581 ; CHECK-NEXT: andeq r7, r6
582 ; CHECK-NEXT: lslseq.w r7, r7, #31
583 ; CHECK-NEXT: beq .LBB5_4
584 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
585 ; CHECK-NEXT: sub.w r4, r12, #1
586 ; CHECK-NEXT: and lr, r12, #3
587 ; CHECK-NEXT: cmp r4, #3
588 ; CHECK-NEXT: bhs.w .LBB5_6
589 ; CHECK-NEXT: @ %bb.3:
590 ; CHECK-NEXT: movs r7, #0
591 ; CHECK-NEXT: b .LBB5_9
592 ; CHECK-NEXT: .LBB5_4: @ %vector.ph
593 ; CHECK-NEXT: add.w r7, r12, #3
594 ; CHECK-NEXT: adr r5, .LCPI5_0
595 ; CHECK-NEXT: bic r7, r7, #3
596 ; CHECK-NEXT: sub.w r4, r12, #1
597 ; CHECK-NEXT: subs r7, #4
598 ; CHECK-NEXT: movs r6, #1
599 ; CHECK-NEXT: vldrw.u32 q1, [r5]
600 ; CHECK-NEXT: vdup.32 q0, r4
601 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
602 ; CHECK-NEXT: movs r4, #0
603 ; CHECK-NEXT: vmov.i32 q2, #0xff
604 ; CHECK-NEXT: vmov.i32 q3, #0xff
605 ; CHECK-NEXT: dls lr, lr
606 ; CHECK-NEXT: .LBB5_5: @ %vector.body
607 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
608 ; CHECK-NEXT: vadd.i32 q4, q1, r4
609 ; CHECK-NEXT: @ implicit-def: $q5
610 ; CHECK-NEXT: sub.w r12, r12, #4
611 ; CHECK-NEXT: vcmp.u32 cs, q0, q4
612 ; CHECK-NEXT: @ implicit-def: $q4
613 ; CHECK-NEXT: vmrs r6, p0
614 ; CHECK-NEXT: and r5, r6, #1
615 ; CHECK-NEXT: rsbs r7, r5, #0
616 ; CHECK-NEXT: movs r5, #0
617 ; CHECK-NEXT: bfi r5, r7, #0, #1
618 ; CHECK-NEXT: ubfx r7, r6, #4, #1
619 ; CHECK-NEXT: rsbs r7, r7, #0
620 ; CHECK-NEXT: bfi r5, r7, #1, #1
621 ; CHECK-NEXT: ubfx r7, r6, #8, #1
622 ; CHECK-NEXT: ubfx r6, r6, #12, #1
623 ; CHECK-NEXT: rsbs r7, r7, #0
624 ; CHECK-NEXT: bfi r5, r7, #2, #1
625 ; CHECK-NEXT: rsbs r6, r6, #0
626 ; CHECK-NEXT: bfi r5, r6, #3, #1
627 ; CHECK-NEXT: lsls r6, r5, #31
628 ; CHECK-NEXT: add.w r6, r0, r4
630 ; CHECK-NEXT: ldrbne r7, [r6]
631 ; CHECK-NEXT: vmovne.32 q4[0], r7
632 ; CHECK-NEXT: lsls r7, r5, #30
634 ; CHECK-NEXT: ldrbmi r7, [r6, #1]
635 ; CHECK-NEXT: vmovmi.32 q4[1], r7
636 ; CHECK-NEXT: lsls r7, r5, #29
638 ; CHECK-NEXT: ldrbmi r7, [r6, #2]
639 ; CHECK-NEXT: vmovmi.32 q4[2], r7
640 ; CHECK-NEXT: lsls r5, r5, #28
642 ; CHECK-NEXT: ldrbmi r5, [r6, #3]
643 ; CHECK-NEXT: vmovmi.32 q4[3], r5
644 ; CHECK-NEXT: vmrs r6, p0
645 ; CHECK-NEXT: vand q4, q4, q2
646 ; CHECK-NEXT: and r5, r6, #1
647 ; CHECK-NEXT: rsbs r7, r5, #0
648 ; CHECK-NEXT: movs r5, #0
649 ; CHECK-NEXT: bfi r5, r7, #0, #1
650 ; CHECK-NEXT: ubfx r7, r6, #4, #1
651 ; CHECK-NEXT: rsbs r7, r7, #0
652 ; CHECK-NEXT: bfi r5, r7, #1, #1
653 ; CHECK-NEXT: ubfx r7, r6, #8, #1
654 ; CHECK-NEXT: ubfx r6, r6, #12, #1
655 ; CHECK-NEXT: rsbs r7, r7, #0
656 ; CHECK-NEXT: bfi r5, r7, #2, #1
657 ; CHECK-NEXT: rsbs r6, r6, #0
658 ; CHECK-NEXT: bfi r5, r6, #3, #1
659 ; CHECK-NEXT: lsls r6, r5, #31
660 ; CHECK-NEXT: add.w r6, r1, r4
662 ; CHECK-NEXT: ldrbne r7, [r6]
663 ; CHECK-NEXT: vmovne.32 q5[0], r7
664 ; CHECK-NEXT: lsls r7, r5, #30
666 ; CHECK-NEXT: ldrbmi r7, [r6, #1]
667 ; CHECK-NEXT: vmovmi.32 q5[1], r7
668 ; CHECK-NEXT: lsls r7, r5, #29
670 ; CHECK-NEXT: ldrbmi r7, [r6, #2]
671 ; CHECK-NEXT: vmovmi.32 q5[2], r7
672 ; CHECK-NEXT: lsls r5, r5, #28
674 ; CHECK-NEXT: ldrbmi r5, [r6, #3]
675 ; CHECK-NEXT: vmovmi.32 q5[3], r5
676 ; CHECK-NEXT: vand q5, q5, q3
677 ; CHECK-NEXT: vctp.32 r12
678 ; CHECK-NEXT: vmul.i32 q4, q5, q4
679 ; CHECK-NEXT: adds r4, #4
680 ; CHECK-NEXT: vadd.i32 q4, q4, r2
682 ; CHECK-NEXT: vstrwt.32 q4, [r3]
683 ; CHECK-NEXT: adds r3, #16
684 ; CHECK-NEXT: le lr, .LBB5_5
685 ; CHECK-NEXT: b .LBB5_12
686 ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new
687 ; CHECK-NEXT: sub.w r12, lr, r12
688 ; CHECK-NEXT: subs r4, r1, #3
689 ; CHECK-NEXT: subs r5, r0, #3
690 ; CHECK-NEXT: sub.w r7, r3, #16
691 ; CHECK-NEXT: mov.w r9, #0
692 ; CHECK-NEXT: .LBB5_7: @ %for.body
693 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
694 ; CHECK-NEXT: ldrb.w r8, [r5, #3]
695 ; CHECK-NEXT: sub.w r9, r9, #4
696 ; CHECK-NEXT: ldrb r6, [r4, #3]
697 ; CHECK-NEXT: cmp r12, r9
698 ; CHECK-NEXT: smlabb r6, r6, r8, r2
699 ; CHECK-NEXT: str r6, [r7, #16]!
700 ; CHECK-NEXT: ldrb r8, [r5, #4]!
701 ; CHECK-NEXT: ldrb r6, [r4, #4]!
702 ; CHECK-NEXT: smlabb r6, r6, r8, r2
703 ; CHECK-NEXT: str r6, [r7, #4]
704 ; CHECK-NEXT: ldrb.w r8, [r5, #1]
705 ; CHECK-NEXT: ldrb r6, [r4, #1]
706 ; CHECK-NEXT: smlabb r6, r6, r8, r2
707 ; CHECK-NEXT: str r6, [r7, #8]
708 ; CHECK-NEXT: ldrb.w r8, [r5, #2]
709 ; CHECK-NEXT: ldrb r6, [r4, #2]
710 ; CHECK-NEXT: smlabb r6, r6, r8, r2
711 ; CHECK-NEXT: str r6, [r7, #12]
712 ; CHECK-NEXT: bne .LBB5_7
713 ; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup.loopexit.unr-lcssa.loopexit
714 ; CHECK-NEXT: rsb.w r7, r9, #0
715 ; CHECK-NEXT: .LBB5_9: @ %for.cond.cleanup.loopexit.unr-lcssa
716 ; CHECK-NEXT: wls lr, lr, .LBB5_12
717 ; CHECK-NEXT: @ %bb.10: @ %for.body.epil.preheader
718 ; CHECK-NEXT: subs r7, #1
719 ; CHECK-NEXT: add r0, r7
720 ; CHECK-NEXT: add r1, r7
721 ; CHECK-NEXT: add.w r3, r3, r7, lsl #2
722 ; CHECK-NEXT: .LBB5_11: @ %for.body.epil
723 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
724 ; CHECK-NEXT: ldrb r7, [r0, #1]!
725 ; CHECK-NEXT: ldrb r6, [r1, #1]!
726 ; CHECK-NEXT: smlabb r7, r6, r7, r2
727 ; CHECK-NEXT: str r7, [r3, #4]!
728 ; CHECK-NEXT: le lr, .LBB5_11
729 ; CHECK-NEXT: .LBB5_12: @ %for.cond.cleanup
730 ; CHECK-NEXT: add sp, #8
731 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
732 ; CHECK-NEXT: add sp, #4
733 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
734 ; CHECK-NEXT: .p2align 4
735 ; CHECK-NEXT: @ %bb.13:
736 ; CHECK-NEXT: .LCPI5_0:
737 ; CHECK-NEXT: .long 0 @ 0x0
738 ; CHECK-NEXT: .long 1 @ 0x1
739 ; CHECK-NEXT: .long 2 @ 0x2
740 ; CHECK-NEXT: .long 3 @ 0x3
742 %res12 = bitcast i32* %res to i8*
743 %cmp10 = icmp eq i32 %N, 0
744 br i1 %cmp10, label %for.cond.cleanup, label %for.body.lr.ph
746 for.body.lr.ph: ; preds = %entry
747 %conv3 = zext i8 %c to i32
748 %scevgep = getelementptr i32, i32* %res, i32 %N
749 %scevgep13 = bitcast i32* %scevgep to i8*
750 %scevgep14 = getelementptr i8, i8* %a, i32 %N
751 %scevgep15 = getelementptr i8, i8* %b, i32 %N
752 %bound0 = icmp ugt i8* %scevgep14, %res12
753 %bound1 = icmp ugt i8* %scevgep13, %a
754 %found.conflict = and i1 %bound0, %bound1
755 %bound016 = icmp ugt i8* %scevgep15, %res12
756 %bound117 = icmp ugt i8* %scevgep13, %b
757 %found.conflict18 = and i1 %bound016, %bound117
758 %conflict.rdx = or i1 %found.conflict, %found.conflict18
759 br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph
761 for.body.preheader: ; preds = %for.body.lr.ph
763 %xtraiter = and i32 %N, 3
764 %1 = icmp ult i32 %0, 3
765 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
767 for.body.preheader.new: ; preds = %for.body.preheader
768 %unroll_iter = sub i32 %N, %xtraiter
771 vector.ph: ; preds = %for.body.lr.ph
772 %n.rnd.up = add i32 %N, 3
773 %n.vec = and i32 %n.rnd.up, -4
774 %trip.count.minus.1 = add i32 %N, -1
775 %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
776 %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
777 %broadcast.splatinsert22 = insertelement <4 x i32> undef, i32 %conv3, i32 0
778 %broadcast.splat23 = shufflevector <4 x i32> %broadcast.splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
779 br label %vector.body
781 vector.body: ; preds = %vector.body, %vector.ph
782 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
783 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
784 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
785 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
786 %2 = getelementptr inbounds i8, i8* %a, i32 %index
787 %3 = icmp ule <4 x i32> %induction, %broadcast.splat20
788 %4 = bitcast i8* %2 to <4 x i8>*
789 %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef)
790 %5 = zext <4 x i8> %wide.masked.load to <4 x i32>
791 %6 = getelementptr inbounds i8, i8* %b, i32 %index
792 %7 = bitcast i8* %6 to <4 x i8>*
793 %wide.masked.load21 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %7, i32 1, <4 x i1> %3, <4 x i8> undef)
794 %8 = zext <4 x i8> %wide.masked.load21 to <4 x i32>
795 %9 = mul nuw nsw <4 x i32> %8, %5
796 %10 = add nuw nsw <4 x i32> %9, %broadcast.splat23
797 %11 = getelementptr inbounds i32, i32* %res, i32 %index
798 %12 = bitcast i32* %11 to <4 x i32>*
799 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %12, i32 4, <4 x i1> %3)
800 %index.next = add i32 %index, 4
801 %13 = icmp eq i32 %index.next, %n.vec
802 br i1 %13, label %for.cond.cleanup, label %vector.body
804 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
805 %i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
806 %lcmp.mod = icmp eq i32 %xtraiter, 0
807 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
809 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
810 %i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
811 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
812 %arrayidx.epil = getelementptr inbounds i8, i8* %a, i32 %i.011.epil
813 %14 = load i8, i8* %arrayidx.epil, align 1
814 %conv.epil = zext i8 %14 to i32
815 %arrayidx1.epil = getelementptr inbounds i8, i8* %b, i32 %i.011.epil
816 %15 = load i8, i8* %arrayidx1.epil, align 1
817 %conv2.epil = zext i8 %15 to i32
818 %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil
819 %add.epil = add nuw nsw i32 %mul.epil, %conv3
820 %arrayidx4.epil = getelementptr inbounds i32, i32* %res, i32 %i.011.epil
821 store i32 %add.epil, i32* %arrayidx4.epil, align 4
822 %inc.epil = add nuw i32 %i.011.epil, 1
823 %epil.iter.sub = add i32 %epil.iter, -1
824 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
825 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
827 for.cond.cleanup: ; preds = %vector.body, %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
830 for.body: ; preds = %for.body, %for.body.preheader.new
831 %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
832 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
833 %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.011
834 %16 = load i8, i8* %arrayidx, align 1
835 %conv = zext i8 %16 to i32
836 %arrayidx1 = getelementptr inbounds i8, i8* %b, i32 %i.011
837 %17 = load i8, i8* %arrayidx1, align 1
838 %conv2 = zext i8 %17 to i32
839 %mul = mul nuw nsw i32 %conv2, %conv
840 %add = add nuw nsw i32 %mul, %conv3
841 %arrayidx4 = getelementptr inbounds i32, i32* %res, i32 %i.011
842 store i32 %add, i32* %arrayidx4, align 4
843 %inc = or i32 %i.011, 1
844 %arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %inc
845 %18 = load i8, i8* %arrayidx.1, align 1
846 %conv.1 = zext i8 %18 to i32
847 %arrayidx1.1 = getelementptr inbounds i8, i8* %b, i32 %inc
848 %19 = load i8, i8* %arrayidx1.1, align 1
849 %conv2.1 = zext i8 %19 to i32
850 %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
851 %add.1 = add nuw nsw i32 %mul.1, %conv3
852 %arrayidx4.1 = getelementptr inbounds i32, i32* %res, i32 %inc
853 store i32 %add.1, i32* %arrayidx4.1, align 4
854 %inc.1 = or i32 %i.011, 2
855 %arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %inc.1
856 %20 = load i8, i8* %arrayidx.2, align 1
857 %conv.2 = zext i8 %20 to i32
858 %arrayidx1.2 = getelementptr inbounds i8, i8* %b, i32 %inc.1
859 %21 = load i8, i8* %arrayidx1.2, align 1
860 %conv2.2 = zext i8 %21 to i32
861 %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2
862 %add.2 = add nuw nsw i32 %mul.2, %conv3
863 %arrayidx4.2 = getelementptr inbounds i32, i32* %res, i32 %inc.1
864 store i32 %add.2, i32* %arrayidx4.2, align 4
865 %inc.2 = or i32 %i.011, 3
866 %arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %inc.2
867 %22 = load i8, i8* %arrayidx.3, align 1
868 %conv.3 = zext i8 %22 to i32
869 %arrayidx1.3 = getelementptr inbounds i8, i8* %b, i32 %inc.2
870 %23 = load i8, i8* %arrayidx1.3, align 1
871 %conv2.3 = zext i8 %23 to i32
872 %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3
873 %add.3 = add nuw nsw i32 %mul.3, %conv3
874 %arrayidx4.3 = getelementptr inbounds i32, i32* %res, i32 %inc.2
875 store i32 %add.3, i32* %arrayidx4.3, align 4
876 %inc.3 = add nuw i32 %i.011, 4
877 %niter.nsub.3 = add i32 %niter, -4
878 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
879 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
882 define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_short(i16* nocapture readonly %a, i16* nocapture readonly %b, i16 signext %c, i32* nocapture %res, i32 %N) {
883 ; CHECK-LABEL: test_vec_mul_scalar_add_short:
884 ; CHECK: @ %bb.0: @ %entry
885 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
886 ; CHECK-NEXT: sub sp, #8
887 ; CHECK-NEXT: ldr.w r12, [sp, #28]
888 ; CHECK-NEXT: cmp.w r12, #0
889 ; CHECK-NEXT: beq.w .LBB6_3
890 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
891 ; CHECK-NEXT: add.w r5, r12, #3
892 ; CHECK-NEXT: movs r4, #1
893 ; CHECK-NEXT: bic r5, r5, #3
894 ; CHECK-NEXT: subs r5, #4
895 ; CHECK-NEXT: add.w lr, r4, r5, lsr #2
896 ; CHECK-NEXT: adr r5, .LCPI6_0
897 ; CHECK-NEXT: sub.w r4, r12, #1
898 ; CHECK-NEXT: vldrw.u32 q1, [r5]
899 ; CHECK-NEXT: vdup.32 q0, r4
900 ; CHECK-NEXT: movs r4, #0
901 ; CHECK-NEXT: dls lr, lr
902 ; CHECK-NEXT: .LBB6_2: @ %vector.body
903 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
904 ; CHECK-NEXT: vadd.i32 q2, q1, r4
905 ; CHECK-NEXT: @ implicit-def: $q3
906 ; CHECK-NEXT: sub.w r12, r12, #4
907 ; CHECK-NEXT: vcmp.u32 cs, q0, q2
908 ; CHECK-NEXT: @ implicit-def: $q2
909 ; CHECK-NEXT: adds r4, #4
910 ; CHECK-NEXT: vmrs r6, p0
911 ; CHECK-NEXT: and r5, r6, #1
912 ; CHECK-NEXT: rsbs r7, r5, #0
913 ; CHECK-NEXT: movs r5, #0
914 ; CHECK-NEXT: bfi r5, r7, #0, #1
915 ; CHECK-NEXT: ubfx r7, r6, #4, #1
916 ; CHECK-NEXT: rsbs r7, r7, #0
917 ; CHECK-NEXT: bfi r5, r7, #1, #1
918 ; CHECK-NEXT: ubfx r7, r6, #8, #1
919 ; CHECK-NEXT: ubfx r6, r6, #12, #1
920 ; CHECK-NEXT: rsbs r7, r7, #0
921 ; CHECK-NEXT: bfi r5, r7, #2, #1
922 ; CHECK-NEXT: rsbs r6, r6, #0
923 ; CHECK-NEXT: bfi r5, r6, #3, #1
924 ; CHECK-NEXT: lsls r6, r5, #31
926 ; CHECK-NEXT: ldrhne r6, [r0]
927 ; CHECK-NEXT: vmovne.32 q2[0], r6
928 ; CHECK-NEXT: lsls r6, r5, #30
930 ; CHECK-NEXT: ldrhmi r6, [r0, #2]
931 ; CHECK-NEXT: vmovmi.32 q2[1], r6
932 ; CHECK-NEXT: lsls r6, r5, #29
934 ; CHECK-NEXT: ldrhmi r6, [r0, #4]
935 ; CHECK-NEXT: vmovmi.32 q2[2], r6
936 ; CHECK-NEXT: lsls r5, r5, #28
938 ; CHECK-NEXT: ldrhmi r5, [r0, #6]
939 ; CHECK-NEXT: vmovmi.32 q2[3], r5
940 ; CHECK-NEXT: vmrs r6, p0
941 ; CHECK-NEXT: vmovlb.s16 q2, q2
942 ; CHECK-NEXT: adds r0, #8
943 ; CHECK-NEXT: and r5, r6, #1
944 ; CHECK-NEXT: rsbs r7, r5, #0
945 ; CHECK-NEXT: movs r5, #0
946 ; CHECK-NEXT: bfi r5, r7, #0, #1
947 ; CHECK-NEXT: ubfx r7, r6, #4, #1
948 ; CHECK-NEXT: rsbs r7, r7, #0
949 ; CHECK-NEXT: bfi r5, r7, #1, #1
950 ; CHECK-NEXT: ubfx r7, r6, #8, #1
951 ; CHECK-NEXT: ubfx r6, r6, #12, #1
952 ; CHECK-NEXT: rsbs r7, r7, #0
953 ; CHECK-NEXT: bfi r5, r7, #2, #1
954 ; CHECK-NEXT: rsbs r6, r6, #0
955 ; CHECK-NEXT: bfi r5, r6, #3, #1
956 ; CHECK-NEXT: lsls r6, r5, #31
958 ; CHECK-NEXT: ldrhne r6, [r1]
959 ; CHECK-NEXT: vmovne.32 q3[0], r6
960 ; CHECK-NEXT: lsls r6, r5, #30
962 ; CHECK-NEXT: ldrhmi r6, [r1, #2]
963 ; CHECK-NEXT: vmovmi.32 q3[1], r6
964 ; CHECK-NEXT: lsls r6, r5, #29
966 ; CHECK-NEXT: ldrhmi r6, [r1, #4]
967 ; CHECK-NEXT: vmovmi.32 q3[2], r6
968 ; CHECK-NEXT: lsls r5, r5, #28
970 ; CHECK-NEXT: ldrhmi r5, [r1, #6]
971 ; CHECK-NEXT: vmovmi.32 q3[3], r5
972 ; CHECK-NEXT: vmovlb.s16 q3, q3
973 ; CHECK-NEXT: vctp.32 r12
974 ; CHECK-NEXT: vmul.i32 q2, q3, q2
975 ; CHECK-NEXT: adds r1, #8
976 ; CHECK-NEXT: vadd.i32 q2, q2, r2
978 ; CHECK-NEXT: vstrwt.32 q2, [r3]
979 ; CHECK-NEXT: adds r3, #16
980 ; CHECK-NEXT: le lr, .LBB6_2
981 ; CHECK-NEXT: .LBB6_3: @ %for.cond.cleanup
982 ; CHECK-NEXT: add sp, #8
983 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
984 ; CHECK-NEXT: .p2align 4
985 ; CHECK-NEXT: @ %bb.4:
986 ; CHECK-NEXT: .LCPI6_0:
987 ; CHECK-NEXT: .long 0 @ 0x0
988 ; CHECK-NEXT: .long 1 @ 0x1
989 ; CHECK-NEXT: .long 2 @ 0x2
990 ; CHECK-NEXT: .long 3 @ 0x3
992 %cmp10 = icmp eq i32 %N, 0
993 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
995 vector.ph: ; preds = %entry
996 %conv3 = sext i16 %c to i32
997 %n.rnd.up = add i32 %N, 3
998 %n.vec = and i32 %n.rnd.up, -4
999 %trip.count.minus.1 = add i32 %N, -1
1000 %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
1001 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
1002 %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %conv3, i32 0
1003 %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
1004 br label %vector.body
1006 vector.body: ; preds = %vector.body, %vector.ph
1007 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1008 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
1009 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
1010 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
1011 %0 = getelementptr inbounds i16, i16* %a, i32 %index
1012 %1 = icmp ule <4 x i32> %induction, %broadcast.splat13
1013 %2 = bitcast i16* %0 to <4 x i16>*
1014 %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
1015 %3 = sext <4 x i16> %wide.masked.load to <4 x i32>
1016 %4 = getelementptr inbounds i16, i16* %b, i32 %index
1017 %5 = bitcast i16* %4 to <4 x i16>*
1018 %wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %5, i32 2, <4 x i1> %1, <4 x i16> undef)
1019 %6 = sext <4 x i16> %wide.masked.load14 to <4 x i32>
1020 %7 = mul nsw <4 x i32> %6, %3
1021 %8 = add nsw <4 x i32> %7, %broadcast.splat16
1022 %9 = getelementptr inbounds i32, i32* %res, i32 %index
1023 %10 = bitcast i32* %9 to <4 x i32>*
1024 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %1)
1025 %index.next = add i32 %index, 4
1026 %11 = icmp eq i32 %index.next, %n.vec
1027 br i1 %11, label %for.cond.cleanup, label %vector.body
1029 for.cond.cleanup: ; preds = %vector.body, %entry
1033 define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonly %a, i8* nocapture readonly %b, i8 zeroext %c, i32* nocapture %res, i32 %N) {
1034 ; CHECK-LABEL: test_vec_mul_scalar_add_uchar:
1035 ; CHECK: @ %bb.0: @ %entry
1036 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
1037 ; CHECK-NEXT: sub sp, #4
1038 ; CHECK-NEXT: vpush {d8, d9, d10, d11}
1039 ; CHECK-NEXT: sub sp, #8
1040 ; CHECK-NEXT: ldr.w r12, [sp, #72]
1041 ; CHECK-NEXT: cmp.w r12, #0
1042 ; CHECK-NEXT: beq.w .LBB7_12
1043 ; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph
1044 ; CHECK-NEXT: add.w r5, r3, r12, lsl #2
1045 ; CHECK-NEXT: add.w r6, r1, r12
1046 ; CHECK-NEXT: cmp r5, r1
1047 ; CHECK-NEXT: add.w r4, r0, r12
1048 ; CHECK-NEXT: cset r7, hi
1049 ; CHECK-NEXT: cmp r6, r3
1050 ; CHECK-NEXT: cset r6, hi
1051 ; CHECK-NEXT: cmp r5, r0
1052 ; CHECK-NEXT: cset r5, hi
1053 ; CHECK-NEXT: cmp r4, r3
1054 ; CHECK-NEXT: cset r4, hi
1055 ; CHECK-NEXT: ands r5, r4
1056 ; CHECK-NEXT: lsls r5, r5, #31
1057 ; CHECK-NEXT: itt eq
1058 ; CHECK-NEXT: andeq r7, r6
1059 ; CHECK-NEXT: lslseq.w r7, r7, #31
1060 ; CHECK-NEXT: beq .LBB7_4
1061 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
1062 ; CHECK-NEXT: sub.w r4, r12, #1
1063 ; CHECK-NEXT: and lr, r12, #3
1064 ; CHECK-NEXT: cmp r4, #3
1065 ; CHECK-NEXT: bhs.w .LBB7_6
1066 ; CHECK-NEXT: @ %bb.3:
1067 ; CHECK-NEXT: movs r7, #0
1068 ; CHECK-NEXT: b .LBB7_9
1069 ; CHECK-NEXT: .LBB7_4: @ %vector.ph
1070 ; CHECK-NEXT: add.w r7, r12, #3
1071 ; CHECK-NEXT: adr r5, .LCPI7_0
1072 ; CHECK-NEXT: bic r7, r7, #3
1073 ; CHECK-NEXT: sub.w r4, r12, #1
1074 ; CHECK-NEXT: subs r7, #4
1075 ; CHECK-NEXT: movs r6, #1
1076 ; CHECK-NEXT: vldrw.u32 q1, [r5]
1077 ; CHECK-NEXT: vdup.32 q0, r4
1078 ; CHECK-NEXT: add.w lr, r6, r7, lsr #2
1079 ; CHECK-NEXT: movs r4, #0
1080 ; CHECK-NEXT: vmov.i32 q2, #0xff
1081 ; CHECK-NEXT: vmov.i32 q3, #0xff
1082 ; CHECK-NEXT: dls lr, lr
1083 ; CHECK-NEXT: .LBB7_5: @ %vector.body
1084 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1085 ; CHECK-NEXT: vadd.i32 q4, q1, r4
1086 ; CHECK-NEXT: @ implicit-def: $q5
1087 ; CHECK-NEXT: sub.w r12, r12, #4
1088 ; CHECK-NEXT: vcmp.u32 cs, q0, q4
1089 ; CHECK-NEXT: @ implicit-def: $q4
1090 ; CHECK-NEXT: vmrs r6, p0
1091 ; CHECK-NEXT: and r5, r6, #1
1092 ; CHECK-NEXT: rsbs r7, r5, #0
1093 ; CHECK-NEXT: movs r5, #0
1094 ; CHECK-NEXT: bfi r5, r7, #0, #1
1095 ; CHECK-NEXT: ubfx r7, r6, #4, #1
1096 ; CHECK-NEXT: rsbs r7, r7, #0
1097 ; CHECK-NEXT: bfi r5, r7, #1, #1
1098 ; CHECK-NEXT: ubfx r7, r6, #8, #1
1099 ; CHECK-NEXT: ubfx r6, r6, #12, #1
1100 ; CHECK-NEXT: rsbs r7, r7, #0
1101 ; CHECK-NEXT: bfi r5, r7, #2, #1
1102 ; CHECK-NEXT: rsbs r6, r6, #0
1103 ; CHECK-NEXT: bfi r5, r6, #3, #1
1104 ; CHECK-NEXT: lsls r6, r5, #31
1105 ; CHECK-NEXT: add.w r6, r0, r4
1106 ; CHECK-NEXT: itt ne
1107 ; CHECK-NEXT: ldrbne r7, [r6]
1108 ; CHECK-NEXT: vmovne.32 q4[0], r7
1109 ; CHECK-NEXT: lsls r7, r5, #30
1110 ; CHECK-NEXT: itt mi
1111 ; CHECK-NEXT: ldrbmi r7, [r6, #1]
1112 ; CHECK-NEXT: vmovmi.32 q4[1], r7
1113 ; CHECK-NEXT: lsls r7, r5, #29
1114 ; CHECK-NEXT: itt mi
1115 ; CHECK-NEXT: ldrbmi r7, [r6, #2]
1116 ; CHECK-NEXT: vmovmi.32 q4[2], r7
1117 ; CHECK-NEXT: lsls r5, r5, #28
1118 ; CHECK-NEXT: itt mi
1119 ; CHECK-NEXT: ldrbmi r5, [r6, #3]
1120 ; CHECK-NEXT: vmovmi.32 q4[3], r5
1121 ; CHECK-NEXT: vmrs r6, p0
1122 ; CHECK-NEXT: vand q4, q4, q2
1123 ; CHECK-NEXT: and r5, r6, #1
1124 ; CHECK-NEXT: rsbs r7, r5, #0
1125 ; CHECK-NEXT: movs r5, #0
1126 ; CHECK-NEXT: bfi r5, r7, #0, #1
1127 ; CHECK-NEXT: ubfx r7, r6, #4, #1
1128 ; CHECK-NEXT: rsbs r7, r7, #0
1129 ; CHECK-NEXT: bfi r5, r7, #1, #1
1130 ; CHECK-NEXT: ubfx r7, r6, #8, #1
1131 ; CHECK-NEXT: ubfx r6, r6, #12, #1
1132 ; CHECK-NEXT: rsbs r7, r7, #0
1133 ; CHECK-NEXT: bfi r5, r7, #2, #1
1134 ; CHECK-NEXT: rsbs r6, r6, #0
1135 ; CHECK-NEXT: bfi r5, r6, #3, #1
1136 ; CHECK-NEXT: lsls r6, r5, #31
1137 ; CHECK-NEXT: add.w r6, r1, r4
1138 ; CHECK-NEXT: itt ne
1139 ; CHECK-NEXT: ldrbne r7, [r6]
1140 ; CHECK-NEXT: vmovne.32 q5[0], r7
1141 ; CHECK-NEXT: lsls r7, r5, #30
1142 ; CHECK-NEXT: itt mi
1143 ; CHECK-NEXT: ldrbmi r7, [r6, #1]
1144 ; CHECK-NEXT: vmovmi.32 q5[1], r7
1145 ; CHECK-NEXT: lsls r7, r5, #29
1146 ; CHECK-NEXT: itt mi
1147 ; CHECK-NEXT: ldrbmi r7, [r6, #2]
1148 ; CHECK-NEXT: vmovmi.32 q5[2], r7
1149 ; CHECK-NEXT: lsls r5, r5, #28
1150 ; CHECK-NEXT: itt mi
1151 ; CHECK-NEXT: ldrbmi r5, [r6, #3]
1152 ; CHECK-NEXT: vmovmi.32 q5[3], r5
1153 ; CHECK-NEXT: vand q5, q5, q3
1154 ; CHECK-NEXT: vctp.32 r12
1155 ; CHECK-NEXT: vmul.i32 q4, q5, q4
1156 ; CHECK-NEXT: adds r4, #4
1157 ; CHECK-NEXT: vadd.i32 q4, q4, r2
1159 ; CHECK-NEXT: vstrwt.32 q4, [r3]
1160 ; CHECK-NEXT: adds r3, #16
1161 ; CHECK-NEXT: le lr, .LBB7_5
1162 ; CHECK-NEXT: b .LBB7_12
1163 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new
1164 ; CHECK-NEXT: sub.w r12, lr, r12
1165 ; CHECK-NEXT: subs r4, r1, #3
1166 ; CHECK-NEXT: subs r5, r0, #3
1167 ; CHECK-NEXT: sub.w r7, r3, #16
1168 ; CHECK-NEXT: mov.w r9, #0
1169 ; CHECK-NEXT: .LBB7_7: @ %for.body
1170 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1171 ; CHECK-NEXT: ldrb.w r8, [r5, #3]
1172 ; CHECK-NEXT: sub.w r9, r9, #4
1173 ; CHECK-NEXT: ldrb r6, [r4, #3]
1174 ; CHECK-NEXT: cmp r12, r9
1175 ; CHECK-NEXT: smlabb r6, r6, r8, r2
1176 ; CHECK-NEXT: str r6, [r7, #16]!
1177 ; CHECK-NEXT: ldrb r8, [r5, #4]!
1178 ; CHECK-NEXT: ldrb r6, [r4, #4]!
1179 ; CHECK-NEXT: smlabb r6, r6, r8, r2
1180 ; CHECK-NEXT: str r6, [r7, #4]
1181 ; CHECK-NEXT: ldrb.w r8, [r5, #1]
1182 ; CHECK-NEXT: ldrb r6, [r4, #1]
1183 ; CHECK-NEXT: smlabb r6, r6, r8, r2
1184 ; CHECK-NEXT: str r6, [r7, #8]
1185 ; CHECK-NEXT: ldrb.w r8, [r5, #2]
1186 ; CHECK-NEXT: ldrb r6, [r4, #2]
1187 ; CHECK-NEXT: smlabb r6, r6, r8, r2
1188 ; CHECK-NEXT: str r6, [r7, #12]
1189 ; CHECK-NEXT: bne .LBB7_7
1190 ; CHECK-NEXT: @ %bb.8: @ %for.cond.cleanup.loopexit.unr-lcssa.loopexit
1191 ; CHECK-NEXT: rsb.w r7, r9, #0
1192 ; CHECK-NEXT: .LBB7_9: @ %for.cond.cleanup.loopexit.unr-lcssa
1193 ; CHECK-NEXT: wls lr, lr, .LBB7_12
1194 ; CHECK-NEXT: @ %bb.10: @ %for.body.epil.preheader
1195 ; CHECK-NEXT: subs r7, #1
1196 ; CHECK-NEXT: add r0, r7
1197 ; CHECK-NEXT: add r1, r7
1198 ; CHECK-NEXT: add.w r3, r3, r7, lsl #2
1199 ; CHECK-NEXT: .LBB7_11: @ %for.body.epil
1200 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1201 ; CHECK-NEXT: ldrb r7, [r0, #1]!
1202 ; CHECK-NEXT: ldrb r6, [r1, #1]!
1203 ; CHECK-NEXT: smlabb r7, r6, r7, r2
1204 ; CHECK-NEXT: str r7, [r3, #4]!
1205 ; CHECK-NEXT: le lr, .LBB7_11
1206 ; CHECK-NEXT: .LBB7_12: @ %for.cond.cleanup
1207 ; CHECK-NEXT: add sp, #8
1208 ; CHECK-NEXT: vpop {d8, d9, d10, d11}
1209 ; CHECK-NEXT: add sp, #4
1210 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
1211 ; CHECK-NEXT: .p2align 4
1212 ; CHECK-NEXT: @ %bb.13:
1213 ; CHECK-NEXT: .LCPI7_0:
1214 ; CHECK-NEXT: .long 0 @ 0x0
1215 ; CHECK-NEXT: .long 1 @ 0x1
1216 ; CHECK-NEXT: .long 2 @ 0x2
1217 ; CHECK-NEXT: .long 3 @ 0x3
1219 %res12 = bitcast i32* %res to i8*
1220 %cmp10 = icmp eq i32 %N, 0
1221 br i1 %cmp10, label %for.cond.cleanup, label %for.body.lr.ph
1223 for.body.lr.ph: ; preds = %entry
1224 %conv3 = zext i8 %c to i32
1225 %scevgep = getelementptr i32, i32* %res, i32 %N
1226 %scevgep13 = bitcast i32* %scevgep to i8*
1227 %scevgep14 = getelementptr i8, i8* %a, i32 %N
1228 %scevgep15 = getelementptr i8, i8* %b, i32 %N
1229 %bound0 = icmp ugt i8* %scevgep14, %res12
1230 %bound1 = icmp ugt i8* %scevgep13, %a
1231 %found.conflict = and i1 %bound0, %bound1
1232 %bound016 = icmp ugt i8* %scevgep15, %res12
1233 %bound117 = icmp ugt i8* %scevgep13, %b
1234 %found.conflict18 = and i1 %bound016, %bound117
1235 %conflict.rdx = or i1 %found.conflict, %found.conflict18
1236 br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph
1238 for.body.preheader: ; preds = %for.body.lr.ph
1240 %xtraiter = and i32 %N, 3
1241 %1 = icmp ult i32 %0, 3
1242 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1244 for.body.preheader.new: ; preds = %for.body.preheader
1245 %unroll_iter = sub i32 %N, %xtraiter
1248 vector.ph: ; preds = %for.body.lr.ph
1249 %n.rnd.up = add i32 %N, 3
1250 %n.vec = and i32 %n.rnd.up, -4
1251 %trip.count.minus.1 = add i32 %N, -1
1252 %broadcast.splatinsert19 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
1253 %broadcast.splat20 = shufflevector <4 x i32> %broadcast.splatinsert19, <4 x i32> undef, <4 x i32> zeroinitializer
1254 %broadcast.splatinsert22 = insertelement <4 x i32> undef, i32 %conv3, i32 0
1255 %broadcast.splat23 = shufflevector <4 x i32> %broadcast.splatinsert22, <4 x i32> undef, <4 x i32> zeroinitializer
1256 br label %vector.body
1258 vector.body: ; preds = %vector.body, %vector.ph
1259 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1260 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
1261 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
1262 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
1263 %2 = getelementptr inbounds i8, i8* %a, i32 %index
1264 %3 = icmp ule <4 x i32> %induction, %broadcast.splat20
1265 %4 = bitcast i8* %2 to <4 x i8>*
1266 %wide.masked.load = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %4, i32 1, <4 x i1> %3, <4 x i8> undef)
1267 %5 = zext <4 x i8> %wide.masked.load to <4 x i32>
1268 %6 = getelementptr inbounds i8, i8* %b, i32 %index
1269 %7 = bitcast i8* %6 to <4 x i8>*
1270 %wide.masked.load21 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %7, i32 1, <4 x i1> %3, <4 x i8> undef)
1271 %8 = zext <4 x i8> %wide.masked.load21 to <4 x i32>
1272 %9 = mul nuw nsw <4 x i32> %8, %5
1273 %10 = add nuw nsw <4 x i32> %9, %broadcast.splat23
1274 %11 = getelementptr inbounds i32, i32* %res, i32 %index
1275 %12 = bitcast i32* %11 to <4 x i32>*
1276 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %10, <4 x i32>* %12, i32 4, <4 x i1> %3)
1277 %index.next = add i32 %index, 4
1278 %13 = icmp eq i32 %index.next, %n.vec
1279 br i1 %13, label %for.cond.cleanup, label %vector.body
1281 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1282 %i.011.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1283 %lcmp.mod = icmp eq i32 %xtraiter, 0
1284 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1286 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1287 %i.011.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.011.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1288 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1289 %arrayidx.epil = getelementptr inbounds i8, i8* %a, i32 %i.011.epil
1290 %14 = load i8, i8* %arrayidx.epil, align 1
1291 %conv.epil = zext i8 %14 to i32
1292 %arrayidx1.epil = getelementptr inbounds i8, i8* %b, i32 %i.011.epil
1293 %15 = load i8, i8* %arrayidx1.epil, align 1
1294 %conv2.epil = zext i8 %15 to i32
1295 %mul.epil = mul nuw nsw i32 %conv2.epil, %conv.epil
1296 %add.epil = add nuw nsw i32 %mul.epil, %conv3
1297 %arrayidx4.epil = getelementptr inbounds i32, i32* %res, i32 %i.011.epil
1298 store i32 %add.epil, i32* %arrayidx4.epil, align 4
1299 %inc.epil = add nuw i32 %i.011.epil, 1
1300 %epil.iter.sub = add i32 %epil.iter, -1
1301 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1302 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1304 for.cond.cleanup: ; preds = %vector.body, %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1307 for.body: ; preds = %for.body, %for.body.preheader.new
1308 %i.011 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1309 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1310 %arrayidx = getelementptr inbounds i8, i8* %a, i32 %i.011
1311 %16 = load i8, i8* %arrayidx, align 1
1312 %conv = zext i8 %16 to i32
1313 %arrayidx1 = getelementptr inbounds i8, i8* %b, i32 %i.011
1314 %17 = load i8, i8* %arrayidx1, align 1
1315 %conv2 = zext i8 %17 to i32
1316 %mul = mul nuw nsw i32 %conv2, %conv
1317 %add = add nuw nsw i32 %mul, %conv3
1318 %arrayidx4 = getelementptr inbounds i32, i32* %res, i32 %i.011
1319 store i32 %add, i32* %arrayidx4, align 4
1320 %inc = or i32 %i.011, 1
1321 %arrayidx.1 = getelementptr inbounds i8, i8* %a, i32 %inc
1322 %18 = load i8, i8* %arrayidx.1, align 1
1323 %conv.1 = zext i8 %18 to i32
1324 %arrayidx1.1 = getelementptr inbounds i8, i8* %b, i32 %inc
1325 %19 = load i8, i8* %arrayidx1.1, align 1
1326 %conv2.1 = zext i8 %19 to i32
1327 %mul.1 = mul nuw nsw i32 %conv2.1, %conv.1
1328 %add.1 = add nuw nsw i32 %mul.1, %conv3
1329 %arrayidx4.1 = getelementptr inbounds i32, i32* %res, i32 %inc
1330 store i32 %add.1, i32* %arrayidx4.1, align 4
1331 %inc.1 = or i32 %i.011, 2
1332 %arrayidx.2 = getelementptr inbounds i8, i8* %a, i32 %inc.1
1333 %20 = load i8, i8* %arrayidx.2, align 1
1334 %conv.2 = zext i8 %20 to i32
1335 %arrayidx1.2 = getelementptr inbounds i8, i8* %b, i32 %inc.1
1336 %21 = load i8, i8* %arrayidx1.2, align 1
1337 %conv2.2 = zext i8 %21 to i32
1338 %mul.2 = mul nuw nsw i32 %conv2.2, %conv.2
1339 %add.2 = add nuw nsw i32 %mul.2, %conv3
1340 %arrayidx4.2 = getelementptr inbounds i32, i32* %res, i32 %inc.1
1341 store i32 %add.2, i32* %arrayidx4.2, align 4
1342 %inc.2 = or i32 %i.011, 3
1343 %arrayidx.3 = getelementptr inbounds i8, i8* %a, i32 %inc.2
1344 %22 = load i8, i8* %arrayidx.3, align 1
1345 %conv.3 = zext i8 %22 to i32
1346 %arrayidx1.3 = getelementptr inbounds i8, i8* %b, i32 %inc.2
1347 %23 = load i8, i8* %arrayidx1.3, align 1
1348 %conv2.3 = zext i8 %23 to i32
1349 %mul.3 = mul nuw nsw i32 %conv2.3, %conv.3
1350 %add.3 = add nuw nsw i32 %mul.3, %conv3
1351 %arrayidx4.3 = getelementptr inbounds i32, i32* %res, i32 %inc.2
1352 store i32 %add.3, i32* %arrayidx4.3, align 4
1353 %inc.3 = add nuw i32 %i.011, 4
1354 %niter.nsub.3 = add i32 %niter, -4
1355 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1356 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1359 define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_ushort(i16* nocapture readonly %a, i16* nocapture readonly %b, i16 signext %c, i32* nocapture %res, i32 %N) {
1360 ; CHECK-LABEL: test_vec_mul_scalar_add_ushort:
1361 ; CHECK: @ %bb.0: @ %entry
1362 ; CHECK-NEXT: push {r4, r5, r6, r7, lr}
1363 ; CHECK-NEXT: sub sp, #8
1364 ; CHECK-NEXT: ldr.w r12, [sp, #28]
1365 ; CHECK-NEXT: cmp.w r12, #0
1366 ; CHECK-NEXT: beq.w .LBB8_3
1367 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1368 ; CHECK-NEXT: add.w r5, r12, #3
1369 ; CHECK-NEXT: movs r4, #1
1370 ; CHECK-NEXT: bic r5, r5, #3
1371 ; CHECK-NEXT: subs r5, #4
1372 ; CHECK-NEXT: add.w lr, r4, r5, lsr #2
1373 ; CHECK-NEXT: adr r5, .LCPI8_0
1374 ; CHECK-NEXT: sub.w r4, r12, #1
1375 ; CHECK-NEXT: vldrw.u32 q1, [r5]
1376 ; CHECK-NEXT: vdup.32 q0, r4
1377 ; CHECK-NEXT: movs r4, #0
1378 ; CHECK-NEXT: dls lr, lr
1379 ; CHECK-NEXT: .LBB8_2: @ %vector.body
1380 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1381 ; CHECK-NEXT: vadd.i32 q2, q1, r4
1382 ; CHECK-NEXT: @ implicit-def: $q3
1383 ; CHECK-NEXT: sub.w r12, r12, #4
1384 ; CHECK-NEXT: vcmp.u32 cs, q0, q2
1385 ; CHECK-NEXT: @ implicit-def: $q2
1386 ; CHECK-NEXT: adds r4, #4
1387 ; CHECK-NEXT: vmrs r6, p0
1388 ; CHECK-NEXT: and r5, r6, #1
1389 ; CHECK-NEXT: rsbs r7, r5, #0
1390 ; CHECK-NEXT: movs r5, #0
1391 ; CHECK-NEXT: bfi r5, r7, #0, #1
1392 ; CHECK-NEXT: ubfx r7, r6, #4, #1
1393 ; CHECK-NEXT: rsbs r7, r7, #0
1394 ; CHECK-NEXT: bfi r5, r7, #1, #1
1395 ; CHECK-NEXT: ubfx r7, r6, #8, #1
1396 ; CHECK-NEXT: ubfx r6, r6, #12, #1
1397 ; CHECK-NEXT: rsbs r7, r7, #0
1398 ; CHECK-NEXT: bfi r5, r7, #2, #1
1399 ; CHECK-NEXT: rsbs r6, r6, #0
1400 ; CHECK-NEXT: bfi r5, r6, #3, #1
1401 ; CHECK-NEXT: lsls r6, r5, #31
1402 ; CHECK-NEXT: itt ne
1403 ; CHECK-NEXT: ldrhne r6, [r0]
1404 ; CHECK-NEXT: vmovne.32 q2[0], r6
1405 ; CHECK-NEXT: lsls r6, r5, #30
1406 ; CHECK-NEXT: itt mi
1407 ; CHECK-NEXT: ldrhmi r6, [r0, #2]
1408 ; CHECK-NEXT: vmovmi.32 q2[1], r6
1409 ; CHECK-NEXT: lsls r6, r5, #29
1410 ; CHECK-NEXT: itt mi
1411 ; CHECK-NEXT: ldrhmi r6, [r0, #4]
1412 ; CHECK-NEXT: vmovmi.32 q2[2], r6
1413 ; CHECK-NEXT: lsls r5, r5, #28
1414 ; CHECK-NEXT: itt mi
1415 ; CHECK-NEXT: ldrhmi r5, [r0, #6]
1416 ; CHECK-NEXT: vmovmi.32 q2[3], r5
1417 ; CHECK-NEXT: vmrs r6, p0
1418 ; CHECK-NEXT: vmovlb.u16 q2, q2
1419 ; CHECK-NEXT: adds r0, #8
1420 ; CHECK-NEXT: and r5, r6, #1
1421 ; CHECK-NEXT: rsbs r7, r5, #0
1422 ; CHECK-NEXT: movs r5, #0
1423 ; CHECK-NEXT: bfi r5, r7, #0, #1
1424 ; CHECK-NEXT: ubfx r7, r6, #4, #1
1425 ; CHECK-NEXT: rsbs r7, r7, #0
1426 ; CHECK-NEXT: bfi r5, r7, #1, #1
1427 ; CHECK-NEXT: ubfx r7, r6, #8, #1
1428 ; CHECK-NEXT: ubfx r6, r6, #12, #1
1429 ; CHECK-NEXT: rsbs r7, r7, #0
1430 ; CHECK-NEXT: bfi r5, r7, #2, #1
1431 ; CHECK-NEXT: rsbs r6, r6, #0
1432 ; CHECK-NEXT: bfi r5, r6, #3, #1
1433 ; CHECK-NEXT: lsls r6, r5, #31
1434 ; CHECK-NEXT: itt ne
1435 ; CHECK-NEXT: ldrhne r6, [r1]
1436 ; CHECK-NEXT: vmovne.32 q3[0], r6
1437 ; CHECK-NEXT: lsls r6, r5, #30
1438 ; CHECK-NEXT: itt mi
1439 ; CHECK-NEXT: ldrhmi r6, [r1, #2]
1440 ; CHECK-NEXT: vmovmi.32 q3[1], r6
1441 ; CHECK-NEXT: lsls r6, r5, #29
1442 ; CHECK-NEXT: itt mi
1443 ; CHECK-NEXT: ldrhmi r6, [r1, #4]
1444 ; CHECK-NEXT: vmovmi.32 q3[2], r6
1445 ; CHECK-NEXT: lsls r5, r5, #28
1446 ; CHECK-NEXT: itt mi
1447 ; CHECK-NEXT: ldrhmi r5, [r1, #6]
1448 ; CHECK-NEXT: vmovmi.32 q3[3], r5
1449 ; CHECK-NEXT: vmovlb.u16 q3, q3
1450 ; CHECK-NEXT: vctp.32 r12
1451 ; CHECK-NEXT: vmul.i32 q2, q3, q2
1452 ; CHECK-NEXT: adds r1, #8
1453 ; CHECK-NEXT: vadd.i32 q2, q2, r2
1455 ; CHECK-NEXT: vstrwt.32 q2, [r3]
1456 ; CHECK-NEXT: adds r3, #16
1457 ; CHECK-NEXT: le lr, .LBB8_2
1458 ; CHECK-NEXT: .LBB8_3: @ %for.cond.cleanup
1459 ; CHECK-NEXT: add sp, #8
1460 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
1461 ; CHECK-NEXT: .p2align 4
1462 ; CHECK-NEXT: @ %bb.4:
1463 ; CHECK-NEXT: .LCPI8_0:
1464 ; CHECK-NEXT: .long 0 @ 0x0
1465 ; CHECK-NEXT: .long 1 @ 0x1
1466 ; CHECK-NEXT: .long 2 @ 0x2
1467 ; CHECK-NEXT: .long 3 @ 0x3
1469 %cmp10 = icmp eq i32 %N, 0
1470 br i1 %cmp10, label %for.cond.cleanup, label %vector.ph
1472 vector.ph: ; preds = %entry
1473 %conv3 = sext i16 %c to i32
1474 %n.rnd.up = add i32 %N, 3
1475 %n.vec = and i32 %n.rnd.up, -4
1476 %trip.count.minus.1 = add i32 %N, -1
1477 %broadcast.splatinsert12 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
1478 %broadcast.splat13 = shufflevector <4 x i32> %broadcast.splatinsert12, <4 x i32> undef, <4 x i32> zeroinitializer
1479 %broadcast.splatinsert15 = insertelement <4 x i32> undef, i32 %conv3, i32 0
1480 %broadcast.splat16 = shufflevector <4 x i32> %broadcast.splatinsert15, <4 x i32> undef, <4 x i32> zeroinitializer
1481 br label %vector.body
1483 vector.body: ; preds = %vector.body, %vector.ph
1484 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1485 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
1486 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
1487 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
1488 %0 = getelementptr inbounds i16, i16* %a, i32 %index
1489 %1 = icmp ule <4 x i32> %induction, %broadcast.splat13
1490 %2 = bitcast i16* %0 to <4 x i16>*
1491 %wide.masked.load = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %2, i32 2, <4 x i1> %1, <4 x i16> undef)
1492 %3 = zext <4 x i16> %wide.masked.load to <4 x i32>
1493 %4 = getelementptr inbounds i16, i16* %b, i32 %index
1494 %5 = bitcast i16* %4 to <4 x i16>*
1495 %wide.masked.load14 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %5, i32 2, <4 x i1> %1, <4 x i16> undef)
1496 %6 = zext <4 x i16> %wide.masked.load14 to <4 x i32>
1497 %7 = mul nuw nsw <4 x i32> %6, %3
1498 %8 = add nsw <4 x i32> %7, %broadcast.splat16
1499 %9 = getelementptr inbounds i32, i32* %res, i32 %index
1500 %10 = bitcast i32* %9 to <4 x i32>*
1501 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %1)
1502 %index.next = add i32 %index, 4
1503 %11 = icmp eq i32 %index.next, %n.vec
1504 br i1 %11, label %for.cond.cleanup, label %vector.body
1506 for.cond.cleanup: ; preds = %vector.body, %entry
1510 define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_int(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %c, i32* nocapture %res, i32 %N) {
1511 ; CHECK-LABEL: test_vec_mul_scalar_add_int:
1512 ; CHECK: @ %bb.0: @ %entry
1513 ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
1514 ; CHECK-NEXT: ldr.w r12, [sp, #32]
1515 ; CHECK-NEXT: cmp.w r12, #0
1516 ; CHECK-NEXT: beq.w .LBB9_11
1517 ; CHECK-NEXT: @ %bb.1: @ %vector.memcheck
1518 ; CHECK-NEXT: add.w r4, r3, r12, lsl #2
1519 ; CHECK-NEXT: add.w r5, r1, r12, lsl #2
1520 ; CHECK-NEXT: cmp r4, r1
1521 ; CHECK-NEXT: add.w r6, r0, r12, lsl #2
1522 ; CHECK-NEXT: cset r7, hi
1523 ; CHECK-NEXT: cmp r5, r3
1524 ; CHECK-NEXT: cset r5, hi
1525 ; CHECK-NEXT: cmp r4, r0
1526 ; CHECK-NEXT: cset r4, hi
1527 ; CHECK-NEXT: cmp r6, r3
1528 ; CHECK-NEXT: cset r6, hi
1529 ; CHECK-NEXT: mov.w lr, #1
1530 ; CHECK-NEXT: ands r6, r4
1531 ; CHECK-NEXT: lsls r6, r6, #31
1532 ; CHECK-NEXT: itt eq
1533 ; CHECK-NEXT: andeq.w r4, r5, r7
1534 ; CHECK-NEXT: lslseq.w r4, r4, #31
1535 ; CHECK-NEXT: beq .LBB9_4
1536 ; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
1537 ; CHECK-NEXT: sub.w r4, r12, #1
1538 ; CHECK-NEXT: and r5, r12, #3
1539 ; CHECK-NEXT: cmp r4, #3
1540 ; CHECK-NEXT: bhs .LBB9_6
1541 ; CHECK-NEXT: @ %bb.3:
1542 ; CHECK-NEXT: mov r10, r5
1543 ; CHECK-NEXT: mov.w r12, #0
1544 ; CHECK-NEXT: b .LBB9_8
1545 ; CHECK-NEXT: .LBB9_4: @ %vector.ph
1546 ; CHECK-NEXT: add.w r4, r12, #3
1547 ; CHECK-NEXT: bic r4, r4, #3
1548 ; CHECK-NEXT: subs r4, #4
1549 ; CHECK-NEXT: add.w lr, lr, r4, lsr #2
1550 ; CHECK-NEXT: dls lr, lr
1551 ; CHECK-NEXT: .LBB9_5: @ %vector.body
1552 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1553 ; CHECK-NEXT: sub.w r12, r12, #4
1554 ; CHECK-NEXT: vctp.32 r12
1556 ; CHECK-NEXT: vldrwt.u32 q0, [r0]
1557 ; CHECK-NEXT: vldrwt.u32 q1, [r1]
1558 ; CHECK-NEXT: adds r0, #16
1559 ; CHECK-NEXT: vmul.i32 q0, q1, q0
1560 ; CHECK-NEXT: adds r1, #16
1561 ; CHECK-NEXT: vadd.i32 q0, q0, r2
1563 ; CHECK-NEXT: vstrwt.32 q0, [r3]
1564 ; CHECK-NEXT: adds r3, #16
1565 ; CHECK-NEXT: le lr, .LBB9_5
1566 ; CHECK-NEXT: b .LBB9_11
1567 ; CHECK-NEXT: .LBB9_6: @ %for.body.preheader.new
1568 ; CHECK-NEXT: sub.w r7, r12, r5
1569 ; CHECK-NEXT: mov r10, r5
1570 ; CHECK-NEXT: subs r7, #4
1571 ; CHECK-NEXT: movs r4, #0
1572 ; CHECK-NEXT: mov.w r12, #0
1573 ; CHECK-NEXT: add.w lr, lr, r7, lsr #2
1574 ; CHECK-NEXT: dls lr, lr
1575 ; CHECK-NEXT: .LBB9_7: @ %for.body
1576 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1577 ; CHECK-NEXT: ldr r5, [r0, r4]
1578 ; CHECK-NEXT: add.w r9, r0, r4
1579 ; CHECK-NEXT: ldr r6, [r1, r4]
1580 ; CHECK-NEXT: adds r7, r1, r4
1581 ; CHECK-NEXT: add.w r12, r12, #4
1582 ; CHECK-NEXT: mla r5, r6, r5, r2
1583 ; CHECK-NEXT: str r5, [r3, r4]
1584 ; CHECK-NEXT: ldr.w r8, [r9, #4]
1585 ; CHECK-NEXT: ldr r6, [r7, #4]
1586 ; CHECK-NEXT: mla r8, r6, r8, r2
1587 ; CHECK-NEXT: adds r6, r3, r4
1588 ; CHECK-NEXT: adds r4, #16
1589 ; CHECK-NEXT: str.w r8, [r6, #4]
1590 ; CHECK-NEXT: ldr.w r8, [r9, #8]
1591 ; CHECK-NEXT: ldr r5, [r7, #8]
1592 ; CHECK-NEXT: mla r5, r5, r8, r2
1593 ; CHECK-NEXT: str r5, [r6, #8]
1594 ; CHECK-NEXT: ldr.w r5, [r9, #12]
1595 ; CHECK-NEXT: ldr r7, [r7, #12]
1596 ; CHECK-NEXT: mla r5, r7, r5, r2
1597 ; CHECK-NEXT: str r5, [r6, #12]
1598 ; CHECK-NEXT: le lr, .LBB9_7
1599 ; CHECK-NEXT: .LBB9_8: @ %for.cond.cleanup.loopexit.unr-lcssa
1600 ; CHECK-NEXT: wls lr, r10, .LBB9_11
1601 ; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader
1602 ; CHECK-NEXT: mvn r7, #3
1603 ; CHECK-NEXT: mov lr, r10
1604 ; CHECK-NEXT: add.w r7, r7, r12, lsl #2
1605 ; CHECK-NEXT: add r0, r7
1606 ; CHECK-NEXT: add r1, r7
1607 ; CHECK-NEXT: add r3, r7
1608 ; CHECK-NEXT: .LBB9_10: @ %for.body.epil
1609 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1610 ; CHECK-NEXT: ldr r7, [r0, #4]!
1611 ; CHECK-NEXT: ldr r6, [r1, #4]!
1612 ; CHECK-NEXT: mla r7, r6, r7, r2
1613 ; CHECK-NEXT: str r7, [r3, #4]!
1614 ; CHECK-NEXT: le lr, .LBB9_10
1615 ; CHECK-NEXT: .LBB9_11: @ %for.cond.cleanup
1616 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
1618 %cmp8 = icmp eq i32 %N, 0
1619 br i1 %cmp8, label %for.cond.cleanup, label %vector.memcheck
1621 vector.memcheck: ; preds = %entry
1622 %scevgep = getelementptr i32, i32* %res, i32 %N
1623 %scevgep13 = getelementptr i32, i32* %a, i32 %N
1624 %scevgep16 = getelementptr i32, i32* %b, i32 %N
1625 %bound0 = icmp ugt i32* %scevgep13, %res
1626 %bound1 = icmp ugt i32* %scevgep, %a
1627 %found.conflict = and i1 %bound0, %bound1
1628 %bound018 = icmp ugt i32* %scevgep16, %res
1629 %bound119 = icmp ugt i32* %scevgep, %b
1630 %found.conflict20 = and i1 %bound018, %bound119
1631 %conflict.rdx = or i1 %found.conflict, %found.conflict20
1632 br i1 %conflict.rdx, label %for.body.preheader, label %vector.ph
1634 for.body.preheader: ; preds = %vector.memcheck
1636 %xtraiter = and i32 %N, 3
1637 %1 = icmp ult i32 %0, 3
1638 br i1 %1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body.preheader.new
1640 for.body.preheader.new: ; preds = %for.body.preheader
1641 %unroll_iter = sub i32 %N, %xtraiter
1644 vector.ph: ; preds = %vector.memcheck
1645 %n.rnd.up = add i32 %N, 3
1646 %n.vec = and i32 %n.rnd.up, -4
1647 %trip.count.minus.1 = add i32 %N, -1
1648 %broadcast.splatinsert21 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
1649 %broadcast.splat22 = shufflevector <4 x i32> %broadcast.splatinsert21, <4 x i32> undef, <4 x i32> zeroinitializer
1650 %broadcast.splatinsert24 = insertelement <4 x i32> undef, i32 %c, i32 0
1651 %broadcast.splat25 = shufflevector <4 x i32> %broadcast.splatinsert24, <4 x i32> undef, <4 x i32> zeroinitializer
1652 br label %vector.body
1654 vector.body: ; preds = %vector.body, %vector.ph
1655 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1656 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
1657 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
1658 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
1659 %2 = getelementptr inbounds i32, i32* %a, i32 %index
1660 %3 = icmp ule <4 x i32> %induction, %broadcast.splat22
1661 %4 = bitcast i32* %2 to <4 x i32>*
1662 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %3, <4 x i32> undef)
1663 %5 = getelementptr inbounds i32, i32* %b, i32 %index
1664 %6 = bitcast i32* %5 to <4 x i32>*
1665 %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %6, i32 4, <4 x i1> %3, <4 x i32> undef)
1666 %7 = mul nsw <4 x i32> %wide.masked.load23, %wide.masked.load
1667 %8 = add nsw <4 x i32> %7, %broadcast.splat25
1668 %9 = getelementptr inbounds i32, i32* %res, i32 %index
1669 %10 = bitcast i32* %9 to <4 x i32>*
1670 call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %8, <4 x i32>* %10, i32 4, <4 x i1> %3)
1671 %index.next = add i32 %index, 4
1672 %11 = icmp eq i32 %index.next, %n.vec
1673 br i1 %11, label %for.cond.cleanup, label %vector.body
1675 for.cond.cleanup.loopexit.unr-lcssa: ; preds = %for.body, %for.body.preheader
1676 %i.09.unr = phi i32 [ 0, %for.body.preheader ], [ %inc.3, %for.body ]
1677 %lcmp.mod = icmp eq i32 %xtraiter, 0
1678 br i1 %lcmp.mod, label %for.cond.cleanup, label %for.body.epil
1680 for.body.epil: ; preds = %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil
1681 %i.09.epil = phi i32 [ %inc.epil, %for.body.epil ], [ %i.09.unr, %for.cond.cleanup.loopexit.unr-lcssa ]
1682 %epil.iter = phi i32 [ %epil.iter.sub, %for.body.epil ], [ %xtraiter, %for.cond.cleanup.loopexit.unr-lcssa ]
1683 %arrayidx.epil = getelementptr inbounds i32, i32* %a, i32 %i.09.epil
1684 %12 = load i32, i32* %arrayidx.epil, align 4
1685 %arrayidx1.epil = getelementptr inbounds i32, i32* %b, i32 %i.09.epil
1686 %13 = load i32, i32* %arrayidx1.epil, align 4
1687 %mul.epil = mul nsw i32 %13, %12
1688 %add.epil = add nsw i32 %mul.epil, %c
1689 %arrayidx2.epil = getelementptr inbounds i32, i32* %res, i32 %i.09.epil
1690 store i32 %add.epil, i32* %arrayidx2.epil, align 4
1691 %inc.epil = add nuw i32 %i.09.epil, 1
1692 %epil.iter.sub = add i32 %epil.iter, -1
1693 %epil.iter.cmp = icmp eq i32 %epil.iter.sub, 0
1694 br i1 %epil.iter.cmp, label %for.cond.cleanup, label %for.body.epil
1696 for.cond.cleanup: ; preds = %vector.body, %for.cond.cleanup.loopexit.unr-lcssa, %for.body.epil, %entry
1699 for.body: ; preds = %for.body, %for.body.preheader.new
1700 %i.09 = phi i32 [ 0, %for.body.preheader.new ], [ %inc.3, %for.body ]
1701 %niter = phi i32 [ %unroll_iter, %for.body.preheader.new ], [ %niter.nsub.3, %for.body ]
1702 %arrayidx = getelementptr inbounds i32, i32* %a, i32 %i.09
1703 %14 = load i32, i32* %arrayidx, align 4
1704 %arrayidx1 = getelementptr inbounds i32, i32* %b, i32 %i.09
1705 %15 = load i32, i32* %arrayidx1, align 4
1706 %mul = mul nsw i32 %15, %14
1707 %add = add nsw i32 %mul, %c
1708 %arrayidx2 = getelementptr inbounds i32, i32* %res, i32 %i.09
1709 store i32 %add, i32* %arrayidx2, align 4
1710 %inc = or i32 %i.09, 1
1711 %arrayidx.1 = getelementptr inbounds i32, i32* %a, i32 %inc
1712 %16 = load i32, i32* %arrayidx.1, align 4
1713 %arrayidx1.1 = getelementptr inbounds i32, i32* %b, i32 %inc
1714 %17 = load i32, i32* %arrayidx1.1, align 4
1715 %mul.1 = mul nsw i32 %17, %16
1716 %add.1 = add nsw i32 %mul.1, %c
1717 %arrayidx2.1 = getelementptr inbounds i32, i32* %res, i32 %inc
1718 store i32 %add.1, i32* %arrayidx2.1, align 4
1719 %inc.1 = or i32 %i.09, 2
1720 %arrayidx.2 = getelementptr inbounds i32, i32* %a, i32 %inc.1
1721 %18 = load i32, i32* %arrayidx.2, align 4
1722 %arrayidx1.2 = getelementptr inbounds i32, i32* %b, i32 %inc.1
1723 %19 = load i32, i32* %arrayidx1.2, align 4
1724 %mul.2 = mul nsw i32 %19, %18
1725 %add.2 = add nsw i32 %mul.2, %c
1726 %arrayidx2.2 = getelementptr inbounds i32, i32* %res, i32 %inc.1
1727 store i32 %add.2, i32* %arrayidx2.2, align 4
1728 %inc.2 = or i32 %i.09, 3
1729 %arrayidx.3 = getelementptr inbounds i32, i32* %a, i32 %inc.2
1730 %20 = load i32, i32* %arrayidx.3, align 4
1731 %arrayidx1.3 = getelementptr inbounds i32, i32* %b, i32 %inc.2
1732 %21 = load i32, i32* %arrayidx1.3, align 4
1733 %mul.3 = mul nsw i32 %21, %20
1734 %add.3 = add nsw i32 %mul.3, %c
1735 %arrayidx2.3 = getelementptr inbounds i32, i32* %res, i32 %inc.2
1736 store i32 %add.3, i32* %arrayidx2.3, align 4
1737 %inc.3 = add nuw i32 %i.09, 4
1738 %niter.nsub.3 = add i32 %niter, -4
1739 %niter.ncmp.3 = icmp eq i32 %niter.nsub.3, 0
1740 br i1 %niter.ncmp.3, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
1743 ; Function Attrs: argmemonly nounwind readonly willreturn
1744 declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) #2
1746 ; Function Attrs: nounwind readnone willreturn
1747 declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) #3
1749 ; Function Attrs: argmemonly nounwind readonly willreturn
1750 declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) #2
1752 ; Function Attrs: argmemonly nounwind readonly willreturn
1753 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
1755 ; Function Attrs: argmemonly nounwind willreturn
1756 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #4