1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -tail-predication=enabled -verify-machineinstrs %s -o - | FileCheck %s
4 define i32 @add_i32(i32* nocapture readonly %x, i32 %n) {
5 ; CHECK-LABEL: add_i32:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: .save {r7, lr}
8 ; CHECK-NEXT: push {r7, lr}
9 ; CHECK-NEXT: cmp r1, #1
10 ; CHECK-NEXT: blt .LBB0_3
11 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
12 ; CHECK-NEXT: mov r12, r0
13 ; CHECK-NEXT: cmp r1, #4
14 ; CHECK-NEXT: bhs .LBB0_4
15 ; CHECK-NEXT: @ %bb.2:
16 ; CHECK-NEXT: movs r3, #0
17 ; CHECK-NEXT: movs r0, #0
18 ; CHECK-NEXT: b .LBB0_7
19 ; CHECK-NEXT: .LBB0_3:
20 ; CHECK-NEXT: movs r0, #0
21 ; CHECK-NEXT: b .LBB0_9
22 ; CHECK-NEXT: .LBB0_4: @ %vector.ph
23 ; CHECK-NEXT: bic r3, r1, #3
24 ; CHECK-NEXT: movs r2, #1
25 ; CHECK-NEXT: subs r0, r3, #4
26 ; CHECK-NEXT: add.w lr, r2, r0, lsr #2
27 ; CHECK-NEXT: movs r0, #0
28 ; CHECK-NEXT: mov r2, r12
29 ; CHECK-NEXT: .LBB0_5: @ %vector.body
30 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
31 ; CHECK-NEXT: vldrw.u32 q0, [r2], #16
32 ; CHECK-NEXT: vaddva.u32 r0, q0
33 ; CHECK-NEXT: le lr, .LBB0_5
34 ; CHECK-NEXT: @ %bb.6: @ %middle.block
35 ; CHECK-NEXT: cmp r3, r1
37 ; CHECK-NEXT: popeq {r7, pc}
38 ; CHECK-NEXT: .LBB0_7: @ %for.body.preheader1
39 ; CHECK-NEXT: sub.w lr, r1, r3
40 ; CHECK-NEXT: add.w r2, r12, r3, lsl #2
41 ; CHECK-NEXT: .LBB0_8: @ %for.body
42 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
43 ; CHECK-NEXT: ldr r1, [r2], #4
44 ; CHECK-NEXT: add r0, r1
45 ; CHECK-NEXT: le lr, .LBB0_8
46 ; CHECK-NEXT: .LBB0_9: @ %for.cond.cleanup
47 ; CHECK-NEXT: pop {r7, pc}
49 %cmp6 = icmp sgt i32 %n, 0
50 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
52 for.body.preheader: ; preds = %entry
53 %min.iters.check = icmp ult i32 %n, 4
54 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
56 vector.ph: ; preds = %for.body.preheader
57 %n.vec = and i32 %n, -4
60 vector.body: ; preds = %vector.body, %vector.ph
61 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
62 %vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
63 %0 = getelementptr inbounds i32, i32* %x, i32 %index
64 %1 = bitcast i32* %0 to <4 x i32>*
65 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
66 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load)
67 %3 = add i32 %2, %vec.phi
68 %index.next = add i32 %index, 4
69 %4 = icmp eq i32 %index.next, %n.vec
70 br i1 %4, label %middle.block, label %vector.body
72 middle.block: ; preds = %vector.body
73 %cmp.n = icmp eq i32 %n.vec, %n
74 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
76 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
77 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
78 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %3, %middle.block ]
81 for.body: ; preds = %for.body.preheader1, %for.body
82 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
83 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
84 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
85 %5 = load i32, i32* %arrayidx, align 4
86 %add = add nsw i32 %5, %r.07
87 %inc = add nuw nsw i32 %i.08, 1
88 %exitcond = icmp eq i32 %inc, %n
89 br i1 %exitcond, label %for.cond.cleanup, label %for.body
91 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
92 %r.0.lcssa = phi i32 [ 0, %entry ], [ %3, %middle.block ], [ %add, %for.body ]
96 define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) {
97 ; CHECK-LABEL: mul_i32:
98 ; CHECK: @ %bb.0: @ %entry
99 ; CHECK-NEXT: .save {r4, lr}
100 ; CHECK-NEXT: push {r4, lr}
101 ; CHECK-NEXT: movs r2, #1
102 ; CHECK-NEXT: cmp r1, #1
103 ; CHECK-NEXT: blt .LBB1_8
104 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
105 ; CHECK-NEXT: cmp r1, #4
106 ; CHECK-NEXT: bhs .LBB1_3
107 ; CHECK-NEXT: @ %bb.2:
108 ; CHECK-NEXT: mov.w r12, #0
109 ; CHECK-NEXT: b .LBB1_6
110 ; CHECK-NEXT: .LBB1_3: @ %vector.ph
111 ; CHECK-NEXT: bic r12, r1, #3
112 ; CHECK-NEXT: vmov.i32 q0, #0x1
113 ; CHECK-NEXT: sub.w r3, r12, #4
114 ; CHECK-NEXT: add.w lr, r2, r3, lsr #2
115 ; CHECK-NEXT: mov r2, r0
116 ; CHECK-NEXT: .LBB1_4: @ %vector.body
117 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
118 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
119 ; CHECK-NEXT: vmul.i32 q0, q1, q0
120 ; CHECK-NEXT: le lr, .LBB1_4
121 ; CHECK-NEXT: @ %bb.5: @ %middle.block
122 ; CHECK-NEXT: vmov lr, r3, d1
123 ; CHECK-NEXT: cmp r12, r1
124 ; CHECK-NEXT: vmov r2, r4, d0
125 ; CHECK-NEXT: mul r3, lr, r3
126 ; CHECK-NEXT: mul r2, r4, r2
127 ; CHECK-NEXT: mul r2, r3, r2
128 ; CHECK-NEXT: beq .LBB1_8
129 ; CHECK-NEXT: .LBB1_6: @ %for.body.preheader1
130 ; CHECK-NEXT: sub.w lr, r1, r12
131 ; CHECK-NEXT: add.w r0, r0, r12, lsl #2
132 ; CHECK-NEXT: .LBB1_7: @ %for.body
133 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
134 ; CHECK-NEXT: ldr r1, [r0], #4
135 ; CHECK-NEXT: muls r2, r1, r2
136 ; CHECK-NEXT: le lr, .LBB1_7
137 ; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup
138 ; CHECK-NEXT: mov r0, r2
139 ; CHECK-NEXT: pop {r4, pc}
141 %cmp6 = icmp sgt i32 %n, 0
142 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
144 for.body.preheader: ; preds = %entry
145 %min.iters.check = icmp ult i32 %n, 4
146 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
148 vector.ph: ; preds = %for.body.preheader
149 %n.vec = and i32 %n, -4
150 br label %vector.body
152 vector.body: ; preds = %vector.body, %vector.ph
153 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
154 %vec.phi = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, %vector.ph ], [ %2, %vector.body ]
155 %0 = getelementptr inbounds i32, i32* %x, i32 %index
156 %1 = bitcast i32* %0 to <4 x i32>*
157 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
158 %2 = mul <4 x i32> %wide.load, %vec.phi
159 %index.next = add i32 %index, 4
160 %3 = icmp eq i32 %index.next, %n.vec
161 br i1 %3, label %middle.block, label %vector.body
163 middle.block: ; preds = %vector.body
164 %4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %2)
165 %cmp.n = icmp eq i32 %n.vec, %n
166 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
168 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
169 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
170 %r.07.ph = phi i32 [ 1, %for.body.preheader ], [ %4, %middle.block ]
173 for.body: ; preds = %for.body.preheader1, %for.body
174 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
175 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
176 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
177 %5 = load i32, i32* %arrayidx, align 4
178 %add = mul nsw i32 %5, %r.07
179 %inc = add nuw nsw i32 %i.08, 1
180 %exitcond = icmp eq i32 %inc, %n
181 br i1 %exitcond, label %for.cond.cleanup, label %for.body
183 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
184 %r.0.lcssa = phi i32 [ 1, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
188 define i32 @and_i32(i32* nocapture readonly %x, i32 %n) {
189 ; CHECK-LABEL: and_i32:
190 ; CHECK: @ %bb.0: @ %entry
191 ; CHECK-NEXT: .save {r4, lr}
192 ; CHECK-NEXT: push {r4, lr}
193 ; CHECK-NEXT: cmp r1, #1
194 ; CHECK-NEXT: blt .LBB2_3
195 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
196 ; CHECK-NEXT: cmp r1, #4
197 ; CHECK-NEXT: bhs .LBB2_4
198 ; CHECK-NEXT: @ %bb.2:
199 ; CHECK-NEXT: mov.w r2, #-1
200 ; CHECK-NEXT: movs r3, #0
201 ; CHECK-NEXT: b .LBB2_7
202 ; CHECK-NEXT: .LBB2_3:
203 ; CHECK-NEXT: mov.w r2, #-1
204 ; CHECK-NEXT: b .LBB2_9
205 ; CHECK-NEXT: .LBB2_4: @ %vector.ph
206 ; CHECK-NEXT: bic r3, r1, #3
207 ; CHECK-NEXT: movs r2, #1
208 ; CHECK-NEXT: sub.w r12, r3, #4
209 ; CHECK-NEXT: vmov.i8 q0, #0xff
210 ; CHECK-NEXT: add.w lr, r2, r12, lsr #2
211 ; CHECK-NEXT: mov r2, r0
212 ; CHECK-NEXT: .LBB2_5: @ %vector.body
213 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
214 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
215 ; CHECK-NEXT: vand q0, q1, q0
216 ; CHECK-NEXT: le lr, .LBB2_5
217 ; CHECK-NEXT: @ %bb.6: @ %middle.block
218 ; CHECK-NEXT: vmov lr, r12, d1
219 ; CHECK-NEXT: cmp r3, r1
220 ; CHECK-NEXT: vmov r2, r4, d0
221 ; CHECK-NEXT: and.w r12, r12, lr
222 ; CHECK-NEXT: and.w r2, r2, r4
223 ; CHECK-NEXT: and.w r2, r2, r12
224 ; CHECK-NEXT: beq .LBB2_9
225 ; CHECK-NEXT: .LBB2_7: @ %for.body.preheader1
226 ; CHECK-NEXT: sub.w lr, r1, r3
227 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2
228 ; CHECK-NEXT: .LBB2_8: @ %for.body
229 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
230 ; CHECK-NEXT: ldr r1, [r0], #4
231 ; CHECK-NEXT: ands r2, r1
232 ; CHECK-NEXT: le lr, .LBB2_8
233 ; CHECK-NEXT: .LBB2_9: @ %for.cond.cleanup
234 ; CHECK-NEXT: mov r0, r2
235 ; CHECK-NEXT: pop {r4, pc}
237 %cmp6 = icmp sgt i32 %n, 0
238 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
240 for.body.preheader: ; preds = %entry
241 %min.iters.check = icmp ult i32 %n, 4
242 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
244 vector.ph: ; preds = %for.body.preheader
245 %n.vec = and i32 %n, -4
246 br label %vector.body
248 vector.body: ; preds = %vector.body, %vector.ph
249 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
250 %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %2, %vector.body ]
251 %0 = getelementptr inbounds i32, i32* %x, i32 %index
252 %1 = bitcast i32* %0 to <4 x i32>*
253 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
254 %2 = and <4 x i32> %wide.load, %vec.phi
255 %index.next = add i32 %index, 4
256 %3 = icmp eq i32 %index.next, %n.vec
257 br i1 %3, label %middle.block, label %vector.body
259 middle.block: ; preds = %vector.body
260 %4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %2)
261 %cmp.n = icmp eq i32 %n.vec, %n
262 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
264 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
265 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
266 %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %4, %middle.block ]
269 for.body: ; preds = %for.body.preheader1, %for.body
270 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
271 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
272 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
273 %5 = load i32, i32* %arrayidx, align 4
274 %add = and i32 %5, %r.07
275 %inc = add nuw nsw i32 %i.08, 1
276 %exitcond = icmp eq i32 %inc, %n
277 br i1 %exitcond, label %for.cond.cleanup, label %for.body
279 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
280 %r.0.lcssa = phi i32 [ -1, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
284 define i32 @or_i32(i32* nocapture readonly %x, i32 %n) {
285 ; CHECK-LABEL: or_i32:
286 ; CHECK: @ %bb.0: @ %entry
287 ; CHECK-NEXT: .save {r4, lr}
288 ; CHECK-NEXT: push {r4, lr}
289 ; CHECK-NEXT: cmp r1, #1
290 ; CHECK-NEXT: blt .LBB3_3
291 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
292 ; CHECK-NEXT: cmp r1, #4
293 ; CHECK-NEXT: bhs .LBB3_4
294 ; CHECK-NEXT: @ %bb.2:
295 ; CHECK-NEXT: movs r3, #0
296 ; CHECK-NEXT: movs r2, #0
297 ; CHECK-NEXT: b .LBB3_7
298 ; CHECK-NEXT: .LBB3_3:
299 ; CHECK-NEXT: movs r2, #0
300 ; CHECK-NEXT: b .LBB3_9
301 ; CHECK-NEXT: .LBB3_4: @ %vector.ph
302 ; CHECK-NEXT: bic r3, r1, #3
303 ; CHECK-NEXT: movs r2, #1
304 ; CHECK-NEXT: sub.w r12, r3, #4
305 ; CHECK-NEXT: vmov.i32 q0, #0x0
306 ; CHECK-NEXT: add.w lr, r2, r12, lsr #2
307 ; CHECK-NEXT: mov r2, r0
308 ; CHECK-NEXT: .LBB3_5: @ %vector.body
309 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
310 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
311 ; CHECK-NEXT: vorr q0, q1, q0
312 ; CHECK-NEXT: le lr, .LBB3_5
313 ; CHECK-NEXT: @ %bb.6: @ %middle.block
314 ; CHECK-NEXT: vmov lr, r12, d1
315 ; CHECK-NEXT: cmp r3, r1
316 ; CHECK-NEXT: vmov r2, r4, d0
317 ; CHECK-NEXT: orr.w r12, r12, lr
318 ; CHECK-NEXT: orr.w r2, r2, r4
319 ; CHECK-NEXT: orr.w r2, r2, r12
320 ; CHECK-NEXT: beq .LBB3_9
321 ; CHECK-NEXT: .LBB3_7: @ %for.body.preheader1
322 ; CHECK-NEXT: sub.w lr, r1, r3
323 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2
324 ; CHECK-NEXT: .LBB3_8: @ %for.body
325 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
326 ; CHECK-NEXT: ldr r1, [r0], #4
327 ; CHECK-NEXT: orrs r2, r1
328 ; CHECK-NEXT: le lr, .LBB3_8
329 ; CHECK-NEXT: .LBB3_9: @ %for.cond.cleanup
330 ; CHECK-NEXT: mov r0, r2
331 ; CHECK-NEXT: pop {r4, pc}
333 %cmp6 = icmp sgt i32 %n, 0
334 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
336 for.body.preheader: ; preds = %entry
337 %min.iters.check = icmp ult i32 %n, 4
338 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
340 vector.ph: ; preds = %for.body.preheader
341 %n.vec = and i32 %n, -4
342 br label %vector.body
344 vector.body: ; preds = %vector.body, %vector.ph
345 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
346 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
347 %0 = getelementptr inbounds i32, i32* %x, i32 %index
348 %1 = bitcast i32* %0 to <4 x i32>*
349 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
350 %2 = or <4 x i32> %wide.load, %vec.phi
351 %index.next = add i32 %index, 4
352 %3 = icmp eq i32 %index.next, %n.vec
353 br i1 %3, label %middle.block, label %vector.body
355 middle.block: ; preds = %vector.body
356 %4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %2)
357 %cmp.n = icmp eq i32 %n.vec, %n
358 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
360 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
361 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
362 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ]
365 for.body: ; preds = %for.body.preheader1, %for.body
366 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
367 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
368 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
369 %5 = load i32, i32* %arrayidx, align 4
370 %add = or i32 %5, %r.07
371 %inc = add nuw nsw i32 %i.08, 1
372 %exitcond = icmp eq i32 %inc, %n
373 br i1 %exitcond, label %for.cond.cleanup, label %for.body
375 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
376 %r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
380 define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) {
381 ; CHECK-LABEL: xor_i32:
382 ; CHECK: @ %bb.0: @ %entry
383 ; CHECK-NEXT: .save {r4, lr}
384 ; CHECK-NEXT: push {r4, lr}
385 ; CHECK-NEXT: cmp r1, #1
386 ; CHECK-NEXT: blt .LBB4_3
387 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
388 ; CHECK-NEXT: cmp r1, #4
389 ; CHECK-NEXT: bhs .LBB4_4
390 ; CHECK-NEXT: @ %bb.2:
391 ; CHECK-NEXT: movs r3, #0
392 ; CHECK-NEXT: movs r2, #0
393 ; CHECK-NEXT: b .LBB4_7
394 ; CHECK-NEXT: .LBB4_3:
395 ; CHECK-NEXT: movs r2, #0
396 ; CHECK-NEXT: b .LBB4_9
397 ; CHECK-NEXT: .LBB4_4: @ %vector.ph
398 ; CHECK-NEXT: bic r3, r1, #3
399 ; CHECK-NEXT: movs r2, #1
400 ; CHECK-NEXT: sub.w r12, r3, #4
401 ; CHECK-NEXT: vmov.i32 q0, #0x0
402 ; CHECK-NEXT: add.w lr, r2, r12, lsr #2
403 ; CHECK-NEXT: mov r2, r0
404 ; CHECK-NEXT: .LBB4_5: @ %vector.body
405 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
406 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
407 ; CHECK-NEXT: veor q0, q1, q0
408 ; CHECK-NEXT: le lr, .LBB4_5
409 ; CHECK-NEXT: @ %bb.6: @ %middle.block
410 ; CHECK-NEXT: vmov lr, r12, d1
411 ; CHECK-NEXT: cmp r3, r1
412 ; CHECK-NEXT: vmov r2, r4, d0
413 ; CHECK-NEXT: eor.w r12, r12, lr
414 ; CHECK-NEXT: eor.w r2, r2, r4
415 ; CHECK-NEXT: eor.w r2, r2, r12
416 ; CHECK-NEXT: beq .LBB4_9
417 ; CHECK-NEXT: .LBB4_7: @ %for.body.preheader1
418 ; CHECK-NEXT: sub.w lr, r1, r3
419 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2
420 ; CHECK-NEXT: .LBB4_8: @ %for.body
421 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
422 ; CHECK-NEXT: ldr r1, [r0], #4
423 ; CHECK-NEXT: eors r2, r1
424 ; CHECK-NEXT: le lr, .LBB4_8
425 ; CHECK-NEXT: .LBB4_9: @ %for.cond.cleanup
426 ; CHECK-NEXT: mov r0, r2
427 ; CHECK-NEXT: pop {r4, pc}
429 %cmp6 = icmp sgt i32 %n, 0
430 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
432 for.body.preheader: ; preds = %entry
433 %min.iters.check = icmp ult i32 %n, 4
434 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
436 vector.ph: ; preds = %for.body.preheader
437 %n.vec = and i32 %n, -4
438 br label %vector.body
440 vector.body: ; preds = %vector.body, %vector.ph
441 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
442 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
443 %0 = getelementptr inbounds i32, i32* %x, i32 %index
444 %1 = bitcast i32* %0 to <4 x i32>*
445 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
446 %2 = xor <4 x i32> %wide.load, %vec.phi
447 %index.next = add i32 %index, 4
448 %3 = icmp eq i32 %index.next, %n.vec
449 br i1 %3, label %middle.block, label %vector.body
451 middle.block: ; preds = %vector.body
452 %4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %2)
453 %cmp.n = icmp eq i32 %n.vec, %n
454 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
456 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
457 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
458 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ]
461 for.body: ; preds = %for.body.preheader1, %for.body
462 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
463 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
464 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
465 %5 = load i32, i32* %arrayidx, align 4
466 %add = xor i32 %5, %r.07
467 %inc = add nuw nsw i32 %i.08, 1
468 %exitcond = icmp eq i32 %inc, %n
469 br i1 %exitcond, label %for.cond.cleanup, label %for.body
471 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
472 %r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
476 define float @fadd_f32(float* nocapture readonly %x, i32 %n) {
477 ; CHECK-LABEL: fadd_f32:
478 ; CHECK: @ %bb.0: @ %entry
479 ; CHECK-NEXT: .save {r7, lr}
480 ; CHECK-NEXT: push {r7, lr}
481 ; CHECK-NEXT: cmp r1, #1
482 ; CHECK-NEXT: blt .LBB5_3
483 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
484 ; CHECK-NEXT: cmp r1, #4
485 ; CHECK-NEXT: bhs .LBB5_4
486 ; CHECK-NEXT: @ %bb.2:
487 ; CHECK-NEXT: vldr s0, .LCPI5_0
488 ; CHECK-NEXT: movs r2, #0
489 ; CHECK-NEXT: b .LBB5_7
490 ; CHECK-NEXT: .LBB5_3:
491 ; CHECK-NEXT: vldr s0, .LCPI5_0
492 ; CHECK-NEXT: b .LBB5_9
493 ; CHECK-NEXT: .LBB5_4: @ %vector.ph
494 ; CHECK-NEXT: bic r2, r1, #3
495 ; CHECK-NEXT: movs r3, #1
496 ; CHECK-NEXT: sub.w r12, r2, #4
497 ; CHECK-NEXT: vmov.i32 q0, #0x0
498 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
499 ; CHECK-NEXT: mov r3, r0
500 ; CHECK-NEXT: .LBB5_5: @ %vector.body
501 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
502 ; CHECK-NEXT: vldrw.u32 q1, [r3], #16
503 ; CHECK-NEXT: vadd.f32 q0, q1, q0
504 ; CHECK-NEXT: le lr, .LBB5_5
505 ; CHECK-NEXT: @ %bb.6: @ %middle.block
506 ; CHECK-NEXT: vadd.f32 s2, s2, s3
507 ; CHECK-NEXT: cmp r2, r1
508 ; CHECK-NEXT: vadd.f32 s0, s0, s1
509 ; CHECK-NEXT: vadd.f32 s0, s0, s2
510 ; CHECK-NEXT: beq .LBB5_9
511 ; CHECK-NEXT: .LBB5_7: @ %for.body.preheader1
512 ; CHECK-NEXT: sub.w lr, r1, r2
513 ; CHECK-NEXT: add.w r0, r0, r2, lsl #2
514 ; CHECK-NEXT: .LBB5_8: @ %for.body
515 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
516 ; CHECK-NEXT: vldmia r0!, {s2}
517 ; CHECK-NEXT: vadd.f32 s0, s2, s0
518 ; CHECK-NEXT: le lr, .LBB5_8
519 ; CHECK-NEXT: .LBB5_9: @ %for.cond.cleanup
520 ; CHECK-NEXT: vmov r0, s0
521 ; CHECK-NEXT: pop {r7, pc}
522 ; CHECK-NEXT: .p2align 2
523 ; CHECK-NEXT: @ %bb.10:
524 ; CHECK-NEXT: .LCPI5_0:
525 ; CHECK-NEXT: .long 0x00000000 @ float 0
527 %cmp6 = icmp sgt i32 %n, 0
528 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
530 for.body.preheader: ; preds = %entry
531 %min.iters.check = icmp ult i32 %n, 4
532 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
534 vector.ph: ; preds = %for.body.preheader
535 %n.vec = and i32 %n, -4
536 br label %vector.body
538 vector.body: ; preds = %vector.body, %vector.ph
539 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
540 %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
541 %0 = getelementptr inbounds float, float* %x, i32 %index
542 %1 = bitcast float* %0 to <4 x float>*
543 %wide.load = load <4 x float>, <4 x float>* %1, align 4
544 %2 = fadd fast <4 x float> %wide.load, %vec.phi
545 %index.next = add i32 %index, 4
546 %3 = icmp eq i32 %index.next, %n.vec
547 br i1 %3, label %middle.block, label %vector.body
549 middle.block: ; preds = %vector.body
550 %4 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2)
551 %cmp.n = icmp eq i32 %n.vec, %n
552 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
554 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
555 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
556 %r.07.ph = phi float [ 0.000000e+00, %for.body.preheader ], [ %4, %middle.block ]
559 for.body: ; preds = %for.body.preheader1, %for.body
560 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
561 %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
562 %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
563 %5 = load float, float* %arrayidx, align 4
564 %add = fadd fast float %5, %r.07
565 %inc = add nuw nsw i32 %i.08, 1
566 %exitcond = icmp eq i32 %inc, %n
567 br i1 %exitcond, label %for.cond.cleanup, label %for.body
569 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
570 %r.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
574 define float @fmul_f32(float* nocapture readonly %x, i32 %n) {
575 ; CHECK-LABEL: fmul_f32:
576 ; CHECK: @ %bb.0: @ %entry
577 ; CHECK-NEXT: .save {r7, lr}
578 ; CHECK-NEXT: push {r7, lr}
579 ; CHECK-NEXT: cmp r1, #1
580 ; CHECK-NEXT: blt .LBB6_3
581 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
582 ; CHECK-NEXT: cmp r1, #4
583 ; CHECK-NEXT: bhs .LBB6_4
584 ; CHECK-NEXT: @ %bb.2:
585 ; CHECK-NEXT: vmov.f32 s0, #1.000000e+00
586 ; CHECK-NEXT: movs r2, #0
587 ; CHECK-NEXT: b .LBB6_7
588 ; CHECK-NEXT: .LBB6_3:
589 ; CHECK-NEXT: vmov.f32 s0, #1.000000e+00
590 ; CHECK-NEXT: b .LBB6_9
591 ; CHECK-NEXT: .LBB6_4: @ %vector.ph
592 ; CHECK-NEXT: bic r2, r1, #3
593 ; CHECK-NEXT: movs r3, #1
594 ; CHECK-NEXT: sub.w r12, r2, #4
595 ; CHECK-NEXT: vmov.f32 q0, #1.000000e+00
596 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
597 ; CHECK-NEXT: mov r3, r0
598 ; CHECK-NEXT: .LBB6_5: @ %vector.body
599 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
600 ; CHECK-NEXT: vldrw.u32 q1, [r3], #16
601 ; CHECK-NEXT: vmul.f32 q0, q1, q0
602 ; CHECK-NEXT: le lr, .LBB6_5
603 ; CHECK-NEXT: @ %bb.6: @ %middle.block
604 ; CHECK-NEXT: vmul.f32 s2, s2, s3
605 ; CHECK-NEXT: cmp r2, r1
606 ; CHECK-NEXT: vmul.f32 s0, s0, s1
607 ; CHECK-NEXT: vmul.f32 s0, s0, s2
608 ; CHECK-NEXT: beq .LBB6_9
609 ; CHECK-NEXT: .LBB6_7: @ %for.body.preheader1
610 ; CHECK-NEXT: sub.w lr, r1, r2
611 ; CHECK-NEXT: add.w r0, r0, r2, lsl #2
612 ; CHECK-NEXT: .LBB6_8: @ %for.body
613 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
614 ; CHECK-NEXT: vldmia r0!, {s2}
615 ; CHECK-NEXT: vmul.f32 s0, s2, s0
616 ; CHECK-NEXT: le lr, .LBB6_8
617 ; CHECK-NEXT: .LBB6_9: @ %for.cond.cleanup
618 ; CHECK-NEXT: vmov r0, s0
619 ; CHECK-NEXT: pop {r7, pc}
621 %cmp6 = icmp sgt i32 %n, 0
622 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
624 for.body.preheader: ; preds = %entry
625 %min.iters.check = icmp ult i32 %n, 4
626 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
628 vector.ph: ; preds = %for.body.preheader
629 %n.vec = and i32 %n, -4
630 br label %vector.body
632 vector.body: ; preds = %vector.body, %vector.ph
633 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
634 %vec.phi = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %vector.ph ], [ %2, %vector.body ]
635 %0 = getelementptr inbounds float, float* %x, i32 %index
636 %1 = bitcast float* %0 to <4 x float>*
637 %wide.load = load <4 x float>, <4 x float>* %1, align 4
638 %2 = fmul fast <4 x float> %wide.load, %vec.phi
639 %index.next = add i32 %index, 4
640 %3 = icmp eq i32 %index.next, %n.vec
641 br i1 %3, label %middle.block, label %vector.body
643 middle.block: ; preds = %vector.body
644 %4 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2)
645 %cmp.n = icmp eq i32 %n.vec, %n
646 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
648 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
649 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
650 %r.07.ph = phi float [ 1.000000e+00, %for.body.preheader ], [ %4, %middle.block ]
653 for.body: ; preds = %for.body.preheader1, %for.body
654 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
655 %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
656 %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
657 %5 = load float, float* %arrayidx, align 4
658 %add = fmul fast float %5, %r.07
659 %inc = add nuw nsw i32 %i.08, 1
660 %exitcond = icmp eq i32 %inc, %n
661 br i1 %exitcond, label %for.cond.cleanup, label %for.body
663 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
664 %r.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
668 define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) {
669 ; CHECK-LABEL: smin_i32:
670 ; CHECK: @ %bb.0: @ %entry
671 ; CHECK-NEXT: .save {r7, lr}
672 ; CHECK-NEXT: push {r7, lr}
673 ; CHECK-NEXT: cmp r1, #1
674 ; CHECK-NEXT: blt .LBB7_3
675 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
676 ; CHECK-NEXT: cmp r1, #4
677 ; CHECK-NEXT: bhs .LBB7_4
678 ; CHECK-NEXT: @ %bb.2:
679 ; CHECK-NEXT: mvn r2, #-2147483648
680 ; CHECK-NEXT: movs r3, #0
681 ; CHECK-NEXT: b .LBB7_7
682 ; CHECK-NEXT: .LBB7_3:
683 ; CHECK-NEXT: mvn r2, #-2147483648
684 ; CHECK-NEXT: b .LBB7_9
685 ; CHECK-NEXT: .LBB7_4: @ %vector.ph
686 ; CHECK-NEXT: bic r3, r1, #3
687 ; CHECK-NEXT: movs r2, #1
688 ; CHECK-NEXT: sub.w r12, r3, #4
689 ; CHECK-NEXT: vmvn.i32 q0, #0x80000000
690 ; CHECK-NEXT: add.w lr, r2, r12, lsr #2
691 ; CHECK-NEXT: mov r2, r0
692 ; CHECK-NEXT: .LBB7_5: @ %vector.body
693 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
694 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
695 ; CHECK-NEXT: vmin.s32 q0, q0, q1
696 ; CHECK-NEXT: le lr, .LBB7_5
697 ; CHECK-NEXT: @ %bb.6: @ %middle.block
698 ; CHECK-NEXT: mvn r2, #-2147483648
699 ; CHECK-NEXT: cmp r3, r1
700 ; CHECK-NEXT: vminv.s32 r2, q0
701 ; CHECK-NEXT: beq .LBB7_9
702 ; CHECK-NEXT: .LBB7_7: @ %for.body.preheader1
703 ; CHECK-NEXT: sub.w lr, r1, r3
704 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2
705 ; CHECK-NEXT: .LBB7_8: @ %for.body
706 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
707 ; CHECK-NEXT: ldr r1, [r0], #4
708 ; CHECK-NEXT: cmp r2, r1
709 ; CHECK-NEXT: csel r2, r2, r1, lt
710 ; CHECK-NEXT: le lr, .LBB7_8
711 ; CHECK-NEXT: .LBB7_9: @ %for.cond.cleanup
712 ; CHECK-NEXT: mov r0, r2
713 ; CHECK-NEXT: pop {r7, pc}
715 %cmp6 = icmp sgt i32 %n, 0
716 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
718 for.body.preheader: ; preds = %entry
719 %min.iters.check = icmp ult i32 %n, 4
720 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
722 vector.ph: ; preds = %for.body.preheader
723 %n.vec = and i32 %n, -4
724 br label %vector.body
726 vector.body: ; preds = %vector.body, %vector.ph
727 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
728 %vec.phi = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, %vector.ph ], [ %3, %vector.body ]
729 %0 = getelementptr inbounds i32, i32* %x, i32 %index
730 %1 = bitcast i32* %0 to <4 x i32>*
731 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
732 %2 = icmp slt <4 x i32> %vec.phi, %wide.load
733 %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
734 %index.next = add i32 %index, 4
735 %4 = icmp eq i32 %index.next, %n.vec
736 br i1 %4, label %middle.block, label %vector.body
738 middle.block: ; preds = %vector.body
739 %5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %3)
740 %cmp.n = icmp eq i32 %n.vec, %n
741 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
743 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
744 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
745 %r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ]
748 for.body: ; preds = %for.body.preheader1, %for.body
749 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
750 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
751 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
752 %6 = load i32, i32* %arrayidx, align 4
753 %c = icmp slt i32 %r.07, %6
754 %add = select i1 %c, i32 %r.07, i32 %6
755 %inc = add nuw nsw i32 %i.08, 1
756 %exitcond = icmp eq i32 %inc, %n
757 br i1 %exitcond, label %for.cond.cleanup, label %for.body
759 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
760 %r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
764 define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
765 ; CHECK-LABEL: smin_i32_inloop:
766 ; CHECK: @ %bb.0: @ %entry
767 ; CHECK-NEXT: .save {r7, lr}
768 ; CHECK-NEXT: push {r7, lr}
769 ; CHECK-NEXT: cmp r1, #1
770 ; CHECK-NEXT: blt .LBB8_3
771 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
772 ; CHECK-NEXT: mov r12, r0
773 ; CHECK-NEXT: cmp r1, #4
774 ; CHECK-NEXT: bhs .LBB8_4
775 ; CHECK-NEXT: @ %bb.2:
776 ; CHECK-NEXT: mvn r0, #-2147483648
777 ; CHECK-NEXT: movs r3, #0
778 ; CHECK-NEXT: b .LBB8_7
779 ; CHECK-NEXT: .LBB8_3:
780 ; CHECK-NEXT: mvn r0, #-2147483648
781 ; CHECK-NEXT: b .LBB8_9
782 ; CHECK-NEXT: .LBB8_4: @ %vector.ph
783 ; CHECK-NEXT: bic r3, r1, #3
784 ; CHECK-NEXT: movs r2, #1
785 ; CHECK-NEXT: subs r0, r3, #4
786 ; CHECK-NEXT: add.w lr, r2, r0, lsr #2
787 ; CHECK-NEXT: mvn r0, #-2147483648
788 ; CHECK-NEXT: mov r2, r12
789 ; CHECK-NEXT: .LBB8_5: @ %vector.body
790 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
791 ; CHECK-NEXT: vldrw.u32 q0, [r2], #16
792 ; CHECK-NEXT: vminv.s32 r0, q0
793 ; CHECK-NEXT: le lr, .LBB8_5
794 ; CHECK-NEXT: @ %bb.6: @ %middle.block
795 ; CHECK-NEXT: cmp r3, r1
797 ; CHECK-NEXT: popeq {r7, pc}
798 ; CHECK-NEXT: .LBB8_7: @ %for.body.preheader1
799 ; CHECK-NEXT: sub.w lr, r1, r3
800 ; CHECK-NEXT: add.w r2, r12, r3, lsl #2
801 ; CHECK-NEXT: .LBB8_8: @ %for.body
802 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
803 ; CHECK-NEXT: ldr r1, [r2], #4
804 ; CHECK-NEXT: cmp r0, r1
805 ; CHECK-NEXT: csel r0, r0, r1, lt
806 ; CHECK-NEXT: le lr, .LBB8_8
807 ; CHECK-NEXT: .LBB8_9: @ %for.cond.cleanup
808 ; CHECK-NEXT: pop {r7, pc}
810 %cmp6 = icmp sgt i32 %n, 0
811 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
813 for.body.preheader: ; preds = %entry
814 %min.iters.check = icmp ult i32 %n, 4
815 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
817 vector.ph: ; preds = %for.body.preheader
818 %n.vec = and i32 %n, -4
819 br label %vector.body
821 vector.body: ; preds = %vector.body, %vector.ph
822 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
823 %vec.phi = phi i32 [ 2147483647, %vector.ph ], [ %3, %vector.body ]
824 %0 = getelementptr inbounds i32, i32* %x, i32 %index
825 %1 = bitcast i32* %0 to <4 x i32>*
826 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
827 %l5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %wide.load)
828 %2 = icmp slt i32 %vec.phi, %l5
829 %3 = select i1 %2, i32 %vec.phi, i32 %l5
830 %index.next = add i32 %index, 4
831 %4 = icmp eq i32 %index.next, %n.vec
832 br i1 %4, label %middle.block, label %vector.body
834 middle.block: ; preds = %vector.body
835 %5 = phi i32 [ %3, %vector.body ]
836 %cmp.n = icmp eq i32 %n.vec, %n
837 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
839 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
840 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
841 %r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ]
844 for.body: ; preds = %for.body.preheader1, %for.body
845 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
846 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
847 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
848 %6 = load i32, i32* %arrayidx, align 4
849 %c = icmp slt i32 %r.07, %6
850 %add = select i1 %c, i32 %r.07, i32 %6
851 %inc = add nuw nsw i32 %i.08, 1
852 %exitcond = icmp eq i32 %inc, %n
853 br i1 %exitcond, label %for.cond.cleanup, label %for.body
855 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
856 %r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
860 define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) {
861 ; CHECK-LABEL: smax_i32:
862 ; CHECK: @ %bb.0: @ %entry
863 ; CHECK-NEXT: .save {r7, lr}
864 ; CHECK-NEXT: push {r7, lr}
865 ; CHECK-NEXT: cmp r1, #1
866 ; CHECK-NEXT: blt .LBB9_3
867 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
868 ; CHECK-NEXT: cmp r1, #4
869 ; CHECK-NEXT: bhs .LBB9_4
870 ; CHECK-NEXT: @ %bb.2:
871 ; CHECK-NEXT: mov.w r2, #-2147483648
872 ; CHECK-NEXT: movs r3, #0
873 ; CHECK-NEXT: b .LBB9_7
874 ; CHECK-NEXT: .LBB9_3:
875 ; CHECK-NEXT: mov.w r2, #-2147483648
876 ; CHECK-NEXT: b .LBB9_9
877 ; CHECK-NEXT: .LBB9_4: @ %vector.ph
878 ; CHECK-NEXT: bic r3, r1, #3
879 ; CHECK-NEXT: movs r2, #1
880 ; CHECK-NEXT: sub.w r12, r3, #4
881 ; CHECK-NEXT: vmov.i32 q0, #0x80000000
882 ; CHECK-NEXT: add.w lr, r2, r12, lsr #2
883 ; CHECK-NEXT: mov r2, r0
884 ; CHECK-NEXT: .LBB9_5: @ %vector.body
885 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
886 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
887 ; CHECK-NEXT: vmax.s32 q0, q0, q1
888 ; CHECK-NEXT: le lr, .LBB9_5
889 ; CHECK-NEXT: @ %bb.6: @ %middle.block
890 ; CHECK-NEXT: mov.w r2, #-2147483648
891 ; CHECK-NEXT: cmp r3, r1
892 ; CHECK-NEXT: vmaxv.s32 r2, q0
893 ; CHECK-NEXT: beq .LBB9_9
894 ; CHECK-NEXT: .LBB9_7: @ %for.body.preheader1
895 ; CHECK-NEXT: sub.w lr, r1, r3
896 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2
897 ; CHECK-NEXT: .LBB9_8: @ %for.body
898 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
899 ; CHECK-NEXT: ldr r1, [r0], #4
900 ; CHECK-NEXT: cmp r2, r1
901 ; CHECK-NEXT: csel r2, r2, r1, gt
902 ; CHECK-NEXT: le lr, .LBB9_8
903 ; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup
904 ; CHECK-NEXT: mov r0, r2
905 ; CHECK-NEXT: pop {r7, pc}
907 %cmp6 = icmp sgt i32 %n, 0
908 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
910 for.body.preheader: ; preds = %entry
911 %min.iters.check = icmp ult i32 %n, 4
912 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
914 vector.ph: ; preds = %for.body.preheader
915 %n.vec = and i32 %n, -4
916 br label %vector.body
918 vector.body: ; preds = %vector.body, %vector.ph
919 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
920 %vec.phi = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %vector.ph ], [ %3, %vector.body ]
921 %0 = getelementptr inbounds i32, i32* %x, i32 %index
922 %1 = bitcast i32* %0 to <4 x i32>*
923 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
924 %2 = icmp sgt <4 x i32> %vec.phi, %wide.load
925 %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
926 %index.next = add i32 %index, 4
927 %4 = icmp eq i32 %index.next, %n.vec
928 br i1 %4, label %middle.block, label %vector.body
930 middle.block: ; preds = %vector.body
931 %5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %3)
932 %cmp.n = icmp eq i32 %n.vec, %n
933 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
935 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
936 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
937 %r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ]
940 for.body: ; preds = %for.body.preheader1, %for.body
941 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
942 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
943 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
944 %6 = load i32, i32* %arrayidx, align 4
945 %c = icmp sgt i32 %r.07, %6
946 %add = select i1 %c, i32 %r.07, i32 %6
947 %inc = add nuw nsw i32 %i.08, 1
948 %exitcond = icmp eq i32 %inc, %n
949 br i1 %exitcond, label %for.cond.cleanup, label %for.body
951 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
952 %r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
956 define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
957 ; CHECK-LABEL: smax_i32_inloop:
958 ; CHECK: @ %bb.0: @ %entry
959 ; CHECK-NEXT: .save {r7, lr}
960 ; CHECK-NEXT: push {r7, lr}
961 ; CHECK-NEXT: cmp r1, #1
962 ; CHECK-NEXT: blt .LBB10_3
963 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
964 ; CHECK-NEXT: mov r12, r0
965 ; CHECK-NEXT: cmp r1, #4
966 ; CHECK-NEXT: bhs .LBB10_4
967 ; CHECK-NEXT: @ %bb.2:
968 ; CHECK-NEXT: mov.w r0, #-2147483648
969 ; CHECK-NEXT: movs r3, #0
970 ; CHECK-NEXT: b .LBB10_7
971 ; CHECK-NEXT: .LBB10_3:
972 ; CHECK-NEXT: mov.w r0, #-2147483648
973 ; CHECK-NEXT: b .LBB10_9
974 ; CHECK-NEXT: .LBB10_4: @ %vector.ph
975 ; CHECK-NEXT: bic r3, r1, #3
976 ; CHECK-NEXT: movs r2, #1
977 ; CHECK-NEXT: subs r0, r3, #4
978 ; CHECK-NEXT: add.w lr, r2, r0, lsr #2
979 ; CHECK-NEXT: mov.w r0, #-2147483648
980 ; CHECK-NEXT: mov r2, r12
981 ; CHECK-NEXT: .LBB10_5: @ %vector.body
982 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
983 ; CHECK-NEXT: vldrw.u32 q0, [r2], #16
984 ; CHECK-NEXT: vmaxv.s32 r0, q0
985 ; CHECK-NEXT: le lr, .LBB10_5
986 ; CHECK-NEXT: @ %bb.6: @ %middle.block
987 ; CHECK-NEXT: cmp r3, r1
989 ; CHECK-NEXT: popeq {r7, pc}
990 ; CHECK-NEXT: .LBB10_7: @ %for.body.preheader1
991 ; CHECK-NEXT: sub.w lr, r1, r3
992 ; CHECK-NEXT: add.w r2, r12, r3, lsl #2
993 ; CHECK-NEXT: .LBB10_8: @ %for.body
994 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
995 ; CHECK-NEXT: ldr r1, [r2], #4
996 ; CHECK-NEXT: cmp r0, r1
997 ; CHECK-NEXT: csel r0, r0, r1, gt
998 ; CHECK-NEXT: le lr, .LBB10_8
999 ; CHECK-NEXT: .LBB10_9: @ %for.cond.cleanup
1000 ; CHECK-NEXT: pop {r7, pc}
1002 %cmp6 = icmp sgt i32 %n, 0
1003 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1005 for.body.preheader: ; preds = %entry
1006 %min.iters.check = icmp ult i32 %n, 4
1007 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1009 vector.ph: ; preds = %for.body.preheader
1010 %n.vec = and i32 %n, -4
1011 br label %vector.body
1013 vector.body: ; preds = %vector.body, %vector.ph
1014 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1015 %vec.phi = phi i32 [ -2147483648, %vector.ph ], [ %3, %vector.body ]
1016 %0 = getelementptr inbounds i32, i32* %x, i32 %index
1017 %1 = bitcast i32* %0 to <4 x i32>*
1018 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1019 %l5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %wide.load)
1020 %2 = icmp sgt i32 %vec.phi, %l5
1021 %3 = select i1 %2, i32 %vec.phi, i32 %l5
1022 %index.next = add i32 %index, 4
1023 %4 = icmp eq i32 %index.next, %n.vec
1024 br i1 %4, label %middle.block, label %vector.body
1026 middle.block: ; preds = %vector.body
1027 %5 = phi i32 [ %3, %vector.body ]
1028 %cmp.n = icmp eq i32 %n.vec, %n
1029 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1031 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
1032 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1033 %r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ]
1036 for.body: ; preds = %for.body.preheader1, %for.body
1037 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1038 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1039 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
1040 %6 = load i32, i32* %arrayidx, align 4
1041 %c = icmp sgt i32 %r.07, %6
1042 %add = select i1 %c, i32 %r.07, i32 %6
1043 %inc = add nuw nsw i32 %i.08, 1
1044 %exitcond = icmp eq i32 %inc, %n
1045 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1047 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1048 %r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1052 define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) {
1053 ; CHECK-LABEL: umin_i32:
1054 ; CHECK: @ %bb.0: @ %entry
1055 ; CHECK-NEXT: .save {r7, lr}
1056 ; CHECK-NEXT: push {r7, lr}
1057 ; CHECK-NEXT: cmp r1, #1
1058 ; CHECK-NEXT: blt .LBB11_3
1059 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1060 ; CHECK-NEXT: cmp r1, #4
1061 ; CHECK-NEXT: bhs .LBB11_4
1062 ; CHECK-NEXT: @ %bb.2:
1063 ; CHECK-NEXT: mov.w r2, #-1
1064 ; CHECK-NEXT: movs r3, #0
1065 ; CHECK-NEXT: b .LBB11_7
1066 ; CHECK-NEXT: .LBB11_3:
1067 ; CHECK-NEXT: mov.w r2, #-1
1068 ; CHECK-NEXT: b .LBB11_9
1069 ; CHECK-NEXT: .LBB11_4: @ %vector.ph
1070 ; CHECK-NEXT: bic r3, r1, #3
1071 ; CHECK-NEXT: movs r2, #1
1072 ; CHECK-NEXT: sub.w r12, r3, #4
1073 ; CHECK-NEXT: vmov.i8 q0, #0xff
1074 ; CHECK-NEXT: add.w lr, r2, r12, lsr #2
1075 ; CHECK-NEXT: mov r2, r0
1076 ; CHECK-NEXT: .LBB11_5: @ %vector.body
1077 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1078 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
1079 ; CHECK-NEXT: vmin.u32 q0, q0, q1
1080 ; CHECK-NEXT: le lr, .LBB11_5
1081 ; CHECK-NEXT: @ %bb.6: @ %middle.block
1082 ; CHECK-NEXT: mov.w r2, #-1
1083 ; CHECK-NEXT: cmp r3, r1
1084 ; CHECK-NEXT: vminv.u32 r2, q0
1085 ; CHECK-NEXT: beq .LBB11_9
1086 ; CHECK-NEXT: .LBB11_7: @ %for.body.preheader1
1087 ; CHECK-NEXT: sub.w lr, r1, r3
1088 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2
1089 ; CHECK-NEXT: .LBB11_8: @ %for.body
1090 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1091 ; CHECK-NEXT: ldr r1, [r0], #4
1092 ; CHECK-NEXT: cmp r2, r1
1093 ; CHECK-NEXT: csel r2, r2, r1, lo
1094 ; CHECK-NEXT: le lr, .LBB11_8
1095 ; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup
1096 ; CHECK-NEXT: mov r0, r2
1097 ; CHECK-NEXT: pop {r7, pc}
1099 %cmp6 = icmp sgt i32 %n, 0
1100 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1102 for.body.preheader: ; preds = %entry
1103 %min.iters.check = icmp ult i32 %n, 4
1104 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1106 vector.ph: ; preds = %for.body.preheader
1107 %n.vec = and i32 %n, -4
1108 br label %vector.body
1110 vector.body: ; preds = %vector.body, %vector.ph
1111 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1112 %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %3, %vector.body ]
1113 %0 = getelementptr inbounds i32, i32* %x, i32 %index
1114 %1 = bitcast i32* %0 to <4 x i32>*
1115 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1116 %2 = icmp ult <4 x i32> %vec.phi, %wide.load
1117 %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
1118 %index.next = add i32 %index, 4
1119 %4 = icmp eq i32 %index.next, %n.vec
1120 br i1 %4, label %middle.block, label %vector.body
1122 middle.block: ; preds = %vector.body
1123 %5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %3)
1124 %cmp.n = icmp eq i32 %n.vec, %n
1125 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1127 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
1128 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1129 %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ]
1132 for.body: ; preds = %for.body.preheader1, %for.body
1133 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1134 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1135 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
1136 %6 = load i32, i32* %arrayidx, align 4
1137 %c = icmp ult i32 %r.07, %6
1138 %add = select i1 %c, i32 %r.07, i32 %6
1139 %inc = add nuw nsw i32 %i.08, 1
1140 %exitcond = icmp eq i32 %inc, %n
1141 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1143 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1144 %r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1148 define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
1149 ; CHECK-LABEL: umin_i32_inloop:
1150 ; CHECK: @ %bb.0: @ %entry
1151 ; CHECK-NEXT: .save {r7, lr}
1152 ; CHECK-NEXT: push {r7, lr}
1153 ; CHECK-NEXT: cmp r1, #1
1154 ; CHECK-NEXT: blt .LBB12_3
1155 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1156 ; CHECK-NEXT: mov r12, r0
1157 ; CHECK-NEXT: cmp r1, #4
1158 ; CHECK-NEXT: bhs .LBB12_4
1159 ; CHECK-NEXT: @ %bb.2:
1160 ; CHECK-NEXT: mov.w r0, #-1
1161 ; CHECK-NEXT: movs r3, #0
1162 ; CHECK-NEXT: b .LBB12_7
1163 ; CHECK-NEXT: .LBB12_3:
1164 ; CHECK-NEXT: mov.w r0, #-1
1165 ; CHECK-NEXT: b .LBB12_9
1166 ; CHECK-NEXT: .LBB12_4: @ %vector.ph
1167 ; CHECK-NEXT: bic r3, r1, #3
1168 ; CHECK-NEXT: movs r2, #1
1169 ; CHECK-NEXT: subs r0, r3, #4
1170 ; CHECK-NEXT: add.w lr, r2, r0, lsr #2
1171 ; CHECK-NEXT: mov.w r0, #-1
1172 ; CHECK-NEXT: mov r2, r12
1173 ; CHECK-NEXT: .LBB12_5: @ %vector.body
1174 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1175 ; CHECK-NEXT: vldrw.u32 q0, [r2], #16
1176 ; CHECK-NEXT: vminv.u32 r0, q0
1177 ; CHECK-NEXT: le lr, .LBB12_5
1178 ; CHECK-NEXT: @ %bb.6: @ %middle.block
1179 ; CHECK-NEXT: cmp r3, r1
1181 ; CHECK-NEXT: popeq {r7, pc}
1182 ; CHECK-NEXT: .LBB12_7: @ %for.body.preheader1
1183 ; CHECK-NEXT: sub.w lr, r1, r3
1184 ; CHECK-NEXT: add.w r2, r12, r3, lsl #2
1185 ; CHECK-NEXT: .LBB12_8: @ %for.body
1186 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1187 ; CHECK-NEXT: ldr r1, [r2], #4
1188 ; CHECK-NEXT: cmp r0, r1
1189 ; CHECK-NEXT: csel r0, r0, r1, hi
1190 ; CHECK-NEXT: le lr, .LBB12_8
1191 ; CHECK-NEXT: .LBB12_9: @ %for.cond.cleanup
1192 ; CHECK-NEXT: pop {r7, pc}
1194 %cmp6 = icmp sgt i32 %n, 0
1195 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1197 for.body.preheader: ; preds = %entry
1198 %min.iters.check = icmp ult i32 %n, 4
1199 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1201 vector.ph: ; preds = %for.body.preheader
1202 %n.vec = and i32 %n, -4
1203 br label %vector.body
1205 vector.body: ; preds = %vector.body, %vector.ph
1206 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1207 %vec.phi = phi i32 [ -1, %vector.ph ], [ %3, %vector.body ]
1208 %0 = getelementptr inbounds i32, i32* %x, i32 %index
1209 %1 = bitcast i32* %0 to <4 x i32>*
1210 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1211 %l5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %wide.load)
1212 %2 = icmp ult i32 %vec.phi, %l5
1213 %3 = select i1 %2, i32 %vec.phi, i32 %l5
1214 %index.next = add i32 %index, 4
1215 %4 = icmp eq i32 %index.next, %n.vec
1216 br i1 %4, label %middle.block, label %vector.body
1218 middle.block: ; preds = %vector.body
1219 %5 = phi i32 [ %3, %vector.body ]
1220 %cmp.n = icmp eq i32 %n.vec, %n
1221 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1223 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
1224 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1225 %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ]
1228 for.body: ; preds = %for.body.preheader1, %for.body
1229 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1230 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1231 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
1232 %6 = load i32, i32* %arrayidx, align 4
1233 %c = icmp ugt i32 %r.07, %6
1234 %add = select i1 %c, i32 %r.07, i32 %6
1235 %inc = add nuw nsw i32 %i.08, 1
1236 %exitcond = icmp eq i32 %inc, %n
1237 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1239 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1240 %r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1244 define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) {
1245 ; CHECK-LABEL: umax_i32:
1246 ; CHECK: @ %bb.0: @ %entry
1247 ; CHECK-NEXT: .save {r7, lr}
1248 ; CHECK-NEXT: push {r7, lr}
1249 ; CHECK-NEXT: cmp r1, #1
1250 ; CHECK-NEXT: blt .LBB13_3
1251 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1252 ; CHECK-NEXT: cmp r1, #4
1253 ; CHECK-NEXT: bhs .LBB13_4
1254 ; CHECK-NEXT: @ %bb.2:
1255 ; CHECK-NEXT: movs r3, #0
1256 ; CHECK-NEXT: movs r2, #0
1257 ; CHECK-NEXT: b .LBB13_7
1258 ; CHECK-NEXT: .LBB13_3:
1259 ; CHECK-NEXT: movs r2, #0
1260 ; CHECK-NEXT: b .LBB13_9
1261 ; CHECK-NEXT: .LBB13_4: @ %vector.ph
1262 ; CHECK-NEXT: bic r3, r1, #3
1263 ; CHECK-NEXT: movs r2, #1
1264 ; CHECK-NEXT: sub.w r12, r3, #4
1265 ; CHECK-NEXT: vmov.i32 q0, #0x0
1266 ; CHECK-NEXT: add.w lr, r2, r12, lsr #2
1267 ; CHECK-NEXT: mov r2, r0
1268 ; CHECK-NEXT: .LBB13_5: @ %vector.body
1269 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1270 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
1271 ; CHECK-NEXT: vmax.u32 q0, q0, q1
1272 ; CHECK-NEXT: le lr, .LBB13_5
1273 ; CHECK-NEXT: @ %bb.6: @ %middle.block
1274 ; CHECK-NEXT: movs r2, #0
1275 ; CHECK-NEXT: cmp r3, r1
1276 ; CHECK-NEXT: vmaxv.u32 r2, q0
1277 ; CHECK-NEXT: beq .LBB13_9
1278 ; CHECK-NEXT: .LBB13_7: @ %for.body.preheader1
1279 ; CHECK-NEXT: sub.w lr, r1, r3
1280 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2
1281 ; CHECK-NEXT: .LBB13_8: @ %for.body
1282 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1283 ; CHECK-NEXT: ldr r1, [r0], #4
1284 ; CHECK-NEXT: cmp r2, r1
1285 ; CHECK-NEXT: csel r2, r2, r1, hi
1286 ; CHECK-NEXT: le lr, .LBB13_8
1287 ; CHECK-NEXT: .LBB13_9: @ %for.cond.cleanup
1288 ; CHECK-NEXT: mov r0, r2
1289 ; CHECK-NEXT: pop {r7, pc}
1291 %cmp6 = icmp sgt i32 %n, 0
1292 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1294 for.body.preheader: ; preds = %entry
1295 %min.iters.check = icmp ult i32 %n, 4
1296 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1298 vector.ph: ; preds = %for.body.preheader
1299 %n.vec = and i32 %n, -4
1300 br label %vector.body
1302 vector.body: ; preds = %vector.body, %vector.ph
1303 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1304 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
1305 %0 = getelementptr inbounds i32, i32* %x, i32 %index
1306 %1 = bitcast i32* %0 to <4 x i32>*
1307 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1308 %2 = icmp ugt <4 x i32> %vec.phi, %wide.load
1309 %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
1310 %index.next = add i32 %index, 4
1311 %4 = icmp eq i32 %index.next, %n.vec
1312 br i1 %4, label %middle.block, label %vector.body
1314 middle.block: ; preds = %vector.body
1315 %5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %3)
1316 %cmp.n = icmp eq i32 %n.vec, %n
1317 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1319 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
1320 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1321 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ]
1324 for.body: ; preds = %for.body.preheader1, %for.body
1325 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1326 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1327 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
1328 %6 = load i32, i32* %arrayidx, align 4
1329 %c = icmp ugt i32 %r.07, %6
1330 %add = select i1 %c, i32 %r.07, i32 %6
1331 %inc = add nuw nsw i32 %i.08, 1
1332 %exitcond = icmp eq i32 %inc, %n
1333 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1335 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1336 %r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1340 define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
1341 ; CHECK-LABEL: umax_i32_inloop:
1342 ; CHECK: @ %bb.0: @ %entry
1343 ; CHECK-NEXT: .save {r7, lr}
1344 ; CHECK-NEXT: push {r7, lr}
1345 ; CHECK-NEXT: cmp r1, #1
1346 ; CHECK-NEXT: blt .LBB14_3
1347 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1348 ; CHECK-NEXT: mov r12, r0
1349 ; CHECK-NEXT: cmp r1, #4
1350 ; CHECK-NEXT: bhs .LBB14_4
1351 ; CHECK-NEXT: @ %bb.2:
1352 ; CHECK-NEXT: movs r3, #0
1353 ; CHECK-NEXT: movs r0, #0
1354 ; CHECK-NEXT: b .LBB14_7
1355 ; CHECK-NEXT: .LBB14_3:
1356 ; CHECK-NEXT: movs r0, #0
1357 ; CHECK-NEXT: b .LBB14_9
1358 ; CHECK-NEXT: .LBB14_4: @ %vector.ph
1359 ; CHECK-NEXT: bic r3, r1, #3
1360 ; CHECK-NEXT: movs r2, #1
1361 ; CHECK-NEXT: subs r0, r3, #4
1362 ; CHECK-NEXT: add.w lr, r2, r0, lsr #2
1363 ; CHECK-NEXT: movs r0, #0
1364 ; CHECK-NEXT: mov r2, r12
1365 ; CHECK-NEXT: .LBB14_5: @ %vector.body
1366 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1367 ; CHECK-NEXT: vldrw.u32 q0, [r2], #16
1368 ; CHECK-NEXT: vmaxv.u32 r0, q0
1369 ; CHECK-NEXT: le lr, .LBB14_5
1370 ; CHECK-NEXT: @ %bb.6: @ %middle.block
1371 ; CHECK-NEXT: cmp r3, r1
1373 ; CHECK-NEXT: popeq {r7, pc}
1374 ; CHECK-NEXT: .LBB14_7: @ %for.body.preheader1
1375 ; CHECK-NEXT: sub.w lr, r1, r3
1376 ; CHECK-NEXT: add.w r2, r12, r3, lsl #2
1377 ; CHECK-NEXT: .LBB14_8: @ %for.body
1378 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1379 ; CHECK-NEXT: ldr r1, [r2], #4
1380 ; CHECK-NEXT: cmp r0, r1
1381 ; CHECK-NEXT: csel r0, r0, r1, hi
1382 ; CHECK-NEXT: le lr, .LBB14_8
1383 ; CHECK-NEXT: .LBB14_9: @ %for.cond.cleanup
1384 ; CHECK-NEXT: pop {r7, pc}
1386 %cmp6 = icmp sgt i32 %n, 0
1387 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1389 for.body.preheader: ; preds = %entry
1390 %min.iters.check = icmp ult i32 %n, 4
1391 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1393 vector.ph: ; preds = %for.body.preheader
1394 %n.vec = and i32 %n, -4
1395 br label %vector.body
1397 vector.body: ; preds = %vector.body, %vector.ph
1398 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1399 %vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
1400 %0 = getelementptr inbounds i32, i32* %x, i32 %index
1401 %1 = bitcast i32* %0 to <4 x i32>*
1402 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1403 %l5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %wide.load)
1404 %2 = icmp ugt i32 %vec.phi, %l5
1405 %3 = select i1 %2, i32 %vec.phi, i32 %l5
1406 %index.next = add i32 %index, 4
1407 %4 = icmp eq i32 %index.next, %n.vec
1408 br i1 %4, label %middle.block, label %vector.body
1410 middle.block: ; preds = %vector.body
1411 %5 = phi i32 [ %3, %vector.body ]
1412 %cmp.n = icmp eq i32 %n.vec, %n
1413 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1415 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
1416 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1417 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ]
1420 for.body: ; preds = %for.body.preheader1, %for.body
1421 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1422 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1423 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
1424 %6 = load i32, i32* %arrayidx, align 4
1425 %c = icmp ugt i32 %r.07, %6
1426 %add = select i1 %c, i32 %r.07, i32 %6
1427 %inc = add nuw nsw i32 %i.08, 1
1428 %exitcond = icmp eq i32 %inc, %n
1429 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1431 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1432 %r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1436 define float @fmin_f32(float* nocapture readonly %x, i32 %n) {
1437 ; CHECK-LABEL: fmin_f32:
1438 ; CHECK: @ %bb.0: @ %entry
1439 ; CHECK-NEXT: .save {r7, lr}
1440 ; CHECK-NEXT: push {r7, lr}
1441 ; CHECK-NEXT: cmp r1, #1
1442 ; CHECK-NEXT: blt .LBB15_3
1443 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1444 ; CHECK-NEXT: cmp r1, #4
1445 ; CHECK-NEXT: bhs .LBB15_4
1446 ; CHECK-NEXT: @ %bb.2:
1447 ; CHECK-NEXT: vldr s0, .LCPI15_0
1448 ; CHECK-NEXT: movs r2, #0
1449 ; CHECK-NEXT: b .LBB15_7
1450 ; CHECK-NEXT: .LBB15_3:
1451 ; CHECK-NEXT: vldr s0, .LCPI15_0
1452 ; CHECK-NEXT: b .LBB15_9
1453 ; CHECK-NEXT: .LBB15_4: @ %vector.ph
1454 ; CHECK-NEXT: bic r2, r1, #3
1455 ; CHECK-NEXT: movs r3, #1
1456 ; CHECK-NEXT: sub.w r12, r2, #4
1457 ; CHECK-NEXT: vmov.i32 q0, #0x0
1458 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
1459 ; CHECK-NEXT: mov r3, r0
1460 ; CHECK-NEXT: .LBB15_5: @ %vector.body
1461 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1462 ; CHECK-NEXT: vldrw.u32 q1, [r3], #16
1463 ; CHECK-NEXT: vcmp.f32 lt, q0, q1
1464 ; CHECK-NEXT: vpsel q0, q0, q1
1465 ; CHECK-NEXT: le lr, .LBB15_5
1466 ; CHECK-NEXT: @ %bb.6: @ %middle.block
1467 ; CHECK-NEXT: vminnm.f32 s2, s2, s3
1468 ; CHECK-NEXT: vminnm.f32 s0, s0, s1
1469 ; CHECK-NEXT: vminnm.f32 s0, s0, s2
1470 ; CHECK-NEXT: cmp r2, r1
1471 ; CHECK-NEXT: beq .LBB15_9
1472 ; CHECK-NEXT: .LBB15_7: @ %for.body.preheader1
1473 ; CHECK-NEXT: sub.w lr, r1, r2
1474 ; CHECK-NEXT: add.w r0, r0, r2, lsl #2
1475 ; CHECK-NEXT: .LBB15_8: @ %for.body
1476 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1477 ; CHECK-NEXT: vldmia r0!, {s2}
1478 ; CHECK-NEXT: vcmp.f32 s0, s2
1479 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr
1480 ; CHECK-NEXT: vselge.f32 s0, s2, s0
1481 ; CHECK-NEXT: le lr, .LBB15_8
1482 ; CHECK-NEXT: .LBB15_9: @ %for.cond.cleanup
1483 ; CHECK-NEXT: vmov r0, s0
1484 ; CHECK-NEXT: pop {r7, pc}
1485 ; CHECK-NEXT: .p2align 2
1486 ; CHECK-NEXT: @ %bb.10:
1487 ; CHECK-NEXT: .LCPI15_0:
1488 ; CHECK-NEXT: .long 0x00000000 @ float 0
1490 %cmp6 = icmp sgt i32 %n, 0
1491 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1493 for.body.preheader: ; preds = %entry
1494 %min.iters.check = icmp ult i32 %n, 4
1495 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1497 vector.ph: ; preds = %for.body.preheader
1498 %n.vec = and i32 %n, -4
1499 br label %vector.body
1501 vector.body: ; preds = %vector.body, %vector.ph
1502 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1503 %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
1504 %0 = getelementptr inbounds float, float* %x, i32 %index
1505 %1 = bitcast float* %0 to <4 x float>*
1506 %wide.load = load <4 x float>, <4 x float>* %1, align 4
1507 %2 = fcmp ult <4 x float> %vec.phi, %wide.load
1508 %3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load
1509 %index.next = add i32 %index, 4
1510 %4 = icmp eq i32 %index.next, %n.vec
1511 br i1 %4, label %middle.block, label %vector.body
1513 middle.block: ; preds = %vector.body
1514 %5 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %3)
1515 %cmp.n = icmp eq i32 %n.vec, %n
1516 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1518 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
1519 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1520 %r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ]
1523 for.body: ; preds = %for.body.preheader1, %for.body
1524 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1525 %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1526 %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
1527 %6 = load float, float* %arrayidx, align 4
1528 %c = fcmp ult float %r.07, %6
1529 %add = select i1 %c, float %r.07, float %6
1530 %inc = add nuw nsw i32 %i.08, 1
1531 %exitcond = icmp eq i32 %inc, %n
1532 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1534 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1535 %r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1536 ret float %r.0.lcssa
1539 define float @fmax_f32(float* nocapture readonly %x, i32 %n) {
1540 ; CHECK-LABEL: fmax_f32:
1541 ; CHECK: @ %bb.0: @ %entry
1542 ; CHECK-NEXT: .save {r7, lr}
1543 ; CHECK-NEXT: push {r7, lr}
1544 ; CHECK-NEXT: cmp r1, #1
1545 ; CHECK-NEXT: blt .LBB16_3
1546 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1547 ; CHECK-NEXT: cmp r1, #4
1548 ; CHECK-NEXT: bhs .LBB16_4
1549 ; CHECK-NEXT: @ %bb.2:
1550 ; CHECK-NEXT: vldr s0, .LCPI16_0
1551 ; CHECK-NEXT: movs r2, #0
1552 ; CHECK-NEXT: b .LBB16_7
1553 ; CHECK-NEXT: .LBB16_3:
1554 ; CHECK-NEXT: vldr s0, .LCPI16_0
1555 ; CHECK-NEXT: b .LBB16_9
1556 ; CHECK-NEXT: .LBB16_4: @ %vector.ph
1557 ; CHECK-NEXT: bic r2, r1, #3
1558 ; CHECK-NEXT: movs r3, #1
1559 ; CHECK-NEXT: sub.w r12, r2, #4
1560 ; CHECK-NEXT: vmov.i32 q0, #0x0
1561 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
1562 ; CHECK-NEXT: mov r3, r0
1563 ; CHECK-NEXT: .LBB16_5: @ %vector.body
1564 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1565 ; CHECK-NEXT: vldrw.u32 q1, [r3], #16
1566 ; CHECK-NEXT: vcmp.f32 lt, q1, q0
1567 ; CHECK-NEXT: vpsel q0, q0, q1
1568 ; CHECK-NEXT: le lr, .LBB16_5
1569 ; CHECK-NEXT: @ %bb.6: @ %middle.block
1570 ; CHECK-NEXT: vmaxnm.f32 s2, s2, s3
1571 ; CHECK-NEXT: vmaxnm.f32 s0, s0, s1
1572 ; CHECK-NEXT: vmaxnm.f32 s0, s0, s2
1573 ; CHECK-NEXT: cmp r2, r1
1574 ; CHECK-NEXT: beq .LBB16_9
1575 ; CHECK-NEXT: .LBB16_7: @ %for.body.preheader1
1576 ; CHECK-NEXT: sub.w lr, r1, r2
1577 ; CHECK-NEXT: add.w r0, r0, r2, lsl #2
1578 ; CHECK-NEXT: .LBB16_8: @ %for.body
1579 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1580 ; CHECK-NEXT: vldmia r0!, {s2}
1581 ; CHECK-NEXT: vcmp.f32 s2, s0
1582 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr
1583 ; CHECK-NEXT: vselge.f32 s0, s2, s0
1584 ; CHECK-NEXT: le lr, .LBB16_8
1585 ; CHECK-NEXT: .LBB16_9: @ %for.cond.cleanup
1586 ; CHECK-NEXT: vmov r0, s0
1587 ; CHECK-NEXT: pop {r7, pc}
1588 ; CHECK-NEXT: .p2align 2
1589 ; CHECK-NEXT: @ %bb.10:
1590 ; CHECK-NEXT: .LCPI16_0:
1591 ; CHECK-NEXT: .long 0x00000000 @ float 0
1593 %cmp6 = icmp sgt i32 %n, 0
1594 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1596 for.body.preheader: ; preds = %entry
1597 %min.iters.check = icmp ult i32 %n, 4
1598 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1600 vector.ph: ; preds = %for.body.preheader
1601 %n.vec = and i32 %n, -4
1602 br label %vector.body
1604 vector.body: ; preds = %vector.body, %vector.ph
1605 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1606 %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
1607 %0 = getelementptr inbounds float, float* %x, i32 %index
1608 %1 = bitcast float* %0 to <4 x float>*
1609 %wide.load = load <4 x float>, <4 x float>* %1, align 4
1610 %2 = fcmp ugt <4 x float> %vec.phi, %wide.load
1611 %3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load
1612 %index.next = add i32 %index, 4
1613 %4 = icmp eq i32 %index.next, %n.vec
1614 br i1 %4, label %middle.block, label %vector.body
1616 middle.block: ; preds = %vector.body
1617 %5 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %3)
1618 %cmp.n = icmp eq i32 %n.vec, %n
1619 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1621 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
1622 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1623 %r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ]
1626 for.body: ; preds = %for.body.preheader1, %for.body
1627 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1628 %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1629 %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
1630 %6 = load float, float* %arrayidx, align 4
1631 %c = fcmp ugt float %r.07, %6
1632 %add = select i1 %c, float %r.07, float %6
1633 %inc = add nuw nsw i32 %i.08, 1
1634 %exitcond = icmp eq i32 %inc, %n
1635 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1637 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1638 %r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1639 ret float %r.0.lcssa
1642 define i32 @add4i32(i32* noalias nocapture readonly %x, i32 %n) {
1643 ; CHECK-LABEL: add4i32:
1644 ; CHECK: @ %bb.0: @ %entry
1645 ; CHECK-NEXT: .save {r7, lr}
1646 ; CHECK-NEXT: push {r7, lr}
1647 ; CHECK-NEXT: cbz r1, .LBB17_4
1648 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1649 ; CHECK-NEXT: movs r2, #0
1650 ; CHECK-NEXT: dlstp.32 lr, r1
1651 ; CHECK-NEXT: .LBB17_2: @ %vector.body
1652 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1653 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
1654 ; CHECK-NEXT: vaddva.u32 r2, q0
1655 ; CHECK-NEXT: letp lr, .LBB17_2
1656 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1657 ; CHECK-NEXT: mov r0, r2
1658 ; CHECK-NEXT: pop {r7, pc}
1659 ; CHECK-NEXT: .LBB17_4:
1660 ; CHECK-NEXT: movs r2, #0
1661 ; CHECK-NEXT: mov r0, r2
1662 ; CHECK-NEXT: pop {r7, pc}
1664 %cmp6.not = icmp eq i32 %n, 0
1665 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
1667 vector.ph: ; preds = %entry
1668 %n.rnd.up = add i32 %n, 3
1669 %n.vec = and i32 %n.rnd.up, -4
1670 br label %vector.body
1672 vector.body: ; preds = %vector.body, %vector.ph
1673 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1674 %vec.phi = phi i32 [ 0, %vector.ph ], [ %4, %vector.body ]
1675 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
1676 %0 = getelementptr inbounds i32, i32* %x, i32 %index
1677 %1 = bitcast i32* %0 to <4 x i32>*
1678 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1679 %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer
1680 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
1681 %4 = add i32 %3, %vec.phi
1682 %index.next = add i32 %index, 4
1683 %5 = icmp eq i32 %index.next, %n.vec
1684 br i1 %5, label %for.cond.cleanup, label %vector.body
1686 for.cond.cleanup: ; preds = %vector.body, %entry
1687 %s.0.lcssa = phi i32 [ 0, %entry ], [ %4, %vector.body ]
1691 define i32 @mla4i32(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) {
1692 ; CHECK-LABEL: mla4i32:
1693 ; CHECK: @ %bb.0: @ %entry
1694 ; CHECK-NEXT: .save {r7, lr}
1695 ; CHECK-NEXT: push {r7, lr}
1696 ; CHECK-NEXT: cbz r2, .LBB18_4
1697 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1698 ; CHECK-NEXT: mov.w r12, #0
1699 ; CHECK-NEXT: dlstp.32 lr, r2
1700 ; CHECK-NEXT: .LBB18_2: @ %vector.body
1701 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1702 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
1703 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
1704 ; CHECK-NEXT: vmlava.u32 r12, q1, q0
1705 ; CHECK-NEXT: letp lr, .LBB18_2
1706 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1707 ; CHECK-NEXT: mov r0, r12
1708 ; CHECK-NEXT: pop {r7, pc}
1709 ; CHECK-NEXT: .LBB18_4:
1710 ; CHECK-NEXT: mov.w r12, #0
1711 ; CHECK-NEXT: mov r0, r12
1712 ; CHECK-NEXT: pop {r7, pc}
1714 %cmp8.not = icmp eq i32 %n, 0
1715 br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
1717 vector.ph: ; preds = %entry
1718 %n.rnd.up = add i32 %n, 3
1719 %n.vec = and i32 %n.rnd.up, -4
1720 br label %vector.body
1722 vector.body: ; preds = %vector.body, %vector.ph
1723 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1724 %vec.phi = phi i32 [ 0, %vector.ph ], [ %7, %vector.body ]
1725 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
1726 %0 = getelementptr inbounds i32, i32* %x, i32 %index
1727 %1 = bitcast i32* %0 to <4 x i32>*
1728 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1729 %2 = getelementptr inbounds i32, i32* %y, i32 %index
1730 %3 = bitcast i32* %2 to <4 x i32>*
1731 %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1732 %4 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
1733 %5 = select <4 x i1> %active.lane.mask, <4 x i32> %4, <4 x i32> zeroinitializer
1734 %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
1735 %7 = add i32 %6, %vec.phi
1736 %index.next = add i32 %index, 4
1737 %8 = icmp eq i32 %index.next, %n.vec
1738 br i1 %8, label %for.cond.cleanup, label %vector.body
1740 for.cond.cleanup: ; preds = %vector.body, %entry
1741 %s.0.lcssa = phi i32 [ 0, %entry ], [ %7, %vector.body ]
1745 define i32 @add8i32(i16* noalias nocapture readonly %x, i32 %n) {
1746 ; CHECK-LABEL: add8i32:
1747 ; CHECK: @ %bb.0: @ %entry
1748 ; CHECK-NEXT: .save {r7, lr}
1749 ; CHECK-NEXT: push {r7, lr}
1750 ; CHECK-NEXT: cbz r1, .LBB19_4
1751 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1752 ; CHECK-NEXT: movs r2, #0
1753 ; CHECK-NEXT: dlstp.16 lr, r1
1754 ; CHECK-NEXT: .LBB19_2: @ %vector.body
1755 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1756 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
1757 ; CHECK-NEXT: vaddva.s16 r2, q0
1758 ; CHECK-NEXT: letp lr, .LBB19_2
1759 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1760 ; CHECK-NEXT: mov r0, r2
1761 ; CHECK-NEXT: pop {r7, pc}
1762 ; CHECK-NEXT: .LBB19_4:
1763 ; CHECK-NEXT: movs r2, #0
1764 ; CHECK-NEXT: mov r0, r2
1765 ; CHECK-NEXT: pop {r7, pc}
1767 %cmp6.not = icmp eq i32 %n, 0
1768 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
1770 vector.ph: ; preds = %entry
1771 %n.rnd.up = add i32 %n, 7
1772 %n.vec = and i32 %n.rnd.up, -8
1773 br label %vector.body
1775 vector.body: ; preds = %vector.body, %vector.ph
1776 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1777 %vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
1778 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
1779 %0 = getelementptr inbounds i16, i16* %x, i32 %index
1780 %1 = bitcast i16* %0 to <8 x i16>*
1781 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
1782 %2 = sext <8 x i16> %wide.masked.load to <8 x i32>
1783 %3 = select <8 x i1> %active.lane.mask, <8 x i32> %2, <8 x i32> zeroinitializer
1784 %4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
1785 %5 = add i32 %4, %vec.phi
1786 %index.next = add i32 %index, 8
1787 %6 = icmp eq i32 %index.next, %n.vec
1788 br i1 %6, label %for.cond.cleanup, label %vector.body
1790 for.cond.cleanup: ; preds = %vector.body, %entry
1791 %s.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ]
1795 define i32 @mla8i32(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
1796 ; CHECK-LABEL: mla8i32:
1797 ; CHECK: @ %bb.0: @ %entry
1798 ; CHECK-NEXT: .save {r7, lr}
1799 ; CHECK-NEXT: push {r7, lr}
1800 ; CHECK-NEXT: cbz r2, .LBB20_4
1801 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1802 ; CHECK-NEXT: mov.w r12, #0
1803 ; CHECK-NEXT: dlstp.16 lr, r2
1804 ; CHECK-NEXT: .LBB20_2: @ %vector.body
1805 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1806 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
1807 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
1808 ; CHECK-NEXT: vmlava.s16 r12, q1, q0
1809 ; CHECK-NEXT: letp lr, .LBB20_2
1810 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1811 ; CHECK-NEXT: mov r0, r12
1812 ; CHECK-NEXT: pop {r7, pc}
1813 ; CHECK-NEXT: .LBB20_4:
1814 ; CHECK-NEXT: mov.w r12, #0
1815 ; CHECK-NEXT: mov r0, r12
1816 ; CHECK-NEXT: pop {r7, pc}
1818 %cmp9.not = icmp eq i32 %n, 0
1819 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
1821 vector.ph: ; preds = %entry
1822 %n.rnd.up = add i32 %n, 7
1823 %n.vec = and i32 %n.rnd.up, -8
1824 br label %vector.body
1826 vector.body: ; preds = %vector.body, %vector.ph
1827 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1828 %vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ]
1829 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
1830 %0 = getelementptr inbounds i16, i16* %x, i32 %index
1831 %1 = bitcast i16* %0 to <8 x i16>*
1832 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
1833 %2 = sext <8 x i16> %wide.masked.load to <8 x i32>
1834 %3 = getelementptr inbounds i16, i16* %y, i32 %index
1835 %4 = bitcast i16* %3 to <8 x i16>*
1836 %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
1837 %5 = sext <8 x i16> %wide.masked.load14 to <8 x i32>
1838 %6 = mul nsw <8 x i32> %5, %2
1839 %7 = select <8 x i1> %active.lane.mask, <8 x i32> %6, <8 x i32> zeroinitializer
1840 %8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7)
1841 %9 = add i32 %8, %vec.phi
1842 %index.next = add i32 %index, 8
1843 %10 = icmp eq i32 %index.next, %n.vec
1844 br i1 %10, label %for.cond.cleanup, label %vector.body
1846 for.cond.cleanup: ; preds = %vector.body, %entry
1847 %s.0.lcssa = phi i32 [ 0, %entry ], [ %9, %vector.body ]
1851 define i32 @add16i32(i8* noalias nocapture readonly %x, i32 %n) {
1852 ; CHECK-LABEL: add16i32:
1853 ; CHECK: @ %bb.0: @ %entry
1854 ; CHECK-NEXT: .save {r7, lr}
1855 ; CHECK-NEXT: push {r7, lr}
1856 ; CHECK-NEXT: cbz r1, .LBB21_4
1857 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1858 ; CHECK-NEXT: movs r2, #0
1859 ; CHECK-NEXT: dlstp.8 lr, r1
1860 ; CHECK-NEXT: .LBB21_2: @ %vector.body
1861 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1862 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
1863 ; CHECK-NEXT: vaddva.u8 r2, q0
1864 ; CHECK-NEXT: letp lr, .LBB21_2
1865 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1866 ; CHECK-NEXT: mov r0, r2
1867 ; CHECK-NEXT: pop {r7, pc}
1868 ; CHECK-NEXT: .LBB21_4:
1869 ; CHECK-NEXT: movs r2, #0
1870 ; CHECK-NEXT: mov r0, r2
1871 ; CHECK-NEXT: pop {r7, pc}
1873 %cmp6.not = icmp eq i32 %n, 0
1874 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
1876 vector.ph: ; preds = %entry
1877 %n.rnd.up = add i32 %n, 15
1878 %n.vec = and i32 %n.rnd.up, -16
1879 br label %vector.body
1881 vector.body: ; preds = %vector.body, %vector.ph
1882 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1883 %vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
1884 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
1885 %0 = getelementptr inbounds i8, i8* %x, i32 %index
1886 %1 = bitcast i8* %0 to <16 x i8>*
1887 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
1888 %2 = zext <16 x i8> %wide.masked.load to <16 x i32>
1889 %3 = select <16 x i1> %active.lane.mask, <16 x i32> %2, <16 x i32> zeroinitializer
1890 %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
1891 %5 = add i32 %4, %vec.phi
1892 %index.next = add i32 %index, 16
1893 %6 = icmp eq i32 %index.next, %n.vec
1894 br i1 %6, label %for.cond.cleanup, label %vector.body
1896 for.cond.cleanup: ; preds = %vector.body, %entry
1897 %s.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ]
1901 define i32 @mla16i32(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
1902 ; CHECK-LABEL: mla16i32:
1903 ; CHECK: @ %bb.0: @ %entry
1904 ; CHECK-NEXT: .save {r7, lr}
1905 ; CHECK-NEXT: push {r7, lr}
1906 ; CHECK-NEXT: cbz r2, .LBB22_4
1907 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1908 ; CHECK-NEXT: mov.w r12, #0
1909 ; CHECK-NEXT: dlstp.8 lr, r2
1910 ; CHECK-NEXT: .LBB22_2: @ %vector.body
1911 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1912 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
1913 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
1914 ; CHECK-NEXT: vmlava.u8 r12, q1, q0
1915 ; CHECK-NEXT: letp lr, .LBB22_2
1916 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1917 ; CHECK-NEXT: mov r0, r12
1918 ; CHECK-NEXT: pop {r7, pc}
1919 ; CHECK-NEXT: .LBB22_4:
1920 ; CHECK-NEXT: mov.w r12, #0
1921 ; CHECK-NEXT: mov r0, r12
1922 ; CHECK-NEXT: pop {r7, pc}
1924 %cmp9.not = icmp eq i32 %n, 0
1925 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
1927 vector.ph: ; preds = %entry
1928 %n.rnd.up = add i32 %n, 15
1929 %n.vec = and i32 %n.rnd.up, -16
1930 br label %vector.body
1932 vector.body: ; preds = %vector.body, %vector.ph
1933 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1934 %vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ]
1935 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
1936 %0 = getelementptr inbounds i8, i8* %x, i32 %index
1937 %1 = bitcast i8* %0 to <16 x i8>*
1938 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
1939 %2 = zext <16 x i8> %wide.masked.load to <16 x i32>
1940 %3 = getelementptr inbounds i8, i8* %y, i32 %index
1941 %4 = bitcast i8* %3 to <16 x i8>*
1942 %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
1943 %5 = zext <16 x i8> %wide.masked.load14 to <16 x i32>
1944 %6 = mul nuw nsw <16 x i32> %5, %2
1945 %7 = select <16 x i1> %active.lane.mask, <16 x i32> %6, <16 x i32> zeroinitializer
1946 %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)
1947 %9 = add i32 %8, %vec.phi
1948 %index.next = add i32 %index, 16
1949 %10 = icmp eq i32 %index.next, %n.vec
1950 br i1 %10, label %for.cond.cleanup, label %vector.body
1952 for.cond.cleanup: ; preds = %vector.body, %entry
1953 %s.0.lcssa = phi i32 [ 0, %entry ], [ %9, %vector.body ]
1957 define signext i16 @add8i16(i16* noalias nocapture readonly %x, i32 %n) {
1958 ; CHECK-LABEL: add8i16:
1959 ; CHECK: @ %bb.0: @ %entry
1960 ; CHECK-NEXT: .save {r7, lr}
1961 ; CHECK-NEXT: push {r7, lr}
1962 ; CHECK-NEXT: cbz r1, .LBB23_4
1963 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1964 ; CHECK-NEXT: movs r2, #0
1965 ; CHECK-NEXT: dlstp.16 lr, r1
1966 ; CHECK-NEXT: .LBB23_2: @ %vector.body
1967 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1968 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
1969 ; CHECK-NEXT: vaddva.u16 r2, q0
1970 ; CHECK-NEXT: letp lr, .LBB23_2
1971 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1972 ; CHECK-NEXT: sxth r0, r2
1973 ; CHECK-NEXT: pop {r7, pc}
1974 ; CHECK-NEXT: .LBB23_4:
1975 ; CHECK-NEXT: movs r2, #0
1976 ; CHECK-NEXT: sxth r0, r2
1977 ; CHECK-NEXT: pop {r7, pc}
1979 %cmp8.not = icmp eq i32 %n, 0
1980 br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
1982 vector.ph: ; preds = %entry
1983 %n.rnd.up = add i32 %n, 7
1984 %n.vec = and i32 %n.rnd.up, -8
1985 br label %vector.body
1987 vector.body: ; preds = %vector.body, %vector.ph
1988 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1989 %vec.phi = phi i16 [ 0, %vector.ph ], [ %4, %vector.body ]
1990 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
1991 %0 = getelementptr inbounds i16, i16* %x, i32 %index
1992 %1 = bitcast i16* %0 to <8 x i16>*
1993 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
1994 %2 = select <8 x i1> %active.lane.mask, <8 x i16> %wide.masked.load, <8 x i16> zeroinitializer
1995 %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
1996 %4 = add i16 %3, %vec.phi
1997 %index.next = add i32 %index, 8
1998 %5 = icmp eq i32 %index.next, %n.vec
1999 br i1 %5, label %for.cond.cleanup, label %vector.body
2001 for.cond.cleanup: ; preds = %vector.body, %entry
2002 %s.0.lcssa = phi i16 [ 0, %entry ], [ %4, %vector.body ]
2006 define signext i16 @mla8i16(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
2007 ; CHECK-LABEL: mla8i16:
2008 ; CHECK: @ %bb.0: @ %entry
2009 ; CHECK-NEXT: .save {r7, lr}
2010 ; CHECK-NEXT: push {r7, lr}
2011 ; CHECK-NEXT: cbz r2, .LBB24_4
2012 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2013 ; CHECK-NEXT: mov.w r12, #0
2014 ; CHECK-NEXT: dlstp.16 lr, r2
2015 ; CHECK-NEXT: .LBB24_2: @ %vector.body
2016 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2017 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
2018 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
2019 ; CHECK-NEXT: vmlava.u16 r12, q1, q0
2020 ; CHECK-NEXT: letp lr, .LBB24_2
2021 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2022 ; CHECK-NEXT: sxth.w r0, r12
2023 ; CHECK-NEXT: pop {r7, pc}
2024 ; CHECK-NEXT: .LBB24_4:
2025 ; CHECK-NEXT: mov.w r12, #0
2026 ; CHECK-NEXT: sxth.w r0, r12
2027 ; CHECK-NEXT: pop {r7, pc}
2029 %cmp11.not = icmp eq i32 %n, 0
2030 br i1 %cmp11.not, label %for.cond.cleanup, label %vector.ph
2032 vector.ph: ; preds = %entry
2033 %n.rnd.up = add i32 %n, 7
2034 %n.vec = and i32 %n.rnd.up, -8
2035 br label %vector.body
2037 vector.body: ; preds = %vector.body, %vector.ph
2038 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2039 %vec.phi = phi i16 [ 0, %vector.ph ], [ %7, %vector.body ]
2040 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
2041 %0 = getelementptr inbounds i16, i16* %x, i32 %index
2042 %1 = bitcast i16* %0 to <8 x i16>*
2043 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2044 %2 = getelementptr inbounds i16, i16* %y, i32 %index
2045 %3 = bitcast i16* %2 to <8 x i16>*
2046 %wide.masked.load16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %3, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2047 %4 = mul <8 x i16> %wide.masked.load16, %wide.masked.load
2048 %5 = select <8 x i1> %active.lane.mask, <8 x i16> %4, <8 x i16> zeroinitializer
2049 %6 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %5)
2050 %7 = add i16 %6, %vec.phi
2051 %index.next = add i32 %index, 8
2052 %8 = icmp eq i32 %index.next, %n.vec
2053 br i1 %8, label %for.cond.cleanup, label %vector.body
2055 for.cond.cleanup: ; preds = %vector.body, %entry
2056 %s.0.lcssa = phi i16 [ 0, %entry ], [ %7, %vector.body ]
2060 define signext i16 @add16i16(i8* noalias nocapture readonly %x, i32 %n) {
2061 ; CHECK-LABEL: add16i16:
2062 ; CHECK: @ %bb.0: @ %entry
2063 ; CHECK-NEXT: .save {r7, lr}
2064 ; CHECK-NEXT: push {r7, lr}
2065 ; CHECK-NEXT: cbz r1, .LBB25_4
2066 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2067 ; CHECK-NEXT: movs r2, #0
2068 ; CHECK-NEXT: dlstp.8 lr, r1
2069 ; CHECK-NEXT: .LBB25_2: @ %vector.body
2070 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2071 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
2072 ; CHECK-NEXT: vaddva.u8 r2, q0
2073 ; CHECK-NEXT: letp lr, .LBB25_2
2074 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2075 ; CHECK-NEXT: sxth r0, r2
2076 ; CHECK-NEXT: pop {r7, pc}
2077 ; CHECK-NEXT: .LBB25_4:
2078 ; CHECK-NEXT: movs r2, #0
2079 ; CHECK-NEXT: sxth r0, r2
2080 ; CHECK-NEXT: pop {r7, pc}
2082 %cmp8.not = icmp eq i32 %n, 0
2083 br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
2085 vector.ph: ; preds = %entry
2086 %n.rnd.up = add i32 %n, 15
2087 %n.vec = and i32 %n.rnd.up, -16
2088 br label %vector.body
2090 vector.body: ; preds = %vector.body, %vector.ph
2091 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2092 %vec.phi = phi i16 [ 0, %vector.ph ], [ %5, %vector.body ]
2093 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
2094 %0 = getelementptr inbounds i8, i8* %x, i32 %index
2095 %1 = bitcast i8* %0 to <16 x i8>*
2096 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2097 %2 = zext <16 x i8> %wide.masked.load to <16 x i16>
2098 %3 = select <16 x i1> %active.lane.mask, <16 x i16> %2, <16 x i16> zeroinitializer
2099 %4 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %3)
2100 %5 = add i16 %4, %vec.phi
2101 %index.next = add i32 %index, 16
2102 %6 = icmp eq i32 %index.next, %n.vec
2103 br i1 %6, label %for.cond.cleanup, label %vector.body
2105 for.cond.cleanup: ; preds = %vector.body, %entry
2106 %s.0.lcssa = phi i16 [ 0, %entry ], [ %5, %vector.body ]
2110 define signext i16 @mla16i16(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
2111 ; CHECK-LABEL: mla16i16:
2112 ; CHECK: @ %bb.0: @ %entry
2113 ; CHECK-NEXT: .save {r7, lr}
2114 ; CHECK-NEXT: push {r7, lr}
2115 ; CHECK-NEXT: cbz r2, .LBB26_4
2116 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2117 ; CHECK-NEXT: mov.w r12, #0
2118 ; CHECK-NEXT: dlstp.8 lr, r2
2119 ; CHECK-NEXT: .LBB26_2: @ %vector.body
2120 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2121 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
2122 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
2123 ; CHECK-NEXT: vmlava.u8 r12, q1, q0
2124 ; CHECK-NEXT: letp lr, .LBB26_2
2125 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2126 ; CHECK-NEXT: sxth.w r0, r12
2127 ; CHECK-NEXT: pop {r7, pc}
2128 ; CHECK-NEXT: .LBB26_4:
2129 ; CHECK-NEXT: mov.w r12, #0
2130 ; CHECK-NEXT: sxth.w r0, r12
2131 ; CHECK-NEXT: pop {r7, pc}
2133 %cmp13.not = icmp eq i32 %n, 0
2134 br i1 %cmp13.not, label %for.cond.cleanup, label %vector.ph
2136 vector.ph: ; preds = %entry
2137 %n.rnd.up = add i32 %n, 15
2138 %n.vec = and i32 %n.rnd.up, -16
2139 br label %vector.body
2141 vector.body: ; preds = %vector.body, %vector.ph
2142 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2143 %vec.phi = phi i16 [ 0, %vector.ph ], [ %9, %vector.body ]
2144 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
2145 %0 = getelementptr inbounds i8, i8* %x, i32 %index
2146 %1 = bitcast i8* %0 to <16 x i8>*
2147 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2148 %2 = zext <16 x i8> %wide.masked.load to <16 x i16>
2149 %3 = getelementptr inbounds i8, i8* %y, i32 %index
2150 %4 = bitcast i8* %3 to <16 x i8>*
2151 %wide.masked.load18 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2152 %5 = zext <16 x i8> %wide.masked.load18 to <16 x i16>
2153 %6 = mul nuw <16 x i16> %5, %2
2154 %7 = select <16 x i1> %active.lane.mask, <16 x i16> %6, <16 x i16> zeroinitializer
2155 %8 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %7)
2156 %9 = add i16 %8, %vec.phi
2157 %index.next = add i32 %index, 16
2158 %10 = icmp eq i32 %index.next, %n.vec
2159 br i1 %10, label %for.cond.cleanup, label %vector.body
2161 for.cond.cleanup: ; preds = %vector.body, %entry
2162 %s.0.lcssa = phi i16 [ 0, %entry ], [ %9, %vector.body ]
2166 define zeroext i8 @add16i8(i8* noalias nocapture readonly %x, i32 %n) {
2167 ; CHECK-LABEL: add16i8:
2168 ; CHECK: @ %bb.0: @ %entry
2169 ; CHECK-NEXT: .save {r7, lr}
2170 ; CHECK-NEXT: push {r7, lr}
2171 ; CHECK-NEXT: cbz r1, .LBB27_4
2172 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2173 ; CHECK-NEXT: movs r2, #0
2174 ; CHECK-NEXT: dlstp.8 lr, r1
2175 ; CHECK-NEXT: .LBB27_2: @ %vector.body
2176 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2177 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
2178 ; CHECK-NEXT: vaddva.u8 r2, q0
2179 ; CHECK-NEXT: letp lr, .LBB27_2
2180 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2181 ; CHECK-NEXT: uxtb r0, r2
2182 ; CHECK-NEXT: pop {r7, pc}
2183 ; CHECK-NEXT: .LBB27_4:
2184 ; CHECK-NEXT: movs r2, #0
2185 ; CHECK-NEXT: uxtb r0, r2
2186 ; CHECK-NEXT: pop {r7, pc}
2188 %cmp7.not = icmp eq i32 %n, 0
2189 br i1 %cmp7.not, label %for.cond.cleanup, label %vector.ph
2191 vector.ph: ; preds = %entry
2192 %n.rnd.up = add i32 %n, 15
2193 %n.vec = and i32 %n.rnd.up, -16
2194 br label %vector.body
2196 vector.body: ; preds = %vector.body, %vector.ph
2197 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2198 %vec.phi = phi i8 [ 0, %vector.ph ], [ %4, %vector.body ]
2199 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
2200 %0 = getelementptr inbounds i8, i8* %x, i32 %index
2201 %1 = bitcast i8* %0 to <16 x i8>*
2202 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2203 %2 = select <16 x i1> %active.lane.mask, <16 x i8> %wide.masked.load, <16 x i8> zeroinitializer
2204 %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2)
2205 %4 = add i8 %3, %vec.phi
2206 %index.next = add i32 %index, 16
2207 %5 = icmp eq i32 %index.next, %n.vec
2208 br i1 %5, label %for.cond.cleanup, label %vector.body
2210 for.cond.cleanup: ; preds = %vector.body, %entry
2211 %s.0.lcssa = phi i8 [ 0, %entry ], [ %4, %vector.body ]
2215 define zeroext i8 @mla16i8(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
2216 ; CHECK-LABEL: mla16i8:
2217 ; CHECK: @ %bb.0: @ %entry
2218 ; CHECK-NEXT: .save {r7, lr}
2219 ; CHECK-NEXT: push {r7, lr}
2220 ; CHECK-NEXT: cbz r2, .LBB28_4
2221 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2222 ; CHECK-NEXT: mov.w r12, #0
2223 ; CHECK-NEXT: dlstp.8 lr, r2
2224 ; CHECK-NEXT: .LBB28_2: @ %vector.body
2225 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2226 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
2227 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
2228 ; CHECK-NEXT: vmlava.u8 r12, q1, q0
2229 ; CHECK-NEXT: letp lr, .LBB28_2
2230 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2231 ; CHECK-NEXT: uxtb.w r0, r12
2232 ; CHECK-NEXT: pop {r7, pc}
2233 ; CHECK-NEXT: .LBB28_4:
2234 ; CHECK-NEXT: mov.w r12, #0
2235 ; CHECK-NEXT: uxtb.w r0, r12
2236 ; CHECK-NEXT: pop {r7, pc}
2238 %cmp10.not = icmp eq i32 %n, 0
2239 br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph
2241 vector.ph: ; preds = %entry
2242 %n.rnd.up = add i32 %n, 15
2243 %n.vec = and i32 %n.rnd.up, -16
2244 br label %vector.body
2246 vector.body: ; preds = %vector.body, %vector.ph
2247 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2248 %vec.phi = phi i8 [ 0, %vector.ph ], [ %7, %vector.body ]
2249 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
2250 %0 = getelementptr inbounds i8, i8* %x, i32 %index
2251 %1 = bitcast i8* %0 to <16 x i8>*
2252 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2253 %2 = getelementptr inbounds i8, i8* %y, i32 %index
2254 %3 = bitcast i8* %2 to <16 x i8>*
2255 %wide.masked.load15 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2256 %4 = mul <16 x i8> %wide.masked.load15, %wide.masked.load
2257 %5 = select <16 x i1> %active.lane.mask, <16 x i8> %4, <16 x i8> zeroinitializer
2258 %6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %5)
2259 %7 = add i8 %6, %vec.phi
2260 %index.next = add i32 %index, 16
2261 %8 = icmp eq i32 %index.next, %n.vec
2262 br i1 %8, label %for.cond.cleanup, label %vector.body
2264 for.cond.cleanup: ; preds = %vector.body, %entry
2265 %s.0.lcssa = phi i8 [ 0, %entry ], [ %7, %vector.body ]
2269 define i64 @add4i64(i32* noalias nocapture readonly %x, i32 %n) {
2270 ; CHECK-LABEL: add4i64:
2271 ; CHECK: @ %bb.0: @ %entry
2272 ; CHECK-NEXT: .save {r7, lr}
2273 ; CHECK-NEXT: push {r7, lr}
2274 ; CHECK-NEXT: cbz r1, .LBB29_3
2275 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2276 ; CHECK-NEXT: movs r2, #0
2277 ; CHECK-NEXT: mov r3, r2
2278 ; CHECK-NEXT: dlstp.32 lr, r1
2279 ; CHECK-NEXT: .LBB29_2: @ %vector.body
2280 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2281 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
2282 ; CHECK-NEXT: vaddlva.s32 r2, r3, q0
2283 ; CHECK-NEXT: letp lr, .LBB29_2
2284 ; CHECK-NEXT: b .LBB29_4
2285 ; CHECK-NEXT: .LBB29_3:
2286 ; CHECK-NEXT: movs r2, #0
2287 ; CHECK-NEXT: mov r3, r2
2288 ; CHECK-NEXT: .LBB29_4: @ %for.cond.cleanup
2289 ; CHECK-NEXT: mov r0, r2
2290 ; CHECK-NEXT: mov r1, r3
2291 ; CHECK-NEXT: pop {r7, pc}
2293 %cmp6.not = icmp eq i32 %n, 0
2294 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
2296 vector.ph: ; preds = %entry
2297 %n.rnd.up = add i32 %n, 3
2298 %n.vec = and i32 %n.rnd.up, -4
2299 br label %vector.body
2301 vector.body: ; preds = %vector.body, %vector.ph
2302 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2303 %vec.phi = phi i64 [ 0, %vector.ph ], [ %5, %vector.body ]
2304 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2305 %0 = getelementptr inbounds i32, i32* %x, i32 %index
2306 %1 = bitcast i32* %0 to <4 x i32>*
2307 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
2308 %2 = sext <4 x i32> %wide.masked.load to <4 x i64>
2309 %3 = select <4 x i1> %active.lane.mask, <4 x i64> %2, <4 x i64> zeroinitializer
2310 %4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %3)
2311 %5 = add i64 %4, %vec.phi
2312 %index.next = add i32 %index, 4
2313 %6 = icmp eq i32 %index.next, %n.vec
2314 br i1 %6, label %for.cond.cleanup, label %vector.body
2316 for.cond.cleanup: ; preds = %vector.body, %entry
2317 %s.0.lcssa = phi i64 [ 0, %entry ], [ %5, %vector.body ]
2321 define i64 @mla4i64(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) {
2322 ; CHECK-LABEL: mla4i64:
2323 ; CHECK: @ %bb.0: @ %entry
2324 ; CHECK-NEXT: .save {r7, lr}
2325 ; CHECK-NEXT: push {r7, lr}
2326 ; CHECK-NEXT: cbz r2, .LBB30_3
2327 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2328 ; CHECK-NEXT: mov.w r12, #0
2329 ; CHECK-NEXT: mov r3, r12
2330 ; CHECK-NEXT: dlstp.32 lr, r2
2331 ; CHECK-NEXT: .LBB30_2: @ %vector.body
2332 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2333 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
2334 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
2335 ; CHECK-NEXT: vmlalva.s32 r12, r3, q1, q0
2336 ; CHECK-NEXT: letp lr, .LBB30_2
2337 ; CHECK-NEXT: b .LBB30_4
2338 ; CHECK-NEXT: .LBB30_3:
2339 ; CHECK-NEXT: mov.w r12, #0
2340 ; CHECK-NEXT: mov r3, r12
2341 ; CHECK-NEXT: .LBB30_4: @ %for.cond.cleanup
2342 ; CHECK-NEXT: mov r0, r12
2343 ; CHECK-NEXT: mov r1, r3
2344 ; CHECK-NEXT: pop {r7, pc}
2346 %cmp9.not = icmp eq i32 %n, 0
2347 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
2349 vector.ph: ; preds = %entry
2350 %n.rnd.up = add i32 %n, 3
2351 %n.vec = and i32 %n.rnd.up, -4
2352 br label %vector.body
2354 vector.body: ; preds = %vector.body, %vector.ph
2355 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2356 %vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ]
2357 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2358 %0 = getelementptr inbounds i32, i32* %x, i32 %index
2359 %1 = bitcast i32* %0 to <4 x i32>*
2360 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
2361 %2 = sext <4 x i32> %wide.masked.load to <4 x i64>
2362 %3 = getelementptr inbounds i32, i32* %y, i32 %index
2363 %4 = bitcast i32* %3 to <4 x i32>*
2364 %wide.masked.load14 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
2365 %5 = sext <4 x i32> %wide.masked.load14 to <4 x i64>
2366 %6 = mul nsw <4 x i64> %5, %2
2367 %7 = select <4 x i1> %active.lane.mask, <4 x i64> %6, <4 x i64> zeroinitializer
2368 %8 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %7)
2369 %9 = add i64 %8, %vec.phi
2370 %index.next = add i32 %index, 4
2371 %10 = icmp eq i32 %index.next, %n.vec
2372 br i1 %10, label %for.cond.cleanup, label %vector.body
2374 for.cond.cleanup: ; preds = %vector.body, %entry
2375 %s.0.lcssa = phi i64 [ 0, %entry ], [ %9, %vector.body ]
2379 define i64 @mla8i64(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
2380 ; CHECK-LABEL: mla8i64:
2381 ; CHECK: @ %bb.0: @ %entry
2382 ; CHECK-NEXT: .save {r7, lr}
2383 ; CHECK-NEXT: push {r7, lr}
2384 ; CHECK-NEXT: cbz r2, .LBB31_3
2385 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2386 ; CHECK-NEXT: mov.w r12, #0
2387 ; CHECK-NEXT: mov r3, r12
2388 ; CHECK-NEXT: dlstp.16 lr, r2
2389 ; CHECK-NEXT: .LBB31_2: @ %vector.body
2390 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2391 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
2392 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
2393 ; CHECK-NEXT: vmlalva.s16 r12, r3, q1, q0
2394 ; CHECK-NEXT: letp lr, .LBB31_2
2395 ; CHECK-NEXT: b .LBB31_4
2396 ; CHECK-NEXT: .LBB31_3:
2397 ; CHECK-NEXT: mov.w r12, #0
2398 ; CHECK-NEXT: mov r3, r12
2399 ; CHECK-NEXT: .LBB31_4: @ %for.cond.cleanup
2400 ; CHECK-NEXT: mov r0, r12
2401 ; CHECK-NEXT: mov r1, r3
2402 ; CHECK-NEXT: pop {r7, pc}
2404 %cmp9.not = icmp eq i32 %n, 0
2405 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
2407 vector.ph: ; preds = %entry
2408 %n.rnd.up = add i32 %n, 7
2409 %n.vec = and i32 %n.rnd.up, -8
2410 br label %vector.body
2412 vector.body: ; preds = %vector.body, %vector.ph
2413 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2414 %vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ]
2415 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
2416 %0 = getelementptr inbounds i16, i16* %x, i32 %index
2417 %1 = bitcast i16* %0 to <8 x i16>*
2418 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2419 %2 = sext <8 x i16> %wide.masked.load to <8 x i64>
2420 %3 = getelementptr inbounds i16, i16* %y, i32 %index
2421 %4 = bitcast i16* %3 to <8 x i16>*
2422 %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2423 %5 = sext <8 x i16> %wide.masked.load14 to <8 x i64>
2424 %6 = mul nsw <8 x i64> %5, %2
2425 %7 = select <8 x i1> %active.lane.mask, <8 x i64> %6, <8 x i64> zeroinitializer
2426 %8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %7)
2427 %9 = add i64 %8, %vec.phi
2428 %index.next = add i32 %index, 8
2429 %10 = icmp eq i32 %index.next, %n.vec
2430 br i1 %10, label %for.cond.cleanup, label %vector.body
2432 for.cond.cleanup: ; preds = %vector.body, %entry
2433 %s.0.lcssa = phi i64 [ 0, %entry ], [ %9, %vector.body ]
2437 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1
2438 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
2439 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1
2440 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) #2
2441 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #3
2442 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) #1
2443 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #2
2444 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) #3
2445 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #3
2446 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) #3
2447 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #3
2448 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #3
2449 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #3
2451 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
2452 declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
2453 declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
2454 declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
2455 declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
2456 declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
2457 declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
2458 declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
2459 declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
2460 declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
2461 declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
2462 declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
2463 declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)