1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -tail-predication=enabled -verify-machineinstrs %s -o - | FileCheck %s
4 define i32 @add_i32(i32* nocapture readonly %x, i32 %n) {
5 ; CHECK-LABEL: add_i32:
6 ; CHECK: @ %bb.0: @ %entry
7 ; CHECK-NEXT: .save {r7, lr}
8 ; CHECK-NEXT: push {r7, lr}
9 ; CHECK-NEXT: cmp r1, #1
10 ; CHECK-NEXT: blt .LBB0_3
11 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
12 ; CHECK-NEXT: mov r12, r0
13 ; CHECK-NEXT: cmp r1, #4
14 ; CHECK-NEXT: bhs .LBB0_4
15 ; CHECK-NEXT: @ %bb.2:
16 ; CHECK-NEXT: movs r3, #0
17 ; CHECK-NEXT: movs r0, #0
18 ; CHECK-NEXT: b .LBB0_7
19 ; CHECK-NEXT: .LBB0_3:
20 ; CHECK-NEXT: movs r0, #0
21 ; CHECK-NEXT: pop {r7, pc}
22 ; CHECK-NEXT: .LBB0_4: @ %vector.ph
23 ; CHECK-NEXT: bic r3, r1, #3
24 ; CHECK-NEXT: movs r2, #1
25 ; CHECK-NEXT: subs r0, r3, #4
26 ; CHECK-NEXT: add.w lr, r2, r0, lsr #2
27 ; CHECK-NEXT: movs r0, #0
28 ; CHECK-NEXT: mov r2, r12
29 ; CHECK-NEXT: .LBB0_5: @ %vector.body
30 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
31 ; CHECK-NEXT: vldrw.u32 q0, [r2], #16
32 ; CHECK-NEXT: vaddva.u32 r0, q0
33 ; CHECK-NEXT: le lr, .LBB0_5
34 ; CHECK-NEXT: @ %bb.6: @ %middle.block
35 ; CHECK-NEXT: cmp r3, r1
37 ; CHECK-NEXT: popeq {r7, pc}
38 ; CHECK-NEXT: .LBB0_7: @ %for.body.preheader1
39 ; CHECK-NEXT: sub.w lr, r1, r3
40 ; CHECK-NEXT: add.w r2, r12, r3, lsl #2
41 ; CHECK-NEXT: .LBB0_8: @ %for.body
42 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
43 ; CHECK-NEXT: ldr r1, [r2], #4
44 ; CHECK-NEXT: add r0, r1
45 ; CHECK-NEXT: le lr, .LBB0_8
46 ; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup
47 ; CHECK-NEXT: pop {r7, pc}
49 %cmp6 = icmp sgt i32 %n, 0
50 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
52 for.body.preheader: ; preds = %entry
53 %min.iters.check = icmp ult i32 %n, 4
54 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
56 vector.ph: ; preds = %for.body.preheader
57 %n.vec = and i32 %n, -4
60 vector.body: ; preds = %vector.body, %vector.ph
61 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
62 %vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
63 %0 = getelementptr inbounds i32, i32* %x, i32 %index
64 %1 = bitcast i32* %0 to <4 x i32>*
65 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
66 %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %wide.load)
67 %3 = add i32 %2, %vec.phi
68 %index.next = add i32 %index, 4
69 %4 = icmp eq i32 %index.next, %n.vec
70 br i1 %4, label %middle.block, label %vector.body
72 middle.block: ; preds = %vector.body
73 %cmp.n = icmp eq i32 %n.vec, %n
74 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
76 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
77 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
78 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %3, %middle.block ]
81 for.body: ; preds = %for.body.preheader1, %for.body
82 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
83 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
84 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
85 %5 = load i32, i32* %arrayidx, align 4
86 %add = add nsw i32 %5, %r.07
87 %inc = add nuw nsw i32 %i.08, 1
88 %exitcond = icmp eq i32 %inc, %n
89 br i1 %exitcond, label %for.cond.cleanup, label %for.body
91 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
92 %r.0.lcssa = phi i32 [ 0, %entry ], [ %3, %middle.block ], [ %add, %for.body ]
96 define i32 @mul_i32(i32* nocapture readonly %x, i32 %n) {
97 ; CHECK-LABEL: mul_i32:
98 ; CHECK: @ %bb.0: @ %entry
99 ; CHECK-NEXT: .save {r4, lr}
100 ; CHECK-NEXT: push {r4, lr}
101 ; CHECK-NEXT: movs r2, #1
102 ; CHECK-NEXT: cmp r1, #1
103 ; CHECK-NEXT: blt .LBB1_8
104 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
105 ; CHECK-NEXT: cmp r1, #4
106 ; CHECK-NEXT: bhs .LBB1_3
107 ; CHECK-NEXT: @ %bb.2:
108 ; CHECK-NEXT: mov.w r12, #0
109 ; CHECK-NEXT: b .LBB1_6
110 ; CHECK-NEXT: .LBB1_3: @ %vector.ph
111 ; CHECK-NEXT: bic r12, r1, #3
112 ; CHECK-NEXT: vmov.i32 q0, #0x1
113 ; CHECK-NEXT: sub.w r3, r12, #4
114 ; CHECK-NEXT: add.w lr, r2, r3, lsr #2
115 ; CHECK-NEXT: mov r2, r0
116 ; CHECK-NEXT: .LBB1_4: @ %vector.body
117 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
118 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
119 ; CHECK-NEXT: vmul.i32 q0, q1, q0
120 ; CHECK-NEXT: le lr, .LBB1_4
121 ; CHECK-NEXT: @ %bb.5: @ %middle.block
122 ; CHECK-NEXT: vmov lr, r3, d1
123 ; CHECK-NEXT: cmp r12, r1
124 ; CHECK-NEXT: vmov r2, r4, d0
125 ; CHECK-NEXT: mul r3, lr, r3
126 ; CHECK-NEXT: mul r2, r4, r2
127 ; CHECK-NEXT: mul r2, r3, r2
128 ; CHECK-NEXT: beq .LBB1_8
129 ; CHECK-NEXT: .LBB1_6: @ %for.body.preheader1
130 ; CHECK-NEXT: sub.w lr, r1, r12
131 ; CHECK-NEXT: add.w r0, r0, r12, lsl #2
132 ; CHECK-NEXT: .LBB1_7: @ %for.body
133 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
134 ; CHECK-NEXT: ldr r1, [r0], #4
135 ; CHECK-NEXT: muls r2, r1, r2
136 ; CHECK-NEXT: le lr, .LBB1_7
137 ; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup
138 ; CHECK-NEXT: mov r0, r2
139 ; CHECK-NEXT: pop {r4, pc}
141 %cmp6 = icmp sgt i32 %n, 0
142 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
144 for.body.preheader: ; preds = %entry
145 %min.iters.check = icmp ult i32 %n, 4
146 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
148 vector.ph: ; preds = %for.body.preheader
149 %n.vec = and i32 %n, -4
150 br label %vector.body
152 vector.body: ; preds = %vector.body, %vector.ph
153 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
154 %vec.phi = phi <4 x i32> [ <i32 1, i32 1, i32 1, i32 1>, %vector.ph ], [ %2, %vector.body ]
155 %0 = getelementptr inbounds i32, i32* %x, i32 %index
156 %1 = bitcast i32* %0 to <4 x i32>*
157 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
158 %2 = mul <4 x i32> %wide.load, %vec.phi
159 %index.next = add i32 %index, 4
160 %3 = icmp eq i32 %index.next, %n.vec
161 br i1 %3, label %middle.block, label %vector.body
163 middle.block: ; preds = %vector.body
164 %4 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %2)
165 %cmp.n = icmp eq i32 %n.vec, %n
166 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
168 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
169 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
170 %r.07.ph = phi i32 [ 1, %for.body.preheader ], [ %4, %middle.block ]
173 for.body: ; preds = %for.body.preheader1, %for.body
174 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
175 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
176 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
177 %5 = load i32, i32* %arrayidx, align 4
178 %add = mul nsw i32 %5, %r.07
179 %inc = add nuw nsw i32 %i.08, 1
180 %exitcond = icmp eq i32 %inc, %n
181 br i1 %exitcond, label %for.cond.cleanup, label %for.body
183 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
184 %r.0.lcssa = phi i32 [ 1, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
188 define i32 @and_i32(i32* nocapture readonly %x, i32 %n) {
189 ; CHECK-LABEL: and_i32:
190 ; CHECK: @ %bb.0: @ %entry
191 ; CHECK-NEXT: .save {r4, lr}
192 ; CHECK-NEXT: push {r4, lr}
193 ; CHECK-NEXT: cmp r1, #1
194 ; CHECK-NEXT: blt .LBB2_3
195 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
196 ; CHECK-NEXT: cmp r1, #4
197 ; CHECK-NEXT: bhs .LBB2_4
198 ; CHECK-NEXT: @ %bb.2:
199 ; CHECK-NEXT: mov.w r2, #-1
200 ; CHECK-NEXT: movs r3, #0
201 ; CHECK-NEXT: b .LBB2_7
202 ; CHECK-NEXT: .LBB2_3:
203 ; CHECK-NEXT: mov.w r2, #-1
204 ; CHECK-NEXT: mov r0, r2
205 ; CHECK-NEXT: pop {r4, pc}
206 ; CHECK-NEXT: .LBB2_4: @ %vector.ph
207 ; CHECK-NEXT: bic r3, r1, #3
208 ; CHECK-NEXT: movs r2, #1
209 ; CHECK-NEXT: sub.w r12, r3, #4
210 ; CHECK-NEXT: vmov.i8 q0, #0xff
211 ; CHECK-NEXT: add.w lr, r2, r12, lsr #2
212 ; CHECK-NEXT: mov r2, r0
213 ; CHECK-NEXT: .LBB2_5: @ %vector.body
214 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
215 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
216 ; CHECK-NEXT: vand q0, q1, q0
217 ; CHECK-NEXT: le lr, .LBB2_5
218 ; CHECK-NEXT: @ %bb.6: @ %middle.block
219 ; CHECK-NEXT: vmov lr, r12, d1
220 ; CHECK-NEXT: cmp r3, r1
221 ; CHECK-NEXT: vmov r2, r4, d0
222 ; CHECK-NEXT: and.w r12, r12, lr
223 ; CHECK-NEXT: and.w r2, r2, r4
224 ; CHECK-NEXT: and.w r2, r2, r12
225 ; CHECK-NEXT: beq .LBB2_9
226 ; CHECK-NEXT: .LBB2_7: @ %for.body.preheader1
227 ; CHECK-NEXT: sub.w lr, r1, r3
228 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2
229 ; CHECK-NEXT: .LBB2_8: @ %for.body
230 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
231 ; CHECK-NEXT: ldr r1, [r0], #4
232 ; CHECK-NEXT: ands r2, r1
233 ; CHECK-NEXT: le lr, .LBB2_8
234 ; CHECK-NEXT: .LBB2_9: @ %for.cond.cleanup
235 ; CHECK-NEXT: mov r0, r2
236 ; CHECK-NEXT: pop {r4, pc}
238 %cmp6 = icmp sgt i32 %n, 0
239 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
241 for.body.preheader: ; preds = %entry
242 %min.iters.check = icmp ult i32 %n, 4
243 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
245 vector.ph: ; preds = %for.body.preheader
246 %n.vec = and i32 %n, -4
247 br label %vector.body
249 vector.body: ; preds = %vector.body, %vector.ph
250 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
251 %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %2, %vector.body ]
252 %0 = getelementptr inbounds i32, i32* %x, i32 %index
253 %1 = bitcast i32* %0 to <4 x i32>*
254 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
255 %2 = and <4 x i32> %wide.load, %vec.phi
256 %index.next = add i32 %index, 4
257 %3 = icmp eq i32 %index.next, %n.vec
258 br i1 %3, label %middle.block, label %vector.body
260 middle.block: ; preds = %vector.body
261 %4 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %2)
262 %cmp.n = icmp eq i32 %n.vec, %n
263 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
265 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
266 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
267 %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %4, %middle.block ]
270 for.body: ; preds = %for.body.preheader1, %for.body
271 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
272 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
273 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
274 %5 = load i32, i32* %arrayidx, align 4
275 %add = and i32 %5, %r.07
276 %inc = add nuw nsw i32 %i.08, 1
277 %exitcond = icmp eq i32 %inc, %n
278 br i1 %exitcond, label %for.cond.cleanup, label %for.body
280 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
281 %r.0.lcssa = phi i32 [ -1, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
285 define i32 @or_i32(i32* nocapture readonly %x, i32 %n) {
286 ; CHECK-LABEL: or_i32:
287 ; CHECK: @ %bb.0: @ %entry
288 ; CHECK-NEXT: .save {r4, lr}
289 ; CHECK-NEXT: push {r4, lr}
290 ; CHECK-NEXT: cmp r1, #1
291 ; CHECK-NEXT: blt .LBB3_3
292 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
293 ; CHECK-NEXT: cmp r1, #4
294 ; CHECK-NEXT: bhs .LBB3_4
295 ; CHECK-NEXT: @ %bb.2:
296 ; CHECK-NEXT: movs r3, #0
297 ; CHECK-NEXT: movs r2, #0
298 ; CHECK-NEXT: b .LBB3_7
299 ; CHECK-NEXT: .LBB3_3:
300 ; CHECK-NEXT: movs r2, #0
301 ; CHECK-NEXT: mov r0, r2
302 ; CHECK-NEXT: pop {r4, pc}
303 ; CHECK-NEXT: .LBB3_4: @ %vector.ph
304 ; CHECK-NEXT: bic r3, r1, #3
305 ; CHECK-NEXT: movs r2, #1
306 ; CHECK-NEXT: sub.w r12, r3, #4
307 ; CHECK-NEXT: vmov.i32 q0, #0x0
308 ; CHECK-NEXT: add.w lr, r2, r12, lsr #2
309 ; CHECK-NEXT: mov r2, r0
310 ; CHECK-NEXT: .LBB3_5: @ %vector.body
311 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
312 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
313 ; CHECK-NEXT: vorr q0, q1, q0
314 ; CHECK-NEXT: le lr, .LBB3_5
315 ; CHECK-NEXT: @ %bb.6: @ %middle.block
316 ; CHECK-NEXT: vmov lr, r12, d1
317 ; CHECK-NEXT: cmp r3, r1
318 ; CHECK-NEXT: vmov r2, r4, d0
319 ; CHECK-NEXT: orr.w r12, r12, lr
320 ; CHECK-NEXT: orr.w r2, r2, r4
321 ; CHECK-NEXT: orr.w r2, r2, r12
322 ; CHECK-NEXT: beq .LBB3_9
323 ; CHECK-NEXT: .LBB3_7: @ %for.body.preheader1
324 ; CHECK-NEXT: sub.w lr, r1, r3
325 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2
326 ; CHECK-NEXT: .LBB3_8: @ %for.body
327 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
328 ; CHECK-NEXT: ldr r1, [r0], #4
329 ; CHECK-NEXT: orrs r2, r1
330 ; CHECK-NEXT: le lr, .LBB3_8
331 ; CHECK-NEXT: .LBB3_9: @ %for.cond.cleanup
332 ; CHECK-NEXT: mov r0, r2
333 ; CHECK-NEXT: pop {r4, pc}
335 %cmp6 = icmp sgt i32 %n, 0
336 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
338 for.body.preheader: ; preds = %entry
339 %min.iters.check = icmp ult i32 %n, 4
340 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
342 vector.ph: ; preds = %for.body.preheader
343 %n.vec = and i32 %n, -4
344 br label %vector.body
346 vector.body: ; preds = %vector.body, %vector.ph
347 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
348 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
349 %0 = getelementptr inbounds i32, i32* %x, i32 %index
350 %1 = bitcast i32* %0 to <4 x i32>*
351 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
352 %2 = or <4 x i32> %wide.load, %vec.phi
353 %index.next = add i32 %index, 4
354 %3 = icmp eq i32 %index.next, %n.vec
355 br i1 %3, label %middle.block, label %vector.body
357 middle.block: ; preds = %vector.body
358 %4 = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %2)
359 %cmp.n = icmp eq i32 %n.vec, %n
360 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
362 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
363 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
364 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ]
367 for.body: ; preds = %for.body.preheader1, %for.body
368 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
369 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
370 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
371 %5 = load i32, i32* %arrayidx, align 4
372 %add = or i32 %5, %r.07
373 %inc = add nuw nsw i32 %i.08, 1
374 %exitcond = icmp eq i32 %inc, %n
375 br i1 %exitcond, label %for.cond.cleanup, label %for.body
377 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
378 %r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
382 define i32 @xor_i32(i32* nocapture readonly %x, i32 %n) {
383 ; CHECK-LABEL: xor_i32:
384 ; CHECK: @ %bb.0: @ %entry
385 ; CHECK-NEXT: .save {r4, lr}
386 ; CHECK-NEXT: push {r4, lr}
387 ; CHECK-NEXT: cmp r1, #1
388 ; CHECK-NEXT: blt .LBB4_3
389 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
390 ; CHECK-NEXT: cmp r1, #4
391 ; CHECK-NEXT: bhs .LBB4_4
392 ; CHECK-NEXT: @ %bb.2:
393 ; CHECK-NEXT: movs r3, #0
394 ; CHECK-NEXT: movs r2, #0
395 ; CHECK-NEXT: b .LBB4_7
396 ; CHECK-NEXT: .LBB4_3:
397 ; CHECK-NEXT: movs r2, #0
398 ; CHECK-NEXT: mov r0, r2
399 ; CHECK-NEXT: pop {r4, pc}
400 ; CHECK-NEXT: .LBB4_4: @ %vector.ph
401 ; CHECK-NEXT: bic r3, r1, #3
402 ; CHECK-NEXT: movs r2, #1
403 ; CHECK-NEXT: sub.w r12, r3, #4
404 ; CHECK-NEXT: vmov.i32 q0, #0x0
405 ; CHECK-NEXT: add.w lr, r2, r12, lsr #2
406 ; CHECK-NEXT: mov r2, r0
407 ; CHECK-NEXT: .LBB4_5: @ %vector.body
408 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
409 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
410 ; CHECK-NEXT: veor q0, q1, q0
411 ; CHECK-NEXT: le lr, .LBB4_5
412 ; CHECK-NEXT: @ %bb.6: @ %middle.block
413 ; CHECK-NEXT: vmov lr, r12, d1
414 ; CHECK-NEXT: cmp r3, r1
415 ; CHECK-NEXT: vmov r2, r4, d0
416 ; CHECK-NEXT: eor.w r12, r12, lr
417 ; CHECK-NEXT: eor.w r2, r2, r4
418 ; CHECK-NEXT: eor.w r2, r2, r12
419 ; CHECK-NEXT: beq .LBB4_9
420 ; CHECK-NEXT: .LBB4_7: @ %for.body.preheader1
421 ; CHECK-NEXT: sub.w lr, r1, r3
422 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2
423 ; CHECK-NEXT: .LBB4_8: @ %for.body
424 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
425 ; CHECK-NEXT: ldr r1, [r0], #4
426 ; CHECK-NEXT: eors r2, r1
427 ; CHECK-NEXT: le lr, .LBB4_8
428 ; CHECK-NEXT: .LBB4_9: @ %for.cond.cleanup
429 ; CHECK-NEXT: mov r0, r2
430 ; CHECK-NEXT: pop {r4, pc}
432 %cmp6 = icmp sgt i32 %n, 0
433 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
435 for.body.preheader: ; preds = %entry
436 %min.iters.check = icmp ult i32 %n, 4
437 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
439 vector.ph: ; preds = %for.body.preheader
440 %n.vec = and i32 %n, -4
441 br label %vector.body
443 vector.body: ; preds = %vector.body, %vector.ph
444 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
445 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
446 %0 = getelementptr inbounds i32, i32* %x, i32 %index
447 %1 = bitcast i32* %0 to <4 x i32>*
448 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
449 %2 = xor <4 x i32> %wide.load, %vec.phi
450 %index.next = add i32 %index, 4
451 %3 = icmp eq i32 %index.next, %n.vec
452 br i1 %3, label %middle.block, label %vector.body
454 middle.block: ; preds = %vector.body
455 %4 = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %2)
456 %cmp.n = icmp eq i32 %n.vec, %n
457 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
459 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
460 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
461 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %4, %middle.block ]
464 for.body: ; preds = %for.body.preheader1, %for.body
465 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
466 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
467 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
468 %5 = load i32, i32* %arrayidx, align 4
469 %add = xor i32 %5, %r.07
470 %inc = add nuw nsw i32 %i.08, 1
471 %exitcond = icmp eq i32 %inc, %n
472 br i1 %exitcond, label %for.cond.cleanup, label %for.body
474 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
475 %r.0.lcssa = phi i32 [ 0, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
479 define float @fadd_f32(float* nocapture readonly %x, i32 %n) {
480 ; CHECK-LABEL: fadd_f32:
481 ; CHECK: @ %bb.0: @ %entry
482 ; CHECK-NEXT: .save {r7, lr}
483 ; CHECK-NEXT: push {r7, lr}
484 ; CHECK-NEXT: cmp r1, #1
485 ; CHECK-NEXT: blt .LBB5_3
486 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
487 ; CHECK-NEXT: cmp r1, #4
488 ; CHECK-NEXT: bhs .LBB5_4
489 ; CHECK-NEXT: @ %bb.2:
490 ; CHECK-NEXT: vldr s0, .LCPI5_0
491 ; CHECK-NEXT: movs r2, #0
492 ; CHECK-NEXT: b .LBB5_7
493 ; CHECK-NEXT: .LBB5_3:
494 ; CHECK-NEXT: vldr s0, .LCPI5_0
495 ; CHECK-NEXT: vmov r0, s0
496 ; CHECK-NEXT: pop {r7, pc}
497 ; CHECK-NEXT: .LBB5_4: @ %vector.ph
498 ; CHECK-NEXT: bic r2, r1, #3
499 ; CHECK-NEXT: movs r3, #1
500 ; CHECK-NEXT: sub.w r12, r2, #4
501 ; CHECK-NEXT: vmov.i32 q0, #0x0
502 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
503 ; CHECK-NEXT: mov r3, r0
504 ; CHECK-NEXT: .LBB5_5: @ %vector.body
505 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
506 ; CHECK-NEXT: vldrw.u32 q1, [r3], #16
507 ; CHECK-NEXT: vadd.f32 q0, q1, q0
508 ; CHECK-NEXT: le lr, .LBB5_5
509 ; CHECK-NEXT: @ %bb.6: @ %middle.block
510 ; CHECK-NEXT: vadd.f32 s2, s2, s3
511 ; CHECK-NEXT: cmp r2, r1
512 ; CHECK-NEXT: vadd.f32 s0, s0, s1
513 ; CHECK-NEXT: vadd.f32 s0, s0, s2
514 ; CHECK-NEXT: beq .LBB5_9
515 ; CHECK-NEXT: .LBB5_7: @ %for.body.preheader1
516 ; CHECK-NEXT: sub.w lr, r1, r2
517 ; CHECK-NEXT: add.w r0, r0, r2, lsl #2
518 ; CHECK-NEXT: .LBB5_8: @ %for.body
519 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
520 ; CHECK-NEXT: vldmia r0!, {s2}
521 ; CHECK-NEXT: vadd.f32 s0, s2, s0
522 ; CHECK-NEXT: le lr, .LBB5_8
523 ; CHECK-NEXT: .LBB5_9: @ %for.cond.cleanup
524 ; CHECK-NEXT: vmov r0, s0
525 ; CHECK-NEXT: pop {r7, pc}
526 ; CHECK-NEXT: .p2align 2
527 ; CHECK-NEXT: @ %bb.10:
528 ; CHECK-NEXT: .LCPI5_0:
529 ; CHECK-NEXT: .long 0x00000000 @ float 0
531 %cmp6 = icmp sgt i32 %n, 0
532 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
534 for.body.preheader: ; preds = %entry
535 %min.iters.check = icmp ult i32 %n, 4
536 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
538 vector.ph: ; preds = %for.body.preheader
539 %n.vec = and i32 %n, -4
540 br label %vector.body
542 vector.body: ; preds = %vector.body, %vector.ph
543 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
544 %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
545 %0 = getelementptr inbounds float, float* %x, i32 %index
546 %1 = bitcast float* %0 to <4 x float>*
547 %wide.load = load <4 x float>, <4 x float>* %1, align 4
548 %2 = fadd fast <4 x float> %wide.load, %vec.phi
549 %index.next = add i32 %index, 4
550 %3 = icmp eq i32 %index.next, %n.vec
551 br i1 %3, label %middle.block, label %vector.body
553 middle.block: ; preds = %vector.body
554 %4 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float 0.000000e+00, <4 x float> %2)
555 %cmp.n = icmp eq i32 %n.vec, %n
556 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
558 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
559 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
560 %r.07.ph = phi float [ 0.000000e+00, %for.body.preheader ], [ %4, %middle.block ]
563 for.body: ; preds = %for.body.preheader1, %for.body
564 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
565 %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
566 %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
567 %5 = load float, float* %arrayidx, align 4
568 %add = fadd fast float %5, %r.07
569 %inc = add nuw nsw i32 %i.08, 1
570 %exitcond = icmp eq i32 %inc, %n
571 br i1 %exitcond, label %for.cond.cleanup, label %for.body
573 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
574 %r.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
578 define float @fmul_f32(float* nocapture readonly %x, i32 %n) {
579 ; CHECK-LABEL: fmul_f32:
580 ; CHECK: @ %bb.0: @ %entry
581 ; CHECK-NEXT: .save {r7, lr}
582 ; CHECK-NEXT: push {r7, lr}
583 ; CHECK-NEXT: cmp r1, #1
584 ; CHECK-NEXT: blt .LBB6_3
585 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
586 ; CHECK-NEXT: cmp r1, #4
587 ; CHECK-NEXT: bhs .LBB6_4
588 ; CHECK-NEXT: @ %bb.2:
589 ; CHECK-NEXT: vmov.f32 s0, #1.000000e+00
590 ; CHECK-NEXT: movs r2, #0
591 ; CHECK-NEXT: b .LBB6_7
592 ; CHECK-NEXT: .LBB6_3:
593 ; CHECK-NEXT: vmov.f32 s0, #1.000000e+00
594 ; CHECK-NEXT: vmov r0, s0
595 ; CHECK-NEXT: pop {r7, pc}
596 ; CHECK-NEXT: .LBB6_4: @ %vector.ph
597 ; CHECK-NEXT: bic r2, r1, #3
598 ; CHECK-NEXT: movs r3, #1
599 ; CHECK-NEXT: sub.w r12, r2, #4
600 ; CHECK-NEXT: vmov.f32 q0, #1.000000e+00
601 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
602 ; CHECK-NEXT: mov r3, r0
603 ; CHECK-NEXT: .LBB6_5: @ %vector.body
604 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
605 ; CHECK-NEXT: vldrw.u32 q1, [r3], #16
606 ; CHECK-NEXT: vmul.f32 q0, q1, q0
607 ; CHECK-NEXT: le lr, .LBB6_5
608 ; CHECK-NEXT: @ %bb.6: @ %middle.block
609 ; CHECK-NEXT: vmul.f32 s2, s2, s3
610 ; CHECK-NEXT: cmp r2, r1
611 ; CHECK-NEXT: vmul.f32 s0, s0, s1
612 ; CHECK-NEXT: vmul.f32 s0, s0, s2
613 ; CHECK-NEXT: beq .LBB6_9
614 ; CHECK-NEXT: .LBB6_7: @ %for.body.preheader1
615 ; CHECK-NEXT: sub.w lr, r1, r2
616 ; CHECK-NEXT: add.w r0, r0, r2, lsl #2
617 ; CHECK-NEXT: .LBB6_8: @ %for.body
618 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
619 ; CHECK-NEXT: vldmia r0!, {s2}
620 ; CHECK-NEXT: vmul.f32 s0, s2, s0
621 ; CHECK-NEXT: le lr, .LBB6_8
622 ; CHECK-NEXT: .LBB6_9: @ %for.cond.cleanup
623 ; CHECK-NEXT: vmov r0, s0
624 ; CHECK-NEXT: pop {r7, pc}
626 %cmp6 = icmp sgt i32 %n, 0
627 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
629 for.body.preheader: ; preds = %entry
630 %min.iters.check = icmp ult i32 %n, 4
631 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
633 vector.ph: ; preds = %for.body.preheader
634 %n.vec = and i32 %n, -4
635 br label %vector.body
637 vector.body: ; preds = %vector.body, %vector.ph
638 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
639 %vec.phi = phi <4 x float> [ <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %vector.ph ], [ %2, %vector.body ]
640 %0 = getelementptr inbounds float, float* %x, i32 %index
641 %1 = bitcast float* %0 to <4 x float>*
642 %wide.load = load <4 x float>, <4 x float>* %1, align 4
643 %2 = fmul fast <4 x float> %wide.load, %vec.phi
644 %index.next = add i32 %index, 4
645 %3 = icmp eq i32 %index.next, %n.vec
646 br i1 %3, label %middle.block, label %vector.body
648 middle.block: ; preds = %vector.body
649 %4 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.000000e+00, <4 x float> %2)
650 %cmp.n = icmp eq i32 %n.vec, %n
651 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
653 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
654 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
655 %r.07.ph = phi float [ 1.000000e+00, %for.body.preheader ], [ %4, %middle.block ]
658 for.body: ; preds = %for.body.preheader1, %for.body
659 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
660 %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
661 %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
662 %5 = load float, float* %arrayidx, align 4
663 %add = fmul fast float %5, %r.07
664 %inc = add nuw nsw i32 %i.08, 1
665 %exitcond = icmp eq i32 %inc, %n
666 br i1 %exitcond, label %for.cond.cleanup, label %for.body
668 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
669 %r.0.lcssa = phi float [ 1.000000e+00, %entry ], [ %4, %middle.block ], [ %add, %for.body ]
673 define i32 @smin_i32(i32* nocapture readonly %x, i32 %n) {
674 ; CHECK-LABEL: smin_i32:
675 ; CHECK: @ %bb.0: @ %entry
676 ; CHECK-NEXT: .save {r7, lr}
677 ; CHECK-NEXT: push {r7, lr}
678 ; CHECK-NEXT: cmp r1, #1
679 ; CHECK-NEXT: blt .LBB7_3
680 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
681 ; CHECK-NEXT: cmp r1, #4
682 ; CHECK-NEXT: bhs .LBB7_4
683 ; CHECK-NEXT: @ %bb.2:
684 ; CHECK-NEXT: mvn r2, #-2147483648
685 ; CHECK-NEXT: movs r3, #0
686 ; CHECK-NEXT: b .LBB7_7
687 ; CHECK-NEXT: .LBB7_3:
688 ; CHECK-NEXT: mvn r2, #-2147483648
689 ; CHECK-NEXT: mov r0, r2
690 ; CHECK-NEXT: pop {r7, pc}
691 ; CHECK-NEXT: .LBB7_4: @ %vector.ph
692 ; CHECK-NEXT: bic r3, r1, #3
693 ; CHECK-NEXT: movs r2, #1
694 ; CHECK-NEXT: sub.w r12, r3, #4
695 ; CHECK-NEXT: vmvn.i32 q0, #0x80000000
696 ; CHECK-NEXT: add.w lr, r2, r12, lsr #2
697 ; CHECK-NEXT: mov r2, r0
698 ; CHECK-NEXT: .LBB7_5: @ %vector.body
699 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
700 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
701 ; CHECK-NEXT: vmin.s32 q0, q0, q1
702 ; CHECK-NEXT: le lr, .LBB7_5
703 ; CHECK-NEXT: @ %bb.6: @ %middle.block
704 ; CHECK-NEXT: mvn r2, #-2147483648
705 ; CHECK-NEXT: cmp r3, r1
706 ; CHECK-NEXT: vminv.s32 r2, q0
707 ; CHECK-NEXT: beq .LBB7_9
708 ; CHECK-NEXT: .LBB7_7: @ %for.body.preheader1
709 ; CHECK-NEXT: sub.w lr, r1, r3
710 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2
711 ; CHECK-NEXT: .LBB7_8: @ %for.body
712 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
713 ; CHECK-NEXT: ldr r1, [r0], #4
714 ; CHECK-NEXT: cmp r2, r1
715 ; CHECK-NEXT: csel r2, r2, r1, lt
716 ; CHECK-NEXT: le lr, .LBB7_8
717 ; CHECK-NEXT: .LBB7_9: @ %for.cond.cleanup
718 ; CHECK-NEXT: mov r0, r2
719 ; CHECK-NEXT: pop {r7, pc}
721 %cmp6 = icmp sgt i32 %n, 0
722 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
724 for.body.preheader: ; preds = %entry
725 %min.iters.check = icmp ult i32 %n, 4
726 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
728 vector.ph: ; preds = %for.body.preheader
729 %n.vec = and i32 %n, -4
730 br label %vector.body
732 vector.body: ; preds = %vector.body, %vector.ph
733 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
734 %vec.phi = phi <4 x i32> [ <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>, %vector.ph ], [ %3, %vector.body ]
735 %0 = getelementptr inbounds i32, i32* %x, i32 %index
736 %1 = bitcast i32* %0 to <4 x i32>*
737 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
738 %2 = icmp slt <4 x i32> %vec.phi, %wide.load
739 %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
740 %index.next = add i32 %index, 4
741 %4 = icmp eq i32 %index.next, %n.vec
742 br i1 %4, label %middle.block, label %vector.body
744 middle.block: ; preds = %vector.body
745 %5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %3)
746 %cmp.n = icmp eq i32 %n.vec, %n
747 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
749 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
750 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
751 %r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ]
754 for.body: ; preds = %for.body.preheader1, %for.body
755 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
756 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
757 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
758 %6 = load i32, i32* %arrayidx, align 4
759 %c = icmp slt i32 %r.07, %6
760 %add = select i1 %c, i32 %r.07, i32 %6
761 %inc = add nuw nsw i32 %i.08, 1
762 %exitcond = icmp eq i32 %inc, %n
763 br i1 %exitcond, label %for.cond.cleanup, label %for.body
765 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
766 %r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
770 define i32 @smin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
771 ; CHECK-LABEL: smin_i32_inloop:
772 ; CHECK: @ %bb.0: @ %entry
773 ; CHECK-NEXT: .save {r7, lr}
774 ; CHECK-NEXT: push {r7, lr}
775 ; CHECK-NEXT: cmp r1, #1
776 ; CHECK-NEXT: blt .LBB8_3
777 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
778 ; CHECK-NEXT: mov r12, r0
779 ; CHECK-NEXT: cmp r1, #4
780 ; CHECK-NEXT: bhs .LBB8_4
781 ; CHECK-NEXT: @ %bb.2:
782 ; CHECK-NEXT: mvn r0, #-2147483648
783 ; CHECK-NEXT: movs r3, #0
784 ; CHECK-NEXT: b .LBB8_7
785 ; CHECK-NEXT: .LBB8_3:
786 ; CHECK-NEXT: mvn r0, #-2147483648
787 ; CHECK-NEXT: pop {r7, pc}
788 ; CHECK-NEXT: .LBB8_4: @ %vector.ph
789 ; CHECK-NEXT: bic r3, r1, #3
790 ; CHECK-NEXT: movs r2, #1
791 ; CHECK-NEXT: subs r0, r3, #4
792 ; CHECK-NEXT: add.w lr, r2, r0, lsr #2
793 ; CHECK-NEXT: mvn r0, #-2147483648
794 ; CHECK-NEXT: mov r2, r12
795 ; CHECK-NEXT: .LBB8_5: @ %vector.body
796 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
797 ; CHECK-NEXT: vldrw.u32 q0, [r2], #16
798 ; CHECK-NEXT: vminv.s32 r0, q0
799 ; CHECK-NEXT: le lr, .LBB8_5
800 ; CHECK-NEXT: @ %bb.6: @ %middle.block
801 ; CHECK-NEXT: cmp r3, r1
803 ; CHECK-NEXT: popeq {r7, pc}
804 ; CHECK-NEXT: .LBB8_7: @ %for.body.preheader1
805 ; CHECK-NEXT: sub.w lr, r1, r3
806 ; CHECK-NEXT: add.w r2, r12, r3, lsl #2
807 ; CHECK-NEXT: .LBB8_8: @ %for.body
808 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
809 ; CHECK-NEXT: ldr r1, [r2], #4
810 ; CHECK-NEXT: cmp r0, r1
811 ; CHECK-NEXT: csel r0, r0, r1, lt
812 ; CHECK-NEXT: le lr, .LBB8_8
813 ; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup
814 ; CHECK-NEXT: pop {r7, pc}
816 %cmp6 = icmp sgt i32 %n, 0
817 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
819 for.body.preheader: ; preds = %entry
820 %min.iters.check = icmp ult i32 %n, 4
821 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
823 vector.ph: ; preds = %for.body.preheader
824 %n.vec = and i32 %n, -4
825 br label %vector.body
827 vector.body: ; preds = %vector.body, %vector.ph
828 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
829 %vec.phi = phi i32 [ 2147483647, %vector.ph ], [ %3, %vector.body ]
830 %0 = getelementptr inbounds i32, i32* %x, i32 %index
831 %1 = bitcast i32* %0 to <4 x i32>*
832 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
833 %l5 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %wide.load)
834 %2 = icmp slt i32 %vec.phi, %l5
835 %3 = select i1 %2, i32 %vec.phi, i32 %l5
836 %index.next = add i32 %index, 4
837 %4 = icmp eq i32 %index.next, %n.vec
838 br i1 %4, label %middle.block, label %vector.body
840 middle.block: ; preds = %vector.body
841 %5 = phi i32 [ %3, %vector.body ]
842 %cmp.n = icmp eq i32 %n.vec, %n
843 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
845 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
846 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
847 %r.07.ph = phi i32 [ 2147483647, %for.body.preheader ], [ %5, %middle.block ]
850 for.body: ; preds = %for.body.preheader1, %for.body
851 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
852 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
853 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
854 %6 = load i32, i32* %arrayidx, align 4
855 %c = icmp slt i32 %r.07, %6
856 %add = select i1 %c, i32 %r.07, i32 %6
857 %inc = add nuw nsw i32 %i.08, 1
858 %exitcond = icmp eq i32 %inc, %n
859 br i1 %exitcond, label %for.cond.cleanup, label %for.body
861 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
862 %r.0.lcssa = phi i32 [ 2147483647, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
866 define i32 @smax_i32(i32* nocapture readonly %x, i32 %n) {
867 ; CHECK-LABEL: smax_i32:
868 ; CHECK: @ %bb.0: @ %entry
869 ; CHECK-NEXT: .save {r7, lr}
870 ; CHECK-NEXT: push {r7, lr}
871 ; CHECK-NEXT: cmp r1, #1
872 ; CHECK-NEXT: blt .LBB9_3
873 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
874 ; CHECK-NEXT: cmp r1, #4
875 ; CHECK-NEXT: bhs .LBB9_4
876 ; CHECK-NEXT: @ %bb.2:
877 ; CHECK-NEXT: mov.w r2, #-2147483648
878 ; CHECK-NEXT: movs r3, #0
879 ; CHECK-NEXT: b .LBB9_7
880 ; CHECK-NEXT: .LBB9_3:
881 ; CHECK-NEXT: mov.w r2, #-2147483648
882 ; CHECK-NEXT: mov r0, r2
883 ; CHECK-NEXT: pop {r7, pc}
884 ; CHECK-NEXT: .LBB9_4: @ %vector.ph
885 ; CHECK-NEXT: bic r3, r1, #3
886 ; CHECK-NEXT: movs r2, #1
887 ; CHECK-NEXT: sub.w r12, r3, #4
888 ; CHECK-NEXT: vmov.i32 q0, #0x80000000
889 ; CHECK-NEXT: add.w lr, r2, r12, lsr #2
890 ; CHECK-NEXT: mov r2, r0
891 ; CHECK-NEXT: .LBB9_5: @ %vector.body
892 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
893 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
894 ; CHECK-NEXT: vmax.s32 q0, q0, q1
895 ; CHECK-NEXT: le lr, .LBB9_5
896 ; CHECK-NEXT: @ %bb.6: @ %middle.block
897 ; CHECK-NEXT: mov.w r2, #-2147483648
898 ; CHECK-NEXT: cmp r3, r1
899 ; CHECK-NEXT: vmaxv.s32 r2, q0
900 ; CHECK-NEXT: beq .LBB9_9
901 ; CHECK-NEXT: .LBB9_7: @ %for.body.preheader1
902 ; CHECK-NEXT: sub.w lr, r1, r3
903 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2
904 ; CHECK-NEXT: .LBB9_8: @ %for.body
905 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
906 ; CHECK-NEXT: ldr r1, [r0], #4
907 ; CHECK-NEXT: cmp r2, r1
908 ; CHECK-NEXT: csel r2, r2, r1, gt
909 ; CHECK-NEXT: le lr, .LBB9_8
910 ; CHECK-NEXT: .LBB9_9: @ %for.cond.cleanup
911 ; CHECK-NEXT: mov r0, r2
912 ; CHECK-NEXT: pop {r7, pc}
914 %cmp6 = icmp sgt i32 %n, 0
915 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
917 for.body.preheader: ; preds = %entry
918 %min.iters.check = icmp ult i32 %n, 4
919 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
921 vector.ph: ; preds = %for.body.preheader
922 %n.vec = and i32 %n, -4
923 br label %vector.body
925 vector.body: ; preds = %vector.body, %vector.ph
926 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
927 %vec.phi = phi <4 x i32> [ <i32 -2147483648, i32 -2147483648, i32 -2147483648, i32 -2147483648>, %vector.ph ], [ %3, %vector.body ]
928 %0 = getelementptr inbounds i32, i32* %x, i32 %index
929 %1 = bitcast i32* %0 to <4 x i32>*
930 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
931 %2 = icmp sgt <4 x i32> %vec.phi, %wide.load
932 %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
933 %index.next = add i32 %index, 4
934 %4 = icmp eq i32 %index.next, %n.vec
935 br i1 %4, label %middle.block, label %vector.body
937 middle.block: ; preds = %vector.body
938 %5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %3)
939 %cmp.n = icmp eq i32 %n.vec, %n
940 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
942 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
943 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
944 %r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ]
947 for.body: ; preds = %for.body.preheader1, %for.body
948 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
949 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
950 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
951 %6 = load i32, i32* %arrayidx, align 4
952 %c = icmp sgt i32 %r.07, %6
953 %add = select i1 %c, i32 %r.07, i32 %6
954 %inc = add nuw nsw i32 %i.08, 1
955 %exitcond = icmp eq i32 %inc, %n
956 br i1 %exitcond, label %for.cond.cleanup, label %for.body
958 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
959 %r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
963 define i32 @smax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
964 ; CHECK-LABEL: smax_i32_inloop:
965 ; CHECK: @ %bb.0: @ %entry
966 ; CHECK-NEXT: .save {r7, lr}
967 ; CHECK-NEXT: push {r7, lr}
968 ; CHECK-NEXT: cmp r1, #1
969 ; CHECK-NEXT: blt .LBB10_3
970 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
971 ; CHECK-NEXT: mov r12, r0
972 ; CHECK-NEXT: cmp r1, #4
973 ; CHECK-NEXT: bhs .LBB10_4
974 ; CHECK-NEXT: @ %bb.2:
975 ; CHECK-NEXT: mov.w r0, #-2147483648
976 ; CHECK-NEXT: movs r3, #0
977 ; CHECK-NEXT: b .LBB10_7
978 ; CHECK-NEXT: .LBB10_3:
979 ; CHECK-NEXT: mov.w r0, #-2147483648
980 ; CHECK-NEXT: pop {r7, pc}
981 ; CHECK-NEXT: .LBB10_4: @ %vector.ph
982 ; CHECK-NEXT: bic r3, r1, #3
983 ; CHECK-NEXT: movs r2, #1
984 ; CHECK-NEXT: subs r0, r3, #4
985 ; CHECK-NEXT: add.w lr, r2, r0, lsr #2
986 ; CHECK-NEXT: mov.w r0, #-2147483648
987 ; CHECK-NEXT: mov r2, r12
988 ; CHECK-NEXT: .LBB10_5: @ %vector.body
989 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
990 ; CHECK-NEXT: vldrw.u32 q0, [r2], #16
991 ; CHECK-NEXT: vmaxv.s32 r0, q0
992 ; CHECK-NEXT: le lr, .LBB10_5
993 ; CHECK-NEXT: @ %bb.6: @ %middle.block
994 ; CHECK-NEXT: cmp r3, r1
996 ; CHECK-NEXT: popeq {r7, pc}
997 ; CHECK-NEXT: .LBB10_7: @ %for.body.preheader1
998 ; CHECK-NEXT: sub.w lr, r1, r3
999 ; CHECK-NEXT: add.w r2, r12, r3, lsl #2
1000 ; CHECK-NEXT: .LBB10_8: @ %for.body
1001 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1002 ; CHECK-NEXT: ldr r1, [r2], #4
1003 ; CHECK-NEXT: cmp r0, r1
1004 ; CHECK-NEXT: csel r0, r0, r1, gt
1005 ; CHECK-NEXT: le lr, .LBB10_8
1006 ; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup
1007 ; CHECK-NEXT: pop {r7, pc}
1009 %cmp6 = icmp sgt i32 %n, 0
1010 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1012 for.body.preheader: ; preds = %entry
1013 %min.iters.check = icmp ult i32 %n, 4
1014 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1016 vector.ph: ; preds = %for.body.preheader
1017 %n.vec = and i32 %n, -4
1018 br label %vector.body
1020 vector.body: ; preds = %vector.body, %vector.ph
1021 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1022 %vec.phi = phi i32 [ -2147483648, %vector.ph ], [ %3, %vector.body ]
1023 %0 = getelementptr inbounds i32, i32* %x, i32 %index
1024 %1 = bitcast i32* %0 to <4 x i32>*
1025 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1026 %l5 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %wide.load)
1027 %2 = icmp sgt i32 %vec.phi, %l5
1028 %3 = select i1 %2, i32 %vec.phi, i32 %l5
1029 %index.next = add i32 %index, 4
1030 %4 = icmp eq i32 %index.next, %n.vec
1031 br i1 %4, label %middle.block, label %vector.body
1033 middle.block: ; preds = %vector.body
1034 %5 = phi i32 [ %3, %vector.body ]
1035 %cmp.n = icmp eq i32 %n.vec, %n
1036 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1038 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
1039 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1040 %r.07.ph = phi i32 [ -2147483648, %for.body.preheader ], [ %5, %middle.block ]
1043 for.body: ; preds = %for.body.preheader1, %for.body
1044 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1045 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1046 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
1047 %6 = load i32, i32* %arrayidx, align 4
1048 %c = icmp sgt i32 %r.07, %6
1049 %add = select i1 %c, i32 %r.07, i32 %6
1050 %inc = add nuw nsw i32 %i.08, 1
1051 %exitcond = icmp eq i32 %inc, %n
1052 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1054 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1055 %r.0.lcssa = phi i32 [ -2147483648, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1059 define i32 @umin_i32(i32* nocapture readonly %x, i32 %n) {
1060 ; CHECK-LABEL: umin_i32:
1061 ; CHECK: @ %bb.0: @ %entry
1062 ; CHECK-NEXT: .save {r7, lr}
1063 ; CHECK-NEXT: push {r7, lr}
1064 ; CHECK-NEXT: cmp r1, #1
1065 ; CHECK-NEXT: blt .LBB11_3
1066 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1067 ; CHECK-NEXT: cmp r1, #4
1068 ; CHECK-NEXT: bhs .LBB11_4
1069 ; CHECK-NEXT: @ %bb.2:
1070 ; CHECK-NEXT: mov.w r2, #-1
1071 ; CHECK-NEXT: movs r3, #0
1072 ; CHECK-NEXT: b .LBB11_7
1073 ; CHECK-NEXT: .LBB11_3:
1074 ; CHECK-NEXT: mov.w r2, #-1
1075 ; CHECK-NEXT: mov r0, r2
1076 ; CHECK-NEXT: pop {r7, pc}
1077 ; CHECK-NEXT: .LBB11_4: @ %vector.ph
1078 ; CHECK-NEXT: bic r3, r1, #3
1079 ; CHECK-NEXT: movs r2, #1
1080 ; CHECK-NEXT: sub.w r12, r3, #4
1081 ; CHECK-NEXT: vmov.i8 q0, #0xff
1082 ; CHECK-NEXT: add.w lr, r2, r12, lsr #2
1083 ; CHECK-NEXT: mov r2, r0
1084 ; CHECK-NEXT: .LBB11_5: @ %vector.body
1085 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1086 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
1087 ; CHECK-NEXT: vmin.u32 q0, q0, q1
1088 ; CHECK-NEXT: le lr, .LBB11_5
1089 ; CHECK-NEXT: @ %bb.6: @ %middle.block
1090 ; CHECK-NEXT: mov.w r2, #-1
1091 ; CHECK-NEXT: cmp r3, r1
1092 ; CHECK-NEXT: vminv.u32 r2, q0
1093 ; CHECK-NEXT: beq .LBB11_9
1094 ; CHECK-NEXT: .LBB11_7: @ %for.body.preheader1
1095 ; CHECK-NEXT: sub.w lr, r1, r3
1096 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2
1097 ; CHECK-NEXT: .LBB11_8: @ %for.body
1098 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1099 ; CHECK-NEXT: ldr r1, [r0], #4
1100 ; CHECK-NEXT: cmp r2, r1
1101 ; CHECK-NEXT: csel r2, r2, r1, lo
1102 ; CHECK-NEXT: le lr, .LBB11_8
1103 ; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup
1104 ; CHECK-NEXT: mov r0, r2
1105 ; CHECK-NEXT: pop {r7, pc}
1107 %cmp6 = icmp sgt i32 %n, 0
1108 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1110 for.body.preheader: ; preds = %entry
1111 %min.iters.check = icmp ult i32 %n, 4
1112 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1114 vector.ph: ; preds = %for.body.preheader
1115 %n.vec = and i32 %n, -4
1116 br label %vector.body
1118 vector.body: ; preds = %vector.body, %vector.ph
1119 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1120 %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %vector.ph ], [ %3, %vector.body ]
1121 %0 = getelementptr inbounds i32, i32* %x, i32 %index
1122 %1 = bitcast i32* %0 to <4 x i32>*
1123 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1124 %2 = icmp ult <4 x i32> %vec.phi, %wide.load
1125 %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
1126 %index.next = add i32 %index, 4
1127 %4 = icmp eq i32 %index.next, %n.vec
1128 br i1 %4, label %middle.block, label %vector.body
1130 middle.block: ; preds = %vector.body
1131 %5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %3)
1132 %cmp.n = icmp eq i32 %n.vec, %n
1133 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1135 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
1136 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1137 %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ]
1140 for.body: ; preds = %for.body.preheader1, %for.body
1141 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1142 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1143 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
1144 %6 = load i32, i32* %arrayidx, align 4
1145 %c = icmp ult i32 %r.07, %6
1146 %add = select i1 %c, i32 %r.07, i32 %6
1147 %inc = add nuw nsw i32 %i.08, 1
1148 %exitcond = icmp eq i32 %inc, %n
1149 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1151 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1152 %r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1156 define i32 @umin_i32_inloop(i32* nocapture readonly %x, i32 %n) {
1157 ; CHECK-LABEL: umin_i32_inloop:
1158 ; CHECK: @ %bb.0: @ %entry
1159 ; CHECK-NEXT: .save {r7, lr}
1160 ; CHECK-NEXT: push {r7, lr}
1161 ; CHECK-NEXT: cmp r1, #1
1162 ; CHECK-NEXT: blt .LBB12_3
1163 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1164 ; CHECK-NEXT: mov r12, r0
1165 ; CHECK-NEXT: cmp r1, #4
1166 ; CHECK-NEXT: bhs .LBB12_4
1167 ; CHECK-NEXT: @ %bb.2:
1168 ; CHECK-NEXT: mov.w r0, #-1
1169 ; CHECK-NEXT: movs r3, #0
1170 ; CHECK-NEXT: b .LBB12_7
1171 ; CHECK-NEXT: .LBB12_3:
1172 ; CHECK-NEXT: mov.w r0, #-1
1173 ; CHECK-NEXT: pop {r7, pc}
1174 ; CHECK-NEXT: .LBB12_4: @ %vector.ph
1175 ; CHECK-NEXT: bic r3, r1, #3
1176 ; CHECK-NEXT: movs r2, #1
1177 ; CHECK-NEXT: subs r0, r3, #4
1178 ; CHECK-NEXT: add.w lr, r2, r0, lsr #2
1179 ; CHECK-NEXT: mov.w r0, #-1
1180 ; CHECK-NEXT: mov r2, r12
1181 ; CHECK-NEXT: .LBB12_5: @ %vector.body
1182 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1183 ; CHECK-NEXT: vldrw.u32 q0, [r2], #16
1184 ; CHECK-NEXT: vminv.u32 r0, q0
1185 ; CHECK-NEXT: le lr, .LBB12_5
1186 ; CHECK-NEXT: @ %bb.6: @ %middle.block
1187 ; CHECK-NEXT: cmp r3, r1
1189 ; CHECK-NEXT: popeq {r7, pc}
1190 ; CHECK-NEXT: .LBB12_7: @ %for.body.preheader1
1191 ; CHECK-NEXT: sub.w lr, r1, r3
1192 ; CHECK-NEXT: add.w r2, r12, r3, lsl #2
1193 ; CHECK-NEXT: .LBB12_8: @ %for.body
1194 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1195 ; CHECK-NEXT: ldr r1, [r2], #4
1196 ; CHECK-NEXT: cmp r0, r1
1197 ; CHECK-NEXT: csel r0, r0, r1, hi
1198 ; CHECK-NEXT: le lr, .LBB12_8
1199 ; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup
1200 ; CHECK-NEXT: pop {r7, pc}
1202 %cmp6 = icmp sgt i32 %n, 0
1203 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1205 for.body.preheader: ; preds = %entry
1206 %min.iters.check = icmp ult i32 %n, 4
1207 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1209 vector.ph: ; preds = %for.body.preheader
1210 %n.vec = and i32 %n, -4
1211 br label %vector.body
1213 vector.body: ; preds = %vector.body, %vector.ph
1214 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1215 %vec.phi = phi i32 [ -1, %vector.ph ], [ %3, %vector.body ]
1216 %0 = getelementptr inbounds i32, i32* %x, i32 %index
1217 %1 = bitcast i32* %0 to <4 x i32>*
1218 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1219 %l5 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %wide.load)
1220 %2 = icmp ult i32 %vec.phi, %l5
1221 %3 = select i1 %2, i32 %vec.phi, i32 %l5
1222 %index.next = add i32 %index, 4
1223 %4 = icmp eq i32 %index.next, %n.vec
1224 br i1 %4, label %middle.block, label %vector.body
1226 middle.block: ; preds = %vector.body
1227 %5 = phi i32 [ %3, %vector.body ]
1228 %cmp.n = icmp eq i32 %n.vec, %n
1229 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1231 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
1232 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1233 %r.07.ph = phi i32 [ -1, %for.body.preheader ], [ %5, %middle.block ]
1236 for.body: ; preds = %for.body.preheader1, %for.body
1237 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1238 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1239 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
1240 %6 = load i32, i32* %arrayidx, align 4
1241 %c = icmp ugt i32 %r.07, %6
1242 %add = select i1 %c, i32 %r.07, i32 %6
1243 %inc = add nuw nsw i32 %i.08, 1
1244 %exitcond = icmp eq i32 %inc, %n
1245 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1247 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1248 %r.0.lcssa = phi i32 [ -1, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1252 define i32 @umax_i32(i32* nocapture readonly %x, i32 %n) {
1253 ; CHECK-LABEL: umax_i32:
1254 ; CHECK: @ %bb.0: @ %entry
1255 ; CHECK-NEXT: .save {r7, lr}
1256 ; CHECK-NEXT: push {r7, lr}
1257 ; CHECK-NEXT: cmp r1, #1
1258 ; CHECK-NEXT: blt .LBB13_3
1259 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1260 ; CHECK-NEXT: cmp r1, #4
1261 ; CHECK-NEXT: bhs .LBB13_4
1262 ; CHECK-NEXT: @ %bb.2:
1263 ; CHECK-NEXT: movs r3, #0
1264 ; CHECK-NEXT: movs r2, #0
1265 ; CHECK-NEXT: b .LBB13_7
1266 ; CHECK-NEXT: .LBB13_3:
1267 ; CHECK-NEXT: movs r2, #0
1268 ; CHECK-NEXT: mov r0, r2
1269 ; CHECK-NEXT: pop {r7, pc}
1270 ; CHECK-NEXT: .LBB13_4: @ %vector.ph
1271 ; CHECK-NEXT: bic r3, r1, #3
1272 ; CHECK-NEXT: movs r2, #1
1273 ; CHECK-NEXT: sub.w r12, r3, #4
1274 ; CHECK-NEXT: vmov.i32 q0, #0x0
1275 ; CHECK-NEXT: add.w lr, r2, r12, lsr #2
1276 ; CHECK-NEXT: mov r2, r0
1277 ; CHECK-NEXT: .LBB13_5: @ %vector.body
1278 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1279 ; CHECK-NEXT: vldrw.u32 q1, [r2], #16
1280 ; CHECK-NEXT: vmax.u32 q0, q0, q1
1281 ; CHECK-NEXT: le lr, .LBB13_5
1282 ; CHECK-NEXT: @ %bb.6: @ %middle.block
1283 ; CHECK-NEXT: movs r2, #0
1284 ; CHECK-NEXT: cmp r3, r1
1285 ; CHECK-NEXT: vmaxv.u32 r2, q0
1286 ; CHECK-NEXT: beq .LBB13_9
1287 ; CHECK-NEXT: .LBB13_7: @ %for.body.preheader1
1288 ; CHECK-NEXT: sub.w lr, r1, r3
1289 ; CHECK-NEXT: add.w r0, r0, r3, lsl #2
1290 ; CHECK-NEXT: .LBB13_8: @ %for.body
1291 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1292 ; CHECK-NEXT: ldr r1, [r0], #4
1293 ; CHECK-NEXT: cmp r2, r1
1294 ; CHECK-NEXT: csel r2, r2, r1, hi
1295 ; CHECK-NEXT: le lr, .LBB13_8
1296 ; CHECK-NEXT: .LBB13_9: @ %for.cond.cleanup
1297 ; CHECK-NEXT: mov r0, r2
1298 ; CHECK-NEXT: pop {r7, pc}
1300 %cmp6 = icmp sgt i32 %n, 0
1301 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1303 for.body.preheader: ; preds = %entry
1304 %min.iters.check = icmp ult i32 %n, 4
1305 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1307 vector.ph: ; preds = %for.body.preheader
1308 %n.vec = and i32 %n, -4
1309 br label %vector.body
1311 vector.body: ; preds = %vector.body, %vector.ph
1312 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1313 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
1314 %0 = getelementptr inbounds i32, i32* %x, i32 %index
1315 %1 = bitcast i32* %0 to <4 x i32>*
1316 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1317 %2 = icmp ugt <4 x i32> %vec.phi, %wide.load
1318 %3 = select <4 x i1> %2, <4 x i32> %vec.phi, <4 x i32> %wide.load
1319 %index.next = add i32 %index, 4
1320 %4 = icmp eq i32 %index.next, %n.vec
1321 br i1 %4, label %middle.block, label %vector.body
1323 middle.block: ; preds = %vector.body
1324 %5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %3)
1325 %cmp.n = icmp eq i32 %n.vec, %n
1326 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1328 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
1329 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1330 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ]
1333 for.body: ; preds = %for.body.preheader1, %for.body
1334 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1335 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1336 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
1337 %6 = load i32, i32* %arrayidx, align 4
1338 %c = icmp ugt i32 %r.07, %6
1339 %add = select i1 %c, i32 %r.07, i32 %6
1340 %inc = add nuw nsw i32 %i.08, 1
1341 %exitcond = icmp eq i32 %inc, %n
1342 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1344 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1345 %r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1349 define i32 @umax_i32_inloop(i32* nocapture readonly %x, i32 %n) {
1350 ; CHECK-LABEL: umax_i32_inloop:
1351 ; CHECK: @ %bb.0: @ %entry
1352 ; CHECK-NEXT: .save {r7, lr}
1353 ; CHECK-NEXT: push {r7, lr}
1354 ; CHECK-NEXT: cmp r1, #1
1355 ; CHECK-NEXT: blt .LBB14_3
1356 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1357 ; CHECK-NEXT: mov r12, r0
1358 ; CHECK-NEXT: cmp r1, #4
1359 ; CHECK-NEXT: bhs .LBB14_4
1360 ; CHECK-NEXT: @ %bb.2:
1361 ; CHECK-NEXT: movs r3, #0
1362 ; CHECK-NEXT: movs r0, #0
1363 ; CHECK-NEXT: b .LBB14_7
1364 ; CHECK-NEXT: .LBB14_3:
1365 ; CHECK-NEXT: movs r0, #0
1366 ; CHECK-NEXT: pop {r7, pc}
1367 ; CHECK-NEXT: .LBB14_4: @ %vector.ph
1368 ; CHECK-NEXT: bic r3, r1, #3
1369 ; CHECK-NEXT: movs r2, #1
1370 ; CHECK-NEXT: subs r0, r3, #4
1371 ; CHECK-NEXT: add.w lr, r2, r0, lsr #2
1372 ; CHECK-NEXT: movs r0, #0
1373 ; CHECK-NEXT: mov r2, r12
1374 ; CHECK-NEXT: .LBB14_5: @ %vector.body
1375 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1376 ; CHECK-NEXT: vldrw.u32 q0, [r2], #16
1377 ; CHECK-NEXT: vmaxv.u32 r0, q0
1378 ; CHECK-NEXT: le lr, .LBB14_5
1379 ; CHECK-NEXT: @ %bb.6: @ %middle.block
1380 ; CHECK-NEXT: cmp r3, r1
1382 ; CHECK-NEXT: popeq {r7, pc}
1383 ; CHECK-NEXT: .LBB14_7: @ %for.body.preheader1
1384 ; CHECK-NEXT: sub.w lr, r1, r3
1385 ; CHECK-NEXT: add.w r2, r12, r3, lsl #2
1386 ; CHECK-NEXT: .LBB14_8: @ %for.body
1387 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1388 ; CHECK-NEXT: ldr r1, [r2], #4
1389 ; CHECK-NEXT: cmp r0, r1
1390 ; CHECK-NEXT: csel r0, r0, r1, hi
1391 ; CHECK-NEXT: le lr, .LBB14_8
1392 ; CHECK-NEXT: @ %bb.9: @ %for.cond.cleanup
1393 ; CHECK-NEXT: pop {r7, pc}
1395 %cmp6 = icmp sgt i32 %n, 0
1396 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1398 for.body.preheader: ; preds = %entry
1399 %min.iters.check = icmp ult i32 %n, 4
1400 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1402 vector.ph: ; preds = %for.body.preheader
1403 %n.vec = and i32 %n, -4
1404 br label %vector.body
1406 vector.body: ; preds = %vector.body, %vector.ph
1407 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1408 %vec.phi = phi i32 [ 0, %vector.ph ], [ %3, %vector.body ]
1409 %0 = getelementptr inbounds i32, i32* %x, i32 %index
1410 %1 = bitcast i32* %0 to <4 x i32>*
1411 %wide.load = load <4 x i32>, <4 x i32>* %1, align 4
1412 %l5 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %wide.load)
1413 %2 = icmp ugt i32 %vec.phi, %l5
1414 %3 = select i1 %2, i32 %vec.phi, i32 %l5
1415 %index.next = add i32 %index, 4
1416 %4 = icmp eq i32 %index.next, %n.vec
1417 br i1 %4, label %middle.block, label %vector.body
1419 middle.block: ; preds = %vector.body
1420 %5 = phi i32 [ %3, %vector.body ]
1421 %cmp.n = icmp eq i32 %n.vec, %n
1422 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1424 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
1425 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1426 %r.07.ph = phi i32 [ 0, %for.body.preheader ], [ %5, %middle.block ]
1429 for.body: ; preds = %for.body.preheader1, %for.body
1430 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1431 %r.07 = phi i32 [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1432 %arrayidx = getelementptr inbounds i32, i32* %x, i32 %i.08
1433 %6 = load i32, i32* %arrayidx, align 4
1434 %c = icmp ugt i32 %r.07, %6
1435 %add = select i1 %c, i32 %r.07, i32 %6
1436 %inc = add nuw nsw i32 %i.08, 1
1437 %exitcond = icmp eq i32 %inc, %n
1438 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1440 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1441 %r.0.lcssa = phi i32 [ 0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1445 define float @fmin_f32(float* nocapture readonly %x, i32 %n) {
1446 ; CHECK-LABEL: fmin_f32:
1447 ; CHECK: @ %bb.0: @ %entry
1448 ; CHECK-NEXT: .save {r7, lr}
1449 ; CHECK-NEXT: push {r7, lr}
1450 ; CHECK-NEXT: cmp r1, #1
1451 ; CHECK-NEXT: blt .LBB15_3
1452 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1453 ; CHECK-NEXT: cmp r1, #4
1454 ; CHECK-NEXT: bhs .LBB15_4
1455 ; CHECK-NEXT: @ %bb.2:
1456 ; CHECK-NEXT: vldr s0, .LCPI15_0
1457 ; CHECK-NEXT: movs r2, #0
1458 ; CHECK-NEXT: b .LBB15_7
1459 ; CHECK-NEXT: .LBB15_3:
1460 ; CHECK-NEXT: vldr s0, .LCPI15_0
1461 ; CHECK-NEXT: vmov r0, s0
1462 ; CHECK-NEXT: pop {r7, pc}
1463 ; CHECK-NEXT: .LBB15_4: @ %vector.ph
1464 ; CHECK-NEXT: bic r2, r1, #3
1465 ; CHECK-NEXT: movs r3, #1
1466 ; CHECK-NEXT: sub.w r12, r2, #4
1467 ; CHECK-NEXT: vmov.i32 q0, #0x0
1468 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
1469 ; CHECK-NEXT: mov r3, r0
1470 ; CHECK-NEXT: .LBB15_5: @ %vector.body
1471 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1472 ; CHECK-NEXT: vldrw.u32 q1, [r3], #16
1473 ; CHECK-NEXT: vcmp.f32 lt, q0, q1
1474 ; CHECK-NEXT: vpsel q0, q0, q1
1475 ; CHECK-NEXT: le lr, .LBB15_5
1476 ; CHECK-NEXT: @ %bb.6: @ %middle.block
1477 ; CHECK-NEXT: vminnm.f32 s2, s2, s3
1478 ; CHECK-NEXT: vminnm.f32 s0, s0, s1
1479 ; CHECK-NEXT: vminnm.f32 s0, s0, s2
1480 ; CHECK-NEXT: cmp r2, r1
1481 ; CHECK-NEXT: beq .LBB15_9
1482 ; CHECK-NEXT: .LBB15_7: @ %for.body.preheader1
1483 ; CHECK-NEXT: sub.w lr, r1, r2
1484 ; CHECK-NEXT: add.w r0, r0, r2, lsl #2
1485 ; CHECK-NEXT: .LBB15_8: @ %for.body
1486 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1487 ; CHECK-NEXT: vldmia r0!, {s2}
1488 ; CHECK-NEXT: vcmp.f32 s0, s2
1489 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr
1490 ; CHECK-NEXT: vselge.f32 s0, s2, s0
1491 ; CHECK-NEXT: le lr, .LBB15_8
1492 ; CHECK-NEXT: .LBB15_9: @ %for.cond.cleanup
1493 ; CHECK-NEXT: vmov r0, s0
1494 ; CHECK-NEXT: pop {r7, pc}
1495 ; CHECK-NEXT: .p2align 2
1496 ; CHECK-NEXT: @ %bb.10:
1497 ; CHECK-NEXT: .LCPI15_0:
1498 ; CHECK-NEXT: .long 0x00000000 @ float 0
1500 %cmp6 = icmp sgt i32 %n, 0
1501 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1503 for.body.preheader: ; preds = %entry
1504 %min.iters.check = icmp ult i32 %n, 4
1505 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1507 vector.ph: ; preds = %for.body.preheader
1508 %n.vec = and i32 %n, -4
1509 br label %vector.body
1511 vector.body: ; preds = %vector.body, %vector.ph
1512 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1513 %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
1514 %0 = getelementptr inbounds float, float* %x, i32 %index
1515 %1 = bitcast float* %0 to <4 x float>*
1516 %wide.load = load <4 x float>, <4 x float>* %1, align 4
1517 %2 = fcmp ult <4 x float> %vec.phi, %wide.load
1518 %3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load
1519 %index.next = add i32 %index, 4
1520 %4 = icmp eq i32 %index.next, %n.vec
1521 br i1 %4, label %middle.block, label %vector.body
1523 middle.block: ; preds = %vector.body
1524 %5 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %3)
1525 %cmp.n = icmp eq i32 %n.vec, %n
1526 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1528 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
1529 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1530 %r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ]
1533 for.body: ; preds = %for.body.preheader1, %for.body
1534 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1535 %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1536 %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
1537 %6 = load float, float* %arrayidx, align 4
1538 %c = fcmp ult float %r.07, %6
1539 %add = select i1 %c, float %r.07, float %6
1540 %inc = add nuw nsw i32 %i.08, 1
1541 %exitcond = icmp eq i32 %inc, %n
1542 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1544 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1545 %r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1546 ret float %r.0.lcssa
1549 define float @fmax_f32(float* nocapture readonly %x, i32 %n) {
1550 ; CHECK-LABEL: fmax_f32:
1551 ; CHECK: @ %bb.0: @ %entry
1552 ; CHECK-NEXT: .save {r7, lr}
1553 ; CHECK-NEXT: push {r7, lr}
1554 ; CHECK-NEXT: cmp r1, #1
1555 ; CHECK-NEXT: blt .LBB16_3
1556 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
1557 ; CHECK-NEXT: cmp r1, #4
1558 ; CHECK-NEXT: bhs .LBB16_4
1559 ; CHECK-NEXT: @ %bb.2:
1560 ; CHECK-NEXT: vldr s0, .LCPI16_0
1561 ; CHECK-NEXT: movs r2, #0
1562 ; CHECK-NEXT: b .LBB16_7
1563 ; CHECK-NEXT: .LBB16_3:
1564 ; CHECK-NEXT: vldr s0, .LCPI16_0
1565 ; CHECK-NEXT: vmov r0, s0
1566 ; CHECK-NEXT: pop {r7, pc}
1567 ; CHECK-NEXT: .LBB16_4: @ %vector.ph
1568 ; CHECK-NEXT: bic r2, r1, #3
1569 ; CHECK-NEXT: movs r3, #1
1570 ; CHECK-NEXT: sub.w r12, r2, #4
1571 ; CHECK-NEXT: vmov.i32 q0, #0x0
1572 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2
1573 ; CHECK-NEXT: mov r3, r0
1574 ; CHECK-NEXT: .LBB16_5: @ %vector.body
1575 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1576 ; CHECK-NEXT: vldrw.u32 q1, [r3], #16
1577 ; CHECK-NEXT: vcmp.f32 lt, q1, q0
1578 ; CHECK-NEXT: vpsel q0, q0, q1
1579 ; CHECK-NEXT: le lr, .LBB16_5
1580 ; CHECK-NEXT: @ %bb.6: @ %middle.block
1581 ; CHECK-NEXT: vmaxnm.f32 s2, s2, s3
1582 ; CHECK-NEXT: vmaxnm.f32 s0, s0, s1
1583 ; CHECK-NEXT: vmaxnm.f32 s0, s0, s2
1584 ; CHECK-NEXT: cmp r2, r1
1585 ; CHECK-NEXT: beq .LBB16_9
1586 ; CHECK-NEXT: .LBB16_7: @ %for.body.preheader1
1587 ; CHECK-NEXT: sub.w lr, r1, r2
1588 ; CHECK-NEXT: add.w r0, r0, r2, lsl #2
1589 ; CHECK-NEXT: .LBB16_8: @ %for.body
1590 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1591 ; CHECK-NEXT: vldmia r0!, {s2}
1592 ; CHECK-NEXT: vcmp.f32 s2, s0
1593 ; CHECK-NEXT: vmrs APSR_nzcv, fpscr
1594 ; CHECK-NEXT: vselge.f32 s0, s2, s0
1595 ; CHECK-NEXT: le lr, .LBB16_8
1596 ; CHECK-NEXT: .LBB16_9: @ %for.cond.cleanup
1597 ; CHECK-NEXT: vmov r0, s0
1598 ; CHECK-NEXT: pop {r7, pc}
1599 ; CHECK-NEXT: .p2align 2
1600 ; CHECK-NEXT: @ %bb.10:
1601 ; CHECK-NEXT: .LCPI16_0:
1602 ; CHECK-NEXT: .long 0x00000000 @ float 0
1604 %cmp6 = icmp sgt i32 %n, 0
1605 br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
1607 for.body.preheader: ; preds = %entry
1608 %min.iters.check = icmp ult i32 %n, 4
1609 br i1 %min.iters.check, label %for.body.preheader1, label %vector.ph
1611 vector.ph: ; preds = %for.body.preheader
1612 %n.vec = and i32 %n, -4
1613 br label %vector.body
1615 vector.body: ; preds = %vector.body, %vector.ph
1616 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1617 %vec.phi = phi <4 x float> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
1618 %0 = getelementptr inbounds float, float* %x, i32 %index
1619 %1 = bitcast float* %0 to <4 x float>*
1620 %wide.load = load <4 x float>, <4 x float>* %1, align 4
1621 %2 = fcmp ugt <4 x float> %vec.phi, %wide.load
1622 %3 = select <4 x i1> %2, <4 x float> %vec.phi, <4 x float> %wide.load
1623 %index.next = add i32 %index, 4
1624 %4 = icmp eq i32 %index.next, %n.vec
1625 br i1 %4, label %middle.block, label %vector.body
1627 middle.block: ; preds = %vector.body
1628 %5 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %3)
1629 %cmp.n = icmp eq i32 %n.vec, %n
1630 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader1
1632 for.body.preheader1: ; preds = %middle.block, %for.body.preheader
1633 %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
1634 %r.07.ph = phi float [ 0.0, %for.body.preheader ], [ %5, %middle.block ]
1637 for.body: ; preds = %for.body.preheader1, %for.body
1638 %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader1 ]
1639 %r.07 = phi float [ %add, %for.body ], [ %r.07.ph, %for.body.preheader1 ]
1640 %arrayidx = getelementptr inbounds float, float* %x, i32 %i.08
1641 %6 = load float, float* %arrayidx, align 4
1642 %c = fcmp ugt float %r.07, %6
1643 %add = select i1 %c, float %r.07, float %6
1644 %inc = add nuw nsw i32 %i.08, 1
1645 %exitcond = icmp eq i32 %inc, %n
1646 br i1 %exitcond, label %for.cond.cleanup, label %for.body
1648 for.cond.cleanup: ; preds = %for.body, %middle.block, %entry
1649 %r.0.lcssa = phi float [ 0.0, %entry ], [ %5, %middle.block ], [ %add, %for.body ]
1650 ret float %r.0.lcssa
1653 define i32 @add4i32(i32* noalias nocapture readonly %x, i32 %n) {
1654 ; CHECK-LABEL: add4i32:
1655 ; CHECK: @ %bb.0: @ %entry
1656 ; CHECK-NEXT: .save {r7, lr}
1657 ; CHECK-NEXT: push {r7, lr}
1658 ; CHECK-NEXT: cbz r1, .LBB17_4
1659 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1660 ; CHECK-NEXT: movs r2, #0
1661 ; CHECK-NEXT: dlstp.32 lr, r1
1662 ; CHECK-NEXT: .LBB17_2: @ %vector.body
1663 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1664 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
1665 ; CHECK-NEXT: vaddva.u32 r2, q0
1666 ; CHECK-NEXT: letp lr, .LBB17_2
1667 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1668 ; CHECK-NEXT: mov r0, r2
1669 ; CHECK-NEXT: pop {r7, pc}
1670 ; CHECK-NEXT: .LBB17_4:
1671 ; CHECK-NEXT: movs r2, #0
1672 ; CHECK-NEXT: mov r0, r2
1673 ; CHECK-NEXT: pop {r7, pc}
1675 %cmp6.not = icmp eq i32 %n, 0
1676 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
1678 vector.ph: ; preds = %entry
1679 %n.rnd.up = add i32 %n, 3
1680 %n.vec = and i32 %n.rnd.up, -4
1681 br label %vector.body
1683 vector.body: ; preds = %vector.body, %vector.ph
1684 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1685 %vec.phi = phi i32 [ 0, %vector.ph ], [ %4, %vector.body ]
1686 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
1687 %0 = getelementptr inbounds i32, i32* %x, i32 %index
1688 %1 = bitcast i32* %0 to <4 x i32>*
1689 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1690 %2 = select <4 x i1> %active.lane.mask, <4 x i32> %wide.masked.load, <4 x i32> zeroinitializer
1691 %3 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %2)
1692 %4 = add i32 %3, %vec.phi
1693 %index.next = add i32 %index, 4
1694 %5 = icmp eq i32 %index.next, %n.vec
1695 br i1 %5, label %for.cond.cleanup, label %vector.body
1697 for.cond.cleanup: ; preds = %vector.body, %entry
1698 %s.0.lcssa = phi i32 [ 0, %entry ], [ %4, %vector.body ]
1702 define i32 @mla4i32(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) {
1703 ; CHECK-LABEL: mla4i32:
1704 ; CHECK: @ %bb.0: @ %entry
1705 ; CHECK-NEXT: .save {r7, lr}
1706 ; CHECK-NEXT: push {r7, lr}
1707 ; CHECK-NEXT: cbz r2, .LBB18_4
1708 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1709 ; CHECK-NEXT: mov.w r12, #0
1710 ; CHECK-NEXT: dlstp.32 lr, r2
1711 ; CHECK-NEXT: .LBB18_2: @ %vector.body
1712 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1713 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
1714 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
1715 ; CHECK-NEXT: vmlava.u32 r12, q1, q0
1716 ; CHECK-NEXT: letp lr, .LBB18_2
1717 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1718 ; CHECK-NEXT: mov r0, r12
1719 ; CHECK-NEXT: pop {r7, pc}
1720 ; CHECK-NEXT: .LBB18_4:
1721 ; CHECK-NEXT: mov.w r12, #0
1722 ; CHECK-NEXT: mov r0, r12
1723 ; CHECK-NEXT: pop {r7, pc}
1725 %cmp8.not = icmp eq i32 %n, 0
1726 br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
1728 vector.ph: ; preds = %entry
1729 %n.rnd.up = add i32 %n, 3
1730 %n.vec = and i32 %n.rnd.up, -4
1731 br label %vector.body
1733 vector.body: ; preds = %vector.body, %vector.ph
1734 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1735 %vec.phi = phi i32 [ 0, %vector.ph ], [ %7, %vector.body ]
1736 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
1737 %0 = getelementptr inbounds i32, i32* %x, i32 %index
1738 %1 = bitcast i32* %0 to <4 x i32>*
1739 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1740 %2 = getelementptr inbounds i32, i32* %y, i32 %index
1741 %3 = bitcast i32* %2 to <4 x i32>*
1742 %wide.masked.load13 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
1743 %4 = mul nsw <4 x i32> %wide.masked.load13, %wide.masked.load
1744 %5 = select <4 x i1> %active.lane.mask, <4 x i32> %4, <4 x i32> zeroinitializer
1745 %6 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %5)
1746 %7 = add i32 %6, %vec.phi
1747 %index.next = add i32 %index, 4
1748 %8 = icmp eq i32 %index.next, %n.vec
1749 br i1 %8, label %for.cond.cleanup, label %vector.body
1751 for.cond.cleanup: ; preds = %vector.body, %entry
1752 %s.0.lcssa = phi i32 [ 0, %entry ], [ %7, %vector.body ]
1756 define i32 @add8i32(i16* noalias nocapture readonly %x, i32 %n) {
1757 ; CHECK-LABEL: add8i32:
1758 ; CHECK: @ %bb.0: @ %entry
1759 ; CHECK-NEXT: .save {r7, lr}
1760 ; CHECK-NEXT: push {r7, lr}
1761 ; CHECK-NEXT: cbz r1, .LBB19_4
1762 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1763 ; CHECK-NEXT: movs r2, #0
1764 ; CHECK-NEXT: dlstp.16 lr, r1
1765 ; CHECK-NEXT: .LBB19_2: @ %vector.body
1766 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1767 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
1768 ; CHECK-NEXT: vaddva.s16 r2, q0
1769 ; CHECK-NEXT: letp lr, .LBB19_2
1770 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1771 ; CHECK-NEXT: mov r0, r2
1772 ; CHECK-NEXT: pop {r7, pc}
1773 ; CHECK-NEXT: .LBB19_4:
1774 ; CHECK-NEXT: movs r2, #0
1775 ; CHECK-NEXT: mov r0, r2
1776 ; CHECK-NEXT: pop {r7, pc}
1778 %cmp6.not = icmp eq i32 %n, 0
1779 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
1781 vector.ph: ; preds = %entry
1782 %n.rnd.up = add i32 %n, 7
1783 %n.vec = and i32 %n.rnd.up, -8
1784 br label %vector.body
1786 vector.body: ; preds = %vector.body, %vector.ph
1787 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1788 %vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
1789 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
1790 %0 = getelementptr inbounds i16, i16* %x, i32 %index
1791 %1 = bitcast i16* %0 to <8 x i16>*
1792 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
1793 %2 = sext <8 x i16> %wide.masked.load to <8 x i32>
1794 %3 = select <8 x i1> %active.lane.mask, <8 x i32> %2, <8 x i32> zeroinitializer
1795 %4 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
1796 %5 = add i32 %4, %vec.phi
1797 %index.next = add i32 %index, 8
1798 %6 = icmp eq i32 %index.next, %n.vec
1799 br i1 %6, label %for.cond.cleanup, label %vector.body
1801 for.cond.cleanup: ; preds = %vector.body, %entry
1802 %s.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ]
1806 define i32 @mla8i32(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
1807 ; CHECK-LABEL: mla8i32:
1808 ; CHECK: @ %bb.0: @ %entry
1809 ; CHECK-NEXT: .save {r7, lr}
1810 ; CHECK-NEXT: push {r7, lr}
1811 ; CHECK-NEXT: cbz r2, .LBB20_4
1812 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1813 ; CHECK-NEXT: mov.w r12, #0
1814 ; CHECK-NEXT: dlstp.16 lr, r2
1815 ; CHECK-NEXT: .LBB20_2: @ %vector.body
1816 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1817 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
1818 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
1819 ; CHECK-NEXT: vmlava.s16 r12, q1, q0
1820 ; CHECK-NEXT: letp lr, .LBB20_2
1821 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1822 ; CHECK-NEXT: mov r0, r12
1823 ; CHECK-NEXT: pop {r7, pc}
1824 ; CHECK-NEXT: .LBB20_4:
1825 ; CHECK-NEXT: mov.w r12, #0
1826 ; CHECK-NEXT: mov r0, r12
1827 ; CHECK-NEXT: pop {r7, pc}
1829 %cmp9.not = icmp eq i32 %n, 0
1830 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
1832 vector.ph: ; preds = %entry
1833 %n.rnd.up = add i32 %n, 7
1834 %n.vec = and i32 %n.rnd.up, -8
1835 br label %vector.body
1837 vector.body: ; preds = %vector.body, %vector.ph
1838 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1839 %vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ]
1840 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
1841 %0 = getelementptr inbounds i16, i16* %x, i32 %index
1842 %1 = bitcast i16* %0 to <8 x i16>*
1843 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
1844 %2 = sext <8 x i16> %wide.masked.load to <8 x i32>
1845 %3 = getelementptr inbounds i16, i16* %y, i32 %index
1846 %4 = bitcast i16* %3 to <8 x i16>*
1847 %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
1848 %5 = sext <8 x i16> %wide.masked.load14 to <8 x i32>
1849 %6 = mul nsw <8 x i32> %5, %2
1850 %7 = select <8 x i1> %active.lane.mask, <8 x i32> %6, <8 x i32> zeroinitializer
1851 %8 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %7)
1852 %9 = add i32 %8, %vec.phi
1853 %index.next = add i32 %index, 8
1854 %10 = icmp eq i32 %index.next, %n.vec
1855 br i1 %10, label %for.cond.cleanup, label %vector.body
1857 for.cond.cleanup: ; preds = %vector.body, %entry
1858 %s.0.lcssa = phi i32 [ 0, %entry ], [ %9, %vector.body ]
1862 define i32 @add16i32(i8* noalias nocapture readonly %x, i32 %n) {
1863 ; CHECK-LABEL: add16i32:
1864 ; CHECK: @ %bb.0: @ %entry
1865 ; CHECK-NEXT: .save {r7, lr}
1866 ; CHECK-NEXT: push {r7, lr}
1867 ; CHECK-NEXT: cbz r1, .LBB21_4
1868 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1869 ; CHECK-NEXT: movs r2, #0
1870 ; CHECK-NEXT: dlstp.8 lr, r1
1871 ; CHECK-NEXT: .LBB21_2: @ %vector.body
1872 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1873 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
1874 ; CHECK-NEXT: vaddva.u8 r2, q0
1875 ; CHECK-NEXT: letp lr, .LBB21_2
1876 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1877 ; CHECK-NEXT: mov r0, r2
1878 ; CHECK-NEXT: pop {r7, pc}
1879 ; CHECK-NEXT: .LBB21_4:
1880 ; CHECK-NEXT: movs r2, #0
1881 ; CHECK-NEXT: mov r0, r2
1882 ; CHECK-NEXT: pop {r7, pc}
1884 %cmp6.not = icmp eq i32 %n, 0
1885 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
1887 vector.ph: ; preds = %entry
1888 %n.rnd.up = add i32 %n, 15
1889 %n.vec = and i32 %n.rnd.up, -16
1890 br label %vector.body
1892 vector.body: ; preds = %vector.body, %vector.ph
1893 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1894 %vec.phi = phi i32 [ 0, %vector.ph ], [ %5, %vector.body ]
1895 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
1896 %0 = getelementptr inbounds i8, i8* %x, i32 %index
1897 %1 = bitcast i8* %0 to <16 x i8>*
1898 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
1899 %2 = zext <16 x i8> %wide.masked.load to <16 x i32>
1900 %3 = select <16 x i1> %active.lane.mask, <16 x i32> %2, <16 x i32> zeroinitializer
1901 %4 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
1902 %5 = add i32 %4, %vec.phi
1903 %index.next = add i32 %index, 16
1904 %6 = icmp eq i32 %index.next, %n.vec
1905 br i1 %6, label %for.cond.cleanup, label %vector.body
1907 for.cond.cleanup: ; preds = %vector.body, %entry
1908 %s.0.lcssa = phi i32 [ 0, %entry ], [ %5, %vector.body ]
1912 define i32 @mla16i32(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
1913 ; CHECK-LABEL: mla16i32:
1914 ; CHECK: @ %bb.0: @ %entry
1915 ; CHECK-NEXT: .save {r7, lr}
1916 ; CHECK-NEXT: push {r7, lr}
1917 ; CHECK-NEXT: cbz r2, .LBB22_4
1918 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1919 ; CHECK-NEXT: mov.w r12, #0
1920 ; CHECK-NEXT: dlstp.8 lr, r2
1921 ; CHECK-NEXT: .LBB22_2: @ %vector.body
1922 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1923 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
1924 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
1925 ; CHECK-NEXT: vmlava.u8 r12, q1, q0
1926 ; CHECK-NEXT: letp lr, .LBB22_2
1927 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1928 ; CHECK-NEXT: mov r0, r12
1929 ; CHECK-NEXT: pop {r7, pc}
1930 ; CHECK-NEXT: .LBB22_4:
1931 ; CHECK-NEXT: mov.w r12, #0
1932 ; CHECK-NEXT: mov r0, r12
1933 ; CHECK-NEXT: pop {r7, pc}
1935 %cmp9.not = icmp eq i32 %n, 0
1936 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
1938 vector.ph: ; preds = %entry
1939 %n.rnd.up = add i32 %n, 15
1940 %n.vec = and i32 %n.rnd.up, -16
1941 br label %vector.body
1943 vector.body: ; preds = %vector.body, %vector.ph
1944 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
1945 %vec.phi = phi i32 [ 0, %vector.ph ], [ %9, %vector.body ]
1946 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
1947 %0 = getelementptr inbounds i8, i8* %x, i32 %index
1948 %1 = bitcast i8* %0 to <16 x i8>*
1949 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
1950 %2 = zext <16 x i8> %wide.masked.load to <16 x i32>
1951 %3 = getelementptr inbounds i8, i8* %y, i32 %index
1952 %4 = bitcast i8* %3 to <16 x i8>*
1953 %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
1954 %5 = zext <16 x i8> %wide.masked.load14 to <16 x i32>
1955 %6 = mul nuw nsw <16 x i32> %5, %2
1956 %7 = select <16 x i1> %active.lane.mask, <16 x i32> %6, <16 x i32> zeroinitializer
1957 %8 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %7)
1958 %9 = add i32 %8, %vec.phi
1959 %index.next = add i32 %index, 16
1960 %10 = icmp eq i32 %index.next, %n.vec
1961 br i1 %10, label %for.cond.cleanup, label %vector.body
1963 for.cond.cleanup: ; preds = %vector.body, %entry
1964 %s.0.lcssa = phi i32 [ 0, %entry ], [ %9, %vector.body ]
1968 define signext i16 @add8i16(i16* noalias nocapture readonly %x, i32 %n) {
1969 ; CHECK-LABEL: add8i16:
1970 ; CHECK: @ %bb.0: @ %entry
1971 ; CHECK-NEXT: .save {r7, lr}
1972 ; CHECK-NEXT: push {r7, lr}
1973 ; CHECK-NEXT: cbz r1, .LBB23_4
1974 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
1975 ; CHECK-NEXT: movs r2, #0
1976 ; CHECK-NEXT: dlstp.16 lr, r1
1977 ; CHECK-NEXT: .LBB23_2: @ %vector.body
1978 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
1979 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
1980 ; CHECK-NEXT: vaddva.u16 r2, q0
1981 ; CHECK-NEXT: letp lr, .LBB23_2
1982 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
1983 ; CHECK-NEXT: sxth r0, r2
1984 ; CHECK-NEXT: pop {r7, pc}
1985 ; CHECK-NEXT: .LBB23_4:
1986 ; CHECK-NEXT: movs r2, #0
1987 ; CHECK-NEXT: sxth r0, r2
1988 ; CHECK-NEXT: pop {r7, pc}
1990 %cmp8.not = icmp eq i32 %n, 0
1991 br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
1993 vector.ph: ; preds = %entry
1994 %n.rnd.up = add i32 %n, 7
1995 %n.vec = and i32 %n.rnd.up, -8
1996 br label %vector.body
1998 vector.body: ; preds = %vector.body, %vector.ph
1999 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2000 %vec.phi = phi i16 [ 0, %vector.ph ], [ %4, %vector.body ]
2001 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
2002 %0 = getelementptr inbounds i16, i16* %x, i32 %index
2003 %1 = bitcast i16* %0 to <8 x i16>*
2004 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2005 %2 = select <8 x i1> %active.lane.mask, <8 x i16> %wide.masked.load, <8 x i16> zeroinitializer
2006 %3 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %2)
2007 %4 = add i16 %3, %vec.phi
2008 %index.next = add i32 %index, 8
2009 %5 = icmp eq i32 %index.next, %n.vec
2010 br i1 %5, label %for.cond.cleanup, label %vector.body
2012 for.cond.cleanup: ; preds = %vector.body, %entry
2013 %s.0.lcssa = phi i16 [ 0, %entry ], [ %4, %vector.body ]
2017 define signext i16 @mla8i16(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
2018 ; CHECK-LABEL: mla8i16:
2019 ; CHECK: @ %bb.0: @ %entry
2020 ; CHECK-NEXT: .save {r7, lr}
2021 ; CHECK-NEXT: push {r7, lr}
2022 ; CHECK-NEXT: cbz r2, .LBB24_4
2023 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2024 ; CHECK-NEXT: mov.w r12, #0
2025 ; CHECK-NEXT: dlstp.16 lr, r2
2026 ; CHECK-NEXT: .LBB24_2: @ %vector.body
2027 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2028 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
2029 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
2030 ; CHECK-NEXT: vmlava.u16 r12, q1, q0
2031 ; CHECK-NEXT: letp lr, .LBB24_2
2032 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2033 ; CHECK-NEXT: sxth.w r0, r12
2034 ; CHECK-NEXT: pop {r7, pc}
2035 ; CHECK-NEXT: .LBB24_4:
2036 ; CHECK-NEXT: mov.w r12, #0
2037 ; CHECK-NEXT: sxth.w r0, r12
2038 ; CHECK-NEXT: pop {r7, pc}
2040 %cmp11.not = icmp eq i32 %n, 0
2041 br i1 %cmp11.not, label %for.cond.cleanup, label %vector.ph
2043 vector.ph: ; preds = %entry
2044 %n.rnd.up = add i32 %n, 7
2045 %n.vec = and i32 %n.rnd.up, -8
2046 br label %vector.body
2048 vector.body: ; preds = %vector.body, %vector.ph
2049 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2050 %vec.phi = phi i16 [ 0, %vector.ph ], [ %7, %vector.body ]
2051 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
2052 %0 = getelementptr inbounds i16, i16* %x, i32 %index
2053 %1 = bitcast i16* %0 to <8 x i16>*
2054 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2055 %2 = getelementptr inbounds i16, i16* %y, i32 %index
2056 %3 = bitcast i16* %2 to <8 x i16>*
2057 %wide.masked.load16 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %3, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2058 %4 = mul <8 x i16> %wide.masked.load16, %wide.masked.load
2059 %5 = select <8 x i1> %active.lane.mask, <8 x i16> %4, <8 x i16> zeroinitializer
2060 %6 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %5)
2061 %7 = add i16 %6, %vec.phi
2062 %index.next = add i32 %index, 8
2063 %8 = icmp eq i32 %index.next, %n.vec
2064 br i1 %8, label %for.cond.cleanup, label %vector.body
2066 for.cond.cleanup: ; preds = %vector.body, %entry
2067 %s.0.lcssa = phi i16 [ 0, %entry ], [ %7, %vector.body ]
2071 define signext i16 @add16i16(i8* noalias nocapture readonly %x, i32 %n) {
2072 ; CHECK-LABEL: add16i16:
2073 ; CHECK: @ %bb.0: @ %entry
2074 ; CHECK-NEXT: .save {r7, lr}
2075 ; CHECK-NEXT: push {r7, lr}
2076 ; CHECK-NEXT: cbz r1, .LBB25_4
2077 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2078 ; CHECK-NEXT: movs r2, #0
2079 ; CHECK-NEXT: dlstp.8 lr, r1
2080 ; CHECK-NEXT: .LBB25_2: @ %vector.body
2081 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2082 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
2083 ; CHECK-NEXT: vaddva.u8 r2, q0
2084 ; CHECK-NEXT: letp lr, .LBB25_2
2085 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2086 ; CHECK-NEXT: sxth r0, r2
2087 ; CHECK-NEXT: pop {r7, pc}
2088 ; CHECK-NEXT: .LBB25_4:
2089 ; CHECK-NEXT: movs r2, #0
2090 ; CHECK-NEXT: sxth r0, r2
2091 ; CHECK-NEXT: pop {r7, pc}
2093 %cmp8.not = icmp eq i32 %n, 0
2094 br i1 %cmp8.not, label %for.cond.cleanup, label %vector.ph
2096 vector.ph: ; preds = %entry
2097 %n.rnd.up = add i32 %n, 15
2098 %n.vec = and i32 %n.rnd.up, -16
2099 br label %vector.body
2101 vector.body: ; preds = %vector.body, %vector.ph
2102 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2103 %vec.phi = phi i16 [ 0, %vector.ph ], [ %5, %vector.body ]
2104 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
2105 %0 = getelementptr inbounds i8, i8* %x, i32 %index
2106 %1 = bitcast i8* %0 to <16 x i8>*
2107 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2108 %2 = zext <16 x i8> %wide.masked.load to <16 x i16>
2109 %3 = select <16 x i1> %active.lane.mask, <16 x i16> %2, <16 x i16> zeroinitializer
2110 %4 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %3)
2111 %5 = add i16 %4, %vec.phi
2112 %index.next = add i32 %index, 16
2113 %6 = icmp eq i32 %index.next, %n.vec
2114 br i1 %6, label %for.cond.cleanup, label %vector.body
2116 for.cond.cleanup: ; preds = %vector.body, %entry
2117 %s.0.lcssa = phi i16 [ 0, %entry ], [ %5, %vector.body ]
2121 define signext i16 @mla16i16(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
2122 ; CHECK-LABEL: mla16i16:
2123 ; CHECK: @ %bb.0: @ %entry
2124 ; CHECK-NEXT: .save {r7, lr}
2125 ; CHECK-NEXT: push {r7, lr}
2126 ; CHECK-NEXT: cbz r2, .LBB26_4
2127 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2128 ; CHECK-NEXT: mov.w r12, #0
2129 ; CHECK-NEXT: dlstp.8 lr, r2
2130 ; CHECK-NEXT: .LBB26_2: @ %vector.body
2131 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2132 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
2133 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
2134 ; CHECK-NEXT: vmlava.u8 r12, q1, q0
2135 ; CHECK-NEXT: letp lr, .LBB26_2
2136 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2137 ; CHECK-NEXT: sxth.w r0, r12
2138 ; CHECK-NEXT: pop {r7, pc}
2139 ; CHECK-NEXT: .LBB26_4:
2140 ; CHECK-NEXT: mov.w r12, #0
2141 ; CHECK-NEXT: sxth.w r0, r12
2142 ; CHECK-NEXT: pop {r7, pc}
2144 %cmp13.not = icmp eq i32 %n, 0
2145 br i1 %cmp13.not, label %for.cond.cleanup, label %vector.ph
2147 vector.ph: ; preds = %entry
2148 %n.rnd.up = add i32 %n, 15
2149 %n.vec = and i32 %n.rnd.up, -16
2150 br label %vector.body
2152 vector.body: ; preds = %vector.body, %vector.ph
2153 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2154 %vec.phi = phi i16 [ 0, %vector.ph ], [ %9, %vector.body ]
2155 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
2156 %0 = getelementptr inbounds i8, i8* %x, i32 %index
2157 %1 = bitcast i8* %0 to <16 x i8>*
2158 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2159 %2 = zext <16 x i8> %wide.masked.load to <16 x i16>
2160 %3 = getelementptr inbounds i8, i8* %y, i32 %index
2161 %4 = bitcast i8* %3 to <16 x i8>*
2162 %wide.masked.load18 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2163 %5 = zext <16 x i8> %wide.masked.load18 to <16 x i16>
2164 %6 = mul nuw <16 x i16> %5, %2
2165 %7 = select <16 x i1> %active.lane.mask, <16 x i16> %6, <16 x i16> zeroinitializer
2166 %8 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %7)
2167 %9 = add i16 %8, %vec.phi
2168 %index.next = add i32 %index, 16
2169 %10 = icmp eq i32 %index.next, %n.vec
2170 br i1 %10, label %for.cond.cleanup, label %vector.body
2172 for.cond.cleanup: ; preds = %vector.body, %entry
2173 %s.0.lcssa = phi i16 [ 0, %entry ], [ %9, %vector.body ]
2177 define zeroext i8 @add16i8(i8* noalias nocapture readonly %x, i32 %n) {
2178 ; CHECK-LABEL: add16i8:
2179 ; CHECK: @ %bb.0: @ %entry
2180 ; CHECK-NEXT: .save {r7, lr}
2181 ; CHECK-NEXT: push {r7, lr}
2182 ; CHECK-NEXT: cbz r1, .LBB27_4
2183 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2184 ; CHECK-NEXT: movs r2, #0
2185 ; CHECK-NEXT: dlstp.8 lr, r1
2186 ; CHECK-NEXT: .LBB27_2: @ %vector.body
2187 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2188 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
2189 ; CHECK-NEXT: vaddva.u8 r2, q0
2190 ; CHECK-NEXT: letp lr, .LBB27_2
2191 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2192 ; CHECK-NEXT: uxtb r0, r2
2193 ; CHECK-NEXT: pop {r7, pc}
2194 ; CHECK-NEXT: .LBB27_4:
2195 ; CHECK-NEXT: movs r2, #0
2196 ; CHECK-NEXT: uxtb r0, r2
2197 ; CHECK-NEXT: pop {r7, pc}
2199 %cmp7.not = icmp eq i32 %n, 0
2200 br i1 %cmp7.not, label %for.cond.cleanup, label %vector.ph
2202 vector.ph: ; preds = %entry
2203 %n.rnd.up = add i32 %n, 15
2204 %n.vec = and i32 %n.rnd.up, -16
2205 br label %vector.body
2207 vector.body: ; preds = %vector.body, %vector.ph
2208 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2209 %vec.phi = phi i8 [ 0, %vector.ph ], [ %4, %vector.body ]
2210 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
2211 %0 = getelementptr inbounds i8, i8* %x, i32 %index
2212 %1 = bitcast i8* %0 to <16 x i8>*
2213 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2214 %2 = select <16 x i1> %active.lane.mask, <16 x i8> %wide.masked.load, <16 x i8> zeroinitializer
2215 %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %2)
2216 %4 = add i8 %3, %vec.phi
2217 %index.next = add i32 %index, 16
2218 %5 = icmp eq i32 %index.next, %n.vec
2219 br i1 %5, label %for.cond.cleanup, label %vector.body
2221 for.cond.cleanup: ; preds = %vector.body, %entry
2222 %s.0.lcssa = phi i8 [ 0, %entry ], [ %4, %vector.body ]
2226 define zeroext i8 @mla16i8(i8* noalias nocapture readonly %x, i8* noalias nocapture readonly %y, i32 %n) {
2227 ; CHECK-LABEL: mla16i8:
2228 ; CHECK: @ %bb.0: @ %entry
2229 ; CHECK-NEXT: .save {r7, lr}
2230 ; CHECK-NEXT: push {r7, lr}
2231 ; CHECK-NEXT: cbz r2, .LBB28_4
2232 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2233 ; CHECK-NEXT: mov.w r12, #0
2234 ; CHECK-NEXT: dlstp.8 lr, r2
2235 ; CHECK-NEXT: .LBB28_2: @ %vector.body
2236 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2237 ; CHECK-NEXT: vldrb.u8 q0, [r0], #16
2238 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16
2239 ; CHECK-NEXT: vmlava.u8 r12, q1, q0
2240 ; CHECK-NEXT: letp lr, .LBB28_2
2241 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2242 ; CHECK-NEXT: uxtb.w r0, r12
2243 ; CHECK-NEXT: pop {r7, pc}
2244 ; CHECK-NEXT: .LBB28_4:
2245 ; CHECK-NEXT: mov.w r12, #0
2246 ; CHECK-NEXT: uxtb.w r0, r12
2247 ; CHECK-NEXT: pop {r7, pc}
2249 %cmp10.not = icmp eq i32 %n, 0
2250 br i1 %cmp10.not, label %for.cond.cleanup, label %vector.ph
2252 vector.ph: ; preds = %entry
2253 %n.rnd.up = add i32 %n, 15
2254 %n.vec = and i32 %n.rnd.up, -16
2255 br label %vector.body
2257 vector.body: ; preds = %vector.body, %vector.ph
2258 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2259 %vec.phi = phi i8 [ 0, %vector.ph ], [ %7, %vector.body ]
2260 %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %n)
2261 %0 = getelementptr inbounds i8, i8* %x, i32 %index
2262 %1 = bitcast i8* %0 to <16 x i8>*
2263 %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %1, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2264 %2 = getelementptr inbounds i8, i8* %y, i32 %index
2265 %3 = bitcast i8* %2 to <16 x i8>*
2266 %wide.masked.load15 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %3, i32 1, <16 x i1> %active.lane.mask, <16 x i8> undef)
2267 %4 = mul <16 x i8> %wide.masked.load15, %wide.masked.load
2268 %5 = select <16 x i1> %active.lane.mask, <16 x i8> %4, <16 x i8> zeroinitializer
2269 %6 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %5)
2270 %7 = add i8 %6, %vec.phi
2271 %index.next = add i32 %index, 16
2272 %8 = icmp eq i32 %index.next, %n.vec
2273 br i1 %8, label %for.cond.cleanup, label %vector.body
2275 for.cond.cleanup: ; preds = %vector.body, %entry
2276 %s.0.lcssa = phi i8 [ 0, %entry ], [ %7, %vector.body ]
2280 define i64 @add4i64(i32* noalias nocapture readonly %x, i32 %n) {
2281 ; CHECK-LABEL: add4i64:
2282 ; CHECK: @ %bb.0: @ %entry
2283 ; CHECK-NEXT: .save {r7, lr}
2284 ; CHECK-NEXT: push {r7, lr}
2285 ; CHECK-NEXT: cbz r1, .LBB29_3
2286 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2287 ; CHECK-NEXT: movs r2, #0
2288 ; CHECK-NEXT: mov r3, r2
2289 ; CHECK-NEXT: dlstp.32 lr, r1
2290 ; CHECK-NEXT: .LBB29_2: @ %vector.body
2291 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2292 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
2293 ; CHECK-NEXT: vaddlva.s32 r2, r3, q0
2294 ; CHECK-NEXT: letp lr, .LBB29_2
2295 ; CHECK-NEXT: b .LBB29_4
2296 ; CHECK-NEXT: .LBB29_3:
2297 ; CHECK-NEXT: movs r2, #0
2298 ; CHECK-NEXT: mov r3, r2
2299 ; CHECK-NEXT: .LBB29_4: @ %for.cond.cleanup
2300 ; CHECK-NEXT: mov r0, r2
2301 ; CHECK-NEXT: mov r1, r3
2302 ; CHECK-NEXT: pop {r7, pc}
2304 %cmp6.not = icmp eq i32 %n, 0
2305 br i1 %cmp6.not, label %for.cond.cleanup, label %vector.ph
2307 vector.ph: ; preds = %entry
2308 %n.rnd.up = add i32 %n, 3
2309 %n.vec = and i32 %n.rnd.up, -4
2310 br label %vector.body
2312 vector.body: ; preds = %vector.body, %vector.ph
2313 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2314 %vec.phi = phi i64 [ 0, %vector.ph ], [ %5, %vector.body ]
2315 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2316 %0 = getelementptr inbounds i32, i32* %x, i32 %index
2317 %1 = bitcast i32* %0 to <4 x i32>*
2318 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
2319 %2 = sext <4 x i32> %wide.masked.load to <4 x i64>
2320 %3 = select <4 x i1> %active.lane.mask, <4 x i64> %2, <4 x i64> zeroinitializer
2321 %4 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %3)
2322 %5 = add i64 %4, %vec.phi
2323 %index.next = add i32 %index, 4
2324 %6 = icmp eq i32 %index.next, %n.vec
2325 br i1 %6, label %for.cond.cleanup, label %vector.body
2327 for.cond.cleanup: ; preds = %vector.body, %entry
2328 %s.0.lcssa = phi i64 [ 0, %entry ], [ %5, %vector.body ]
2332 define i64 @mla4i64(i32* noalias nocapture readonly %x, i32* noalias nocapture readonly %y, i32 %n) {
2333 ; CHECK-LABEL: mla4i64:
2334 ; CHECK: @ %bb.0: @ %entry
2335 ; CHECK-NEXT: .save {r7, lr}
2336 ; CHECK-NEXT: push {r7, lr}
2337 ; CHECK-NEXT: cbz r2, .LBB30_3
2338 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2339 ; CHECK-NEXT: mov.w r12, #0
2340 ; CHECK-NEXT: mov r3, r12
2341 ; CHECK-NEXT: dlstp.32 lr, r2
2342 ; CHECK-NEXT: .LBB30_2: @ %vector.body
2343 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2344 ; CHECK-NEXT: vldrw.u32 q0, [r0], #16
2345 ; CHECK-NEXT: vldrw.u32 q1, [r1], #16
2346 ; CHECK-NEXT: vmlalva.s32 r12, r3, q1, q0
2347 ; CHECK-NEXT: letp lr, .LBB30_2
2348 ; CHECK-NEXT: b .LBB30_4
2349 ; CHECK-NEXT: .LBB30_3:
2350 ; CHECK-NEXT: mov.w r12, #0
2351 ; CHECK-NEXT: mov r3, r12
2352 ; CHECK-NEXT: .LBB30_4: @ %for.cond.cleanup
2353 ; CHECK-NEXT: mov r0, r12
2354 ; CHECK-NEXT: mov r1, r3
2355 ; CHECK-NEXT: pop {r7, pc}
2357 %cmp9.not = icmp eq i32 %n, 0
2358 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
2360 vector.ph: ; preds = %entry
2361 %n.rnd.up = add i32 %n, 3
2362 %n.vec = and i32 %n.rnd.up, -4
2363 br label %vector.body
2365 vector.body: ; preds = %vector.body, %vector.ph
2366 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2367 %vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ]
2368 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %n)
2369 %0 = getelementptr inbounds i32, i32* %x, i32 %index
2370 %1 = bitcast i32* %0 to <4 x i32>*
2371 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
2372 %2 = sext <4 x i32> %wide.masked.load to <4 x i64>
2373 %3 = getelementptr inbounds i32, i32* %y, i32 %index
2374 %4 = bitcast i32* %3 to <4 x i32>*
2375 %wide.masked.load14 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
2376 %5 = sext <4 x i32> %wide.masked.load14 to <4 x i64>
2377 %6 = mul nsw <4 x i64> %5, %2
2378 %7 = select <4 x i1> %active.lane.mask, <4 x i64> %6, <4 x i64> zeroinitializer
2379 %8 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %7)
2380 %9 = add i64 %8, %vec.phi
2381 %index.next = add i32 %index, 4
2382 %10 = icmp eq i32 %index.next, %n.vec
2383 br i1 %10, label %for.cond.cleanup, label %vector.body
2385 for.cond.cleanup: ; preds = %vector.body, %entry
2386 %s.0.lcssa = phi i64 [ 0, %entry ], [ %9, %vector.body ]
2390 define i64 @mla8i64(i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y, i32 %n) {
2391 ; CHECK-LABEL: mla8i64:
2392 ; CHECK: @ %bb.0: @ %entry
2393 ; CHECK-NEXT: .save {r7, lr}
2394 ; CHECK-NEXT: push {r7, lr}
2395 ; CHECK-NEXT: cbz r2, .LBB31_3
2396 ; CHECK-NEXT: @ %bb.1: @ %vector.ph
2397 ; CHECK-NEXT: mov.w r12, #0
2398 ; CHECK-NEXT: mov r3, r12
2399 ; CHECK-NEXT: dlstp.16 lr, r2
2400 ; CHECK-NEXT: .LBB31_2: @ %vector.body
2401 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
2402 ; CHECK-NEXT: vldrh.u16 q0, [r0], #16
2403 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16
2404 ; CHECK-NEXT: vmlalva.s16 r12, r3, q1, q0
2405 ; CHECK-NEXT: letp lr, .LBB31_2
2406 ; CHECK-NEXT: b .LBB31_4
2407 ; CHECK-NEXT: .LBB31_3:
2408 ; CHECK-NEXT: mov.w r12, #0
2409 ; CHECK-NEXT: mov r3, r12
2410 ; CHECK-NEXT: .LBB31_4: @ %for.cond.cleanup
2411 ; CHECK-NEXT: mov r0, r12
2412 ; CHECK-NEXT: mov r1, r3
2413 ; CHECK-NEXT: pop {r7, pc}
2415 %cmp9.not = icmp eq i32 %n, 0
2416 br i1 %cmp9.not, label %for.cond.cleanup, label %vector.ph
2418 vector.ph: ; preds = %entry
2419 %n.rnd.up = add i32 %n, 7
2420 %n.vec = and i32 %n.rnd.up, -8
2421 br label %vector.body
2423 vector.body: ; preds = %vector.body, %vector.ph
2424 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
2425 %vec.phi = phi i64 [ 0, %vector.ph ], [ %9, %vector.body ]
2426 %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %n)
2427 %0 = getelementptr inbounds i16, i16* %x, i32 %index
2428 %1 = bitcast i16* %0 to <8 x i16>*
2429 %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %1, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2430 %2 = sext <8 x i16> %wide.masked.load to <8 x i64>
2431 %3 = getelementptr inbounds i16, i16* %y, i32 %index
2432 %4 = bitcast i16* %3 to <8 x i16>*
2433 %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %active.lane.mask, <8 x i16> undef)
2434 %5 = sext <8 x i16> %wide.masked.load14 to <8 x i64>
2435 %6 = mul nsw <8 x i64> %5, %2
2436 %7 = select <8 x i1> %active.lane.mask, <8 x i64> %6, <8 x i64> zeroinitializer
2437 %8 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %7)
2438 %9 = add i64 %8, %vec.phi
2439 %index.next = add i32 %index, 8
2440 %10 = icmp eq i32 %index.next, %n.vec
2441 br i1 %10, label %for.cond.cleanup, label %vector.body
2443 for.cond.cleanup: ; preds = %vector.body, %entry
2444 %s.0.lcssa = phi i64 [ 0, %entry ], [ %9, %vector.body ]
2448 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1
2449 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2
2450 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32) #1
2451 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) #2
2452 declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) #3
2453 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32) #1
2454 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) #2
2455 declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) #3
2456 declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) #3
2457 declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) #3
2458 declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) #3
2459 declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) #3
2460 declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) #3
2462 declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
2463 declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
2464 declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
2465 declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
2466 declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
2467 declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
2468 declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
2469 declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
2470 declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
2471 declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
2472 declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
2473 declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
2474 declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)