1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
6 define arm_aapcs_vfpcc <4 x float> @maxf32(<4 x float> %a, <4 x float> %b) {
9 ; CHECK-NEXT: vmaxnma.f32 q0, q1
11 %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
12 %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
13 %c = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %aa, <4 x float> %bb)
17 define arm_aapcs_vfpcc <4 x float> @maxf32_c(<4 x float> %a, <4 x float> %b) {
18 ; CHECK-LABEL: maxf32_c:
20 ; CHECK-NEXT: vmaxnma.f32 q0, q1
22 %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
23 %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
24 %c = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %bb, <4 x float> %aa)
28 define arm_aapcs_vfpcc <4 x float> @minf32(<4 x float> %a, <4 x float> %b) {
29 ; CHECK-LABEL: minf32:
31 ; CHECK-NEXT: vminnma.f32 q0, q1
33 %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
34 %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
35 %c = tail call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %aa, <4 x float> %bb)
39 define arm_aapcs_vfpcc <4 x float> @minf32_c(<4 x float> %a, <4 x float> %b) {
40 ; CHECK-LABEL: minf32_c:
42 ; CHECK-NEXT: vminnma.f32 q0, q1
44 %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
45 %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
46 %c = tail call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %bb, <4 x float> %aa)
51 define arm_aapcs_vfpcc <4 x float> @maxpredf32(<4 x float> %a, <4 x float> %b) {
52 ; CHECK-LABEL: maxpredf32:
54 ; CHECK-NEXT: vpt.f32 gt, q1, q0
55 ; CHECK-NEXT: vmaxnmat.f32 q0, q1
57 %c = fcmp olt <4 x float> %a, %b
58 %s = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %c)
62 define arm_aapcs_vfpcc <4 x float> @maxpredf32_c(<4 x float> %a, <4 x float> %b) {
63 ; CHECK-LABEL: maxpredf32_c:
65 ; CHECK-NEXT: vpt.f32 gt, q1, q0
66 ; CHECK-NEXT: vmaxnmat.f32 q1, q0
67 ; CHECK-NEXT: vmov q0, q1
69 %c = fcmp olt <4 x float> %a, %b
70 %s = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %a, <4 x i1> %c)
74 define arm_aapcs_vfpcc <4 x float> @minpredf32(<4 x float> %a, <4 x float> %b) {
75 ; CHECK-LABEL: minpredf32:
77 ; CHECK-NEXT: vpt.f32 gt, q1, q0
78 ; CHECK-NEXT: vminnmat.f32 q0, q1
80 %c = fcmp olt <4 x float> %a, %b
81 %s = tail call fast <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %c)
85 define arm_aapcs_vfpcc <4 x float> @minpredf32_c(<4 x float> %a, <4 x float> %b) {
86 ; CHECK-LABEL: minpredf32_c:
88 ; CHECK-NEXT: vpt.f32 gt, q1, q0
89 ; CHECK-NEXT: vminnmat.f32 q1, q0
90 ; CHECK-NEXT: vmov q0, q1
92 %c = fcmp olt <4 x float> %a, %b
93 %s = tail call fast <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %a, <4 x i1> %c)
101 define arm_aapcs_vfpcc <8 x half> @maxf16(<8 x half> %a, <8 x half> %b) {
102 ; CHECK-LABEL: maxf16:
104 ; CHECK-NEXT: vmaxnma.f16 q0, q1
106 %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
107 %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
108 %c = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %aa, <8 x half> %bb)
112 define arm_aapcs_vfpcc <8 x half> @maxf16_c(<8 x half> %a, <8 x half> %b) {
113 ; CHECK-LABEL: maxf16_c:
115 ; CHECK-NEXT: vmaxnma.f16 q0, q1
117 %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
118 %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
119 %c = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %bb, <8 x half> %aa)
123 define arm_aapcs_vfpcc <8 x half> @minf16(<8 x half> %a, <8 x half> %b) {
124 ; CHECK-LABEL: minf16:
126 ; CHECK-NEXT: vminnma.f16 q0, q1
128 %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
129 %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
130 %c = tail call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %aa, <8 x half> %bb)
134 define arm_aapcs_vfpcc <8 x half> @minf16_c(<8 x half> %a, <8 x half> %b) {
135 ; CHECK-LABEL: minf16_c:
137 ; CHECK-NEXT: vminnma.f16 q0, q1
139 %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
140 %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
141 %c = tail call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %bb, <8 x half> %aa)
145 define arm_aapcs_vfpcc <8 x half> @maxpredf16(<8 x half> %a, <8 x half> %b) {
146 ; CHECK-LABEL: maxpredf16:
148 ; CHECK-NEXT: vpt.f16 gt, q1, q0
149 ; CHECK-NEXT: vmaxnmat.f16 q0, q1
151 %c = fcmp olt <8 x half> %a, %b
152 %s = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %c)
156 define arm_aapcs_vfpcc <8 x half> @maxpredf16_c(<8 x half> %a, <8 x half> %b) {
157 ; CHECK-LABEL: maxpredf16_c:
159 ; CHECK-NEXT: vpt.f16 gt, q1, q0
160 ; CHECK-NEXT: vmaxnmat.f16 q1, q0
161 ; CHECK-NEXT: vmov q0, q1
163 %c = fcmp olt <8 x half> %a, %b
164 %s = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %a, <8 x i1> %c)
168 define arm_aapcs_vfpcc <8 x half> @minpredf16(<8 x half> %a, <8 x half> %b) {
169 ; CHECK-LABEL: minpredf16:
171 ; CHECK-NEXT: vpt.f16 gt, q1, q0
172 ; CHECK-NEXT: vminnmat.f16 q0, q1
174 %c = fcmp olt <8 x half> %a, %b
175 %s = tail call fast <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %c)
179 define arm_aapcs_vfpcc <8 x half> @minpredf16_c(<8 x half> %a, <8 x half> %b) {
180 ; CHECK-LABEL: minpredf16_c:
182 ; CHECK-NEXT: vpt.f16 gt, q1, q0
183 ; CHECK-NEXT: vminnmat.f16 q1, q0
184 ; CHECK-NEXT: vmov q0, q1
186 %c = fcmp olt <8 x half> %a, %b
187 %s = tail call fast <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %a, <8 x i1> %c)
194 define void @loop_absmax32(float* nocapture readonly %0, i32 %1, float* nocapture %2) {
195 ; CHECK-LABEL: loop_absmax32:
197 ; CHECK-NEXT: .save {r7, lr}
198 ; CHECK-NEXT: push {r7, lr}
199 ; CHECK-NEXT: vmov.i32 q0, #0x0
200 ; CHECK-NEXT: lsrs r1, r1, #3
201 ; CHECK-NEXT: wls lr, r1, .LBB16_3
202 ; CHECK-NEXT: @ %bb.1: @ %.preheader
203 ; CHECK-NEXT: vmov.i32 q0, #0x0
204 ; CHECK-NEXT: .LBB16_2: @ =>This Inner Loop Header: Depth=1
205 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
206 ; CHECK-NEXT: vmaxnma.f32 q0, q1
207 ; CHECK-NEXT: le lr, .LBB16_2
208 ; CHECK-NEXT: .LBB16_3:
209 ; CHECK-NEXT: vldr s4, .LCPI16_0
210 ; CHECK-NEXT: vmov r0, s4
211 ; CHECK-NEXT: vmaxnmav.f32 r0, q0
212 ; CHECK-NEXT: vmov s0, r0
213 ; CHECK-NEXT: vstr s0, [r2]
214 ; CHECK-NEXT: pop {r7, pc}
215 ; CHECK-NEXT: .p2align 2
216 ; CHECK-NEXT: @ %bb.4:
217 ; CHECK-NEXT: .LCPI16_0:
218 ; CHECK-NEXT: .long 0x00000000 @ float 0
220 %5 = icmp eq i32 %4, 0
221 br i1 %5, label %18, label %6
224 %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
225 %8 = phi <4 x float> [ %15, %6 ], [ zeroinitializer, %3 ]
226 %9 = phi float* [ %12, %6 ], [ %0, %3 ]
227 %10 = bitcast float* %9 to <4 x float>*
228 %11 = load <4 x float>, <4 x float>* %10, align 4
229 %12 = getelementptr inbounds float, float* %9, i32 4
230 %13 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %11)
231 %14 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %8)
232 %15 = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %14, <4 x float> %13)
233 %16 = add nsw i32 %7, -1
234 %17 = icmp eq i32 %16, 0
235 br i1 %17, label %18, label %6
238 %19 = phi <4 x float> [ zeroinitializer, %3 ], [ %15, %6 ]
239 %20 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %19)
240 store float %20, float* %2, align 4
244 define void @loop_absmax32_c(float* nocapture readonly %0, i32 %1, float* nocapture %2) {
245 ; CHECK-LABEL: loop_absmax32_c:
247 ; CHECK-NEXT: .save {r7, lr}
248 ; CHECK-NEXT: push {r7, lr}
249 ; CHECK-NEXT: vmov.i32 q0, #0x0
250 ; CHECK-NEXT: lsrs r1, r1, #3
251 ; CHECK-NEXT: wls lr, r1, .LBB17_3
252 ; CHECK-NEXT: @ %bb.1: @ %.preheader
253 ; CHECK-NEXT: vmov.i32 q0, #0x0
254 ; CHECK-NEXT: .LBB17_2: @ =>This Inner Loop Header: Depth=1
255 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
256 ; CHECK-NEXT: vmaxnma.f32 q0, q1
257 ; CHECK-NEXT: le lr, .LBB17_2
258 ; CHECK-NEXT: .LBB17_3:
259 ; CHECK-NEXT: vldr s4, .LCPI17_0
260 ; CHECK-NEXT: vmov r0, s4
261 ; CHECK-NEXT: vmaxnmav.f32 r0, q0
262 ; CHECK-NEXT: vmov s0, r0
263 ; CHECK-NEXT: vstr s0, [r2]
264 ; CHECK-NEXT: pop {r7, pc}
265 ; CHECK-NEXT: .p2align 2
266 ; CHECK-NEXT: @ %bb.4:
267 ; CHECK-NEXT: .LCPI17_0:
268 ; CHECK-NEXT: .long 0x00000000 @ float 0
270 %5 = icmp eq i32 %4, 0
271 br i1 %5, label %18, label %6
274 %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
275 %8 = phi <4 x float> [ %15, %6 ], [ zeroinitializer, %3 ]
276 %9 = phi float* [ %12, %6 ], [ %0, %3 ]
277 %10 = bitcast float* %9 to <4 x float>*
278 %11 = load <4 x float>, <4 x float>* %10, align 4
279 %12 = getelementptr inbounds float, float* %9, i32 4
280 %13 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %11)
281 %14 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %8)
282 %15 = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %13, <4 x float> %14)
283 %16 = add nsw i32 %7, -1
284 %17 = icmp eq i32 %16, 0
285 br i1 %17, label %18, label %6
288 %19 = phi <4 x float> [ zeroinitializer, %3 ], [ %15, %6 ]
289 %20 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %19)
290 store float %20, float* %2, align 4
294 define void @loop_absmax32_pred(float* %0, i32 %1, float* nocapture %2) {
295 ; CHECK-LABEL: loop_absmax32_pred:
297 ; CHECK-NEXT: .save {r7, lr}
298 ; CHECK-NEXT: push {r7, lr}
299 ; CHECK-NEXT: vmov.i32 q0, #0x0
300 ; CHECK-NEXT: dlstp.32 lr, r1
301 ; CHECK-NEXT: .LBB18_1: @ =>This Inner Loop Header: Depth=1
302 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
303 ; CHECK-NEXT: vmaxnma.f32 q0, q1
304 ; CHECK-NEXT: letp lr, .LBB18_1
305 ; CHECK-NEXT: @ %bb.2:
306 ; CHECK-NEXT: vldr s4, .LCPI18_0
307 ; CHECK-NEXT: vmov r0, s4
308 ; CHECK-NEXT: vmaxnmav.f32 r0, q0
309 ; CHECK-NEXT: vmov s0, r0
310 ; CHECK-NEXT: vstr s0, [r2]
311 ; CHECK-NEXT: pop {r7, pc}
312 ; CHECK-NEXT: .p2align 2
313 ; CHECK-NEXT: @ %bb.3:
314 ; CHECK-NEXT: .LCPI18_0:
315 ; CHECK-NEXT: .long 0x00000000 @ float 0
319 %5 = phi <4 x float> [ zeroinitializer, %3 ], [ %12, %4 ]
320 %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
321 %7 = phi float* [ %0, %3 ], [ %11, %4 ]
322 %8 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %6)
323 %9 = bitcast float* %7 to <4 x float>*
324 %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %9, i32 4, <4 x i1> %8, <4 x float> zeroinitializer)
325 %11 = getelementptr inbounds float, float* %7, i32 4
326 %12 = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %5, <4 x float> %10, <4 x i1> %8)
327 %13 = add nsw i32 %6, -4
328 %14 = icmp sgt i32 %6, 4
329 br i1 %14, label %4, label %15
332 %16 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %12)
333 store float %16, float* %2, align 4
337 define void @loop_absmax32_pred_c(float* %0, i32 %1, float* nocapture %2) {
338 ; CHECK-LABEL: loop_absmax32_pred_c:
340 ; CHECK-NEXT: .save {r7, lr}
341 ; CHECK-NEXT: push {r7, lr}
342 ; CHECK-NEXT: vmov.i32 q0, #0x0
343 ; CHECK-NEXT: dlstp.32 lr, r1
344 ; CHECK-NEXT: .LBB19_1: @ =>This Inner Loop Header: Depth=1
345 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
346 ; CHECK-NEXT: vmaxnma.f32 q1, q0
347 ; CHECK-NEXT: vmov q0, q1
348 ; CHECK-NEXT: letp lr, .LBB19_1
349 ; CHECK-NEXT: @ %bb.2:
350 ; CHECK-NEXT: vldr s0, .LCPI19_0
351 ; CHECK-NEXT: vmov r0, s0
352 ; CHECK-NEXT: vmaxnmav.f32 r0, q1
353 ; CHECK-NEXT: vmov s0, r0
354 ; CHECK-NEXT: vstr s0, [r2]
355 ; CHECK-NEXT: pop {r7, pc}
356 ; CHECK-NEXT: .p2align 2
357 ; CHECK-NEXT: @ %bb.3:
358 ; CHECK-NEXT: .LCPI19_0:
359 ; CHECK-NEXT: .long 0x00000000 @ float 0
363 %5 = phi <4 x float> [ zeroinitializer, %3 ], [ %12, %4 ]
364 %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
365 %7 = phi float* [ %0, %3 ], [ %11, %4 ]
366 %8 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %6)
367 %9 = bitcast float* %7 to <4 x float>*
368 %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %9, i32 4, <4 x i1> %8, <4 x float> zeroinitializer)
369 %11 = getelementptr inbounds float, float* %7, i32 4
370 %12 = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %10, <4 x float> %5, <4 x i1> %8)
371 %13 = add nsw i32 %6, -4
372 %14 = icmp sgt i32 %6, 4
373 br i1 %14, label %4, label %15
376 %16 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %12)
377 store float %16, float* %2, align 4
386 define void @loop_absmax16(half* nocapture readonly %0, i32 %1, half* nocapture %2) {
387 ; CHECK-LABEL: loop_absmax16:
389 ; CHECK-NEXT: .save {r7, lr}
390 ; CHECK-NEXT: push {r7, lr}
391 ; CHECK-NEXT: vmov.i32 q0, #0x0
392 ; CHECK-NEXT: lsrs r1, r1, #3
393 ; CHECK-NEXT: wls lr, r1, .LBB20_3
394 ; CHECK-NEXT: @ %bb.1: @ %.preheader
395 ; CHECK-NEXT: vmov.i32 q0, #0x0
396 ; CHECK-NEXT: .LBB20_2: @ =>This Inner Loop Header: Depth=1
397 ; CHECK-NEXT: vldrw.u32 q1, [r0], #8
398 ; CHECK-NEXT: vmaxnma.f16 q0, q1
399 ; CHECK-NEXT: le lr, .LBB20_2
400 ; CHECK-NEXT: .LBB20_3:
401 ; CHECK-NEXT: vldr.16 s4, .LCPI20_0
402 ; CHECK-NEXT: vmov r0, s4
403 ; CHECK-NEXT: vmaxnmav.f16 r0, q0
404 ; CHECK-NEXT: vmov s0, r0
405 ; CHECK-NEXT: vstr.16 s0, [r2]
406 ; CHECK-NEXT: pop {r7, pc}
407 ; CHECK-NEXT: .p2align 1
408 ; CHECK-NEXT: @ %bb.4:
409 ; CHECK-NEXT: .LCPI20_0:
410 ; CHECK-NEXT: .short 0x0000 @ half 0
412 %5 = icmp eq i32 %4, 0
413 br i1 %5, label %18, label %6
416 %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
417 %8 = phi <8 x half> [ %15, %6 ], [ zeroinitializer, %3 ]
418 %9 = phi half* [ %12, %6 ], [ %0, %3 ]
419 %10 = bitcast half* %9 to <8 x half>*
420 %11 = load <8 x half>, <8 x half>* %10, align 4
421 %12 = getelementptr inbounds half, half* %9, i32 4
422 %13 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %11)
423 %14 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %8)
424 %15 = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %14, <8 x half> %13)
425 %16 = add nsw i32 %7, -1
426 %17 = icmp eq i32 %16, 0
427 br i1 %17, label %18, label %6
430 %19 = phi <8 x half> [ zeroinitializer, %3 ], [ %15, %6 ]
431 %20 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %19)
432 store half %20, half* %2, align 4
436 define void @loop_absmax16_c(half* nocapture readonly %0, i32 %1, half* nocapture %2) {
437 ; CHECK-LABEL: loop_absmax16_c:
439 ; CHECK-NEXT: .save {r7, lr}
440 ; CHECK-NEXT: push {r7, lr}
441 ; CHECK-NEXT: vmov.i32 q0, #0x0
442 ; CHECK-NEXT: lsrs r1, r1, #3
443 ; CHECK-NEXT: wls lr, r1, .LBB21_3
444 ; CHECK-NEXT: @ %bb.1: @ %.preheader
445 ; CHECK-NEXT: vmov.i32 q0, #0x0
446 ; CHECK-NEXT: .LBB21_2: @ =>This Inner Loop Header: Depth=1
447 ; CHECK-NEXT: vldrw.u32 q1, [r0], #8
448 ; CHECK-NEXT: vmaxnma.f16 q0, q1
449 ; CHECK-NEXT: le lr, .LBB21_2
450 ; CHECK-NEXT: .LBB21_3:
451 ; CHECK-NEXT: vldr.16 s4, .LCPI21_0
452 ; CHECK-NEXT: vmov r0, s4
453 ; CHECK-NEXT: vmaxnmav.f16 r0, q0
454 ; CHECK-NEXT: vmov s0, r0
455 ; CHECK-NEXT: vstr.16 s0, [r2]
456 ; CHECK-NEXT: pop {r7, pc}
457 ; CHECK-NEXT: .p2align 1
458 ; CHECK-NEXT: @ %bb.4:
459 ; CHECK-NEXT: .LCPI21_0:
460 ; CHECK-NEXT: .short 0x0000 @ half 0
462 %5 = icmp eq i32 %4, 0
463 br i1 %5, label %18, label %6
466 %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
467 %8 = phi <8 x half> [ %15, %6 ], [ zeroinitializer, %3 ]
468 %9 = phi half* [ %12, %6 ], [ %0, %3 ]
469 %10 = bitcast half* %9 to <8 x half>*
470 %11 = load <8 x half>, <8 x half>* %10, align 4
471 %12 = getelementptr inbounds half, half* %9, i32 4
472 %13 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %11)
473 %14 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %8)
474 %15 = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %13, <8 x half> %14)
475 %16 = add nsw i32 %7, -1
476 %17 = icmp eq i32 %16, 0
477 br i1 %17, label %18, label %6
480 %19 = phi <8 x half> [ zeroinitializer, %3 ], [ %15, %6 ]
481 %20 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %19)
482 store half %20, half* %2, align 4
486 define void @loop_absmax16_pred(half* %0, i32 %1, half* nocapture %2) {
487 ; CHECK-LABEL: loop_absmax16_pred:
489 ; CHECK-NEXT: .save {r7, lr}
490 ; CHECK-NEXT: push {r7, lr}
491 ; CHECK-NEXT: vmov.i32 q0, #0x0
492 ; CHECK-NEXT: dlstp.16 lr, r1
493 ; CHECK-NEXT: .LBB22_1: @ =>This Inner Loop Header: Depth=1
494 ; CHECK-NEXT: vldrh.u16 q1, [r0], #8
495 ; CHECK-NEXT: vmaxnma.f16 q0, q1
496 ; CHECK-NEXT: letp lr, .LBB22_1
497 ; CHECK-NEXT: @ %bb.2:
498 ; CHECK-NEXT: vldr.16 s4, .LCPI22_0
499 ; CHECK-NEXT: vmov r0, s4
500 ; CHECK-NEXT: vmaxnmav.f16 r0, q0
501 ; CHECK-NEXT: vmov s0, r0
502 ; CHECK-NEXT: vstr.16 s0, [r2]
503 ; CHECK-NEXT: pop {r7, pc}
504 ; CHECK-NEXT: .p2align 1
505 ; CHECK-NEXT: @ %bb.3:
506 ; CHECK-NEXT: .LCPI22_0:
507 ; CHECK-NEXT: .short 0x0000 @ half 0
511 %5 = phi <8 x half> [ zeroinitializer, %3 ], [ %12, %4 ]
512 %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
513 %7 = phi half* [ %0, %3 ], [ %11, %4 ]
514 %8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %6)
515 %9 = bitcast half* %7 to <8 x half>*
516 %10 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %9, i32 4, <8 x i1> %8, <8 x half> zeroinitializer)
517 %11 = getelementptr inbounds half, half* %7, i32 4
518 %12 = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %5, <8 x half> %10, <8 x i1> %8)
519 %13 = add nsw i32 %6, -8
520 %14 = icmp sgt i32 %6, 8
521 br i1 %14, label %4, label %15
524 %16 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %12)
525 store half %16, half* %2, align 4
529 define void @loop_absmax16_pred_c(half* %0, i32 %1, half* nocapture %2) {
530 ; CHECK-LABEL: loop_absmax16_pred_c:
532 ; CHECK-NEXT: .save {r7, lr}
533 ; CHECK-NEXT: push {r7, lr}
534 ; CHECK-NEXT: vmov.i32 q0, #0x0
535 ; CHECK-NEXT: dlstp.16 lr, r1
536 ; CHECK-NEXT: .LBB23_1: @ =>This Inner Loop Header: Depth=1
537 ; CHECK-NEXT: vldrh.u16 q1, [r0], #8
538 ; CHECK-NEXT: vmaxnma.f16 q1, q0
539 ; CHECK-NEXT: vmov q0, q1
540 ; CHECK-NEXT: letp lr, .LBB23_1
541 ; CHECK-NEXT: @ %bb.2:
542 ; CHECK-NEXT: vldr.16 s0, .LCPI23_0
543 ; CHECK-NEXT: vmov r0, s0
544 ; CHECK-NEXT: vmaxnmav.f16 r0, q1
545 ; CHECK-NEXT: vmov s0, r0
546 ; CHECK-NEXT: vstr.16 s0, [r2]
547 ; CHECK-NEXT: pop {r7, pc}
548 ; CHECK-NEXT: .p2align 1
549 ; CHECK-NEXT: @ %bb.3:
550 ; CHECK-NEXT: .LCPI23_0:
551 ; CHECK-NEXT: .short 0x0000 @ half 0
555 %5 = phi <8 x half> [ zeroinitializer, %3 ], [ %12, %4 ]
556 %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
557 %7 = phi half* [ %0, %3 ], [ %11, %4 ]
558 %8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %6)
559 %9 = bitcast half* %7 to <8 x half>*
560 %10 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %9, i32 4, <8 x i1> %8, <8 x half> zeroinitializer)
561 %11 = getelementptr inbounds half, half* %7, i32 4
562 %12 = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %10, <8 x half> %5, <8 x i1> %8)
563 %13 = add nsw i32 %6, -8
564 %14 = icmp sgt i32 %6, 8
565 br i1 %14, label %4, label %15
568 %16 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %12)
569 store half %16, half* %2, align 4
577 declare <4 x i1> @llvm.arm.mve.vctp32(i32)
578 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
579 declare <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>)
580 declare <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>)
581 declare float @llvm.arm.mve.maxnmav.f32.v4f32(float, <4 x float>)
582 declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
583 declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
584 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
586 declare <8 x i1> @llvm.arm.mve.vctp16(i32)
587 declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32 immarg, <8 x i1>, <8 x half>)
588 declare <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>)
589 declare <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>)
590 declare half @llvm.arm.mve.maxnmav.f16.v8f16(half, <8 x half>)
591 declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
592 declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>)
593 declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>)