1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
6 define arm_aapcs_vfpcc <4 x float> @maxf32(<4 x float> %a, <4 x float> %b) {
9 ; CHECK-NEXT: vmaxnma.f32 q0, q1
11 %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
12 %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
13 %c = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %aa, <4 x float> %bb)
17 define arm_aapcs_vfpcc <4 x float> @maxf32_c(<4 x float> %a, <4 x float> %b) {
18 ; CHECK-LABEL: maxf32_c:
20 ; CHECK-NEXT: vmaxnma.f32 q0, q1
22 %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
23 %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
24 %c = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %bb, <4 x float> %aa)
28 define arm_aapcs_vfpcc <4 x float> @minf32(<4 x float> %a, <4 x float> %b) {
29 ; CHECK-LABEL: minf32:
31 ; CHECK-NEXT: vminnma.f32 q0, q1
33 %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
34 %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
35 %c = tail call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %aa, <4 x float> %bb)
39 define arm_aapcs_vfpcc <4 x float> @minf32_c(<4 x float> %a, <4 x float> %b) {
40 ; CHECK-LABEL: minf32_c:
42 ; CHECK-NEXT: vminnma.f32 q0, q1
44 %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
45 %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
46 %c = tail call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %bb, <4 x float> %aa)
51 define arm_aapcs_vfpcc <4 x float> @maxpredf32(<4 x float> %a, <4 x float> %b) {
52 ; CHECK-LABEL: maxpredf32:
54 ; CHECK-NEXT: vpt.f32 gt, q1, q0
55 ; CHECK-NEXT: vmaxnmat.f32 q0, q1
57 %c = fcmp olt <4 x float> %a, %b
58 %s = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %c)
62 define arm_aapcs_vfpcc <4 x float> @maxpredf32_c(<4 x float> %a, <4 x float> %b) {
63 ; CHECK-LABEL: maxpredf32_c:
65 ; CHECK-NEXT: vpt.f32 gt, q1, q0
66 ; CHECK-NEXT: vmaxnmat.f32 q1, q0
67 ; CHECK-NEXT: vmov q0, q1
69 %c = fcmp olt <4 x float> %a, %b
70 %s = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %a, <4 x i1> %c)
74 define arm_aapcs_vfpcc <4 x float> @minpredf32(<4 x float> %a, <4 x float> %b) {
75 ; CHECK-LABEL: minpredf32:
77 ; CHECK-NEXT: vpt.f32 gt, q1, q0
78 ; CHECK-NEXT: vminnmat.f32 q0, q1
80 %c = fcmp olt <4 x float> %a, %b
81 %s = tail call fast <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %c)
85 define arm_aapcs_vfpcc <4 x float> @minpredf32_c(<4 x float> %a, <4 x float> %b) {
86 ; CHECK-LABEL: minpredf32_c:
88 ; CHECK-NEXT: vpt.f32 gt, q1, q0
89 ; CHECK-NEXT: vminnmat.f32 q1, q0
90 ; CHECK-NEXT: vmov q0, q1
92 %c = fcmp olt <4 x float> %a, %b
93 %s = tail call fast <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %a, <4 x i1> %c)
101 define arm_aapcs_vfpcc <8 x half> @maxf16(<8 x half> %a, <8 x half> %b) {
102 ; CHECK-LABEL: maxf16:
104 ; CHECK-NEXT: vmaxnma.f16 q0, q1
106 %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
107 %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
108 %c = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %aa, <8 x half> %bb)
112 define arm_aapcs_vfpcc <8 x half> @maxf16_c(<8 x half> %a, <8 x half> %b) {
113 ; CHECK-LABEL: maxf16_c:
115 ; CHECK-NEXT: vmaxnma.f16 q0, q1
117 %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
118 %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
119 %c = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %bb, <8 x half> %aa)
123 define arm_aapcs_vfpcc <8 x half> @minf16(<8 x half> %a, <8 x half> %b) {
124 ; CHECK-LABEL: minf16:
126 ; CHECK-NEXT: vminnma.f16 q0, q1
128 %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
129 %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
130 %c = tail call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %aa, <8 x half> %bb)
134 define arm_aapcs_vfpcc <8 x half> @minf16_c(<8 x half> %a, <8 x half> %b) {
135 ; CHECK-LABEL: minf16_c:
137 ; CHECK-NEXT: vminnma.f16 q0, q1
139 %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
140 %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
141 %c = tail call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %bb, <8 x half> %aa)
145 define arm_aapcs_vfpcc <8 x half> @maxpredf16(<8 x half> %a, <8 x half> %b) {
146 ; CHECK-LABEL: maxpredf16:
148 ; CHECK-NEXT: vpt.f16 gt, q1, q0
149 ; CHECK-NEXT: vmaxnmat.f16 q0, q1
151 %c = fcmp olt <8 x half> %a, %b
152 %s = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %c)
156 define arm_aapcs_vfpcc <8 x half> @maxpredf16_c(<8 x half> %a, <8 x half> %b) {
157 ; CHECK-LABEL: maxpredf16_c:
159 ; CHECK-NEXT: vpt.f16 gt, q1, q0
160 ; CHECK-NEXT: vmaxnmat.f16 q1, q0
161 ; CHECK-NEXT: vmov q0, q1
163 %c = fcmp olt <8 x half> %a, %b
164 %s = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %a, <8 x i1> %c)
168 define arm_aapcs_vfpcc <8 x half> @minpredf16(<8 x half> %a, <8 x half> %b) {
169 ; CHECK-LABEL: minpredf16:
171 ; CHECK-NEXT: vpt.f16 gt, q1, q0
172 ; CHECK-NEXT: vminnmat.f16 q0, q1
174 %c = fcmp olt <8 x half> %a, %b
175 %s = tail call fast <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %c)
179 define arm_aapcs_vfpcc <8 x half> @minpredf16_c(<8 x half> %a, <8 x half> %b) {
180 ; CHECK-LABEL: minpredf16_c:
182 ; CHECK-NEXT: vpt.f16 gt, q1, q0
183 ; CHECK-NEXT: vminnmat.f16 q1, q0
184 ; CHECK-NEXT: vmov q0, q1
186 %c = fcmp olt <8 x half> %a, %b
187 %s = tail call fast <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %a, <8 x i1> %c)
194 define void @loop_absmax32(ptr nocapture readonly %0, i32 %1, ptr nocapture %2) {
195 ; CHECK-LABEL: loop_absmax32:
197 ; CHECK-NEXT: .save {r7, lr}
198 ; CHECK-NEXT: push {r7, lr}
199 ; CHECK-NEXT: vmov.i32 q0, #0x0
200 ; CHECK-NEXT: lsrs r1, r1, #3
201 ; CHECK-NEXT: wls lr, r1, .LBB16_3
202 ; CHECK-NEXT: @ %bb.1: @ %.preheader
203 ; CHECK-NEXT: vmov.i32 q0, #0x0
204 ; CHECK-NEXT: .LBB16_2: @ =>This Inner Loop Header: Depth=1
205 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
206 ; CHECK-NEXT: vabs.f32 q1, q1
207 ; CHECK-NEXT: vmaxnm.f32 q0, q0, q1
208 ; CHECK-NEXT: le lr, .LBB16_2
209 ; CHECK-NEXT: .LBB16_3:
210 ; CHECK-NEXT: vldr s4, .LCPI16_0
211 ; CHECK-NEXT: vmov r0, s4
212 ; CHECK-NEXT: vmaxnmav.f32 r0, q0
213 ; CHECK-NEXT: vmov s0, r0
214 ; CHECK-NEXT: vstr s0, [r2]
215 ; CHECK-NEXT: pop {r7, pc}
216 ; CHECK-NEXT: .p2align 2
217 ; CHECK-NEXT: @ %bb.4:
218 ; CHECK-NEXT: .LCPI16_0:
219 ; CHECK-NEXT: .long 0x00000000 @ float 0
221 %5 = icmp eq i32 %4, 0
222 br i1 %5, label %18, label %6
225 %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
226 %8 = phi <4 x float> [ %15, %6 ], [ zeroinitializer, %3 ]
227 %9 = phi ptr [ %12, %6 ], [ %0, %3 ]
228 %10 = bitcast ptr %9 to ptr
229 %11 = load <4 x float>, ptr %10, align 4
230 %12 = getelementptr inbounds float, ptr %9, i32 4
231 %13 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %11)
232 %14 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %8)
233 %15 = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %14, <4 x float> %13)
234 %16 = add nsw i32 %7, -1
235 %17 = icmp eq i32 %16, 0
236 br i1 %17, label %18, label %6
239 %19 = phi <4 x float> [ zeroinitializer, %3 ], [ %15, %6 ]
240 %20 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %19)
241 store float %20, ptr %2, align 4
245 define void @loop_absmax32_c(ptr nocapture readonly %0, i32 %1, ptr nocapture %2) {
246 ; CHECK-LABEL: loop_absmax32_c:
248 ; CHECK-NEXT: .save {r7, lr}
249 ; CHECK-NEXT: push {r7, lr}
250 ; CHECK-NEXT: vmov.i32 q0, #0x0
251 ; CHECK-NEXT: lsrs r1, r1, #3
252 ; CHECK-NEXT: wls lr, r1, .LBB17_3
253 ; CHECK-NEXT: @ %bb.1: @ %.preheader
254 ; CHECK-NEXT: vmov.i32 q0, #0x0
255 ; CHECK-NEXT: .LBB17_2: @ =>This Inner Loop Header: Depth=1
256 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
257 ; CHECK-NEXT: vabs.f32 q1, q1
258 ; CHECK-NEXT: vmaxnm.f32 q0, q1, q0
259 ; CHECK-NEXT: le lr, .LBB17_2
260 ; CHECK-NEXT: .LBB17_3:
261 ; CHECK-NEXT: vldr s4, .LCPI17_0
262 ; CHECK-NEXT: vmov r0, s4
263 ; CHECK-NEXT: vmaxnmav.f32 r0, q0
264 ; CHECK-NEXT: vmov s0, r0
265 ; CHECK-NEXT: vstr s0, [r2]
266 ; CHECK-NEXT: pop {r7, pc}
267 ; CHECK-NEXT: .p2align 2
268 ; CHECK-NEXT: @ %bb.4:
269 ; CHECK-NEXT: .LCPI17_0:
270 ; CHECK-NEXT: .long 0x00000000 @ float 0
272 %5 = icmp eq i32 %4, 0
273 br i1 %5, label %18, label %6
276 %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
277 %8 = phi <4 x float> [ %15, %6 ], [ zeroinitializer, %3 ]
278 %9 = phi ptr [ %12, %6 ], [ %0, %3 ]
279 %10 = bitcast ptr %9 to ptr
280 %11 = load <4 x float>, ptr %10, align 4
281 %12 = getelementptr inbounds float, ptr %9, i32 4
282 %13 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %11)
283 %14 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %8)
284 %15 = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %13, <4 x float> %14)
285 %16 = add nsw i32 %7, -1
286 %17 = icmp eq i32 %16, 0
287 br i1 %17, label %18, label %6
290 %19 = phi <4 x float> [ zeroinitializer, %3 ], [ %15, %6 ]
291 %20 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %19)
292 store float %20, ptr %2, align 4
296 define void @loop_absmax32_pred(ptr %0, i32 %1, ptr nocapture %2) {
297 ; CHECK-LABEL: loop_absmax32_pred:
299 ; CHECK-NEXT: .save {r7, lr}
300 ; CHECK-NEXT: push {r7, lr}
301 ; CHECK-NEXT: vmov.i32 q0, #0x0
302 ; CHECK-NEXT: dlstp.32 lr, r1
303 ; CHECK-NEXT: .LBB18_1: @ =>This Inner Loop Header: Depth=1
304 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
305 ; CHECK-NEXT: vmaxnma.f32 q0, q1
306 ; CHECK-NEXT: letp lr, .LBB18_1
307 ; CHECK-NEXT: @ %bb.2:
308 ; CHECK-NEXT: vldr s4, .LCPI18_0
309 ; CHECK-NEXT: vmov r0, s4
310 ; CHECK-NEXT: vmaxnmav.f32 r0, q0
311 ; CHECK-NEXT: vmov s0, r0
312 ; CHECK-NEXT: vstr s0, [r2]
313 ; CHECK-NEXT: pop {r7, pc}
314 ; CHECK-NEXT: .p2align 2
315 ; CHECK-NEXT: @ %bb.3:
316 ; CHECK-NEXT: .LCPI18_0:
317 ; CHECK-NEXT: .long 0x00000000 @ float 0
321 %5 = phi <4 x float> [ zeroinitializer, %3 ], [ %12, %4 ]
322 %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
323 %7 = phi ptr [ %0, %3 ], [ %11, %4 ]
324 %8 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %6)
325 %9 = bitcast ptr %7 to ptr
326 %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %9, i32 4, <4 x i1> %8, <4 x float> zeroinitializer)
327 %11 = getelementptr inbounds float, ptr %7, i32 4
328 %12 = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %5, <4 x float> %10, <4 x i1> %8)
329 %13 = add nsw i32 %6, -4
330 %14 = icmp sgt i32 %6, 4
331 br i1 %14, label %4, label %15
334 %16 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %12)
335 store float %16, ptr %2, align 4
339 define void @loop_absmax32_pred_c(ptr %0, i32 %1, ptr nocapture %2) {
340 ; CHECK-LABEL: loop_absmax32_pred_c:
342 ; CHECK-NEXT: .save {r7, lr}
343 ; CHECK-NEXT: push {r7, lr}
344 ; CHECK-NEXT: vmov.i32 q0, #0x0
345 ; CHECK-NEXT: dlstp.32 lr, r1
346 ; CHECK-NEXT: .LBB19_1: @ =>This Inner Loop Header: Depth=1
347 ; CHECK-NEXT: vldrw.u32 q1, [r0], #16
348 ; CHECK-NEXT: vmaxnma.f32 q1, q0
349 ; CHECK-NEXT: vmov q0, q1
350 ; CHECK-NEXT: letp lr, .LBB19_1
351 ; CHECK-NEXT: @ %bb.2:
352 ; CHECK-NEXT: vldr s0, .LCPI19_0
353 ; CHECK-NEXT: vmov r0, s0
354 ; CHECK-NEXT: vmaxnmav.f32 r0, q1
355 ; CHECK-NEXT: vmov s0, r0
356 ; CHECK-NEXT: vstr s0, [r2]
357 ; CHECK-NEXT: pop {r7, pc}
358 ; CHECK-NEXT: .p2align 2
359 ; CHECK-NEXT: @ %bb.3:
360 ; CHECK-NEXT: .LCPI19_0:
361 ; CHECK-NEXT: .long 0x00000000 @ float 0
365 %5 = phi <4 x float> [ zeroinitializer, %3 ], [ %12, %4 ]
366 %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
367 %7 = phi ptr [ %0, %3 ], [ %11, %4 ]
368 %8 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %6)
369 %9 = bitcast ptr %7 to ptr
370 %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %9, i32 4, <4 x i1> %8, <4 x float> zeroinitializer)
371 %11 = getelementptr inbounds float, ptr %7, i32 4
372 %12 = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %10, <4 x float> %5, <4 x i1> %8)
373 %13 = add nsw i32 %6, -4
374 %14 = icmp sgt i32 %6, 4
375 br i1 %14, label %4, label %15
378 %16 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %12)
379 store float %16, ptr %2, align 4
388 define void @loop_absmax16(ptr nocapture readonly %0, i32 %1, ptr nocapture %2) {
389 ; CHECK-LABEL: loop_absmax16:
391 ; CHECK-NEXT: .save {r7, lr}
392 ; CHECK-NEXT: push {r7, lr}
393 ; CHECK-NEXT: vmov.i32 q0, #0x0
394 ; CHECK-NEXT: lsrs r1, r1, #3
395 ; CHECK-NEXT: wls lr, r1, .LBB20_3
396 ; CHECK-NEXT: @ %bb.1: @ %.preheader
397 ; CHECK-NEXT: vmov.i32 q0, #0x0
398 ; CHECK-NEXT: .LBB20_2: @ =>This Inner Loop Header: Depth=1
399 ; CHECK-NEXT: vldrw.u32 q1, [r0], #8
400 ; CHECK-NEXT: vabs.f16 q1, q1
401 ; CHECK-NEXT: vmaxnm.f16 q0, q0, q1
402 ; CHECK-NEXT: le lr, .LBB20_2
403 ; CHECK-NEXT: .LBB20_3:
404 ; CHECK-NEXT: vldr.16 s4, .LCPI20_0
405 ; CHECK-NEXT: vmov r0, s4
406 ; CHECK-NEXT: vmaxnmav.f16 r0, q0
407 ; CHECK-NEXT: vmov s0, r0
408 ; CHECK-NEXT: vstr.16 s0, [r2]
409 ; CHECK-NEXT: pop {r7, pc}
410 ; CHECK-NEXT: .p2align 1
411 ; CHECK-NEXT: @ %bb.4:
412 ; CHECK-NEXT: .LCPI20_0:
413 ; CHECK-NEXT: .short 0x0000 @ half 0
415 %5 = icmp eq i32 %4, 0
416 br i1 %5, label %18, label %6
419 %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
420 %8 = phi <8 x half> [ %15, %6 ], [ zeroinitializer, %3 ]
421 %9 = phi ptr [ %12, %6 ], [ %0, %3 ]
422 %10 = bitcast ptr %9 to ptr
423 %11 = load <8 x half>, ptr %10, align 4
424 %12 = getelementptr inbounds half, ptr %9, i32 4
425 %13 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %11)
426 %14 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %8)
427 %15 = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %14, <8 x half> %13)
428 %16 = add nsw i32 %7, -1
429 %17 = icmp eq i32 %16, 0
430 br i1 %17, label %18, label %6
433 %19 = phi <8 x half> [ zeroinitializer, %3 ], [ %15, %6 ]
434 %20 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %19)
435 store half %20, ptr %2, align 4
439 define void @loop_absmax16_c(ptr nocapture readonly %0, i32 %1, ptr nocapture %2) {
440 ; CHECK-LABEL: loop_absmax16_c:
442 ; CHECK-NEXT: .save {r7, lr}
443 ; CHECK-NEXT: push {r7, lr}
444 ; CHECK-NEXT: vmov.i32 q0, #0x0
445 ; CHECK-NEXT: lsrs r1, r1, #3
446 ; CHECK-NEXT: wls lr, r1, .LBB21_3
447 ; CHECK-NEXT: @ %bb.1: @ %.preheader
448 ; CHECK-NEXT: vmov.i32 q0, #0x0
449 ; CHECK-NEXT: .LBB21_2: @ =>This Inner Loop Header: Depth=1
450 ; CHECK-NEXT: vldrw.u32 q1, [r0], #8
451 ; CHECK-NEXT: vabs.f16 q1, q1
452 ; CHECK-NEXT: vmaxnm.f16 q0, q1, q0
453 ; CHECK-NEXT: le lr, .LBB21_2
454 ; CHECK-NEXT: .LBB21_3:
455 ; CHECK-NEXT: vldr.16 s4, .LCPI21_0
456 ; CHECK-NEXT: vmov r0, s4
457 ; CHECK-NEXT: vmaxnmav.f16 r0, q0
458 ; CHECK-NEXT: vmov s0, r0
459 ; CHECK-NEXT: vstr.16 s0, [r2]
460 ; CHECK-NEXT: pop {r7, pc}
461 ; CHECK-NEXT: .p2align 1
462 ; CHECK-NEXT: @ %bb.4:
463 ; CHECK-NEXT: .LCPI21_0:
464 ; CHECK-NEXT: .short 0x0000 @ half 0
466 %5 = icmp eq i32 %4, 0
467 br i1 %5, label %18, label %6
470 %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
471 %8 = phi <8 x half> [ %15, %6 ], [ zeroinitializer, %3 ]
472 %9 = phi ptr [ %12, %6 ], [ %0, %3 ]
473 %10 = bitcast ptr %9 to ptr
474 %11 = load <8 x half>, ptr %10, align 4
475 %12 = getelementptr inbounds half, ptr %9, i32 4
476 %13 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %11)
477 %14 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %8)
478 %15 = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %13, <8 x half> %14)
479 %16 = add nsw i32 %7, -1
480 %17 = icmp eq i32 %16, 0
481 br i1 %17, label %18, label %6
484 %19 = phi <8 x half> [ zeroinitializer, %3 ], [ %15, %6 ]
485 %20 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %19)
486 store half %20, ptr %2, align 4
490 define void @loop_absmax16_pred(ptr %0, i32 %1, ptr nocapture %2) {
491 ; CHECK-LABEL: loop_absmax16_pred:
493 ; CHECK-NEXT: .save {r7, lr}
494 ; CHECK-NEXT: push {r7, lr}
495 ; CHECK-NEXT: vmov.i32 q0, #0x0
496 ; CHECK-NEXT: dlstp.16 lr, r1
497 ; CHECK-NEXT: .LBB22_1: @ =>This Inner Loop Header: Depth=1
498 ; CHECK-NEXT: vldrh.u16 q1, [r0], #8
499 ; CHECK-NEXT: vmaxnma.f16 q0, q1
500 ; CHECK-NEXT: letp lr, .LBB22_1
501 ; CHECK-NEXT: @ %bb.2:
502 ; CHECK-NEXT: vldr.16 s4, .LCPI22_0
503 ; CHECK-NEXT: vmov r0, s4
504 ; CHECK-NEXT: vmaxnmav.f16 r0, q0
505 ; CHECK-NEXT: vmov s0, r0
506 ; CHECK-NEXT: vstr.16 s0, [r2]
507 ; CHECK-NEXT: pop {r7, pc}
508 ; CHECK-NEXT: .p2align 1
509 ; CHECK-NEXT: @ %bb.3:
510 ; CHECK-NEXT: .LCPI22_0:
511 ; CHECK-NEXT: .short 0x0000 @ half 0
515 %5 = phi <8 x half> [ zeroinitializer, %3 ], [ %12, %4 ]
516 %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
517 %7 = phi ptr [ %0, %3 ], [ %11, %4 ]
518 %8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %6)
519 %9 = bitcast ptr %7 to ptr
520 %10 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0(ptr %9, i32 4, <8 x i1> %8, <8 x half> zeroinitializer)
521 %11 = getelementptr inbounds half, ptr %7, i32 4
522 %12 = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %5, <8 x half> %10, <8 x i1> %8)
523 %13 = add nsw i32 %6, -8
524 %14 = icmp sgt i32 %6, 8
525 br i1 %14, label %4, label %15
528 %16 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %12)
529 store half %16, ptr %2, align 4
533 define void @loop_absmax16_pred_c(ptr %0, i32 %1, ptr nocapture %2) {
534 ; CHECK-LABEL: loop_absmax16_pred_c:
536 ; CHECK-NEXT: .save {r7, lr}
537 ; CHECK-NEXT: push {r7, lr}
538 ; CHECK-NEXT: vmov.i32 q0, #0x0
539 ; CHECK-NEXT: dlstp.16 lr, r1
540 ; CHECK-NEXT: .LBB23_1: @ =>This Inner Loop Header: Depth=1
541 ; CHECK-NEXT: vldrh.u16 q1, [r0], #8
542 ; CHECK-NEXT: vmaxnma.f16 q1, q0
543 ; CHECK-NEXT: vmov q0, q1
544 ; CHECK-NEXT: letp lr, .LBB23_1
545 ; CHECK-NEXT: @ %bb.2:
546 ; CHECK-NEXT: vldr.16 s0, .LCPI23_0
547 ; CHECK-NEXT: vmov r0, s0
548 ; CHECK-NEXT: vmaxnmav.f16 r0, q1
549 ; CHECK-NEXT: vmov s0, r0
550 ; CHECK-NEXT: vstr.16 s0, [r2]
551 ; CHECK-NEXT: pop {r7, pc}
552 ; CHECK-NEXT: .p2align 1
553 ; CHECK-NEXT: @ %bb.3:
554 ; CHECK-NEXT: .LCPI23_0:
555 ; CHECK-NEXT: .short 0x0000 @ half 0
559 %5 = phi <8 x half> [ zeroinitializer, %3 ], [ %12, %4 ]
560 %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
561 %7 = phi ptr [ %0, %3 ], [ %11, %4 ]
562 %8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %6)
563 %9 = bitcast ptr %7 to ptr
564 %10 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0(ptr %9, i32 4, <8 x i1> %8, <8 x half> zeroinitializer)
565 %11 = getelementptr inbounds half, ptr %7, i32 4
566 %12 = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %10, <8 x half> %5, <8 x i1> %8)
567 %13 = add nsw i32 %6, -8
568 %14 = icmp sgt i32 %6, 8
569 br i1 %14, label %4, label %15
572 %16 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %12)
573 store half %16, ptr %2, align 4
581 declare <4 x i1> @llvm.arm.mve.vctp32(i32)
582 declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>)
583 declare <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>)
584 declare <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>)
585 declare float @llvm.arm.mve.maxnmav.f32.v4f32(float, <4 x float>)
586 declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
587 declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
588 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
590 declare <8 x i1> @llvm.arm.mve.vctp16(i32)
591 declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32 immarg, <8 x i1>, <8 x half>)
592 declare <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>)
593 declare <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>)
594 declare half @llvm.arm.mve.maxnmav.f16.v8f16(half, <8 x half>)
595 declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
596 declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>)
597 declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>)