llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
   3
   4 ; F32
   5
   6 define arm_aapcs_vfpcc <4 x float> @maxf32(<4 x float> %a, <4 x float> %b) {
   7 ; CHECK-LABEL: maxf32:
   8 ; CHECK:       @ %bb.0:
   9 ; CHECK-NEXT:    vmaxnma.f32 q0, q1
  10 ; CHECK-NEXT:    bx lr
  11   %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
  12   %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
  13   %c = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %aa, <4 x float> %bb)
  14   ret <4 x float> %c
  15 }
  16
  17 define arm_aapcs_vfpcc <4 x float> @maxf32_c(<4 x float> %a, <4 x float> %b) {
  18 ; CHECK-LABEL: maxf32_c:
  19 ; CHECK:       @ %bb.0:
  20 ; CHECK-NEXT:    vmaxnma.f32 q0, q1
  21 ; CHECK-NEXT:    bx lr
  22   %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
  23   %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
  24   %c = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %bb, <4 x float> %aa)
  25   ret <4 x float> %c
  26 }
  27
  28 define arm_aapcs_vfpcc <4 x float> @minf32(<4 x float> %a, <4 x float> %b) {
  29 ; CHECK-LABEL: minf32:
  30 ; CHECK:       @ %bb.0:
  31 ; CHECK-NEXT:    vminnma.f32 q0, q1
  32 ; CHECK-NEXT:    bx lr
  33   %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
  34   %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
  35   %c = tail call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %aa, <4 x float> %bb)
  36   ret <4 x float> %c
  37 }
  38
  39 define arm_aapcs_vfpcc <4 x float> @minf32_c(<4 x float> %a, <4 x float> %b) {
  40 ; CHECK-LABEL: minf32_c:
  41 ; CHECK:       @ %bb.0:
  42 ; CHECK-NEXT:    vminnma.f32 q0, q1
  43 ; CHECK-NEXT:    bx lr
  44   %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
  45   %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
  46   %c = tail call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %bb, <4 x float> %aa)
  47   ret <4 x float> %c
  48 }
  49
  50
  51 define arm_aapcs_vfpcc <4 x float> @maxpredf32(<4 x float> %a, <4 x float> %b) {
  52 ; CHECK-LABEL: maxpredf32:
  53 ; CHECK:       @ %bb.0:
  54 ; CHECK-NEXT:    vpt.f32 gt, q1, q0
  55 ; CHECK-NEXT:    vmaxnmat.f32 q0, q1
  56 ; CHECK-NEXT:    bx lr
  57   %c = fcmp olt <4 x float> %a, %b
  58   %s = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %c)
  59   ret <4 x float> %s
  60 }
  61
  62 define arm_aapcs_vfpcc <4 x float> @maxpredf32_c(<4 x float> %a, <4 x float> %b) {
  63 ; CHECK-LABEL: maxpredf32_c:
  64 ; CHECK:       @ %bb.0:
  65 ; CHECK-NEXT:    vpt.f32 gt, q1, q0
  66 ; CHECK-NEXT:    vmaxnmat.f32 q1, q0
  67 ; CHECK-NEXT:    vmov q0, q1
  68 ; CHECK-NEXT:    bx lr
  69   %c = fcmp olt <4 x float> %a, %b
  70   %s = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %a, <4 x i1> %c)
  71   ret <4 x float> %s
  72 }
  73
  74 define arm_aapcs_vfpcc <4 x float> @minpredf32(<4 x float> %a, <4 x float> %b) {
  75 ; CHECK-LABEL: minpredf32:
  76 ; CHECK:       @ %bb.0:
  77 ; CHECK-NEXT:    vpt.f32 gt, q1, q0
  78 ; CHECK-NEXT:    vminnmat.f32 q0, q1
  79 ; CHECK-NEXT:    bx lr
  80   %c = fcmp olt <4 x float> %a, %b
  81   %s = tail call fast <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %c)
  82   ret <4 x float> %s
  83 }
  84
  85 define arm_aapcs_vfpcc <4 x float> @minpredf32_c(<4 x float> %a, <4 x float> %b) {
  86 ; CHECK-LABEL: minpredf32_c:
  87 ; CHECK:       @ %bb.0:
  88 ; CHECK-NEXT:    vpt.f32 gt, q1, q0
  89 ; CHECK-NEXT:    vminnmat.f32 q1, q0
  90 ; CHECK-NEXT:    vmov q0, q1
  91 ; CHECK-NEXT:    bx lr
  92   %c = fcmp olt <4 x float> %a, %b
  93   %s = tail call fast <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %a, <4 x i1> %c)
  94   ret <4 x float> %s
  95 }
  96
  97
  98
  99 ; F16
 100
 101 define arm_aapcs_vfpcc <8 x half> @maxf16(<8 x half> %a, <8 x half> %b) {
 102 ; CHECK-LABEL: maxf16:
 103 ; CHECK:       @ %bb.0:
 104 ; CHECK-NEXT:    vmaxnma.f16 q0, q1
 105 ; CHECK-NEXT:    bx lr
 106   %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
 107   %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
 108   %c = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %aa, <8 x half> %bb)
 109   ret <8 x half> %c
 110 }
 111
 112 define arm_aapcs_vfpcc <8 x half> @maxf16_c(<8 x half> %a, <8 x half> %b) {
 113 ; CHECK-LABEL: maxf16_c:
 114 ; CHECK:       @ %bb.0:
 115 ; CHECK-NEXT:    vmaxnma.f16 q0, q1
 116 ; CHECK-NEXT:    bx lr
 117   %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
 118   %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
 119   %c = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %bb, <8 x half> %aa)
 120   ret <8 x half> %c
 121 }
 122
 123 define arm_aapcs_vfpcc <8 x half> @minf16(<8 x half> %a, <8 x half> %b) {
 124 ; CHECK-LABEL: minf16:
 125 ; CHECK:       @ %bb.0:
 126 ; CHECK-NEXT:    vminnma.f16 q0, q1
 127 ; CHECK-NEXT:    bx lr
 128   %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
 129   %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
 130   %c = tail call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %aa, <8 x half> %bb)
 131   ret <8 x half> %c
 132 }
 133
 134 define arm_aapcs_vfpcc <8 x half> @minf16_c(<8 x half> %a, <8 x half> %b) {
 135 ; CHECK-LABEL: minf16_c:
 136 ; CHECK:       @ %bb.0:
 137 ; CHECK-NEXT:    vminnma.f16 q0, q1
 138 ; CHECK-NEXT:    bx lr
 139   %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
 140   %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
 141   %c = tail call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %bb, <8 x half> %aa)
 142   ret <8 x half> %c
 143 }
 144
 145 define arm_aapcs_vfpcc <8 x half> @maxpredf16(<8 x half> %a, <8 x half> %b) {
 146 ; CHECK-LABEL: maxpredf16:
 147 ; CHECK:       @ %bb.0:
 148 ; CHECK-NEXT:    vpt.f16 gt, q1, q0
 149 ; CHECK-NEXT:    vmaxnmat.f16 q0, q1
 150 ; CHECK-NEXT:    bx lr
 151   %c = fcmp olt <8 x half> %a, %b
 152   %s = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %c)
 153   ret <8 x half> %s
 154 }
 155
 156 define arm_aapcs_vfpcc <8 x half> @maxpredf16_c(<8 x half> %a, <8 x half> %b) {
 157 ; CHECK-LABEL: maxpredf16_c:
 158 ; CHECK:       @ %bb.0:
 159 ; CHECK-NEXT:    vpt.f16 gt, q1, q0
 160 ; CHECK-NEXT:    vmaxnmat.f16 q1, q0
 161 ; CHECK-NEXT:    vmov q0, q1
 162 ; CHECK-NEXT:    bx lr
 163   %c = fcmp olt <8 x half> %a, %b
 164   %s = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %a, <8 x i1> %c)
 165   ret <8 x half> %s
 166 }
 167
 168 define arm_aapcs_vfpcc <8 x half> @minpredf16(<8 x half> %a, <8 x half> %b) {
 169 ; CHECK-LABEL: minpredf16:
 170 ; CHECK:       @ %bb.0:
 171 ; CHECK-NEXT:    vpt.f16 gt, q1, q0
 172 ; CHECK-NEXT:    vminnmat.f16 q0, q1
 173 ; CHECK-NEXT:    bx lr
 174   %c = fcmp olt <8 x half> %a, %b
 175   %s = tail call fast <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %c)
 176   ret <8 x half> %s
 177 }
 178
 179 define arm_aapcs_vfpcc <8 x half> @minpredf16_c(<8 x half> %a, <8 x half> %b) {
 180 ; CHECK-LABEL: minpredf16_c:
 181 ; CHECK:       @ %bb.0:
 182 ; CHECK-NEXT:    vpt.f16 gt, q1, q0
 183 ; CHECK-NEXT:    vminnmat.f16 q1, q0
 184 ; CHECK-NEXT:    vmov q0, q1
 185 ; CHECK-NEXT:    bx lr
 186   %c = fcmp olt <8 x half> %a, %b
 187   %s = tail call fast <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %a, <8 x i1> %c)
 188   ret <8 x half> %s
 189 }
 190
 191
 192 ; Loops
 193
 194 define void @loop_absmax32(float* nocapture readonly %0, i32 %1, float* nocapture %2) {
 195 ; CHECK-LABEL: loop_absmax32:
 196 ; CHECK:       @ %bb.0:
 197 ; CHECK-NEXT:    .save {r7, lr}
 198 ; CHECK-NEXT:    push {r7, lr}
 199 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 200 ; CHECK-NEXT:    lsrs r1, r1, #3
 201 ; CHECK-NEXT:    wls lr, r1, .LBB16_3
 202 ; CHECK-NEXT:  @ %bb.1: @ %.preheader
 203 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 204 ; CHECK-NEXT:  .LBB16_2: @ =>This Inner Loop Header: Depth=1
 205 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 206 ; CHECK-NEXT:    vmaxnma.f32 q0, q1
 207 ; CHECK-NEXT:    le lr, .LBB16_2
 208 ; CHECK-NEXT:  .LBB16_3:
 209 ; CHECK-NEXT:    vldr s4, .LCPI16_0
 210 ; CHECK-NEXT:    vmov r0, s4
 211 ; CHECK-NEXT:    vmaxnmav.f32 r0, q0
 212 ; CHECK-NEXT:    vmov s0, r0
 213 ; CHECK-NEXT:    vstr s0, [r2]
 214 ; CHECK-NEXT:    pop {r7, pc}
 215 ; CHECK-NEXT:    .p2align 2
 216 ; CHECK-NEXT:  @ %bb.4:
 217 ; CHECK-NEXT:  .LCPI16_0:
 218 ; CHECK-NEXT:    .long 0x00000000 @ float 0
 219   %4 = lshr i32 %1, 3
 220   %5 = icmp eq i32 %4, 0
 221   br i1 %5, label %18, label %6
 222
 223 6:                                                ; preds = %3, %6
 224   %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
 225   %8 = phi <4 x float> [ %15, %6 ], [ zeroinitializer, %3 ]
 226   %9 = phi float* [ %12, %6 ], [ %0, %3 ]
 227   %10 = bitcast float* %9 to <4 x float>*
 228   %11 = load <4 x float>, <4 x float>* %10, align 4
 229   %12 = getelementptr inbounds float, float* %9, i32 4
 230   %13 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %11)
 231   %14 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %8)
 232   %15 = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %14, <4 x float> %13)
 233   %16 = add nsw i32 %7, -1
 234   %17 = icmp eq i32 %16, 0
 235   br i1 %17, label %18, label %6
 236
 237 18:                                               ; preds = %6, %3
 238   %19 = phi <4 x float> [ zeroinitializer, %3 ], [ %15, %6 ]
 239   %20 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %19)
 240   store float %20, float* %2, align 4
 241   ret void
 242 }
 243
 244 define void @loop_absmax32_c(float* nocapture readonly %0, i32 %1, float* nocapture %2) {
 245 ; CHECK-LABEL: loop_absmax32_c:
 246 ; CHECK:       @ %bb.0:
 247 ; CHECK-NEXT:    .save {r7, lr}
 248 ; CHECK-NEXT:    push {r7, lr}
 249 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 250 ; CHECK-NEXT:    lsrs r1, r1, #3
 251 ; CHECK-NEXT:    wls lr, r1, .LBB17_3
 252 ; CHECK-NEXT:  @ %bb.1: @ %.preheader
 253 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 254 ; CHECK-NEXT:  .LBB17_2: @ =>This Inner Loop Header: Depth=1
 255 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 256 ; CHECK-NEXT:    vmaxnma.f32 q0, q1
 257 ; CHECK-NEXT:    le lr, .LBB17_2
 258 ; CHECK-NEXT:  .LBB17_3:
 259 ; CHECK-NEXT:    vldr s4, .LCPI17_0
 260 ; CHECK-NEXT:    vmov r0, s4
 261 ; CHECK-NEXT:    vmaxnmav.f32 r0, q0
 262 ; CHECK-NEXT:    vmov s0, r0
 263 ; CHECK-NEXT:    vstr s0, [r2]
 264 ; CHECK-NEXT:    pop {r7, pc}
 265 ; CHECK-NEXT:    .p2align 2
 266 ; CHECK-NEXT:  @ %bb.4:
 267 ; CHECK-NEXT:  .LCPI17_0:
 268 ; CHECK-NEXT:    .long 0x00000000 @ float 0
 269   %4 = lshr i32 %1, 3
 270   %5 = icmp eq i32 %4, 0
 271   br i1 %5, label %18, label %6
 272
 273 6:                                                ; preds = %3, %6
 274   %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
 275   %8 = phi <4 x float> [ %15, %6 ], [ zeroinitializer, %3 ]
 276   %9 = phi float* [ %12, %6 ], [ %0, %3 ]
 277   %10 = bitcast float* %9 to <4 x float>*
 278   %11 = load <4 x float>, <4 x float>* %10, align 4
 279   %12 = getelementptr inbounds float, float* %9, i32 4
 280   %13 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %11)
 281   %14 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %8)
 282   %15 = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %13, <4 x float> %14)
 283   %16 = add nsw i32 %7, -1
 284   %17 = icmp eq i32 %16, 0
 285   br i1 %17, label %18, label %6
 286
 287 18:                                               ; preds = %6, %3
 288   %19 = phi <4 x float> [ zeroinitializer, %3 ], [ %15, %6 ]
 289   %20 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %19)
 290   store float %20, float* %2, align 4
 291   ret void
 292 }
 293
 294 define void @loop_absmax32_pred(float* %0, i32 %1, float* nocapture %2) {
 295 ; CHECK-LABEL: loop_absmax32_pred:
 296 ; CHECK:       @ %bb.0:
 297 ; CHECK-NEXT:    .save {r7, lr}
 298 ; CHECK-NEXT:    push {r7, lr}
 299 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 300 ; CHECK-NEXT:    dlstp.32 lr, r1
 301 ; CHECK-NEXT:  .LBB18_1: @ =>This Inner Loop Header: Depth=1
 302 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 303 ; CHECK-NEXT:    vmaxnma.f32 q0, q1
 304 ; CHECK-NEXT:    letp lr, .LBB18_1
 305 ; CHECK-NEXT:  @ %bb.2:
 306 ; CHECK-NEXT:    vldr s4, .LCPI18_0
 307 ; CHECK-NEXT:    vmov r0, s4
 308 ; CHECK-NEXT:    vmaxnmav.f32 r0, q0
 309 ; CHECK-NEXT:    vmov s0, r0
 310 ; CHECK-NEXT:    vstr s0, [r2]
 311 ; CHECK-NEXT:    pop {r7, pc}
 312 ; CHECK-NEXT:    .p2align 2
 313 ; CHECK-NEXT:  @ %bb.3:
 314 ; CHECK-NEXT:  .LCPI18_0:
 315 ; CHECK-NEXT:    .long 0x00000000 @ float 0
 316   br label %4
 317
 318 4:                                                ; preds = %4, %3
 319   %5 = phi <4 x float> [ zeroinitializer, %3 ], [ %12, %4 ]
 320   %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
 321   %7 = phi float* [ %0, %3 ], [ %11, %4 ]
 322   %8 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %6)
 323   %9 = bitcast float* %7 to <4 x float>*
 324   %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %9, i32 4, <4 x i1> %8, <4 x float> zeroinitializer)
 325   %11 = getelementptr inbounds float, float* %7, i32 4
 326   %12 = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %5, <4 x float> %10, <4 x i1> %8)
 327   %13 = add nsw i32 %6, -4
 328   %14 = icmp sgt i32 %6, 4
 329   br i1 %14, label %4, label %15
 330
 331 15:                                               ; preds = %4
 332   %16 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %12)
 333   store float %16, float* %2, align 4
 334   ret void
 335 }
 336
 337 define void @loop_absmax32_pred_c(float* %0, i32 %1, float* nocapture %2) {
 338 ; CHECK-LABEL: loop_absmax32_pred_c:
 339 ; CHECK:       @ %bb.0:
 340 ; CHECK-NEXT:    .save {r7, lr}
 341 ; CHECK-NEXT:    push {r7, lr}
 342 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 343 ; CHECK-NEXT:    dlstp.32 lr, r1
 344 ; CHECK-NEXT:  .LBB19_1: @ =>This Inner Loop Header: Depth=1
 345 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 346 ; CHECK-NEXT:    vmaxnma.f32 q1, q0
 347 ; CHECK-NEXT:    vmov q0, q1
 348 ; CHECK-NEXT:    letp lr, .LBB19_1
 349 ; CHECK-NEXT:  @ %bb.2:
 350 ; CHECK-NEXT:    vldr s0, .LCPI19_0
 351 ; CHECK-NEXT:    vmov r0, s0
 352 ; CHECK-NEXT:    vmaxnmav.f32 r0, q1
 353 ; CHECK-NEXT:    vmov s0, r0
 354 ; CHECK-NEXT:    vstr s0, [r2]
 355 ; CHECK-NEXT:    pop {r7, pc}
 356 ; CHECK-NEXT:    .p2align 2
 357 ; CHECK-NEXT:  @ %bb.3:
 358 ; CHECK-NEXT:  .LCPI19_0:
 359 ; CHECK-NEXT:    .long 0x00000000 @ float 0
 360   br label %4
 361
 362 4:                                                ; preds = %4, %3
 363   %5 = phi <4 x float> [ zeroinitializer, %3 ], [ %12, %4 ]
 364   %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
 365   %7 = phi float* [ %0, %3 ], [ %11, %4 ]
 366   %8 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %6)
 367   %9 = bitcast float* %7 to <4 x float>*
 368   %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %9, i32 4, <4 x i1> %8, <4 x float> zeroinitializer)
 369   %11 = getelementptr inbounds float, float* %7, i32 4
 370   %12 = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %10, <4 x float> %5, <4 x i1> %8)
 371   %13 = add nsw i32 %6, -4
 372   %14 = icmp sgt i32 %6, 4
 373   br i1 %14, label %4, label %15
 374
 375 15:                                               ; preds = %4
 376   %16 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %12)
 377   store float %16, float* %2, align 4
 378   ret void
 379 }
 380
 381
 382
 383
 384
 385
 386 define void @loop_absmax16(half* nocapture readonly %0, i32 %1, half* nocapture %2) {
 387 ; CHECK-LABEL: loop_absmax16:
 388 ; CHECK:       @ %bb.0:
 389 ; CHECK-NEXT:    .save {r7, lr}
 390 ; CHECK-NEXT:    push {r7, lr}
 391 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 392 ; CHECK-NEXT:    lsrs r1, r1, #3
 393 ; CHECK-NEXT:    wls lr, r1, .LBB20_3
 394 ; CHECK-NEXT:  @ %bb.1: @ %.preheader
 395 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 396 ; CHECK-NEXT:  .LBB20_2: @ =>This Inner Loop Header: Depth=1
 397 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #8
 398 ; CHECK-NEXT:    vmaxnma.f16 q0, q1
 399 ; CHECK-NEXT:    le lr, .LBB20_2
 400 ; CHECK-NEXT:  .LBB20_3:
 401 ; CHECK-NEXT:    vldr.16 s4, .LCPI20_0
 402 ; CHECK-NEXT:    vmov r0, s4
 403 ; CHECK-NEXT:    vmaxnmav.f16 r0, q0
 404 ; CHECK-NEXT:    vmov s0, r0
 405 ; CHECK-NEXT:    vstr.16 s0, [r2]
 406 ; CHECK-NEXT:    pop {r7, pc}
 407 ; CHECK-NEXT:    .p2align 1
 408 ; CHECK-NEXT:  @ %bb.4:
 409 ; CHECK-NEXT:  .LCPI20_0:
 410 ; CHECK-NEXT:    .short 0x0000 @ half 0
 411   %4 = lshr i32 %1, 3
 412   %5 = icmp eq i32 %4, 0
 413   br i1 %5, label %18, label %6
 414
 415 6:                                                ; preds = %3, %6
 416   %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
 417   %8 = phi <8 x half> [ %15, %6 ], [ zeroinitializer, %3 ]
 418   %9 = phi half* [ %12, %6 ], [ %0, %3 ]
 419   %10 = bitcast half* %9 to <8 x half>*
 420   %11 = load <8 x half>, <8 x half>* %10, align 4
 421   %12 = getelementptr inbounds half, half* %9, i32 4
 422   %13 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %11)
 423   %14 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %8)
 424   %15 = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %14, <8 x half> %13)
 425   %16 = add nsw i32 %7, -1
 426   %17 = icmp eq i32 %16, 0
 427   br i1 %17, label %18, label %6
 428
 429 18:                                               ; preds = %6, %3
 430   %19 = phi <8 x half> [ zeroinitializer, %3 ], [ %15, %6 ]
 431   %20 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %19)
 432   store half %20, half* %2, align 4
 433   ret void
 434 }
 435
 436 define void @loop_absmax16_c(half* nocapture readonly %0, i32 %1, half* nocapture %2) {
 437 ; CHECK-LABEL: loop_absmax16_c:
 438 ; CHECK:       @ %bb.0:
 439 ; CHECK-NEXT:    .save {r7, lr}
 440 ; CHECK-NEXT:    push {r7, lr}
 441 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 442 ; CHECK-NEXT:    lsrs r1, r1, #3
 443 ; CHECK-NEXT:    wls lr, r1, .LBB21_3
 444 ; CHECK-NEXT:  @ %bb.1: @ %.preheader
 445 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 446 ; CHECK-NEXT:  .LBB21_2: @ =>This Inner Loop Header: Depth=1
 447 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #8
 448 ; CHECK-NEXT:    vmaxnma.f16 q0, q1
 449 ; CHECK-NEXT:    le lr, .LBB21_2
 450 ; CHECK-NEXT:  .LBB21_3:
 451 ; CHECK-NEXT:    vldr.16 s4, .LCPI21_0
 452 ; CHECK-NEXT:    vmov r0, s4
 453 ; CHECK-NEXT:    vmaxnmav.f16 r0, q0
 454 ; CHECK-NEXT:    vmov s0, r0
 455 ; CHECK-NEXT:    vstr.16 s0, [r2]
 456 ; CHECK-NEXT:    pop {r7, pc}
 457 ; CHECK-NEXT:    .p2align 1
 458 ; CHECK-NEXT:  @ %bb.4:
 459 ; CHECK-NEXT:  .LCPI21_0:
 460 ; CHECK-NEXT:    .short 0x0000 @ half 0
 461   %4 = lshr i32 %1, 3
 462   %5 = icmp eq i32 %4, 0
 463   br i1 %5, label %18, label %6
 464
 465 6:                                                ; preds = %3, %6
 466   %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
 467   %8 = phi <8 x half> [ %15, %6 ], [ zeroinitializer, %3 ]
 468   %9 = phi half* [ %12, %6 ], [ %0, %3 ]
 469   %10 = bitcast half* %9 to <8 x half>*
 470   %11 = load <8 x half>, <8 x half>* %10, align 4
 471   %12 = getelementptr inbounds half, half* %9, i32 4
 472   %13 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %11)
 473   %14 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %8)
 474   %15 = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %13, <8 x half> %14)
 475   %16 = add nsw i32 %7, -1
 476   %17 = icmp eq i32 %16, 0
 477   br i1 %17, label %18, label %6
 478
 479 18:                                               ; preds = %6, %3
 480   %19 = phi <8 x half> [ zeroinitializer, %3 ], [ %15, %6 ]
 481   %20 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %19)
 482   store half %20, half* %2, align 4
 483   ret void
 484 }
 485
 486 define void @loop_absmax16_pred(half* %0, i32 %1, half* nocapture %2) {
 487 ; CHECK-LABEL: loop_absmax16_pred:
 488 ; CHECK:       @ %bb.0:
 489 ; CHECK-NEXT:    .save {r7, lr}
 490 ; CHECK-NEXT:    push {r7, lr}
 491 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 492 ; CHECK-NEXT:    dlstp.16 lr, r1
 493 ; CHECK-NEXT:  .LBB22_1: @ =>This Inner Loop Header: Depth=1
 494 ; CHECK-NEXT:    vldrh.u16 q1, [r0], #8
 495 ; CHECK-NEXT:    vmaxnma.f16 q0, q1
 496 ; CHECK-NEXT:    letp lr, .LBB22_1
 497 ; CHECK-NEXT:  @ %bb.2:
 498 ; CHECK-NEXT:    vldr.16 s4, .LCPI22_0
 499 ; CHECK-NEXT:    vmov r0, s4
 500 ; CHECK-NEXT:    vmaxnmav.f16 r0, q0
 501 ; CHECK-NEXT:    vmov s0, r0
 502 ; CHECK-NEXT:    vstr.16 s0, [r2]
 503 ; CHECK-NEXT:    pop {r7, pc}
 504 ; CHECK-NEXT:    .p2align 1
 505 ; CHECK-NEXT:  @ %bb.3:
 506 ; CHECK-NEXT:  .LCPI22_0:
 507 ; CHECK-NEXT:    .short 0x0000 @ half 0
 508   br label %4
 509
 510 4:                                                ; preds = %4, %3
 511   %5 = phi <8 x half> [ zeroinitializer, %3 ], [ %12, %4 ]
 512   %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
 513   %7 = phi half* [ %0, %3 ], [ %11, %4 ]
 514   %8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %6)
 515   %9 = bitcast half* %7 to <8 x half>*
 516   %10 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %9, i32 4, <8 x i1> %8, <8 x half> zeroinitializer)
 517   %11 = getelementptr inbounds half, half* %7, i32 4
 518   %12 = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %5, <8 x half> %10, <8 x i1> %8)
 519   %13 = add nsw i32 %6, -8
 520   %14 = icmp sgt i32 %6, 8
 521   br i1 %14, label %4, label %15
 522
 523 15:                                               ; preds = %4
 524   %16 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %12)
 525   store half %16, half* %2, align 4
 526   ret void
 527 }
 528
 529 define void @loop_absmax16_pred_c(half* %0, i32 %1, half* nocapture %2) {
 530 ; CHECK-LABEL: loop_absmax16_pred_c:
 531 ; CHECK:       @ %bb.0:
 532 ; CHECK-NEXT:    .save {r7, lr}
 533 ; CHECK-NEXT:    push {r7, lr}
 534 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 535 ; CHECK-NEXT:    dlstp.16 lr, r1
 536 ; CHECK-NEXT:  .LBB23_1: @ =>This Inner Loop Header: Depth=1
 537 ; CHECK-NEXT:    vldrh.u16 q1, [r0], #8
 538 ; CHECK-NEXT:    vmaxnma.f16 q1, q0
 539 ; CHECK-NEXT:    vmov q0, q1
 540 ; CHECK-NEXT:    letp lr, .LBB23_1
 541 ; CHECK-NEXT:  @ %bb.2:
 542 ; CHECK-NEXT:    vldr.16 s0, .LCPI23_0
 543 ; CHECK-NEXT:    vmov r0, s0
 544 ; CHECK-NEXT:    vmaxnmav.f16 r0, q1
 545 ; CHECK-NEXT:    vmov s0, r0
 546 ; CHECK-NEXT:    vstr.16 s0, [r2]
 547 ; CHECK-NEXT:    pop {r7, pc}
 548 ; CHECK-NEXT:    .p2align 1
 549 ; CHECK-NEXT:  @ %bb.3:
 550 ; CHECK-NEXT:  .LCPI23_0:
 551 ; CHECK-NEXT:    .short 0x0000 @ half 0
 552   br label %4
 553
 554 4:                                                ; preds = %4, %3
 555   %5 = phi <8 x half> [ zeroinitializer, %3 ], [ %12, %4 ]
 556   %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
 557   %7 = phi half* [ %0, %3 ], [ %11, %4 ]
 558   %8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %6)
 559   %9 = bitcast half* %7 to <8 x half>*
 560   %10 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %9, i32 4, <8 x i1> %8, <8 x half> zeroinitializer)
 561   %11 = getelementptr inbounds half, half* %7, i32 4
 562   %12 = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %10, <8 x half> %5, <8 x i1> %8)
 563   %13 = add nsw i32 %6, -8
 564   %14 = icmp sgt i32 %6, 8
 565   br i1 %14, label %4, label %15
 566
 567 15:                                               ; preds = %4
 568   %16 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %12)
 569   store half %16, half* %2, align 4
 570   ret void
 571 }
 572
 573
 574
 575
 576
 577 declare <4 x i1> @llvm.arm.mve.vctp32(i32)
 578 declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
 579 declare <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>)
 580 declare <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>)
 581 declare float @llvm.arm.mve.maxnmav.f32.v4f32(float, <4 x float>)
 582 declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
 583 declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
 584 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
 585
 586 declare <8 x i1> @llvm.arm.mve.vctp16(i32)
 587 declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32 immarg, <8 x i1>, <8 x half>)
 588 declare <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>)
 589 declare <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>)
 590 declare half @llvm.arm.mve.maxnmav.f16.v8f16(half, <8 x half>)
 591 declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
 592 declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>)
 593 declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>)
 594
 595