llvm/test/CodeGen/Thumb2/mve-vmaxnma-commute.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
   3
   4 ; F32
   5
   6 define arm_aapcs_vfpcc <4 x float> @maxf32(<4 x float> %a, <4 x float> %b) {
   7 ; CHECK-LABEL: maxf32:
   8 ; CHECK:       @ %bb.0:
   9 ; CHECK-NEXT:    vmaxnma.f32 q0, q1
  10 ; CHECK-NEXT:    bx lr
  11   %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
  12   %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
  13   %c = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %aa, <4 x float> %bb)
  14   ret <4 x float> %c
  15 }
  16
  17 define arm_aapcs_vfpcc <4 x float> @maxf32_c(<4 x float> %a, <4 x float> %b) {
  18 ; CHECK-LABEL: maxf32_c:
  19 ; CHECK:       @ %bb.0:
  20 ; CHECK-NEXT:    vmaxnma.f32 q0, q1
  21 ; CHECK-NEXT:    bx lr
  22   %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
  23   %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
  24   %c = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %bb, <4 x float> %aa)
  25   ret <4 x float> %c
  26 }
  27
  28 define arm_aapcs_vfpcc <4 x float> @minf32(<4 x float> %a, <4 x float> %b) {
  29 ; CHECK-LABEL: minf32:
  30 ; CHECK:       @ %bb.0:
  31 ; CHECK-NEXT:    vminnma.f32 q0, q1
  32 ; CHECK-NEXT:    bx lr
  33   %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
  34   %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
  35   %c = tail call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %aa, <4 x float> %bb)
  36   ret <4 x float> %c
  37 }
  38
  39 define arm_aapcs_vfpcc <4 x float> @minf32_c(<4 x float> %a, <4 x float> %b) {
  40 ; CHECK-LABEL: minf32_c:
  41 ; CHECK:       @ %bb.0:
  42 ; CHECK-NEXT:    vminnma.f32 q0, q1
  43 ; CHECK-NEXT:    bx lr
  44   %aa = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
  45   %bb = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %b)
  46   %c = tail call fast <4 x float> @llvm.minnum.v4f32(<4 x float> %bb, <4 x float> %aa)
  47   ret <4 x float> %c
  48 }
  49
  50
  51 define arm_aapcs_vfpcc <4 x float> @maxpredf32(<4 x float> %a, <4 x float> %b) {
  52 ; CHECK-LABEL: maxpredf32:
  53 ; CHECK:       @ %bb.0:
  54 ; CHECK-NEXT:    vpt.f32 gt, q1, q0
  55 ; CHECK-NEXT:    vmaxnmat.f32 q0, q1
  56 ; CHECK-NEXT:    bx lr
  57   %c = fcmp olt <4 x float> %a, %b
  58   %s = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %c)
  59   ret <4 x float> %s
  60 }
  61
  62 define arm_aapcs_vfpcc <4 x float> @maxpredf32_c(<4 x float> %a, <4 x float> %b) {
  63 ; CHECK-LABEL: maxpredf32_c:
  64 ; CHECK:       @ %bb.0:
  65 ; CHECK-NEXT:    vpt.f32 gt, q1, q0
  66 ; CHECK-NEXT:    vmaxnmat.f32 q1, q0
  67 ; CHECK-NEXT:    vmov q0, q1
  68 ; CHECK-NEXT:    bx lr
  69   %c = fcmp olt <4 x float> %a, %b
  70   %s = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %a, <4 x i1> %c)
  71   ret <4 x float> %s
  72 }
  73
  74 define arm_aapcs_vfpcc <4 x float> @minpredf32(<4 x float> %a, <4 x float> %b) {
  75 ; CHECK-LABEL: minpredf32:
  76 ; CHECK:       @ %bb.0:
  77 ; CHECK-NEXT:    vpt.f32 gt, q1, q0
  78 ; CHECK-NEXT:    vminnmat.f32 q0, q1
  79 ; CHECK-NEXT:    bx lr
  80   %c = fcmp olt <4 x float> %a, %b
  81   %s = tail call fast <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %a, <4 x float> %b, <4 x i1> %c)
  82   ret <4 x float> %s
  83 }
  84
  85 define arm_aapcs_vfpcc <4 x float> @minpredf32_c(<4 x float> %a, <4 x float> %b) {
  86 ; CHECK-LABEL: minpredf32_c:
  87 ; CHECK:       @ %bb.0:
  88 ; CHECK-NEXT:    vpt.f32 gt, q1, q0
  89 ; CHECK-NEXT:    vminnmat.f32 q1, q0
  90 ; CHECK-NEXT:    vmov q0, q1
  91 ; CHECK-NEXT:    bx lr
  92   %c = fcmp olt <4 x float> %a, %b
  93   %s = tail call fast <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float> %b, <4 x float> %a, <4 x i1> %c)
  94   ret <4 x float> %s
  95 }
  96
  97
  98
  99 ; F16
 100
 101 define arm_aapcs_vfpcc <8 x half> @maxf16(<8 x half> %a, <8 x half> %b) {
 102 ; CHECK-LABEL: maxf16:
 103 ; CHECK:       @ %bb.0:
 104 ; CHECK-NEXT:    vmaxnma.f16 q0, q1
 105 ; CHECK-NEXT:    bx lr
 106   %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
 107   %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
 108   %c = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %aa, <8 x half> %bb)
 109   ret <8 x half> %c
 110 }
 111
 112 define arm_aapcs_vfpcc <8 x half> @maxf16_c(<8 x half> %a, <8 x half> %b) {
 113 ; CHECK-LABEL: maxf16_c:
 114 ; CHECK:       @ %bb.0:
 115 ; CHECK-NEXT:    vmaxnma.f16 q0, q1
 116 ; CHECK-NEXT:    bx lr
 117   %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
 118   %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
 119   %c = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %bb, <8 x half> %aa)
 120   ret <8 x half> %c
 121 }
 122
 123 define arm_aapcs_vfpcc <8 x half> @minf16(<8 x half> %a, <8 x half> %b) {
 124 ; CHECK-LABEL: minf16:
 125 ; CHECK:       @ %bb.0:
 126 ; CHECK-NEXT:    vminnma.f16 q0, q1
 127 ; CHECK-NEXT:    bx lr
 128   %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
 129   %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
 130   %c = tail call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %aa, <8 x half> %bb)
 131   ret <8 x half> %c
 132 }
 133
 134 define arm_aapcs_vfpcc <8 x half> @minf16_c(<8 x half> %a, <8 x half> %b) {
 135 ; CHECK-LABEL: minf16_c:
 136 ; CHECK:       @ %bb.0:
 137 ; CHECK-NEXT:    vminnma.f16 q0, q1
 138 ; CHECK-NEXT:    bx lr
 139   %aa = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %a)
 140   %bb = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %b)
 141   %c = tail call fast <8 x half> @llvm.minnum.v8f16(<8 x half> %bb, <8 x half> %aa)
 142   ret <8 x half> %c
 143 }
 144
 145 define arm_aapcs_vfpcc <8 x half> @maxpredf16(<8 x half> %a, <8 x half> %b) {
 146 ; CHECK-LABEL: maxpredf16:
 147 ; CHECK:       @ %bb.0:
 148 ; CHECK-NEXT:    vpt.f16 gt, q1, q0
 149 ; CHECK-NEXT:    vmaxnmat.f16 q0, q1
 150 ; CHECK-NEXT:    bx lr
 151   %c = fcmp olt <8 x half> %a, %b
 152   %s = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %c)
 153   ret <8 x half> %s
 154 }
 155
 156 define arm_aapcs_vfpcc <8 x half> @maxpredf16_c(<8 x half> %a, <8 x half> %b) {
 157 ; CHECK-LABEL: maxpredf16_c:
 158 ; CHECK:       @ %bb.0:
 159 ; CHECK-NEXT:    vpt.f16 gt, q1, q0
 160 ; CHECK-NEXT:    vmaxnmat.f16 q1, q0
 161 ; CHECK-NEXT:    vmov q0, q1
 162 ; CHECK-NEXT:    bx lr
 163   %c = fcmp olt <8 x half> %a, %b
 164   %s = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %a, <8 x i1> %c)
 165   ret <8 x half> %s
 166 }
 167
 168 define arm_aapcs_vfpcc <8 x half> @minpredf16(<8 x half> %a, <8 x half> %b) {
 169 ; CHECK-LABEL: minpredf16:
 170 ; CHECK:       @ %bb.0:
 171 ; CHECK-NEXT:    vpt.f16 gt, q1, q0
 172 ; CHECK-NEXT:    vminnmat.f16 q0, q1
 173 ; CHECK-NEXT:    bx lr
 174   %c = fcmp olt <8 x half> %a, %b
 175   %s = tail call fast <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %a, <8 x half> %b, <8 x i1> %c)
 176   ret <8 x half> %s
 177 }
 178
 179 define arm_aapcs_vfpcc <8 x half> @minpredf16_c(<8 x half> %a, <8 x half> %b) {
 180 ; CHECK-LABEL: minpredf16_c:
 181 ; CHECK:       @ %bb.0:
 182 ; CHECK-NEXT:    vpt.f16 gt, q1, q0
 183 ; CHECK-NEXT:    vminnmat.f16 q1, q0
 184 ; CHECK-NEXT:    vmov q0, q1
 185 ; CHECK-NEXT:    bx lr
 186   %c = fcmp olt <8 x half> %a, %b
 187   %s = tail call fast <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half> %b, <8 x half> %a, <8 x i1> %c)
 188   ret <8 x half> %s
 189 }
 190
 191
 192 ; Loops
 193
 194 define void @loop_absmax32(ptr nocapture readonly %0, i32 %1, ptr nocapture %2) {
 195 ; CHECK-LABEL: loop_absmax32:
 196 ; CHECK:       @ %bb.0:
 197 ; CHECK-NEXT:    .save {r7, lr}
 198 ; CHECK-NEXT:    push {r7, lr}
 199 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 200 ; CHECK-NEXT:    lsrs r1, r1, #3
 201 ; CHECK-NEXT:    wls lr, r1, .LBB16_3
 202 ; CHECK-NEXT:  @ %bb.1: @ %.preheader
 203 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 204 ; CHECK-NEXT:  .LBB16_2: @ =>This Inner Loop Header: Depth=1
 205 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 206 ; CHECK-NEXT:    vabs.f32 q1, q1
 207 ; CHECK-NEXT:    vmaxnm.f32 q0, q0, q1
 208 ; CHECK-NEXT:    le lr, .LBB16_2
 209 ; CHECK-NEXT:  .LBB16_3:
 210 ; CHECK-NEXT:    vldr s4, .LCPI16_0
 211 ; CHECK-NEXT:    vmov r0, s4
 212 ; CHECK-NEXT:    vmaxnmav.f32 r0, q0
 213 ; CHECK-NEXT:    vmov s0, r0
 214 ; CHECK-NEXT:    vstr s0, [r2]
 215 ; CHECK-NEXT:    pop {r7, pc}
 216 ; CHECK-NEXT:    .p2align 2
 217 ; CHECK-NEXT:  @ %bb.4:
 218 ; CHECK-NEXT:  .LCPI16_0:
 219 ; CHECK-NEXT:    .long 0x00000000 @ float 0
 220   %4 = lshr i32 %1, 3
 221   %5 = icmp eq i32 %4, 0
 222   br i1 %5, label %18, label %6
 223
 224 6:                                                ; preds = %3, %6
 225   %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
 226   %8 = phi <4 x float> [ %15, %6 ], [ zeroinitializer, %3 ]
 227   %9 = phi ptr [ %12, %6 ], [ %0, %3 ]
 228   %10 = bitcast ptr %9 to ptr
 229   %11 = load <4 x float>, ptr %10, align 4
 230   %12 = getelementptr inbounds float, ptr %9, i32 4
 231   %13 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %11)
 232   %14 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %8)
 233   %15 = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %14, <4 x float> %13)
 234   %16 = add nsw i32 %7, -1
 235   %17 = icmp eq i32 %16, 0
 236   br i1 %17, label %18, label %6
 237
 238 18:                                               ; preds = %6, %3
 239   %19 = phi <4 x float> [ zeroinitializer, %3 ], [ %15, %6 ]
 240   %20 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %19)
 241   store float %20, ptr %2, align 4
 242   ret void
 243 }
 244
 245 define void @loop_absmax32_c(ptr nocapture readonly %0, i32 %1, ptr nocapture %2) {
 246 ; CHECK-LABEL: loop_absmax32_c:
 247 ; CHECK:       @ %bb.0:
 248 ; CHECK-NEXT:    .save {r7, lr}
 249 ; CHECK-NEXT:    push {r7, lr}
 250 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 251 ; CHECK-NEXT:    lsrs r1, r1, #3
 252 ; CHECK-NEXT:    wls lr, r1, .LBB17_3
 253 ; CHECK-NEXT:  @ %bb.1: @ %.preheader
 254 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 255 ; CHECK-NEXT:  .LBB17_2: @ =>This Inner Loop Header: Depth=1
 256 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 257 ; CHECK-NEXT:    vabs.f32 q1, q1
 258 ; CHECK-NEXT:    vmaxnm.f32 q0, q1, q0
 259 ; CHECK-NEXT:    le lr, .LBB17_2
 260 ; CHECK-NEXT:  .LBB17_3:
 261 ; CHECK-NEXT:    vldr s4, .LCPI17_0
 262 ; CHECK-NEXT:    vmov r0, s4
 263 ; CHECK-NEXT:    vmaxnmav.f32 r0, q0
 264 ; CHECK-NEXT:    vmov s0, r0
 265 ; CHECK-NEXT:    vstr s0, [r2]
 266 ; CHECK-NEXT:    pop {r7, pc}
 267 ; CHECK-NEXT:    .p2align 2
 268 ; CHECK-NEXT:  @ %bb.4:
 269 ; CHECK-NEXT:  .LCPI17_0:
 270 ; CHECK-NEXT:    .long 0x00000000 @ float 0
 271   %4 = lshr i32 %1, 3
 272   %5 = icmp eq i32 %4, 0
 273   br i1 %5, label %18, label %6
 274
 275 6:                                                ; preds = %3, %6
 276   %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
 277   %8 = phi <4 x float> [ %15, %6 ], [ zeroinitializer, %3 ]
 278   %9 = phi ptr [ %12, %6 ], [ %0, %3 ]
 279   %10 = bitcast ptr %9 to ptr
 280   %11 = load <4 x float>, ptr %10, align 4
 281   %12 = getelementptr inbounds float, ptr %9, i32 4
 282   %13 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %11)
 283   %14 = tail call fast <4 x float> @llvm.fabs.v4f32(<4 x float> %8)
 284   %15 = tail call fast <4 x float> @llvm.maxnum.v4f32(<4 x float> %13, <4 x float> %14)
 285   %16 = add nsw i32 %7, -1
 286   %17 = icmp eq i32 %16, 0
 287   br i1 %17, label %18, label %6
 288
 289 18:                                               ; preds = %6, %3
 290   %19 = phi <4 x float> [ zeroinitializer, %3 ], [ %15, %6 ]
 291   %20 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %19)
 292   store float %20, ptr %2, align 4
 293   ret void
 294 }
 295
 296 define void @loop_absmax32_pred(ptr %0, i32 %1, ptr nocapture %2) {
 297 ; CHECK-LABEL: loop_absmax32_pred:
 298 ; CHECK:       @ %bb.0:
 299 ; CHECK-NEXT:    .save {r7, lr}
 300 ; CHECK-NEXT:    push {r7, lr}
 301 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 302 ; CHECK-NEXT:    dlstp.32 lr, r1
 303 ; CHECK-NEXT:  .LBB18_1: @ =>This Inner Loop Header: Depth=1
 304 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 305 ; CHECK-NEXT:    vmaxnma.f32 q0, q1
 306 ; CHECK-NEXT:    letp lr, .LBB18_1
 307 ; CHECK-NEXT:  @ %bb.2:
 308 ; CHECK-NEXT:    vldr s4, .LCPI18_0
 309 ; CHECK-NEXT:    vmov r0, s4
 310 ; CHECK-NEXT:    vmaxnmav.f32 r0, q0
 311 ; CHECK-NEXT:    vmov s0, r0
 312 ; CHECK-NEXT:    vstr s0, [r2]
 313 ; CHECK-NEXT:    pop {r7, pc}
 314 ; CHECK-NEXT:    .p2align 2
 315 ; CHECK-NEXT:  @ %bb.3:
 316 ; CHECK-NEXT:  .LCPI18_0:
 317 ; CHECK-NEXT:    .long 0x00000000 @ float 0
 318   br label %4
 319
 320 4:                                                ; preds = %4, %3
 321   %5 = phi <4 x float> [ zeroinitializer, %3 ], [ %12, %4 ]
 322   %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
 323   %7 = phi ptr [ %0, %3 ], [ %11, %4 ]
 324   %8 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %6)
 325   %9 = bitcast ptr %7 to ptr
 326   %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %9, i32 4, <4 x i1> %8, <4 x float> zeroinitializer)
 327   %11 = getelementptr inbounds float, ptr %7, i32 4
 328   %12 = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %5, <4 x float> %10, <4 x i1> %8)
 329   %13 = add nsw i32 %6, -4
 330   %14 = icmp sgt i32 %6, 4
 331   br i1 %14, label %4, label %15
 332
 333 15:                                               ; preds = %4
 334   %16 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %12)
 335   store float %16, ptr %2, align 4
 336   ret void
 337 }
 338
 339 define void @loop_absmax32_pred_c(ptr %0, i32 %1, ptr nocapture %2) {
 340 ; CHECK-LABEL: loop_absmax32_pred_c:
 341 ; CHECK:       @ %bb.0:
 342 ; CHECK-NEXT:    .save {r7, lr}
 343 ; CHECK-NEXT:    push {r7, lr}
 344 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 345 ; CHECK-NEXT:    dlstp.32 lr, r1
 346 ; CHECK-NEXT:  .LBB19_1: @ =>This Inner Loop Header: Depth=1
 347 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #16
 348 ; CHECK-NEXT:    vmaxnma.f32 q1, q0
 349 ; CHECK-NEXT:    vmov q0, q1
 350 ; CHECK-NEXT:    letp lr, .LBB19_1
 351 ; CHECK-NEXT:  @ %bb.2:
 352 ; CHECK-NEXT:    vldr s0, .LCPI19_0
 353 ; CHECK-NEXT:    vmov r0, s0
 354 ; CHECK-NEXT:    vmaxnmav.f32 r0, q1
 355 ; CHECK-NEXT:    vmov s0, r0
 356 ; CHECK-NEXT:    vstr s0, [r2]
 357 ; CHECK-NEXT:    pop {r7, pc}
 358 ; CHECK-NEXT:    .p2align 2
 359 ; CHECK-NEXT:  @ %bb.3:
 360 ; CHECK-NEXT:  .LCPI19_0:
 361 ; CHECK-NEXT:    .long 0x00000000 @ float 0
 362   br label %4
 363
 364 4:                                                ; preds = %4, %3
 365   %5 = phi <4 x float> [ zeroinitializer, %3 ], [ %12, %4 ]
 366   %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
 367   %7 = phi ptr [ %0, %3 ], [ %11, %4 ]
 368   %8 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %6)
 369   %9 = bitcast ptr %7 to ptr
 370   %10 = tail call fast <4 x float> @llvm.masked.load.v4f32.p0(ptr %9, i32 4, <4 x i1> %8, <4 x float> zeroinitializer)
 371   %11 = getelementptr inbounds float, ptr %7, i32 4
 372   %12 = tail call fast <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float> %10, <4 x float> %5, <4 x i1> %8)
 373   %13 = add nsw i32 %6, -4
 374   %14 = icmp sgt i32 %6, 4
 375   br i1 %14, label %4, label %15
 376
 377 15:                                               ; preds = %4
 378   %16 = tail call fast float @llvm.arm.mve.maxnmav.f32.v4f32(float 0.000000e+00, <4 x float> %12)
 379   store float %16, ptr %2, align 4
 380   ret void
 381 }
 382
 383
 384
 385
 386
 387
 388 define void @loop_absmax16(ptr nocapture readonly %0, i32 %1, ptr nocapture %2) {
 389 ; CHECK-LABEL: loop_absmax16:
 390 ; CHECK:       @ %bb.0:
 391 ; CHECK-NEXT:    .save {r7, lr}
 392 ; CHECK-NEXT:    push {r7, lr}
 393 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 394 ; CHECK-NEXT:    lsrs r1, r1, #3
 395 ; CHECK-NEXT:    wls lr, r1, .LBB20_3
 396 ; CHECK-NEXT:  @ %bb.1: @ %.preheader
 397 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 398 ; CHECK-NEXT:  .LBB20_2: @ =>This Inner Loop Header: Depth=1
 399 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #8
 400 ; CHECK-NEXT:    vabs.f16 q1, q1
 401 ; CHECK-NEXT:    vmaxnm.f16 q0, q0, q1
 402 ; CHECK-NEXT:    le lr, .LBB20_2
 403 ; CHECK-NEXT:  .LBB20_3:
 404 ; CHECK-NEXT:    vldr.16 s4, .LCPI20_0
 405 ; CHECK-NEXT:    vmov r0, s4
 406 ; CHECK-NEXT:    vmaxnmav.f16 r0, q0
 407 ; CHECK-NEXT:    vmov s0, r0
 408 ; CHECK-NEXT:    vstr.16 s0, [r2]
 409 ; CHECK-NEXT:    pop {r7, pc}
 410 ; CHECK-NEXT:    .p2align 1
 411 ; CHECK-NEXT:  @ %bb.4:
 412 ; CHECK-NEXT:  .LCPI20_0:
 413 ; CHECK-NEXT:    .short 0x0000 @ half 0
 414   %4 = lshr i32 %1, 3
 415   %5 = icmp eq i32 %4, 0
 416   br i1 %5, label %18, label %6
 417
 418 6:                                                ; preds = %3, %6
 419   %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
 420   %8 = phi <8 x half> [ %15, %6 ], [ zeroinitializer, %3 ]
 421   %9 = phi ptr [ %12, %6 ], [ %0, %3 ]
 422   %10 = bitcast ptr %9 to ptr
 423   %11 = load <8 x half>, ptr %10, align 4
 424   %12 = getelementptr inbounds half, ptr %9, i32 4
 425   %13 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %11)
 426   %14 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %8)
 427   %15 = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %14, <8 x half> %13)
 428   %16 = add nsw i32 %7, -1
 429   %17 = icmp eq i32 %16, 0
 430   br i1 %17, label %18, label %6
 431
 432 18:                                               ; preds = %6, %3
 433   %19 = phi <8 x half> [ zeroinitializer, %3 ], [ %15, %6 ]
 434   %20 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %19)
 435   store half %20, ptr %2, align 4
 436   ret void
 437 }
 438
 439 define void @loop_absmax16_c(ptr nocapture readonly %0, i32 %1, ptr nocapture %2) {
 440 ; CHECK-LABEL: loop_absmax16_c:
 441 ; CHECK:       @ %bb.0:
 442 ; CHECK-NEXT:    .save {r7, lr}
 443 ; CHECK-NEXT:    push {r7, lr}
 444 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 445 ; CHECK-NEXT:    lsrs r1, r1, #3
 446 ; CHECK-NEXT:    wls lr, r1, .LBB21_3
 447 ; CHECK-NEXT:  @ %bb.1: @ %.preheader
 448 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 449 ; CHECK-NEXT:  .LBB21_2: @ =>This Inner Loop Header: Depth=1
 450 ; CHECK-NEXT:    vldrw.u32 q1, [r0], #8
 451 ; CHECK-NEXT:    vabs.f16 q1, q1
 452 ; CHECK-NEXT:    vmaxnm.f16 q0, q1, q0
 453 ; CHECK-NEXT:    le lr, .LBB21_2
 454 ; CHECK-NEXT:  .LBB21_3:
 455 ; CHECK-NEXT:    vldr.16 s4, .LCPI21_0
 456 ; CHECK-NEXT:    vmov r0, s4
 457 ; CHECK-NEXT:    vmaxnmav.f16 r0, q0
 458 ; CHECK-NEXT:    vmov s0, r0
 459 ; CHECK-NEXT:    vstr.16 s0, [r2]
 460 ; CHECK-NEXT:    pop {r7, pc}
 461 ; CHECK-NEXT:    .p2align 1
 462 ; CHECK-NEXT:  @ %bb.4:
 463 ; CHECK-NEXT:  .LCPI21_0:
 464 ; CHECK-NEXT:    .short 0x0000 @ half 0
 465   %4 = lshr i32 %1, 3
 466   %5 = icmp eq i32 %4, 0
 467   br i1 %5, label %18, label %6
 468
 469 6:                                                ; preds = %3, %6
 470   %7 = phi i32 [ %16, %6 ], [ %4, %3 ]
 471   %8 = phi <8 x half> [ %15, %6 ], [ zeroinitializer, %3 ]
 472   %9 = phi ptr [ %12, %6 ], [ %0, %3 ]
 473   %10 = bitcast ptr %9 to ptr
 474   %11 = load <8 x half>, ptr %10, align 4
 475   %12 = getelementptr inbounds half, ptr %9, i32 4
 476   %13 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %11)
 477   %14 = tail call fast <8 x half> @llvm.fabs.v8f16(<8 x half> %8)
 478   %15 = tail call fast <8 x half> @llvm.maxnum.v8f16(<8 x half> %13, <8 x half> %14)
 479   %16 = add nsw i32 %7, -1
 480   %17 = icmp eq i32 %16, 0
 481   br i1 %17, label %18, label %6
 482
 483 18:                                               ; preds = %6, %3
 484   %19 = phi <8 x half> [ zeroinitializer, %3 ], [ %15, %6 ]
 485   %20 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %19)
 486   store half %20, ptr %2, align 4
 487   ret void
 488 }
 489
 490 define void @loop_absmax16_pred(ptr %0, i32 %1, ptr nocapture %2) {
 491 ; CHECK-LABEL: loop_absmax16_pred:
 492 ; CHECK:       @ %bb.0:
 493 ; CHECK-NEXT:    .save {r7, lr}
 494 ; CHECK-NEXT:    push {r7, lr}
 495 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 496 ; CHECK-NEXT:    dlstp.16 lr, r1
 497 ; CHECK-NEXT:  .LBB22_1: @ =>This Inner Loop Header: Depth=1
 498 ; CHECK-NEXT:    vldrh.u16 q1, [r0], #8
 499 ; CHECK-NEXT:    vmaxnma.f16 q0, q1
 500 ; CHECK-NEXT:    letp lr, .LBB22_1
 501 ; CHECK-NEXT:  @ %bb.2:
 502 ; CHECK-NEXT:    vldr.16 s4, .LCPI22_0
 503 ; CHECK-NEXT:    vmov r0, s4
 504 ; CHECK-NEXT:    vmaxnmav.f16 r0, q0
 505 ; CHECK-NEXT:    vmov s0, r0
 506 ; CHECK-NEXT:    vstr.16 s0, [r2]
 507 ; CHECK-NEXT:    pop {r7, pc}
 508 ; CHECK-NEXT:    .p2align 1
 509 ; CHECK-NEXT:  @ %bb.3:
 510 ; CHECK-NEXT:  .LCPI22_0:
 511 ; CHECK-NEXT:    .short 0x0000 @ half 0
 512   br label %4
 513
 514 4:                                                ; preds = %4, %3
 515   %5 = phi <8 x half> [ zeroinitializer, %3 ], [ %12, %4 ]
 516   %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
 517   %7 = phi ptr [ %0, %3 ], [ %11, %4 ]
 518   %8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %6)
 519   %9 = bitcast ptr %7 to ptr
 520   %10 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0(ptr %9, i32 4, <8 x i1> %8, <8 x half> zeroinitializer)
 521   %11 = getelementptr inbounds half, ptr %7, i32 4
 522   %12 = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %5, <8 x half> %10, <8 x i1> %8)
 523   %13 = add nsw i32 %6, -8
 524   %14 = icmp sgt i32 %6, 8
 525   br i1 %14, label %4, label %15
 526
 527 15:                                               ; preds = %4
 528   %16 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %12)
 529   store half %16, ptr %2, align 4
 530   ret void
 531 }
 532
 533 define void @loop_absmax16_pred_c(ptr %0, i32 %1, ptr nocapture %2) {
 534 ; CHECK-LABEL: loop_absmax16_pred_c:
 535 ; CHECK:       @ %bb.0:
 536 ; CHECK-NEXT:    .save {r7, lr}
 537 ; CHECK-NEXT:    push {r7, lr}
 538 ; CHECK-NEXT:    vmov.i32 q0, #0x0
 539 ; CHECK-NEXT:    dlstp.16 lr, r1
 540 ; CHECK-NEXT:  .LBB23_1: @ =>This Inner Loop Header: Depth=1
 541 ; CHECK-NEXT:    vldrh.u16 q1, [r0], #8
 542 ; CHECK-NEXT:    vmaxnma.f16 q1, q0
 543 ; CHECK-NEXT:    vmov q0, q1
 544 ; CHECK-NEXT:    letp lr, .LBB23_1
 545 ; CHECK-NEXT:  @ %bb.2:
 546 ; CHECK-NEXT:    vldr.16 s0, .LCPI23_0
 547 ; CHECK-NEXT:    vmov r0, s0
 548 ; CHECK-NEXT:    vmaxnmav.f16 r0, q1
 549 ; CHECK-NEXT:    vmov s0, r0
 550 ; CHECK-NEXT:    vstr.16 s0, [r2]
 551 ; CHECK-NEXT:    pop {r7, pc}
 552 ; CHECK-NEXT:    .p2align 1
 553 ; CHECK-NEXT:  @ %bb.3:
 554 ; CHECK-NEXT:  .LCPI23_0:
 555 ; CHECK-NEXT:    .short 0x0000 @ half 0
 556   br label %4
 557
 558 4:                                                ; preds = %4, %3
 559   %5 = phi <8 x half> [ zeroinitializer, %3 ], [ %12, %4 ]
 560   %6 = phi i32 [ %1, %3 ], [ %13, %4 ]
 561   %7 = phi ptr [ %0, %3 ], [ %11, %4 ]
 562   %8 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %6)
 563   %9 = bitcast ptr %7 to ptr
 564   %10 = tail call fast <8 x half> @llvm.masked.load.v8f16.p0(ptr %9, i32 4, <8 x i1> %8, <8 x half> zeroinitializer)
 565   %11 = getelementptr inbounds half, ptr %7, i32 4
 566   %12 = tail call fast <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half> %10, <8 x half> %5, <8 x i1> %8)
 567   %13 = add nsw i32 %6, -8
 568   %14 = icmp sgt i32 %6, 8
 569   br i1 %14, label %4, label %15
 570
 571 15:                                               ; preds = %4
 572   %16 = tail call fast half @llvm.arm.mve.maxnmav.f16.v8f16(half 0.000000e+00, <8 x half> %12)
 573   store half %16, ptr %2, align 4
 574   ret void
 575 }
 576
 577
 578
 579
 580
 581 declare <4 x i1> @llvm.arm.mve.vctp32(i32)
 582 declare <4 x float> @llvm.masked.load.v4f32.p0(ptr, i32 immarg, <4 x i1>, <4 x float>)
 583 declare <4 x float> @llvm.arm.mve.vminnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>)
 584 declare <4 x float> @llvm.arm.mve.vmaxnma.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>)
 585 declare float @llvm.arm.mve.maxnmav.f32.v4f32(float, <4 x float>)
 586 declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
 587 declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
 588 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
 589
 590 declare <8 x i1> @llvm.arm.mve.vctp16(i32)
 591 declare <8 x half> @llvm.masked.load.v8f16.p0(ptr, i32 immarg, <8 x i1>, <8 x half>)
 592 declare <8 x half> @llvm.arm.mve.vminnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>)
 593 declare <8 x half> @llvm.arm.mve.vmaxnma.predicated.v8f16.v8i1(<8 x half>, <8 x half>, <8 x i1>)
 594 declare half @llvm.arm.mve.maxnmav.f16.v8f16(half, <8 x half>)
 595 declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
 596 declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>)
 597 declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>)
 598
 599