llvm/test/CodeGen/Thumb2/LowOverheadLoops/lsr-profitable-chain.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: llc -O3 -tail-predication=enabled -mtriple=thumbv8.1m.main -mattr=+mve,+mve.fp %s -o - | FileCheck %s
   3
   4 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
   5 target triple = "thumbv8.1m-arm-none-eabi"
   6
   7 ; Tests that LSR will not interfere with the VCTP intrinsic,
   8 ; and that this loop will correctly become tail-predicated.
   9
  10 define arm_aapcs_vfpcc float @vctpi32(ptr %0, i32 %1) {
  11 ; CHECK-LABEL: vctpi32:
  12 ; CHECK:       @ %bb.0:
  13 ; CHECK-NEXT:    push {r4, lr}
  14 ; CHECK-NEXT:    mvn r3, #31
  15 ; CHECK-NEXT:    vmov.32 q2[0], r0
  16 ; CHECK-NEXT:    movs r4, #0
  17 ; CHECK-NEXT:    subs r2, r1, #1
  18 ; CHECK-NEXT:    vadd.i32 q2, q2, r3
  19 ; CHECK-NEXT:    vidup.u32 q1, r4, #8
  20 ; CHECK-NEXT:    vmov r0, s8
  21 ; CHECK-NEXT:    vadd.i32 q1, q1, r0
  22 ; CHECK-NEXT:    vmov.i32 q0, #0x0
  23 ; CHECK-NEXT:    dlstp.32 lr, r2
  24 ; CHECK-NEXT:  .LBB0_1: @ =>This Inner Loop Header: Depth=1
  25 ; CHECK-NEXT:    vldrw.u32 q2, [q1, #32]!
  26 ; CHECK-NEXT:    vadd.f32 q0, q0, q2
  27 ; CHECK-NEXT:    letp lr, .LBB0_1
  28 ; CHECK-NEXT:  @ %bb.2:
  29 ; CHECK-NEXT:    bl vecAddAcrossF32Mve
  30 ; CHECK-NEXT:    vmov s0, r0
  31 ; CHECK-NEXT:    vcvt.f32.s32 s0, s0
  32 ; CHECK-NEXT:    vabs.f32 s0, s0
  33 ; CHECK-NEXT:    pop {r4, pc}
  34   %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
  35   %4 = extractvalue { <4 x i32>, i32 } %3, 0
  36   %5 = add nsw i32 %1, -1
  37   %6 = ptrtoint ptr %0 to i32
  38   %7 = insertelement <4 x i32> undef, i32 %6, i32 0
  39   %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
  40   %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
  41   %10 = add <4 x i32> %4, %9
  42   br label %11
  43
  44 11:
  45   %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
  46   %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
  47   %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
  48   %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12)
  49   %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)
  50   %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
  51   %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
  52   %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)
  53   %20 = add nsw i32 %12, -4
  54   %21 = icmp sgt i32 %12, 4
  55   br i1 %21, label %11, label %22
  56
  57 22:
  58   %23 = tail call arm_aapcs_vfpcc i32 @vecAddAcrossF32Mve(<4 x float> %19)
  59   %24 = sitofp i32 %23 to float
  60   %25 = tail call float @llvm.fabs.f32(float %24)
  61   ret float %25
  62 }
  63
  64 declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32)
  65 declare <4 x i1> @llvm.arm.mve.vctp32(i32)
  66 declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)
  67 declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
  68 declare arm_aapcs_vfpcc i32 @vecAddAcrossF32Mve(...)
  69 declare float @llvm.fabs.f32(float)