llvm/test/Transforms/LoopStrengthReduce/ARM/vctp-chains.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt -mtriple=thumbv8.1m.main -mattr=+mve %s -S -loop-reduce -o - | FileCheck %s
   3 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
   4 target triple = "thumbv8.1m-arm-none-eabi"
   5
   6 define float @vctp8(float* %0, i32 %1) {
   7 ; CHECK-LABEL: @vctp8(
   8 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
   9 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
  10 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
  11 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
  12 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
  13 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
  14 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
  15 ; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
  16 ; CHECK-NEXT:    br label [[TMP11:%.*]]
  17 ; CHECK:       11:
  18 ; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
  19 ; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
  20 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
  21 ; CHECK-NEXT:    [[TMP15:%.*]] = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 [[TMP12]])
  22 ; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @v16i1_to_v4i1(<16 x i1> [[TMP15]])
  23 ; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[MASK]])
  24 ; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
  25 ; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
  26 ; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[MASK]], <4 x float> [[TMP13]])
  27 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
  28 ; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
  29 ; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
  30 ; CHECK:       22:
  31 ; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]])
  32 ; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
  33 ; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
  34 ; CHECK-NEXT:    ret float [[TMP25]]
  35 ;
  36   %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
  37   %4 = extractvalue { <4 x i32>, i32 } %3, 0
  38   %5 = add nsw i32 %1, -1
  39   %6 = ptrtoint float* %0 to i32
  40   %7 = insertelement <4 x i32> undef, i32 %6, i32 0
  41   %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
  42   %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
  43   %10 = add <4 x i32> %4, %9
  44   br label %11
  45
  46 11:                                               ; preds = %11, %2
  47   %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
  48   %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
  49   %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
  50   %15 = tail call <16 x i1> @llvm.arm.mve.vctp8(i32 %12)
  51   %mask = tail call <4 x i1> @v16i1_to_v4i1(<16 x i1> %15)
  52   %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %mask)
  53   %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
  54   %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
  55   %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %mask, <4 x float> %13)
  56   %20 = add nsw i32 %12, -4
  57   %21 = icmp sgt i32 %12, 4
  58   br i1 %21, label %11, label %22
  59
  60 22:                                               ; preds = %11
  61   %23 = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
  62   %24 = sitofp i32 %23 to float
  63   %25 = tail call float @llvm.fabs.f32(float %24)
  64   ret float %25
  65 }
  66
  67 define float @vctp16(float* %0, i32 %1) {
  68 ; CHECK-LABEL: @vctp16(
  69 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
  70 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
  71 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
  72 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
  73 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
  74 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
  75 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
  76 ; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
  77 ; CHECK-NEXT:    br label [[TMP11:%.*]]
  78 ; CHECK:       11:
  79 ; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
  80 ; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
  81 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
  82 ; CHECK-NEXT:    [[TMP15:%.*]] = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 [[TMP12]])
  83 ; CHECK-NEXT:    [[MASK:%.*]] = tail call <4 x i1> @v8i1_to_v4i1(<8 x i1> [[TMP15]])
  84 ; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[MASK]])
  85 ; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
  86 ; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
  87 ; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[MASK]], <4 x float> [[TMP13]])
  88 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
  89 ; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
  90 ; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
  91 ; CHECK:       22:
  92 ; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]])
  93 ; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
  94 ; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
  95 ; CHECK-NEXT:    ret float [[TMP25]]
  96 ;
  97   %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
  98   %4 = extractvalue { <4 x i32>, i32 } %3, 0
  99   %5 = add nsw i32 %1, -1
 100   %6 = ptrtoint float* %0 to i32
 101   %7 = insertelement <4 x i32> undef, i32 %6, i32 0
 102   %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
 103   %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
 104   %10 = add <4 x i32> %4, %9
 105   br label %11
 106
 107 11:                                               ; preds = %11, %2
 108   %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
 109   %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
 110   %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
 111   %15 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %12)
 112   %mask = tail call <4 x i1> @v8i1_to_v4i1(<8 x i1> %15)
 113   %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %mask)
 114   %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
 115   %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
 116   %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %mask, <4 x float> %13)
 117   %20 = add nsw i32 %12, -4
 118   %21 = icmp sgt i32 %12, 4
 119   br i1 %21, label %11, label %22
 120
 121 22:                                               ; preds = %11
 122   %23 = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
 123   %24 = sitofp i32 %23 to float
 124   %25 = tail call float @llvm.fabs.f32(float %24)
 125   ret float %25
 126 }
 127
 128 define float @vctpi32(float* %0, i32 %1) {
 129 ; CHECK-LABEL: @vctpi32(
 130 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
 131 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
 132 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
 133 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
 134 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
 135 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
 136 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
 137 ; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
 138 ; CHECK-NEXT:    br label [[TMP11:%.*]]
 139 ; CHECK:       11:
 140 ; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
 141 ; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
 142 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
 143 ; CHECK-NEXT:    [[TMP15:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP12]])
 144 ; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[TMP15]])
 145 ; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
 146 ; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
 147 ; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[TMP15]], <4 x float> [[TMP13]])
 148 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
 149 ; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
 150 ; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
 151 ; CHECK:       22:
 152 ; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]])
 153 ; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
 154 ; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
 155 ; CHECK-NEXT:    ret float [[TMP25]]
 156 ;
 157   %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
 158   %4 = extractvalue { <4 x i32>, i32 } %3, 0
 159   %5 = add nsw i32 %1, -1
 160   %6 = ptrtoint float* %0 to i32
 161   %7 = insertelement <4 x i32> undef, i32 %6, i32 0
 162   %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
 163   %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
 164   %10 = add <4 x i32> %4, %9
 165   br label %11
 166
 167 11:                                               ; preds = %11, %2
 168   %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
 169   %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
 170   %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
 171   %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12)
 172   %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)
 173   %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
 174   %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
 175   %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)
 176   %20 = add nsw i32 %12, -4
 177   %21 = icmp sgt i32 %12, 4
 178   br i1 %21, label %11, label %22
 179
 180 22:                                               ; preds = %11
 181   %23 = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
 182   %24 = sitofp i32 %23 to float
 183   %25 = tail call float @llvm.fabs.f32(float %24)
 184   ret float %25
 185 }
 186
 187
 188 define float @vctpi64(float* %0, i32 %1) {
 189 ; CHECK-LABEL: @vctpi64(
 190 ; CHECK-NEXT:    [[TMP3:%.*]] = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
 191 ; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0
 192 ; CHECK-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP1:%.*]], -1
 193 ; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint float* [[TMP0:%.*]] to i32
 194 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> undef, i32 [[TMP6]], i32 0
 195 ; CHECK-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[TMP7]], <i32 -32, i32 undef, i32 undef, i32 undef>
 196 ; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> undef, <4 x i32> zeroinitializer
 197 ; CHECK-NEXT:    [[TMP10:%.*]] = add <4 x i32> [[TMP4]], [[TMP9]]
 198 ; CHECK-NEXT:    br label [[TMP11:%.*]]
 199 ; CHECK:       11:
 200 ; CHECK-NEXT:    [[TMP12:%.*]] = phi i32 [ [[TMP5]], [[TMP2:%.*]] ], [ [[TMP21:%.*]], [[TMP11]] ]
 201 ; CHECK-NEXT:    [[TMP13:%.*]] = phi <4 x float> [ zeroinitializer, [[TMP2]] ], [ [[TMP19:%.*]], [[TMP11]] ]
 202 ; CHECK-NEXT:    [[TMP14:%.*]] = phi <4 x i32> [ [[TMP10]], [[TMP2]] ], [ [[TMP17:%.*]], [[TMP11]] ]
 203 ; CHECK-NEXT:    [[TMP15:%.*]] = tail call <4 x i1> @llvm.arm.mve.vctp64(i32 [[TMP12]])
 204 ; CHECK-NEXT:    [[TMP16:%.*]] = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP14]], i32 32, <4 x i1> [[TMP15]])
 205 ; CHECK-NEXT:    [[TMP17]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 1
 206 ; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP16]], 0
 207 ; CHECK-NEXT:    [[TMP19]] = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> [[TMP13]], <4 x float> [[TMP18]], <4 x i1> [[TMP15]], <4 x float> [[TMP13]])
 208 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp sgt i32 [[TMP12]], 4
 209 ; CHECK-NEXT:    [[TMP21]] = add i32 [[TMP12]], -4
 210 ; CHECK-NEXT:    br i1 [[TMP20]], label [[TMP11]], label [[TMP22:%.*]]
 211 ; CHECK:       22:
 212 ; CHECK-NEXT:    [[TMP23:%.*]] = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> [[TMP19]])
 213 ; CHECK-NEXT:    [[TMP24:%.*]] = sitofp i32 [[TMP23]] to float
 214 ; CHECK-NEXT:    [[TMP25:%.*]] = tail call float @llvm.fabs.f32(float [[TMP24]])
 215 ; CHECK-NEXT:    ret float [[TMP25]]
 216 ;
 217   %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 0, i32 8)
 218   %4 = extractvalue { <4 x i32>, i32 } %3, 0
 219   %5 = add nsw i32 %1, -1
 220   %6 = ptrtoint float* %0 to i32
 221   %7 = insertelement <4 x i32> undef, i32 %6, i32 0
 222   %8 = add <4 x i32> %7, <i32 -32, i32 undef, i32 undef, i32 undef>
 223   %9 = shufflevector <4 x i32> %8, <4 x i32> undef, <4 x i32> zeroinitializer
 224   %10 = add <4 x i32> %4, %9
 225   br label %11
 226
 227 11:                                               ; preds = %11, %2
 228   %12 = phi i32 [ %5, %2 ], [ %20, %11 ]
 229   %13 = phi <4 x float> [ zeroinitializer, %2 ], [ %19, %11 ]
 230   %14 = phi <4 x i32> [ %10, %2 ], [ %17, %11 ]
 231   %15 = tail call <4 x i1> @llvm.arm.mve.vctp64(i32 %12)
 232   %16 = tail call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %14, i32 32, <4 x i1> %15)
 233   %17 = extractvalue { <4 x float>, <4 x i32> } %16, 1
 234   %18 = extractvalue { <4 x float>, <4 x i32> } %16, 0
 235   %19 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %13, <4 x float> %18, <4 x i1> %15, <4 x float> %13)
 236   %20 = add nsw i32 %12, -4
 237   %21 = icmp sgt i32 %12, 4
 238   br i1 %21, label %11, label %22
 239
 240 22:                                               ; preds = %11
 241   %23 = tail call i32 bitcast (i32 (...)* @vecAddAcrossF32Mve to i32 (<4 x float>)*)(<4 x float> %19)
 242   %24 = sitofp i32 %23 to float
 243   %25 = tail call float @llvm.fabs.f32(float %24)
 244   ret float %25
 245 }
 246
 247 declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32)
 248 declare <16 x i1> @llvm.arm.mve.vctp8(i32)
 249 declare <8 x i1> @llvm.arm.mve.vctp16(i32)
 250 declare <4 x i1> @llvm.arm.mve.vctp32(i32)
 251 declare <4 x i1> @llvm.arm.mve.vctp64(i32)
 252 declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>)
 253 declare <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float>, <4 x float>, <4 x i1>, <4 x float>)
 254 declare i32 @vecAddAcrossF32Mve(...)
 255 declare <4 x i1> @v8i1_to_v4i1(<8 x i1>)
 256 declare <4 x i1> @v16i1_to_v4i1(<16 x i1>)
 257 declare float @llvm.fabs.f32(float)