llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s
   3
   4 define dso_local void @foo(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
   5 ; CHECK-LABEL: @foo(
   6 ; CHECK-NEXT:  entry:
   7 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
   8 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
   9 ; CHECK:       vector.body:
  10 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
  11 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
  12 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
  13 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
  14 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
  15 ; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
  16 ; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
  17 ; CHECK-NEXT:    [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
  18 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]])
  19 ; CHECK-NEXT:    [[TMP3]] = sub i32 [[TMP1]], 4
  20 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
  21 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
  22 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
  23 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP4]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP2]])
  24 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
  25 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4
  26 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4
  27 ; CHECK-NEXT:    [[TMP5]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
  28 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
  29 ; CHECK-NEXT:    br i1 [[TMP6]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
  30 ; CHECK:       for.cond.cleanup:
  31 ; CHECK-NEXT:    ret void
  32 ;
  33 entry:
  34   %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
  35   br label %vector.body
  36
  37 vector.body:
  38   %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
  39   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
  40   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
  41   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
  42   %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
  43   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
  44   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
  45   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
  46   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
  47   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
  48   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
  49   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
  50   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
  51   %index.next = add i32 %index, 4
  52   %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
  53   %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
  54   %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
  55   %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
  56   %4 = icmp ne i32 %3, 0
  57   br i1 %4, label %vector.body, label %for.cond.cleanup
  58
  59 for.cond.cleanup:
  60   ret void
  61 }
  62
  63 ; Silly test case: the loop count is constant and a multiple of the vectorisation
  64 ; factor. So, the vectoriser should not produce masked loads/stores and there's
  65 ; nothing to tail-predicate here, just checking.
  66 define dso_local void @foo2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
  67 ; CHECK-LABEL: @foo2(
  68 ; CHECK-NEXT:  entry:
  69 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 2000)
  70 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
  71 ; CHECK:       vector.body:
  72 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
  73 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
  74 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
  75 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
  76 ; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
  77 ; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
  78 ; CHECK-NEXT:    [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
  79 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[LSR_IV10]], align 4
  80 ; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i32>, <4 x i32>* [[LSR_IV1113]], align 4
  81 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD]]
  82 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[LSR_IV1416]], align 4
  83 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
  84 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4
  85 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4
  86 ; CHECK-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
  87 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
  88 ; CHECK-NEXT:    br i1 [[TMP3]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
  89 ; CHECK:       for.cond.cleanup:
  90 ; CHECK-NEXT:    ret void
  91 ;
  92 entry:
  93   %start = call i32 @llvm.start.loop.iterations.i32(i32 2000)
  94   br label %vector.body
  95
  96 vector.body:
  97   %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
  98   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
  99   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
 100   %0 = phi i32 [ %start, %entry ], [ %2, %vector.body ]
 101   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
 102   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
 103   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
 104   %wide.load = load <4 x i32>, <4 x i32>* %lsr.iv10, align 4
 105   %wide.load9 = load <4 x i32>, <4 x i32>* %lsr.iv1113, align 4
 106   %1 = add nsw <4 x i32> %wide.load9, %wide.load
 107   store <4 x i32> %1, <4 x i32>* %lsr.iv1416, align 4
 108   %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
 109   %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
 110   %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
 111   %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 112   %3 = icmp ne i32 %2, 0
 113   br i1 %3, label %vector.body, label %for.cond.cleanup
 114
 115 for.cond.cleanup:
 116   ret void
 117 }
 118
 119 ; Check that the icmp is a ult
 120 define dso_local void @foo3(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 121 ; CHECK-LABEL: @foo3(
 122 ; CHECK-NEXT:  entry:
 123 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 124 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 125 ; CHECK:       vector.body:
 126 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
 127 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
 128 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
 129 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 130 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 131 ; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
 132 ; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
 133 ; CHECK-NEXT:    [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
 134 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
 135 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 136 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 137 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i32> [[INDUCTION]], <i32 32002, i32 32002, i32 32002, i32 32002>
 138 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
 139 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
 140 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
 141 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP1]])
 142 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 143 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
 144 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4
 145 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4
 146 ; CHECK-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
 147 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 148 ; CHECK-NEXT:    br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
 149 ; CHECK:       for.cond.cleanup:
 150 ; CHECK-NEXT:    ret void
 151 ;
 152 entry:
 153   %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 154   br label %vector.body
 155
 156 vector.body:
 157   %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
 158   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
 159   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
 160   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 161   %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
 162   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
 163   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
 164   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
 165   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 166   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 167   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 168
 169 ; UGT here:
 170   %1 = icmp ugt <4 x i32> %induction, <i32 32002, i32 32002, i32 32002, i32 32002>
 171
 172   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
 173   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
 174   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
 175   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
 176   %index.next = add i32 %index, 4
 177   %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
 178   %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
 179   %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
 180   %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 181   %4 = icmp ne i32 %3, 0
 182   br i1 %4, label %vector.body, label %for.cond.cleanup
 183
 184 for.cond.cleanup:
 185   ret void
 186 }
 187
 188 define dso_local void @foo5(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 189 ; CHECK-LABEL: @foo5(
 190 ; CHECK-NEXT:  entry:
 191 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 192 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 193 ; CHECK:       vector.body:
 194 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi i32* [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
 195 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi i32* [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
 196 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi i32* [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
 197 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 198 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 199 ; CHECK-NEXT:    [[LSR_IV1416:%.*]] = bitcast i32* [[LSR_IV14]] to <4 x i32>*
 200 ; CHECK-NEXT:    [[LSR_IV1113:%.*]] = bitcast i32* [[LSR_IV11]] to <4 x i32>*
 201 ; CHECK-NEXT:    [[LSR_IV10:%.*]] = bitcast i32* [[LSR_IV]] to <4 x i32>*
 202 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
 203 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 204 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 205 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> [[INDUCTION]], <i32 0, i32 3200, i32 32002, i32 32002>
 206 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV10]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
 207 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[LSR_IV1113]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
 208 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
 209 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32>* [[LSR_IV1416]], i32 4, <4 x i1> [[TMP1]])
 210 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 211 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, i32* [[LSR_IV]], i32 4
 212 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, i32* [[LSR_IV11]], i32 4
 213 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, i32* [[LSR_IV14]], i32 4
 214 ; CHECK-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
 215 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 216 ; CHECK-NEXT:    br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
 217 ; CHECK:       for.cond.cleanup:
 218 ; CHECK-NEXT:    ret void
 219 ;
 220 entry:
 221   %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 222   br label %vector.body
 223
 224 vector.body:
 225   %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
 226   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
 227   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
 228   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 229   %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
 230   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
 231   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
 232   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
 233   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 234   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 235   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 236   %1 = icmp ult <4 x i32> %induction, <i32 0, i32 3200, i32 32002, i32 32002>
 237   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
 238   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
 239   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
 240   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
 241   %index.next = add i32 %index, 4
 242   %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
 243   %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
 244   %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
 245   %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 246   %4 = icmp ne i32 %3, 0
 247   br i1 %4, label %vector.body, label %for.cond.cleanup
 248
 249 for.cond.cleanup:
 250   ret void
 251 }
 252
 253 ; CHECK-LABEL: @inconsistent_tripcounts(
 254 ; CHECK:       vector.body:
 255 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 256 ; CHECK:       @llvm.get.active.lane.mask
 257 ; CHECK:       ret void
 258 ;
 259 define dso_local void @inconsistent_tripcounts(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 260 entry:
 261   %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 262   br label %vector.body
 263
 264 vector.body:
 265   %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
 266   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
 267   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
 268   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 269   %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
 270   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
 271   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
 272   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
 273 ; BTC = UINT_MAX, and scalar trip count BTC + 1 would overflow:
 274   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967295)
 275   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
 276   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
 277   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
 278   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
 279   %index.next = add i32 %index, 4
 280   %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
 281   %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
 282   %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
 283   %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 284   %4 = icmp ne i32 %3, 0
 285   br i1 %4, label %vector.body, label %for.cond.cleanup
 286
 287 for.cond.cleanup:
 288   ret void
 289 }
 290
 291 ; CHECK-LABEL: @overflow_in_sub(
 292 ; CHECK:       vector.body:
 293 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 294 ; CHECK:       @llvm.get.active.lane.mask
 295 ; CHECK:       ret void
 296 ;
 297 define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 298 entry:
 299   %start = call i32 @llvm.start.loop.iterations.i32(i32 1073741824)
 300   br label %vector.body
 301
 302 vector.body:
 303   %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
 304   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
 305   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
 306   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 307   %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
 308   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
 309   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
 310   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
 311   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
 312   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
 313   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
 314   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
 315   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
 316   %index.next = add i32 %index, 4
 317   %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
 318   %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
 319   %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
 320   %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 321   %4 = icmp ne i32 %3, 0
 322   br i1 %4, label %vector.body, label %for.cond.cleanup
 323
 324 for.cond.cleanup:
 325   ret void
 326 }
 327
 328
 329 ; CHECK-LABEL: @IV_not_an_induction(
 330 ; CHECK:       vector.body:
 331 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 332 ; CHECK:       @llvm.get.active.lane.mask
 333 ; CHECK:       ret void
 334 ;
 335 define dso_local void @IV_not_an_induction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 336 entry:
 337   %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 338   br label %vector.body
 339
 340 vector.body:
 341   %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
 342   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
 343   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
 344   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 345   %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
 346   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
 347   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
 348   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
 349 ; The induction variable %N is not an IV:
 350   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003)
 351   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
 352   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
 353   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
 354   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
 355   %index.next = add i32 %index, 4
 356   %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
 357   %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
 358   %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
 359   %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 360   %4 = icmp ne i32 %3, 0
 361   br i1 %4, label %vector.body, label %for.cond.cleanup
 362
 363 for.cond.cleanup:
 364   ret void
 365 }
 366
 367 ; CHECK-LABEL: @IV_wrong_step(
 368 ; CHECK:       vector.body:
 369 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 370 ; CHECK:       @llvm.get.active.lane.mask
 371 ; CHECK:       ret void
 372 ;
 373 define dso_local void @IV_wrong_step(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 374 entry:
 375   %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 376   br label %vector.body
 377
 378 vector.body:
 379   %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
 380   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
 381   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
 382   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 383   %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
 384   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
 385   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
 386   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
 387   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
 388   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
 389   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
 390   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
 391   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
 392
 393 ; %index is incremented with 3 and not 4, which is the vectorisation factor
 394 ; that we expect here:
 395   %index.next = add i32 %index, 3
 396
 397   %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
 398   %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
 399   %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
 400   %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 401   %4 = icmp ne i32 %3, 0
 402   br i1 %4, label %vector.body, label %for.cond.cleanup
 403
 404 for.cond.cleanup:
 405   ret void
 406 }
 407
 408 ; CHECK-LABEL: @IV_step_not_constant(
 409 ; CHECK:       vector.body:
 410 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 411 ; CHECK:       @llvm.get.active.lane.mask
 412 ; CHECK:       ret void
 413 ;
 414 define dso_local void @IV_step_not_constant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 415 entry:
 416   %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 417   br label %vector.body
 418
 419 vector.body:
 420   %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
 421   %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
 422   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
 423   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 424   %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
 425   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
 426   %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
 427   %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
 428   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
 429   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
 430   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
 431   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
 432   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
 433
 434 ; %index is incremented with some runtime value, i.e. not a constant:
 435   %index.next = add i32 %index, %N
 436
 437   %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
 438   %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
 439   %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
 440   %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 441   %4 = icmp ne i32 %3, 0
 442   br i1 %4, label %vector.body, label %for.cond.cleanup
 443
 444 for.cond.cleanup:
 445   ret void
 446 }
 447
 448 ; CHECK-LABEL: @outerloop_phi(
 449 ; CHECK:       vector.body:
 450 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 451 ; CHECK:       @llvm.get.active.lane.mask
 452 ; CHECK:       ret void
 453 ;
 454 define dso_local void @outerloop_phi(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
 455 entry:
 456   %cmp24 = icmp eq i32 %N, 0
 457   br i1 %cmp24, label %for.cond.cleanup, label %vector.ph.preheader
 458
 459 vector.ph.preheader:                              ; preds = %entry
 460   br label %vector.ph
 461
 462 vector.ph:                                        ; preds = %vector.ph.preheader, %for.cond.cleanup3
 463   %lsr.iv36 = phi i32* [ %B, %vector.ph.preheader ], [ %scevgep37, %for.cond.cleanup3 ]
 464   %lsr.iv31 = phi i32* [ %C, %vector.ph.preheader ], [ %scevgep32, %for.cond.cleanup3 ]
 465   %lsr.iv = phi i32* [ %A, %vector.ph.preheader ], [ %scevgep, %for.cond.cleanup3 ]
 466   %j.025 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %vector.ph.preheader ]
 467   %start = call i32 @llvm.start.loop.iterations.i32(i32 1025)
 468   br label %vector.body
 469
 470 vector.body:                                      ; preds = %vector.body, %vector.ph
 471   %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %lsr.iv36, %vector.ph ]
 472   %lsr.iv33 = phi i32* [ %scevgep34, %vector.body ], [ %lsr.iv31, %vector.ph ]
 473   %lsr.iv28 = phi i32* [ %scevgep29, %vector.body ], [ %lsr.iv, %vector.ph ]
 474   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 475   %0 = phi i32 [ %start, %vector.ph ], [ %2, %vector.body ]
 476   %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
 477   %lsr.iv3335 = bitcast i32* %lsr.iv33 to <4 x i32>*
 478   %lsr.iv2830 = bitcast i32* %lsr.iv28 to <4 x i32>*
 479 ; It's using %j.025, the induction variable from its outer loop:
 480   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %j.025, i32 4096)
 481   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 482   %wide.masked.load27 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv3335, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 483   %1 = add nsw <4 x i32> %wide.masked.load27, %wide.masked.load
 484   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %1, <4 x i32>* %lsr.iv2830, i32 4, <4 x i1> %active.lane.mask)
 485   %index.next = add i32 %index, 4
 486   %scevgep29 = getelementptr i32, i32* %lsr.iv28, i32 4
 487   %scevgep34 = getelementptr i32, i32* %lsr.iv33, i32 4
 488   %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 4
 489   %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 490   %3 = icmp ne i32 %2, 0
 491   br i1 %3, label %vector.body, label %for.cond.cleanup3
 492
 493 for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
 494   ret void
 495
 496 for.cond.cleanup3:                                ; preds = %vector.body
 497   %inc11 = add nuw i32 %j.025, 1
 498   %scevgep = getelementptr i32, i32* %lsr.iv, i32 1
 499   %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 1
 500   %scevgep37 = getelementptr i32, i32* %lsr.iv36, i32 1
 501   %exitcond26 = icmp eq i32 %inc11, %N
 502   br i1 %exitcond26, label %for.cond.cleanup, label %vector.ph
 503 }
 504
 505
 506 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
 507 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
 508 declare i32 @llvm.loop.decrement.reg.i32(i32 , i32 )
 509 declare i32 @llvm.start.loop.iterations.i32(i32)
 510 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)