llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s
   3
   4 define dso_local void @foo(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
   5 ; CHECK-LABEL: @foo(
   6 ; CHECK-NEXT:  entry:
   7 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
   8 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
   9 ; CHECK:       vector.body:
  10 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
  11 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
  12 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
  13 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
  14 ; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
  15 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]])
  16 ; CHECK-NEXT:    [[TMP3]] = sub i32 [[TMP1]], 4
  17 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
  18 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
  19 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
  20 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP2]])
  21 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
  22 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
  23 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
  24 ; CHECK-NEXT:    [[TMP5]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
  25 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
  26 ; CHECK-NEXT:    br i1 [[TMP6]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
  27 ; CHECK:       for.cond.cleanup:
  28 ; CHECK-NEXT:    ret void
  29 ;
  30 entry:
  31   %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
  32   br label %vector.body
  33
  34 vector.body:
  35   %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
  36   %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
  37   %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
  38   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
  39   %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
  40   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
  41   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef)
  42   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef)
  43   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
  44   call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1)
  45   %index.next = add i32 %index, 4
  46   %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
  47   %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
  48   %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
  49   %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
  50   %4 = icmp ne i32 %3, 0
  51   br i1 %4, label %vector.body, label %for.cond.cleanup
  52
  53 for.cond.cleanup:
  54   ret void
  55 }
  56
  57 ; Silly test case: the loop count is constant and a multiple of the vectorisation
  58 ; factor. So, the vectoriser should not produce masked loads/stores and there's
  59 ; nothing to tail-predicate here, just checking.
  60 define dso_local void @foo2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
  61 ; CHECK-LABEL: @foo2(
  62 ; CHECK-NEXT:  entry:
  63 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 2000)
  64 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
  65 ; CHECK:       vector.body:
  66 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
  67 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
  68 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
  69 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
  70 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[LSR_IV]], align 4
  71 ; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[LSR_IV11]], align 4
  72 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD]]
  73 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr [[LSR_IV14]], align 4
  74 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
  75 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
  76 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
  77 ; CHECK-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
  78 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
  79 ; CHECK-NEXT:    br i1 [[TMP3]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
  80 ; CHECK:       for.cond.cleanup:
  81 ; CHECK-NEXT:    ret void
  82 ;
  83 entry:
  84   %start = call i32 @llvm.start.loop.iterations.i32(i32 2000)
  85   br label %vector.body
  86
  87 vector.body:
  88   %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
  89   %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
  90   %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
  91   %0 = phi i32 [ %start, %entry ], [ %2, %vector.body ]
  92   %wide.load = load <4 x i32>, ptr %lsr.iv, align 4
  93   %wide.load9 = load <4 x i32>, ptr %lsr.iv11, align 4
  94   %1 = add nsw <4 x i32> %wide.load9, %wide.load
  95   store <4 x i32> %1, ptr %lsr.iv14, align 4
  96   %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
  97   %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
  98   %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
  99   %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 100   %3 = icmp ne i32 %2, 0
 101   br i1 %3, label %vector.body, label %for.cond.cleanup
 102
 103 for.cond.cleanup:
 104   ret void
 105 }
 106
 107 ; Check that the icmp is a ult
 108 define dso_local void @foo3(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 109 ; CHECK-LABEL: @foo3(
 110 ; CHECK-NEXT:  entry:
 111 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 112 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 113 ; CHECK:       vector.body:
 114 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
 115 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
 116 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
 117 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 118 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 119 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
 120 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 121 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 122 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i32> [[INDUCTION]], <i32 32002, i32 32002, i32 32002, i32 32002>
 123 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
 124 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
 125 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
 126 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
 127 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 128 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 129 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
 130 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
 131 ; CHECK-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
 132 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 133 ; CHECK-NEXT:    br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
 134 ; CHECK:       for.cond.cleanup:
 135 ; CHECK-NEXT:    ret void
 136 ;
 137 entry:
 138   %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 139   br label %vector.body
 140
 141 vector.body:
 142   %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
 143   %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
 144   %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
 145   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 146   %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
 147   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 148   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 149   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 150
 151 ; UGT here:
 152   %1 = icmp ugt <4 x i32> %induction, <i32 32002, i32 32002, i32 32002, i32 32002>
 153
 154   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef)
 155   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef)
 156   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
 157   call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1)
 158   %index.next = add i32 %index, 4
 159   %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
 160   %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
 161   %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
 162   %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 163   %4 = icmp ne i32 %3, 0
 164   br i1 %4, label %vector.body, label %for.cond.cleanup
 165
 166 for.cond.cleanup:
 167   ret void
 168 }
 169
 170 define dso_local void @foo5(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 171 ; CHECK-LABEL: @foo5(
 172 ; CHECK-NEXT:  entry:
 173 ; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 174 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 175 ; CHECK:       vector.body:
 176 ; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
 177 ; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
 178 ; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
 179 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 180 ; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 181 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
 182 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 183 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
 184 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> [[INDUCTION]], <i32 0, i32 3200, i32 32002, i32 32002>
 185 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
 186 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
 187 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
 188 ; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
 189 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 190 ; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
 191 ; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
 192 ; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
 193 ; CHECK-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
 194 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
 195 ; CHECK-NEXT:    br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
 196 ; CHECK:       for.cond.cleanup:
 197 ; CHECK-NEXT:    ret void
 198 ;
 199 entry:
 200   %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 201   br label %vector.body
 202
 203 vector.body:
 204   %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
 205   %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
 206   %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
 207   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 208   %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
 209   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 210   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 211   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 212   %1 = icmp ult <4 x i32> %induction, <i32 0, i32 3200, i32 32002, i32 32002>
 213   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef)
 214   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef)
 215   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
 216   call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1)
 217   %index.next = add i32 %index, 4
 218   %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
 219   %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
 220   %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
 221   %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 222   %4 = icmp ne i32 %3, 0
 223   br i1 %4, label %vector.body, label %for.cond.cleanup
 224
 225 for.cond.cleanup:
 226   ret void
 227 }
 228
 229 ; CHECK-LABEL: @inconsistent_tripcounts(
 230 ; CHECK:       vector.body:
 231 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 232 ; CHECK:       @llvm.get.active.lane.mask
 233 ; CHECK:       ret void
 234 ;
 235 define dso_local void @inconsistent_tripcounts(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 236 entry:
 237   %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 238   br label %vector.body
 239
 240 vector.body:
 241   %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
 242   %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
 243   %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
 244   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 245   %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
 246 ; BTC = UINT_MAX, and scalar trip count BTC + 1 would overflow:
 247   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967295)
 248   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef)
 249   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef)
 250   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
 251   call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1)
 252   %index.next = add i32 %index, 4
 253   %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
 254   %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
 255   %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
 256   %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 257   %4 = icmp ne i32 %3, 0
 258   br i1 %4, label %vector.body, label %for.cond.cleanup
 259
 260 for.cond.cleanup:
 261   ret void
 262 }
 263
 264 ; CHECK-LABEL: @overflow_in_sub(
 265 ; CHECK:       vector.body:
 266 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 267 ; CHECK:       @llvm.get.active.lane.mask
 268 ; CHECK:       ret void
 269 ;
 270 define dso_local void @overflow_in_sub(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 271 entry:
 272   %start = call i32 @llvm.start.loop.iterations.i32(i32 1073741824)
 273   br label %vector.body
 274
 275 vector.body:
 276   %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
 277   %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
 278   %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
 279   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 280   %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
 281   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
 282   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef)
 283   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef)
 284   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
 285   call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1)
 286   %index.next = add i32 %index, 4
 287   %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
 288   %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
 289   %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
 290   %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 291   %4 = icmp ne i32 %3, 0
 292   br i1 %4, label %vector.body, label %for.cond.cleanup
 293
 294 for.cond.cleanup:
 295   ret void
 296 }
 297
 298
 299 ; CHECK-LABEL: @IV_not_an_induction(
 300 ; CHECK:       vector.body:
 301 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 302 ; CHECK:       @llvm.get.active.lane.mask
 303 ; CHECK:       ret void
 304 ;
 305 define dso_local void @IV_not_an_induction(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 306 entry:
 307   %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 308   br label %vector.body
 309
 310 vector.body:
 311   %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
 312   %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
 313   %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
 314   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 315   %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
 316 ; The induction variable %N is not an IV:
 317   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003)
 318   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef)
 319   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef)
 320   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
 321   call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1)
 322   %index.next = add i32 %index, 4
 323   %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
 324   %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
 325   %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
 326   %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 327   %4 = icmp ne i32 %3, 0
 328   br i1 %4, label %vector.body, label %for.cond.cleanup
 329
 330 for.cond.cleanup:
 331   ret void
 332 }
 333
 334 ; CHECK-LABEL: @IV_wrong_step(
 335 ; CHECK:       vector.body:
 336 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 337 ; CHECK:       @llvm.get.active.lane.mask
 338 ; CHECK:       ret void
 339 ;
 340 define dso_local void @IV_wrong_step(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 341 entry:
 342   %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 343   br label %vector.body
 344
 345 vector.body:
 346   %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
 347   %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
 348   %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
 349   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 350   %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
 351   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
 352   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef)
 353   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef)
 354   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
 355   call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1)
 356
 357 ; %index is incremented with 3 and not 4, which is the vectorisation factor
 358 ; that we expect here:
 359   %index.next = add i32 %index, 3
 360
 361   %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
 362   %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
 363   %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
 364   %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 365   %4 = icmp ne i32 %3, 0
 366   br i1 %4, label %vector.body, label %for.cond.cleanup
 367
 368 for.cond.cleanup:
 369   ret void
 370 }
 371
 372 ; CHECK-LABEL: @IV_step_not_constant(
 373 ; CHECK:       vector.body:
 374 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 375 ; CHECK:       @llvm.get.active.lane.mask
 376 ; CHECK:       ret void
 377 ;
 378 define dso_local void @IV_step_not_constant(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 379 entry:
 380   %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
 381   br label %vector.body
 382
 383 vector.body:
 384   %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
 385   %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
 386   %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
 387   %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
 388   %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
 389   %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
 390   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef)
 391   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef)
 392   %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
 393   call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1)
 394
 395 ; %index is incremented with some runtime value, i.e. not a constant:
 396   %index.next = add i32 %index, %N
 397
 398   %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
 399   %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
 400   %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
 401   %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 402   %4 = icmp ne i32 %3, 0
 403   br i1 %4, label %vector.body, label %for.cond.cleanup
 404
 405 for.cond.cleanup:
 406   ret void
 407 }
 408
 409 ; CHECK-LABEL: @outerloop_phi(
 410 ; CHECK:       vector.body:
 411 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 412 ; CHECK:       @llvm.get.active.lane.mask
 413 ; CHECK:       ret void
 414 ;
 415 define dso_local void @outerloop_phi(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
 416 entry:
 417   %cmp24 = icmp eq i32 %N, 0
 418   br i1 %cmp24, label %for.cond.cleanup, label %vector.ph.preheader
 419
 420 vector.ph.preheader:                              ; preds = %entry
 421   br label %vector.ph
 422
 423 vector.ph:                                        ; preds = %vector.ph.preheader, %for.cond.cleanup3
 424   %lsr.iv36 = phi ptr [ %B, %vector.ph.preheader ], [ %scevgep37, %for.cond.cleanup3 ]
 425   %lsr.iv31 = phi ptr [ %C, %vector.ph.preheader ], [ %scevgep32, %for.cond.cleanup3 ]
 426   %lsr.iv = phi ptr [ %A, %vector.ph.preheader ], [ %scevgep, %for.cond.cleanup3 ]
 427   %j.025 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %vector.ph.preheader ]
 428   %start = call i32 @llvm.start.loop.iterations.i32(i32 1025)
 429   br label %vector.body
 430
 431 vector.body:                                      ; preds = %vector.body, %vector.ph
 432   %lsr.iv38 = phi ptr [ %scevgep39, %vector.body ], [ %lsr.iv36, %vector.ph ]
 433   %lsr.iv33 = phi ptr [ %scevgep34, %vector.body ], [ %lsr.iv31, %vector.ph ]
 434   %lsr.iv28 = phi ptr [ %scevgep29, %vector.body ], [ %lsr.iv, %vector.ph ]
 435   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 436   %0 = phi i32 [ %start, %vector.ph ], [ %2, %vector.body ]
 437 ; It's using %j.025, the induction variable from its outer loop:
 438   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %j.025, i32 4096)
 439   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv38, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 440   %wide.masked.load27 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv33, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 441   %1 = add nsw <4 x i32> %wide.masked.load27, %wide.masked.load
 442   call void @llvm.masked.store.v4i32.p0(<4 x i32> %1, ptr %lsr.iv28, i32 4, <4 x i1> %active.lane.mask)
 443   %index.next = add i32 %index, 4
 444   %scevgep29 = getelementptr i32, ptr %lsr.iv28, i32 4
 445   %scevgep34 = getelementptr i32, ptr %lsr.iv33, i32 4
 446   %scevgep39 = getelementptr i32, ptr %lsr.iv38, i32 4
 447   %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
 448   %3 = icmp ne i32 %2, 0
 449   br i1 %3, label %vector.body, label %for.cond.cleanup3
 450
 451 for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
 452   ret void
 453
 454 for.cond.cleanup3:                                ; preds = %vector.body
 455   %inc11 = add nuw i32 %j.025, 1
 456   %scevgep = getelementptr i32, ptr %lsr.iv, i32 1
 457   %scevgep32 = getelementptr i32, ptr %lsr.iv31, i32 1
 458   %scevgep37 = getelementptr i32, ptr %lsr.iv36, i32 1
 459   %exitcond26 = icmp eq i32 %inc11, %N
 460   br i1 %exitcond26, label %for.cond.cleanup, label %vector.ph
 461 }
 462
 463
 464 declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) #1
 465 declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) #2
 466 declare i32 @llvm.loop.decrement.reg.i32(i32 , i32 )
 467 declare i32 @llvm.start.loop.iterations.i32(i32)
 468 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)