llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll

   1 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
   2
   3 ; CHECK-LABEL: mul_v16i8
   4 ; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
   5 ; CHECK: vector.body:
   6 ; CHECK: %index = phi i32
   7 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
   8 ; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[ELEMS]])
   9 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16
  10 ; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
  11 ; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
  12 ; CHECK: tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> {{.*}}, <16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]])
  13 define dso_local arm_aapcs_vfpcc void @mul_v16i8(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) {
  14 entry:
  15   %cmp8 = icmp eq i32 %N, 0
  16   %tmp8 = add i32 %N, 15
  17   %tmp9 = lshr i32 %tmp8, 4
  18   %tmp10 = shl nuw i32 %tmp9, 4
  19   %tmp11 = add i32 %tmp10, -16
  20   %tmp12 = lshr i32 %tmp11, 4
  21   %tmp13 = add nuw nsw i32 %tmp12, 1
  22   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
  23
  24 vector.ph:                                        ; preds = %entry
  25   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  26   br label %vector.body
  27
  28 vector.body:                                      ; preds = %vector.body, %vector.ph
  29   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  30   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  31   %tmp = getelementptr inbounds i8, i8* %a, i32 %index
  32   %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
  33   %tmp2 = bitcast i8* %tmp to <16 x i8>*
  34   %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef)
  35   %tmp3 = getelementptr inbounds i8, i8* %b, i32 %index
  36   %tmp4 = bitcast i8* %tmp3 to <16 x i8>*
  37   %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef)
  38   %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load
  39   %tmp6 = getelementptr inbounds i8, i8* %c, i32 %index
  40   %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
  41   tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %active.lane.mask)
  42   %index.next = add i32 %index, 16
  43   %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
  44   %tmp16 = icmp ne i32 %tmp15, 0
  45   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
  46
  47 for.cond.cleanup:                                 ; preds = %vector.body, %entry
  48   ret void
  49 }
  50
  51 ; CHECK-LABEL: mul_v8i16
  52 ; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
  53 ; CHECK: vector.body:
  54 ; CHECK: %index = phi i32
  55 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
  56 ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]])
  57 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8
  58 ; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
  59 ; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
  60 ; CHECK: tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> {{.*}}, <8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]])
  61 define dso_local arm_aapcs_vfpcc void @mul_v8i16(i16* noalias nocapture readonly %a, i16* noalias nocapture readonly %b, i16* noalias nocapture %c, i32 %N) {
  62 entry:
  63   %cmp8 = icmp eq i32 %N, 0
  64   %tmp8 = add i32 %N, 7
  65   %tmp9 = lshr i32 %tmp8, 3
  66   %tmp10 = shl nuw i32 %tmp9, 3
  67   %tmp11 = add i32 %tmp10, -8
  68   %tmp12 = lshr i32 %tmp11, 3
  69   %tmp13 = add nuw nsw i32 %tmp12, 1
  70   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
  71
  72 vector.ph:                                        ; preds = %entry
  73   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  74   br label %vector.body
  75
  76 vector.body:                                      ; preds = %vector.body, %vector.ph
  77   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  78   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  79   %tmp = getelementptr inbounds i16, i16* %a, i32 %index
  80   %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
  81   %tmp2 = bitcast i16* %tmp to <8 x i16>*
  82   %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef)
  83   %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
  84   %tmp4 = bitcast i16* %tmp3 to <8 x i16>*
  85   %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef)
  86   %mul = mul nsw <8 x i16> %wide.masked.load2, %wide.masked.load
  87   %tmp6 = getelementptr inbounds i16, i16* %c, i32 %index
  88   %tmp7 = bitcast i16* %tmp6 to <8 x i16>*
  89   tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %active.lane.mask)
  90   %index.next = add i32 %index, 8
  91   %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
  92   %tmp16 = icmp ne i32 %tmp15, 0
  93   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
  94
  95 for.cond.cleanup:                                 ; preds = %vector.body, %entry
  96   ret void
  97 }
  98
  99 ; CHECK-LABEL: mul_v4i32
 100 ; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
 101 ; CHECK: vector.body:
 102 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
 103 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
 104 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
 105 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 106 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 107 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
 108 define dso_local arm_aapcs_vfpcc void @mul_v4i32(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 109 entry:
 110   %cmp8 = icmp eq i32 %N, 0
 111   %tmp8 = add i32 %N, 3
 112   %tmp9 = lshr i32 %tmp8, 2
 113   %tmp10 = shl nuw i32 %tmp9, 2
 114   %tmp11 = add i32 %tmp10, -4
 115   %tmp12 = lshr i32 %tmp11, 2
 116   %tmp13 = add nuw nsw i32 %tmp12, 1
 117   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 118
 119 vector.ph:                                        ; preds = %entry
 120   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 121   br label %vector.body
 122
 123 vector.body:                                      ; preds = %vector.body, %vector.ph
 124   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 125   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 126   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 127   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 128   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
 129   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 130   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 131   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 132   %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 133   %mul = mul nsw <4 x i32> %wide.masked.load2, %wide.masked.load
 134   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 135   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 136   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
 137   %index.next = add i32 %index, 4
 138   %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
 139   %tmp16 = icmp ne i32 %tmp15, 0
 140   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 141
 142 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 143   ret void
 144 }
 145
 146 ; CHECK-LABEL: split_vector
 147 ; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
 148 ; CHECK: vector.body:
 149 ; CHECK: %index = phi i32
 150 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
 151 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
 152 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
 153 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 154 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 155 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
 156 define dso_local arm_aapcs_vfpcc void @split_vector(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 157 entry:
 158   %cmp8 = icmp eq i32 %N, 0
 159   %tmp8 = add i32 %N, 3
 160   %tmp9 = lshr i32 %tmp8, 2
 161   %tmp10 = shl nuw i32 %tmp9, 2
 162   %tmp11 = add i32 %tmp10, -4
 163   %tmp12 = lshr i32 %tmp11, 2
 164   %tmp13 = add nuw nsw i32 %tmp12, 1
 165   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 166
 167 vector.ph:                                        ; preds = %entry
 168   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 169   br label %vector.body
 170
 171 vector.body:                                      ; preds = %vector.body, %vector.ph
 172   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 173   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 174   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 175   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
 176   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 177   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 178   %extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2>
 179   %extract.1.high = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 1, i32 3>
 180   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 181   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 182   %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 183   %extract.2.low = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 0, i32 2>
 184   %extract.2.high = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 1, i32 3>
 185   %mul = mul nsw <2 x i32> %extract.1.low, %extract.2.low
 186   %sub = sub nsw <2 x i32> %extract.1.high, %extract.2.high
 187   %combine = shufflevector <2 x i32> %mul, <2 x i32> %sub, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 188   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 189   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 190   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
 191   %index.next = add i32 %index, 4
 192   %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
 193   %tmp16 = icmp ne i32 %tmp15, 0
 194   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 195
 196 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 197   ret void
 198 }
 199
 200 ; One of the loads now uses ult predicate.
 201 ; CHECK-LABEL: mismatch_load_pred
 202 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
 203 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
 204 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
 205 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 206 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef)
 207 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
 208 define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 209 entry:
 210   %cmp8 = icmp eq i32 %N, 0
 211   %tmp8 = add i32 %N, 3
 212   %tmp9 = lshr i32 %tmp8, 2
 213   %tmp10 = shl nuw i32 %tmp9, 2
 214   %tmp11 = add i32 %tmp10, -4
 215   %tmp12 = lshr i32 %tmp11, 2
 216   %tmp13 = add nuw nsw i32 %tmp12, 1
 217   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 218
 219 vector.ph:                                        ; preds = %entry
 220   %trip.count.minus.1 = add i32 %N, -1
 221   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 222   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 223   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 224   br label %vector.body
 225
 226 vector.body:                                      ; preds = %vector.body, %vector.ph
 227   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 228   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 229   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 230   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 231   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 232   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 233   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
 234   %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
 235   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 236   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 237   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 238   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 239   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %wrong, <4 x i32> undef)
 240   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 241   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 242   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 243   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
 244   %index.next = add i32 %index, 4
 245   %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
 246   %tmp16 = icmp ne i32 %tmp15, 0
 247   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 248
 249 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 250   ret void
 251 }
 252
 253 ; The store now uses ult predicate.
 254 ; CHECK-LABEL: mismatch_store_pred
 255 ; CHECK-NOT: %num.elements = add i32 %trip.count.minus.1, 1
 256 ; CHECK: vector.body:
 257 ; CHECK: %index = phi i32
 258 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
 259 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
 260 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
 261 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 262 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 263 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong)
 264 define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 265 entry:
 266   %cmp8 = icmp eq i32 %N, 0
 267   %tmp8 = add i32 %N, 3
 268   %tmp9 = lshr i32 %tmp8, 2
 269   %tmp10 = shl nuw i32 %tmp9, 2
 270   %tmp11 = add i32 %tmp10, -4
 271   %tmp12 = lshr i32 %tmp11, 2
 272   %tmp13 = add nuw nsw i32 %tmp12, 1
 273   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 274
 275 vector.ph:                                        ; preds = %entry
 276   %trip.count.minus.1 = add i32 %N, -1
 277   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 278   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 279   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 280   br label %vector.body
 281
 282 vector.body:                                      ; preds = %vector.body, %vector.ph
 283   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 284   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 285   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 286   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 287   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 288   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 289   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
 290   %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
 291   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 292   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 293   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 294   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 295   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 296   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 297   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 298   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 299   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %wrong)
 300   %index.next = add i32 %index, 4
 301   %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
 302   %tmp16 = icmp ne i32 %tmp15, 0
 303   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 304
 305 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 306   ret void
 307 }
 308
 309 ; TODO: Multiple intrinsics not yet supported.
 310 ; This is currently rejected, because if the vector body is unrolled, the step
 311 ; is not what we expect:
 312 ;
 313 ;   Step value 16 doesn't match vector width 4
 314 ;
 315 ; CHECK-LABEL: interleave4
 316 ; CHECK: vector.body:
 317 ; CHECK:  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
 318 ; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N)
 319 ; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N)
 320 ; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N)
 321 ;
 322 define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
 323 entry:
 324   %cmp8 = icmp sgt i32 %N, 0
 325   %v0 = add i32 %N, 15
 326   %v1 = lshr i32 %v0, 4
 327   %v2 = shl nuw i32 %v1, 4
 328   %v3 = add i32 %v2, -16
 329   %v4 = lshr i32 %v3, 4
 330   %v5 = add nuw nsw i32 %v4, 1
 331   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 332
 333
 334 vector.ph:
 335   %scevgep = getelementptr i32, i32* %A, i32 8
 336   %scevgep30 = getelementptr i32, i32* %C, i32 8
 337   %scevgep37 = getelementptr i32, i32* %B, i32 8
 338   %start = call i32 @llvm.start.loop.iterations.i32(i32 %v5)
 339   br label %vector.body
 340
 341 vector.body:
 342   %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %scevgep37, %vector.ph ]
 343   %lsr.iv31 = phi i32* [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ]
 344   %lsr.iv = phi i32* [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ]
 345   %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ]
 346   %v6 = phi i32 [ %start, %vector.ph ], [ %v15, %vector.body ]
 347   %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
 348   %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>*
 349   %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>*
 350   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
 351   %v7 = add i32 %index, 4
 352   %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N)
 353   %v8 = add i32 %v7, 4
 354   %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N)
 355   %v9 = add i32 %v8, 4
 356   %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N)
 357   %scevgep42 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -2
 358   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 359   %scevgep43 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -1
 360   %wide.masked.load18 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep43, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
 361   %wide.masked.load19 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
 362   %scevgep41 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 1
 363   %wide.masked.load20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep41, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
 364   %scevgep34 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -2
 365   %wide.masked.load21 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep34, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 366   %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -1
 367   %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep35, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
 368   %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3133, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
 369   %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 1
 370   %wide.masked.load24 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep36, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
 371   %v10 = add nsw <4 x i32> %wide.masked.load21, %wide.masked.load
 372   %v11 = add nsw <4 x i32> %wide.masked.load22, %wide.masked.load18
 373   %v12 = add nsw <4 x i32> %wide.masked.load23, %wide.masked.load19
 374   %v13 = add nsw <4 x i32> %wide.masked.load24, %wide.masked.load20
 375   %scevgep27 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -2
 376   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v10, <4 x i32>* %scevgep27, i32 4, <4 x i1> %active.lane.mask)
 377   %scevgep28 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -1
 378   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v11, <4 x i32>* %scevgep28, i32 4, <4 x i1> %active.lane.mask15)
 379   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v12, <4 x i32>* %lsr.iv26, i32 4, <4 x i1> %active.lane.mask16)
 380   %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 1
 381   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v13, <4 x i32>* %scevgep29, i32 4, <4 x i1> %active.lane.mask17)
 382   %scevgep25 = getelementptr i32, i32* %lsr.iv, i32 16
 383   %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 16
 384   %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 16
 385   %v14 = add i32 %v9, 4
 386   %v15 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1)
 387   %v16 = icmp ne i32 %v15, 0
 388   br i1 %v16, label %vector.body, label %for.cond.cleanup
 389
 390 for.cond.cleanup:
 391   ret void
 392 }
 393
 394 ; CHECK-LABEL: const_expected_in_set_loop
 395 ; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
 396 ; CHECK-NOT:   vctp
 397 ; CHECK:       ret void
 398 ;
 399 define dso_local void @const_expected_in_set_loop(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
 400 entry:
 401   %cmp8 = icmp sgt i32 %N, 0
 402   %0 = add i32 %N, 3
 403   %1 = lshr i32 %0, 2
 404   %2 = shl nuw i32 %1, 2
 405   %3 = add i32 %2, -4
 406   %4 = lshr i32 %3, 2
 407   %5 = add nuw nsw i32 %4, 1
 408   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 409
 410 vector.ph:
 411   %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
 412   br label %vector.body
 413
 414 vector.body:                                      ; preds = %vector.body, %vector.ph
 415   %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
 416   %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
 417   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
 418   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 419   %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
 420   %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
 421   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
 422   %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
 423   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 42)
 424   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 425   %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 426   %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 427   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask)
 428   %index.next = add i32 %index, 4
 429   %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
 430   %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
 431   %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
 432   %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
 433   %9 = icmp ne i32 %8, 0
 434   br i1 %9, label %vector.body, label %for.cond.cleanup
 435
 436 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 437   ret void
 438 }
 439
 440 ; CHECK-LABEL: tripcount_arg_not_invariant
 441 ; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
 442 ; CHECK-NOT:   vctp
 443 ; CHECK:       ret void
 444 ;
 445 define dso_local void @tripcount_arg_not_invariant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
 446 entry:
 447   %cmp8 = icmp sgt i32 %N, 0
 448   %0 = add i32 %N, 3
 449   %1 = lshr i32 %0, 2
 450   %2 = shl nuw i32 %1, 2
 451   %3 = add i32 %2, -4
 452   %4 = lshr i32 %3, 2
 453   %5 = add nuw nsw i32 %4, 1
 454   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 455
 456 vector.ph:                                        ; preds = %entry
 457   %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
 458   br label %vector.body
 459
 460 vector.body:                                      ; preds = %vector.body, %vector.ph
 461   %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
 462   %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
 463   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
 464   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 465   %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
 466
 467   %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
 468   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
 469   %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
 470   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %index)
 471   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 472   %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 473   %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 474   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask)
 475   %index.next = add i32 %index, 4
 476   %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
 477   %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
 478   %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
 479   %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
 480   %9 = icmp ne i32 %8, 0
 481   ;br i1 %9, label %vector.body, label %for.cond.cleanup
 482   br i1 %9, label %vector.body, label %vector.ph
 483
 484 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 485   ret void
 486 }
 487
 488 ; CHECK-LABEL: addrec_base_not_zero
 489 ; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
 490 ; CHECK-NOT:   vctp
 491 ; CHECK:       ret void
 492 ;
 493 define dso_local void @addrec_base_not_zero(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
 494 entry:
 495   %cmp8 = icmp sgt i32 %N, 0
 496   %0 = add i32 %N, 3
 497   %1 = lshr i32 %0, 2
 498   %2 = shl nuw i32 %1, 2
 499   %3 = add i32 %2, -4
 500   %4 = lshr i32 %3, 2
 501   %5 = add nuw nsw i32 %4, 1
 502   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
 503
 504 vector.ph:                                        ; preds = %entry
 505   %start = call i32 @llvm.start.loop.iterations.i32(i32 %5)
 506   br label %vector.body
 507
 508 vector.body:                                      ; preds = %vector.body, %vector.ph
 509   %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
 510   %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
 511   %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
 512
 513 ; AddRec base is not 0:
 514   %index = phi i32 [ 1, %vector.ph ], [ %index.next, %vector.body ]
 515
 516   %6 = phi i32 [ %start, %vector.ph ], [ %8, %vector.body ]
 517   %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
 518   %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
 519   %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
 520   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
 521   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 522   %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
 523   %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 524   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask)
 525   %index.next = add i32 %index, 4
 526   %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
 527   %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
 528   %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
 529   %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
 530   %9 = icmp ne i32 %8, 0
 531   ;br i1 %9, label %vector.body, label %for.cond.cleanup
 532   br i1 %9, label %vector.body, label %vector.ph
 533
 534 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 535   ret void
 536 }
 537
 538
 539 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
 540 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
 541 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
 542 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
 543 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 544 declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, <2 x i1>)
 545 declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>)
 546 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
 547 declare i32 @llvm.start.loop.iterations.i32(i32)
 548 declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
 549 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 550 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
 551 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)