test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll

   1 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve,+lob %s -S -o - | FileCheck %s
   2
   3 ; CHECK-LABEL: mul_v16i8
   4 ; CHECK: vector.body:
   5 ; CHECK: %index = phi i32
   6 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
   7 ; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.vctp8(i32 [[ELEMS]])
   8 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16
   9 ; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
  10 ; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
  11 ; CHECK: tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> {{.*}}, <16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]])
  12 define dso_local arm_aapcs_vfpcc void @mul_v16i8(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) {
  13 entry:
  14   %cmp8 = icmp eq i32 %N, 0
  15   %tmp8 = add i32 %N, 15
  16   %tmp9 = lshr i32 %tmp8, 4
  17   %tmp10 = shl nuw i32 %tmp9, 4
  18   %tmp11 = add i32 %tmp10, -16
  19   %tmp12 = lshr i32 %tmp11, 4
  20   %tmp13 = add nuw nsw i32 %tmp12, 1
  21   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
  22
  23 vector.ph:                                        ; preds = %entry
  24   %trip.count.minus.1 = add i32 %N, -1
  25   %broadcast.splatinsert10 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0
  26   %broadcast.splat11 = shufflevector <16 x i32> %broadcast.splatinsert10, <16 x i32> undef, <16 x i32> zeroinitializer
  27   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
  28   br label %vector.body
  29
  30 vector.body:                                      ; preds = %vector.body, %vector.ph
  31   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  32   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
  33   %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0
  34   %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer
  35   %induction = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  36   %tmp = getelementptr inbounds i8, i8* %a, i32 %index
  37   %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11
  38   %tmp2 = bitcast i8* %tmp to <16 x i8>*
  39   %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %tmp1, <16 x i8> undef)
  40   %tmp3 = getelementptr inbounds i8, i8* %b, i32 %index
  41   %tmp4 = bitcast i8* %tmp3 to <16 x i8>*
  42   %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %tmp1, <16 x i8> undef)
  43   %mul = mul nsw <16 x i8> %wide.masked.load2, %wide.masked.load
  44   %tmp6 = getelementptr inbounds i8, i8* %c, i32 %index
  45   %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
  46   tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %tmp1)
  47   %index.next = add i32 %index, 16
  48   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
  49   %tmp16 = icmp ne i32 %tmp15, 0
  50   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
  51
  52 for.cond.cleanup:                                 ; preds = %vector.body, %entry
  53   ret void
  54 }
  55
  56 ; CHECK-LABEL: mul_v8i16
  57 ; CHECK: vector.body:
  58 ; CHECK: %index = phi i32
  59 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
  60 ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
  61 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8
  62 ; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
  63 ; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
  64 ; CHECK: tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> {{.*}}, <8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]])
  65 define dso_local arm_aapcs_vfpcc void @mul_v8i16(i16* noalias nocapture readonly %a, i16* noalias nocapture readonly %b, i16* noalias nocapture %c, i32 %N) {
  66 entry:
  67   %cmp8 = icmp eq i32 %N, 0
  68   %tmp8 = add i32 %N, 7
  69   %tmp9 = lshr i32 %tmp8, 3
  70   %tmp10 = shl nuw i32 %tmp9, 3
  71   %tmp11 = add i32 %tmp10, -8
  72   %tmp12 = lshr i32 %tmp11, 3
  73   %tmp13 = add nuw nsw i32 %tmp12, 1
  74   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
  75
  76 vector.ph:                                        ; preds = %entry
  77   %trip.count.minus.1 = add i32 %N, -1
  78   %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
  79   %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
  80   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
  81   br label %vector.body
  82
  83 vector.body:                                      ; preds = %vector.body, %vector.ph
  84   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  85   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
  86   %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
  87   %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
  88   %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  89   %tmp = getelementptr inbounds i16, i16* %a, i32 %index
  90   %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
  91   %tmp2 = bitcast i16* %tmp to <8 x i16>*
  92   %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
  93   %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
  94   %tmp4 = bitcast i16* %tmp3 to <8 x i16>*
  95   %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
  96   %mul = mul nsw <8 x i16> %wide.masked.load2, %wide.masked.load
  97   %tmp6 = getelementptr inbounds i16, i16* %c, i32 %index
  98   %tmp7 = bitcast i16* %tmp6 to <8 x i16>*
  99   tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %tmp1)
 100   %index.next = add i32 %index, 8
 101   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 102   %tmp16 = icmp ne i32 %tmp15, 0
 103   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 104
 105 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 106   ret void
 107 }
 108
 109 ; CHECK-LABEL: mul_v4i32
 110 ; CHECK: vector.body:
 111 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
 112 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
 113 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
 114 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 115 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 116 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
 117 define dso_local arm_aapcs_vfpcc void @mul_v4i32(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 118 entry:
 119   %cmp8 = icmp eq i32 %N, 0
 120   %tmp8 = add i32 %N, 3
 121   %tmp9 = lshr i32 %tmp8, 2
 122   %tmp10 = shl nuw i32 %tmp9, 2
 123   %tmp11 = add i32 %tmp10, -4
 124   %tmp12 = lshr i32 %tmp11, 2
 125   %tmp13 = add nuw nsw i32 %tmp12, 1
 126   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 127
 128 vector.ph:                                        ; preds = %entry
 129   %trip.count.minus.1 = add i32 %N, -1
 130   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 131   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 132   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
 133   br label %vector.body
 134
 135 vector.body:                                      ; preds = %vector.body, %vector.ph
 136   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 137   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
 138   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 139   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 140   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 141   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 142   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 143   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 144   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 145   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 146   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 147   %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 148   %mul = mul nsw <4 x i32> %wide.masked.load2, %wide.masked.load
 149   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 150   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 151   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 152   %index.next = add i32 %index, 4
 153   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 154   %tmp16 = icmp ne i32 %tmp15, 0
 155   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 156
 157 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 158   ret void
 159 }
 160
 161 ; CHECK-LABEL: copy_v2i64
 162 ; CHECK: vector.body:
 163 ; CHECK: %index = phi i32
 164 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
 165 ; CHECK: [[VCTP:%[^ ]+]] = call <2 x i1> @llvm.arm.vctp64(i32 [[ELEMS]])
 166 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 2
 167 ; CHECK: [[LD0:%[^ ]+]] = tail call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]], <2 x i64> undef)
 168 ; CHECK: tail call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[LD0]], <2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]])
 169 define void @copy_v2i64(i64* %a, i64* %b, i32 %N) {
 170 entry:
 171   %cmp8 = icmp eq i32 %N, 0
 172   %tmp8 = add i32 %N, 1
 173   %tmp9 = lshr i32 %tmp8, 1
 174   %tmp10 = shl nuw i32 %tmp9, 1
 175   %tmp11 = add i32 %tmp10, -2
 176   %tmp12 = lshr i32 %tmp11, 1
 177   %tmp13 = add nuw nsw i32 %tmp12, 1
 178   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 179
 180 vector.ph:                                        ; preds = %entry
 181   %trip.count.minus.1 = add i32 %N, -1
 182   %broadcast.splatinsert10 = insertelement <2 x i32> undef, i32 %trip.count.minus.1, i32 0
 183   %broadcast.splat11 = shufflevector <2 x i32> %broadcast.splatinsert10, <2 x i32> undef, <2 x i32> zeroinitializer
 184   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
 185   br label %vector.body
 186
 187 vector.body:                                      ; preds = %vector.body, %vector.ph
 188   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 189   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
 190   %broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
 191   %broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
 192   %induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
 193   %tmp1 = icmp ule <2 x i32> %induction, %broadcast.splat11
 194   %tmp = getelementptr inbounds i64, i64* %a, i32 %index
 195   %tmp2 = bitcast i64* %tmp to <2 x i64>*
 196   %wide.masked.load = tail call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %tmp2, i32 4, <2 x i1> %tmp1, <2 x i64> undef)
 197   %tmp3 = getelementptr inbounds i64, i64* %b, i32 %index
 198   %tmp7 = bitcast i64* %tmp3 to <2 x i64>*
 199   tail call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %wide.masked.load, <2 x i64>* %tmp7, i32 4, <2 x i1> %tmp1)
 200   %index.next = add i32 %index, 2
 201   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 202   %tmp16 = icmp ne i32 %tmp15, 0
 203   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 204
 205 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 206   ret void
 207 }
 208
 209 ; CHECK-LABEL: split_vector
 210 ; CHECK: vector.body:
 211 ; CHECK: %index = phi i32
 212 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
 213 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
 214 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
 215 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 216 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 217 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
 218 define dso_local arm_aapcs_vfpcc void @split_vector(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 219 entry:
 220   %cmp8 = icmp eq i32 %N, 0
 221   %tmp8 = add i32 %N, 3
 222   %tmp9 = lshr i32 %tmp8, 2
 223   %tmp10 = shl nuw i32 %tmp9, 2
 224   %tmp11 = add i32 %tmp10, -4
 225   %tmp12 = lshr i32 %tmp11, 2
 226   %tmp13 = add nuw nsw i32 %tmp12, 1
 227   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 228
 229 vector.ph:                                        ; preds = %entry
 230   %trip.count.minus.1 = add i32 %N, -1
 231   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 232   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 233   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
 234   br label %vector.body
 235
 236 vector.body:                                      ; preds = %vector.body, %vector.ph
 237   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 238   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
 239   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 240   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 241   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 242   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 243   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 244   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 245   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 246   %extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2>
 247   %extract.1.high = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 1, i32 3>
 248   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 249   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 250   %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 251   %extract.2.low = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 0, i32 2>
 252   %extract.2.high = shufflevector <4 x i32> %wide.masked.load2, <4 x i32> undef, < 2 x i32> < i32 1, i32 3>
 253   %mul = mul nsw <2 x i32> %extract.1.low, %extract.2.low
 254   %sub = sub nsw <2 x i32> %extract.1.high, %extract.2.high
 255   %combine = shufflevector <2 x i32> %mul, <2 x i32> %sub, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 256   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 257   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 258   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 259   %index.next = add i32 %index, 4
 260   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 261   %tmp16 = icmp ne i32 %tmp15, 0
 262   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 263
 264 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 265   ret void
 266 }
 267
 268 ; One of the loads now uses ult predicate.
 269 ; CHECK-LABEL: mismatch_load_pred
 270 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
 271 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
 272 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
 273 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 274 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef)
 275 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]])
 276 define dso_local arm_aapcs_vfpcc void @mismatch_load_pred(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 277 entry:
 278   %cmp8 = icmp eq i32 %N, 0
 279   %tmp8 = add i32 %N, 3
 280   %tmp9 = lshr i32 %tmp8, 2
 281   %tmp10 = shl nuw i32 %tmp9, 2
 282   %tmp11 = add i32 %tmp10, -4
 283   %tmp12 = lshr i32 %tmp11, 2
 284   %tmp13 = add nuw nsw i32 %tmp12, 1
 285   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 286
 287 vector.ph:                                        ; preds = %entry
 288   %trip.count.minus.1 = add i32 %N, -1
 289   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 290   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 291   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
 292   br label %vector.body
 293
 294 vector.body:                                      ; preds = %vector.body, %vector.ph
 295   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 296   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
 297   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 298   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 299   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 300   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 301   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 302   %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
 303   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 304   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 305   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 306   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 307   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %wrong, <4 x i32> undef)
 308   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 309   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 310   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 311   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 312   %index.next = add i32 %index, 4
 313   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 314   %tmp16 = icmp ne i32 %tmp15, 0
 315   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 316
 317 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 318   ret void
 319 }
 320
 321 ; The store now uses ult predicate.
 322 ; CHECK-LABEL: mismatch_store_pred
 323 ; CHECK: %index = phi i32
 324 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
 325 ; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
 326 ; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
 327 ; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 328 ; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
 329 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong)
 330 define dso_local arm_aapcs_vfpcc void @mismatch_store_pred(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 331 entry:
 332   %cmp8 = icmp eq i32 %N, 0
 333   %tmp8 = add i32 %N, 3
 334   %tmp9 = lshr i32 %tmp8, 2
 335   %tmp10 = shl nuw i32 %tmp9, 2
 336   %tmp11 = add i32 %tmp10, -4
 337   %tmp12 = lshr i32 %tmp11, 2
 338   %tmp13 = add nuw nsw i32 %tmp12, 1
 339   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 340
 341 vector.ph:                                        ; preds = %entry
 342   %trip.count.minus.1 = add i32 %N, -1
 343   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 344   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 345   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
 346   br label %vector.body
 347
 348 vector.body:                                      ; preds = %vector.body, %vector.ph
 349   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 350   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
 351   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 352   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 353   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 354   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 355   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 356   %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
 357   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 358   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 359   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 360   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 361   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 362   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 363   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 364   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 365   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %wrong)
 366   %index.next = add i32 %index, 4
 367   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 368   %tmp16 = icmp ne i32 %tmp15, 0
 369   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 370
 371 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 372   ret void
 373 }
 374
 375 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
 376 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
 377 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
 378 declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>)
 379 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 380 declare void @llvm.masked.store.v2i64.p0v2i64(<2 x i64>, <2 x i64>*, i32 immarg, <2 x i1>)
 381 declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>)
 382 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
 383 declare void @llvm.set.loop.iterations.i32(i32)
 384 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
 385