test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll

   1 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve,+lob %s -S -o - | FileCheck %s
   2
   3 ; The following functions should all fail to become tail-predicated.
   4 ; CHECK-NOT: call i32 @llvm.arm.vctp
   5
   6 ; trip.count.minus.1 has been inserted into element 1, not 0.
   7 define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_0(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
   8 entry:
   9   %cmp8 = icmp eq i32 %N, 0
  10   %tmp8 = add i32 %N, 3
  11   %tmp9 = lshr i32 %tmp8, 2
  12   %tmp10 = shl nuw i32 %tmp9, 2
  13   %tmp11 = add i32 %tmp10, -4
  14   %tmp12 = lshr i32 %tmp11, 2
  15   %tmp13 = add nuw nsw i32 %tmp12, 1
  16   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
  17
  18 vector.ph:                                        ; preds = %entry
  19   %trip.count.minus.1 = add i32 %N, -1
  20   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 1
  21   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
  22   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
  23   br label %vector.body
  24
  25 vector.body:                                      ; preds = %vector.body, %vector.ph
  26   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  27   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
  28   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  29   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  30   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
  31   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
  32   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
  33   %tmp2 = bitcast i32* %tmp to <4 x i32>*
  34   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
  35   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
  36   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
  37   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
  38   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
  39   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
  40   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
  41   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
  42   %index.next = add i32 %index, 4
  43   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
  44   %tmp16 = icmp ne i32 %tmp15, 0
  45   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
  46
  47 for.cond.cleanup:                                 ; preds = %vector.body, %entry
  48   ret void
  49 }
  50
  51 ; The insert isn't using an undef for operand 0.
  52 define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_def(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
  53 entry:
  54   %cmp8 = icmp eq i32 %N, 0
  55   %tmp8 = add i32 %N, 3
  56   %tmp9 = lshr i32 %tmp8, 2
  57   %tmp10 = shl nuw i32 %tmp9, 2
  58   %tmp11 = add i32 %tmp10, -4
  59   %tmp12 = lshr i32 %tmp11, 2
  60   %tmp13 = add nuw nsw i32 %tmp12, 1
  61   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
  62
  63 vector.ph:                                        ; preds = %entry
  64   %trip.count.minus.1 = add i32 %N, -1
  65   %broadcast.splatinsert10 = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i32 %trip.count.minus.1, i32 0
  66   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
  67   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
  68   br label %vector.body
  69
  70 vector.body:                                      ; preds = %vector.body, %vector.ph
  71   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  72   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
  73   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  74   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  75   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
  76   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
  77   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
  78   %tmp2 = bitcast i32* %tmp to <4 x i32>*
  79   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
  80   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
  81   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
  82   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
  83   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
  84   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
  85   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
  86   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
  87   %index.next = add i32 %index, 4
  88   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
  89   %tmp16 = icmp ne i32 %tmp15, 0
  90   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
  91
  92 for.cond.cleanup:                                 ; preds = %vector.body, %entry
  93   ret void
  94 }
  95
  96 ; The shuffle uses a defined value for operand 1.
  97 define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_1(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
  98 entry:
  99   %cmp8 = icmp eq i32 %N, 0
 100   %tmp8 = add i32 %N, 3
 101   %tmp9 = lshr i32 %tmp8, 2
 102   %tmp10 = shl nuw i32 %tmp9, 2
 103   %tmp11 = add i32 %tmp10, -4
 104   %tmp12 = lshr i32 %tmp11, 2
 105   %tmp13 = add nuw nsw i32 %tmp12, 1
 106   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 107
 108 vector.ph:                                        ; preds = %entry
 109   %trip.count.minus.1 = add i32 %N, -1
 110   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 111   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> zeroinitializer
 112   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
 113   br label %vector.body
 114
 115 vector.body:                                      ; preds = %vector.body, %vector.ph
 116   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 117   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
 118   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 119   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 120   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 121   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 122   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 123   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 124   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 125   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 126   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 127   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 128   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 129   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 130   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 131   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 132   %index.next = add i32 %index, 4
 133   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 134   %tmp16 = icmp ne i32 %tmp15, 0
 135   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 136
 137 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 138   ret void
 139 }
 140
 141 ; The shuffle uses a non zero value for operand 2.
 142 define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 143 entry:
 144   %cmp8 = icmp eq i32 %N, 0
 145   %tmp8 = add i32 %N, 3
 146   %tmp9 = lshr i32 %tmp8, 2
 147   %tmp10 = shl nuw i32 %tmp9, 2
 148   %tmp11 = add i32 %tmp10, -4
 149   %tmp12 = lshr i32 %tmp11, 2
 150   %tmp13 = add nuw nsw i32 %tmp12, 1
 151   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 152
 153 vector.ph:                                        ; preds = %entry
 154   %trip.count.minus.1 = add i32 %N, -1
 155   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 156   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 157   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
 158   br label %vector.body
 159
 160 vector.body:                                      ; preds = %vector.body, %vector.ph
 161   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 162   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
 163   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 164   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 165   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 166   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 167   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 168   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 169   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 170   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 171   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 172   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 173   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 174   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 175   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 176   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 177   %index.next = add i32 %index, 4
 178   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 179   %tmp16 = icmp ne i32 %tmp15, 0
 180   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 181
 182 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 183   ret void
 184 }
 185
 186 ; %N - 2
 187 define dso_local arm_aapcs_vfpcc void @trip_count_minus_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 188 entry:
 189   %cmp8 = icmp eq i32 %N, 0
 190   %tmp8 = add i32 %N, 3
 191   %tmp9 = lshr i32 %tmp8, 2
 192   %tmp10 = shl nuw i32 %tmp9, 2
 193   %tmp11 = add i32 %tmp10, -4
 194   %tmp12 = lshr i32 %tmp11, 2
 195   %tmp13 = add nuw nsw i32 %tmp12, 1
 196   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 197
 198 vector.ph:                                        ; preds = %entry
 199   %trip.count.minus.2 = add i32 %N, -2
 200   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.2, i32 1
 201   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 202   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
 203   br label %vector.body
 204
 205 vector.body:                                      ; preds = %vector.body, %vector.ph
 206   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 207   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
 208   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 209   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 210   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 211   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 212   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 213   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 214   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 215   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 216   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 217   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 218   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 219   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 220   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 221   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 222   %index.next = add i32 %index, 4
 223   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 224   %tmp16 = icmp ne i32 %tmp15, 0
 225   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 226
 227 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 228   ret void
 229 }
 230
 231 ; index has been inserted at element 1, not 0.
 232 define dso_local arm_aapcs_vfpcc void @wrong_loop_insert(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 233 entry:
 234   %cmp8 = icmp eq i32 %N, 0
 235   %tmp8 = add i32 %N, 3
 236   %tmp9 = lshr i32 %tmp8, 2
 237   %tmp10 = shl nuw i32 %tmp9, 2
 238   %tmp11 = add i32 %tmp10, -4
 239   %tmp12 = lshr i32 %tmp11, 2
 240   %tmp13 = add nuw nsw i32 %tmp12, 1
 241   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 242
 243 vector.ph:                                        ; preds = %entry
 244   %trip.count.minus.1 = add i32 %N, -1
 245   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 246   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 247   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
 248   br label %vector.body
 249
 250 vector.body:                                      ; preds = %vector.body, %vector.ph
 251   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 252   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
 253   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 1
 254   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 255   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 256   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 257   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 258   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 259   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 260   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 261   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 262   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 263   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 264   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 265   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 266   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 267   %index.next = add i32 %index, 4
 268   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 269   %tmp16 = icmp ne i32 %tmp15, 0
 270   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 271
 272 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 273   ret void
 274 }
 275
 276 define dso_local arm_aapcs_vfpcc void @wrong_loop_invalid_index_splat(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 277 entry:
 278   %cmp8 = icmp eq i32 %N, 0
 279   %tmp8 = add i32 %N, 3
 280   %tmp9 = lshr i32 %tmp8, 2
 281   %tmp10 = shl nuw i32 %tmp9, 2
 282   %tmp11 = add i32 %tmp10, -4
 283   %tmp12 = lshr i32 %tmp11, 2
 284   %tmp13 = add nuw nsw i32 %tmp12, 1
 285   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 286
 287 vector.ph:                                        ; preds = %entry
 288   %trip.count.minus.1 = add i32 %N, -1
 289   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 290   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 291   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
 292   br label %vector.body
 293
 294 vector.body:                                      ; preds = %vector.body, %vector.ph
 295   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 296   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
 297   %incorrect = add i32 %index, 1
 298   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %incorrect, i32 0
 299   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 300   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 301   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 302   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 303   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 304   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 305   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 306   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 307   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 308   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 309   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 310   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 311   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 312   %index.next = add i32 %index, 4
 313   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 314   %tmp16 = icmp ne i32 %tmp15, 0
 315   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 316
 317 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 318   ret void
 319 }
 320
 321 ; Now using ult, not ule for the vector icmp
 322 define dso_local arm_aapcs_vfpcc void @wrong_pred_opcode(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 323 entry:
 324   %cmp8 = icmp eq i32 %N, 0
 325   %tmp8 = add i32 %N, 3
 326   %tmp9 = lshr i32 %tmp8, 2
 327   %tmp10 = shl nuw i32 %tmp9, 2
 328   %tmp11 = add i32 %tmp10, -4
 329   %tmp12 = lshr i32 %tmp11, 2
 330   %tmp13 = add nuw nsw i32 %tmp12, 1
 331   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 332
 333 vector.ph:                                        ; preds = %entry
 334   %trip.count.minus.1 = add i32 %N, -1
 335   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 336   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 337   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
 338   br label %vector.body
 339
 340 vector.body:                                      ; preds = %vector.body, %vector.ph
 341   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 342   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
 343   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 344   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 345   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 346   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 347   %tmp1 = icmp ult <4 x i32> %induction, %broadcast.splat11
 348   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 349   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 350   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 351   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 352   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 353   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 354   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 355   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 356   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 357   %index.next = add i32 %index, 4
 358   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 359   %tmp16 = icmp ne i32 %tmp15, 0
 360   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 361
 362 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 363   ret void
 364 }
 365
 366 ; The add in the body uses 1, 2, 3, 4
 367 define void @wrong_body_broadcast_splat(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 368 entry:
 369   %cmp8 = icmp eq i32 %N, 0
 370   %tmp8 = add i32 %N, 3
 371   %tmp9 = lshr i32 %tmp8, 2
 372   %tmp10 = shl nuw i32 %tmp9, 2
 373   %tmp11 = add i32 %tmp10, -4
 374   %tmp12 = lshr i32 %tmp11, 2
 375   %tmp13 = add nuw nsw i32 %tmp12, 1
 376   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 377
 378 vector.ph:                                        ; preds = %entry
 379   %trip.count.minus.1 = add i32 %N, -1
 380   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 381   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 382   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
 383   br label %vector.body
 384
 385 vector.body:                                      ; preds = %vector.body, %vector.ph
 386   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 387   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
 388   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 389   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 390   %induction = add <4 x i32> %broadcast.splat, <i32 1, i32 2, i32 3, i32 4>
 391   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 392   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 393   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 394   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 395   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 396   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 397   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 398   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 399   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 400   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 401   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 402   %index.next = add i32 %index, 4
 403   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 404   %tmp16 = icmp ne i32 %tmp15, 0
 405   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 406
 407 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 408   ret void
 409 }
 410
 411 ; Using a variable for the loop body broadcast.
 412 define void @wrong_body_broadcast_splat_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N, <4 x i32> %offsets) {
 413 entry:
 414   %cmp8 = icmp eq i32 %N, 0
 415   %tmp8 = add i32 %N, 3
 416   %tmp9 = lshr i32 %tmp8, 2
 417   %tmp10 = shl nuw i32 %tmp9, 2
 418   %tmp11 = add i32 %tmp10, -4
 419   %tmp12 = lshr i32 %tmp11, 2
 420   %tmp13 = add nuw nsw i32 %tmp12, 1
 421   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 422
 423 vector.ph:                                        ; preds = %entry
 424   %trip.count.minus.1 = add i32 %N, -1
 425   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 426   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 427   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
 428   br label %vector.body
 429
 430 vector.body:                                      ; preds = %vector.body, %vector.ph
 431   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 432   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
 433   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 434   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 435   %induction = add <4 x i32> %broadcast.splat, %offsets
 436   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 437   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 438   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 439   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 440   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 441   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 442   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 443   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 444   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 445   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 446   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 447   %index.next = add i32 %index, 4
 448   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 449   %tmp16 = icmp ne i32 %tmp15, 0
 450   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 451
 452 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 453   ret void
 454 }
 455
 456 ; adding 5, instead of 4, to index.
 457 define void @wrong_index_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 458 entry:
 459   %cmp8 = icmp eq i32 %N, 0
 460   %tmp8 = add i32 %N, 3
 461   %tmp9 = lshr i32 %tmp8, 2
 462   %tmp10 = shl nuw i32 %tmp9, 2
 463   %tmp11 = add i32 %tmp10, -4
 464   %tmp12 = lshr i32 %tmp11, 2
 465   %tmp13 = add nuw nsw i32 %tmp12, 1
 466   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 467
 468 vector.ph:                                        ; preds = %entry
 469   %trip.count.minus.1 = add i32 %N, -1
 470   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 471   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 472   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
 473   br label %vector.body
 474
 475 vector.body:                                      ; preds = %vector.body, %vector.ph
 476   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 477   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
 478   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 479   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 480   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 481   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 482   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 483   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 484   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 485   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 486   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 487   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 488   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 489   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 490   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 491   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 492   %index.next = add i32 %index, 5
 493   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 494   %tmp16 = icmp ne i32 %tmp15, 0
 495   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 496
 497 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 498   ret void
 499 }
 500
 501 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
 502 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
 503 declare void @llvm.set.loop.iterations.i32(i32) #3
 504 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
 505