llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
   3
   4 ; The following functions should all fail to become tail-predicated.
   5 ; CHECK-NOT: call i32 @llvm.arm.vctp
   6
   7 ; trip.count.minus.1 has been inserted into element 1, not 0.
   8 define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_0(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
   9 entry:
  10   %cmp8 = icmp eq i32 %N, 0
  11   %tmp8 = add i32 %N, 3
  12   %tmp9 = lshr i32 %tmp8, 2
  13   %tmp10 = shl nuw i32 %tmp9, 2
  14   %tmp11 = add i32 %tmp10, -4
  15   %tmp12 = lshr i32 %tmp11, 2
  16   %tmp13 = add nuw nsw i32 %tmp12, 1
  17   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
  18
  19 vector.ph:                                        ; preds = %entry
  20   %trip.count.minus.1 = add i32 %N, -1
  21   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 1
  22   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
  23   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  24   br label %vector.body
  25
  26 vector.body:                                      ; preds = %vector.body, %vector.ph
  27   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  28   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  29   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  30   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  31   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
  32   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
  33   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
  34   %tmp2 = bitcast i32* %tmp to <4 x i32>*
  35   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
  36   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
  37   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
  38   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
  39   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
  40   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
  41   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
  42   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
  43   %index.next = add i32 %index, 4
  44   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
  45   %tmp16 = icmp ne i32 %tmp15, 0
  46   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
  47
  48 for.cond.cleanup:                                 ; preds = %vector.body, %entry
  49   ret void
  50 }
  51
  52 ; The insert isn't using an undef for operand 0.
  53 define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_def(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
  54 entry:
  55   %cmp8 = icmp eq i32 %N, 0
  56   %tmp8 = add i32 %N, 3
  57   %tmp9 = lshr i32 %tmp8, 2
  58   %tmp10 = shl nuw i32 %tmp9, 2
  59   %tmp11 = add i32 %tmp10, -4
  60   %tmp12 = lshr i32 %tmp11, 2
  61   %tmp13 = add nuw nsw i32 %tmp12, 1
  62   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
  63
  64 vector.ph:                                        ; preds = %entry
  65   %trip.count.minus.1 = add i32 %N, -1
  66   %broadcast.splatinsert10 = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i32 %trip.count.minus.1, i32 0
  67   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
  68   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  69   br label %vector.body
  70
  71 vector.body:                                      ; preds = %vector.body, %vector.ph
  72   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  73   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  74   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  75   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  76   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
  77   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
  78   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
  79   %tmp2 = bitcast i32* %tmp to <4 x i32>*
  80   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
  81   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
  82   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
  83   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
  84   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
  85   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
  86   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
  87   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
  88   %index.next = add i32 %index, 4
  89   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
  90   %tmp16 = icmp ne i32 %tmp15, 0
  91   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
  92
  93 for.cond.cleanup:                                 ; preds = %vector.body, %entry
  94   ret void
  95 }
  96
  97 ; The shuffle uses a defined value for operand 1.
  98 define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_1(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
  99 entry:
 100   %cmp8 = icmp eq i32 %N, 0
 101   %tmp8 = add i32 %N, 3
 102   %tmp9 = lshr i32 %tmp8, 2
 103   %tmp10 = shl nuw i32 %tmp9, 2
 104   %tmp11 = add i32 %tmp10, -4
 105   %tmp12 = lshr i32 %tmp11, 2
 106   %tmp13 = add nuw nsw i32 %tmp12, 1
 107   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 108
 109 vector.ph:                                        ; preds = %entry
 110   %trip.count.minus.1 = add i32 %N, -1
 111   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 112   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> zeroinitializer
 113   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 114   br label %vector.body
 115
 116 vector.body:                                      ; preds = %vector.body, %vector.ph
 117   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 118   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 119   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 120   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 121   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 122   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 123   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 124   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 125   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 126   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 127   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 128   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 129   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 130   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 131   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 132   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 133   %index.next = add i32 %index, 4
 134   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 135   %tmp16 = icmp ne i32 %tmp15, 0
 136   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 137
 138 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 139   ret void
 140 }
 141
 142 ; The shuffle uses a non zero value for operand 2.
 143 define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 144 entry:
 145   %cmp8 = icmp eq i32 %N, 0
 146   %tmp8 = add i32 %N, 3
 147   %tmp9 = lshr i32 %tmp8, 2
 148   %tmp10 = shl nuw i32 %tmp9, 2
 149   %tmp11 = add i32 %tmp10, -4
 150   %tmp12 = lshr i32 %tmp11, 2
 151   %tmp13 = add nuw nsw i32 %tmp12, 1
 152   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 153
 154 vector.ph:                                        ; preds = %entry
 155   %trip.count.minus.1 = add i32 %N, -1
 156   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 157   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 158   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 159   br label %vector.body
 160
 161 vector.body:                                      ; preds = %vector.body, %vector.ph
 162   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 163   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 164   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 165   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 166   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 167   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 168   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 169   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 170   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 171   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 172   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 173   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 174   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 175   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 176   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 177   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 178   %index.next = add i32 %index, 4
 179   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 180   %tmp16 = icmp ne i32 %tmp15, 0
 181   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 182
 183 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 184   ret void
 185 }
 186
 187 ; %N - 2
 188 define dso_local arm_aapcs_vfpcc void @trip_count_minus_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 189 entry:
 190   %cmp8 = icmp eq i32 %N, 0
 191   %tmp8 = add i32 %N, 3
 192   %tmp9 = lshr i32 %tmp8, 2
 193   %tmp10 = shl nuw i32 %tmp9, 2
 194   %tmp11 = add i32 %tmp10, -4
 195   %tmp12 = lshr i32 %tmp11, 2
 196   %tmp13 = add nuw nsw i32 %tmp12, 1
 197   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 198
 199 vector.ph:                                        ; preds = %entry
 200   %trip.count.minus.2 = add i32 %N, -2
 201   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.2, i32 1
 202   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 203   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 204   br label %vector.body
 205
 206 vector.body:                                      ; preds = %vector.body, %vector.ph
 207   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 208   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 209   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 210   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 211   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 212   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 213   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 214   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 215   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 216   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 217   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 218   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 219   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 220   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 221   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 222   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 223   %index.next = add i32 %index, 4
 224   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 225   %tmp16 = icmp ne i32 %tmp15, 0
 226   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 227
 228 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 229   ret void
 230 }
 231
 232 ; index has been inserted at element 1, not 0.
 233 define dso_local arm_aapcs_vfpcc void @wrong_loop_insert(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 234 entry:
 235   %cmp8 = icmp eq i32 %N, 0
 236   %tmp8 = add i32 %N, 3
 237   %tmp9 = lshr i32 %tmp8, 2
 238   %tmp10 = shl nuw i32 %tmp9, 2
 239   %tmp11 = add i32 %tmp10, -4
 240   %tmp12 = lshr i32 %tmp11, 2
 241   %tmp13 = add nuw nsw i32 %tmp12, 1
 242   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 243
 244 vector.ph:                                        ; preds = %entry
 245   %trip.count.minus.1 = add i32 %N, -1
 246   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 247   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 248   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 249   br label %vector.body
 250
 251 vector.body:                                      ; preds = %vector.body, %vector.ph
 252   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 253   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 254   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 1
 255   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 256   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 257   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 258   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 259   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 260   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 261   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 262   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 263   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 264   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 265   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 266   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 267   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 268   %index.next = add i32 %index, 4
 269   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 270   %tmp16 = icmp ne i32 %tmp15, 0
 271   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 272
 273 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 274   ret void
 275 }
 276
 277 define dso_local arm_aapcs_vfpcc void @wrong_loop_invalid_index_splat(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 278 entry:
 279   %cmp8 = icmp eq i32 %N, 0
 280   %tmp8 = add i32 %N, 3
 281   %tmp9 = lshr i32 %tmp8, 2
 282   %tmp10 = shl nuw i32 %tmp9, 2
 283   %tmp11 = add i32 %tmp10, -4
 284   %tmp12 = lshr i32 %tmp11, 2
 285   %tmp13 = add nuw nsw i32 %tmp12, 1
 286   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 287
 288 vector.ph:                                        ; preds = %entry
 289   %trip.count.minus.1 = add i32 %N, -1
 290   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 291   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 292   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 293   br label %vector.body
 294
 295 vector.body:                                      ; preds = %vector.body, %vector.ph
 296   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 297   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 298   %incorrect = add i32 %index, 1
 299   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %incorrect, i32 0
 300   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 301   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 302   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 303   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 304   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 305   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 306   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 307   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 308   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 309   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 310   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 311   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 312   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 313   %index.next = add i32 %index, 4
 314   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 315   %tmp16 = icmp ne i32 %tmp15, 0
 316   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 317
 318 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 319   ret void
 320 }
 321
 322 ; Now using ult, not ule for the vector icmp
 323 define dso_local arm_aapcs_vfpcc void @wrong_pred_opcode(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 324 entry:
 325   %cmp8 = icmp eq i32 %N, 0
 326   %tmp8 = add i32 %N, 3
 327   %tmp9 = lshr i32 %tmp8, 2
 328   %tmp10 = shl nuw i32 %tmp9, 2
 329   %tmp11 = add i32 %tmp10, -4
 330   %tmp12 = lshr i32 %tmp11, 2
 331   %tmp13 = add nuw nsw i32 %tmp12, 1
 332   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 333
 334 vector.ph:                                        ; preds = %entry
 335   %trip.count.minus.1 = add i32 %N, -1
 336   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 337   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 338   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 339   br label %vector.body
 340
 341 vector.body:                                      ; preds = %vector.body, %vector.ph
 342   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 343   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 344   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 345   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 346   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 347   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 348   %tmp1 = icmp ult <4 x i32> %induction, %broadcast.splat11
 349   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 350   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 351   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 352   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 353   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 354   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 355   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 356   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 357   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 358   %index.next = add i32 %index, 4
 359   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 360   %tmp16 = icmp ne i32 %tmp15, 0
 361   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 362
 363 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 364   ret void
 365 }
 366
 367 ; The add in the body uses 1, 2, 3, 4
 368 define void @wrong_body_broadcast_splat(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 369 entry:
 370   %cmp8 = icmp eq i32 %N, 0
 371   %tmp8 = add i32 %N, 3
 372   %tmp9 = lshr i32 %tmp8, 2
 373   %tmp10 = shl nuw i32 %tmp9, 2
 374   %tmp11 = add i32 %tmp10, -4
 375   %tmp12 = lshr i32 %tmp11, 2
 376   %tmp13 = add nuw nsw i32 %tmp12, 1
 377   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 378
 379 vector.ph:                                        ; preds = %entry
 380   %trip.count.minus.1 = add i32 %N, -1
 381   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 382   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 383   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 384   br label %vector.body
 385
 386 vector.body:                                      ; preds = %vector.body, %vector.ph
 387   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 388   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 389   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 390   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 391   %induction = add <4 x i32> %broadcast.splat, <i32 1, i32 2, i32 3, i32 4>
 392   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 393   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 394   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 395   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 396   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 397   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 398   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 399   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 400   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 401   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 402   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 403   %index.next = add i32 %index, 4
 404   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 405   %tmp16 = icmp ne i32 %tmp15, 0
 406   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 407
 408 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 409   ret void
 410 }
 411
 412 ; Using a variable for the loop body broadcast.
 413 define void @wrong_body_broadcast_splat_2(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N, <4 x i32> %offsets) {
 414 entry:
 415   %cmp8 = icmp eq i32 %N, 0
 416   %tmp8 = add i32 %N, 3
 417   %tmp9 = lshr i32 %tmp8, 2
 418   %tmp10 = shl nuw i32 %tmp9, 2
 419   %tmp11 = add i32 %tmp10, -4
 420   %tmp12 = lshr i32 %tmp11, 2
 421   %tmp13 = add nuw nsw i32 %tmp12, 1
 422   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 423
 424 vector.ph:                                        ; preds = %entry
 425   %trip.count.minus.1 = add i32 %N, -1
 426   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 427   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 428   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 429   br label %vector.body
 430
 431 vector.body:                                      ; preds = %vector.body, %vector.ph
 432   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 433   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 434   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 435   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 436   %induction = add <4 x i32> %broadcast.splat, %offsets
 437   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 438   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 439   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 440   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 441   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 442   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 443   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 444   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 445   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 446   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 447   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 448   %index.next = add i32 %index, 4
 449   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 450   %tmp16 = icmp ne i32 %tmp15, 0
 451   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 452
 453 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 454   ret void
 455 }
 456
 457 ; adding 5, instead of 4, to index.
 458 define void @wrong_index_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
 459 entry:
 460   %cmp8 = icmp eq i32 %N, 0
 461   %tmp8 = add i32 %N, 3
 462   %tmp9 = lshr i32 %tmp8, 2
 463   %tmp10 = shl nuw i32 %tmp9, 2
 464   %tmp11 = add i32 %tmp10, -4
 465   %tmp12 = lshr i32 %tmp11, 2
 466   %tmp13 = add nuw nsw i32 %tmp12, 1
 467   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 468
 469 vector.ph:                                        ; preds = %entry
 470   %trip.count.minus.1 = add i32 %N, -1
 471   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 472   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 473   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 474   br label %vector.body
 475
 476 vector.body:                                      ; preds = %vector.body, %vector.ph
 477   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 478   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 479   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 480   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 481   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 482   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 483   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 484   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 485   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 486   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 487   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 488   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 489   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 490   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
 491   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 492   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %tmp1)
 493   %index.next = add i32 %index, 5
 494   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 495   %tmp16 = icmp ne i32 %tmp15, 0
 496   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 497
 498 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 499   ret void
 500 }
 501
 502 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
 503 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
 504 declare i32 @llvm.start.loop.iterations.i32(i32) #3
 505 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
 506