llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-pattern-fail.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
   2 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve,+lob %s -S -o - | FileCheck %s
   3
   4 ; The following functions should all fail to become tail-predicated.
   5 ; CHECK-NOT: call i32 @llvm.arm.vctp
   6
   7 ; trip.count.minus.1 has been inserted into element 1, not 0.
   8 define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_0(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
   9 entry:
  10   %cmp8 = icmp eq i32 %N, 0
  11   %tmp8 = add i32 %N, 3
  12   %tmp9 = lshr i32 %tmp8, 2
  13   %tmp10 = shl nuw i32 %tmp9, 2
  14   %tmp11 = add i32 %tmp10, -4
  15   %tmp12 = lshr i32 %tmp11, 2
  16   %tmp13 = add nuw nsw i32 %tmp12, 1
  17   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
  18
  19 vector.ph:                                        ; preds = %entry
  20   %trip.count.minus.1 = add i32 %N, -1
  21   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 1
  22   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
  23   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  24   br label %vector.body
  25
  26 vector.body:                                      ; preds = %vector.body, %vector.ph
  27   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  28   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  29   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  30   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  31   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
  32   %tmp = getelementptr inbounds i32, ptr %a, i32 %index
  33   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
  34   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
  35   %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
  36   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
  37   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
  38   %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
  39   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
  40   %index.next = add i32 %index, 4
  41   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
  42   %tmp16 = icmp ne i32 %tmp15, 0
  43   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
  44
  45 for.cond.cleanup:                                 ; preds = %vector.body, %entry
  46   ret void
  47 }
  48
  49 ; The insert isn't using an undef for operand 0.
  50 define dso_local arm_aapcs_vfpcc void @wrong_ph_insert_def(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
  51 entry:
  52   %cmp8 = icmp eq i32 %N, 0
  53   %tmp8 = add i32 %N, 3
  54   %tmp9 = lshr i32 %tmp8, 2
  55   %tmp10 = shl nuw i32 %tmp9, 2
  56   %tmp11 = add i32 %tmp10, -4
  57   %tmp12 = lshr i32 %tmp11, 2
  58   %tmp13 = add nuw nsw i32 %tmp12, 1
  59   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
  60
  61 vector.ph:                                        ; preds = %entry
  62   %trip.count.minus.1 = add i32 %N, -1
  63   %broadcast.splatinsert10 = insertelement <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i32 %trip.count.minus.1, i32 0
  64   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
  65   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
  66   br label %vector.body
  67
  68 vector.body:                                      ; preds = %vector.body, %vector.ph
  69   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  70   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
  71   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
  72   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
  73   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
  74   %tmp = getelementptr inbounds i32, ptr %a, i32 %index
  75   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
  76   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
  77   %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
  78   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
  79   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
  80   %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
  81   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
  82   %index.next = add i32 %index, 4
  83   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
  84   %tmp16 = icmp ne i32 %tmp15, 0
  85   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
  86
  87 for.cond.cleanup:                                 ; preds = %vector.body, %entry
  88   ret void
  89 }
  90
  91 ; The shuffle uses a defined value for operand 1.
  92 define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_1(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
  93 entry:
  94   %cmp8 = icmp eq i32 %N, 0
  95   %tmp8 = add i32 %N, 3
  96   %tmp9 = lshr i32 %tmp8, 2
  97   %tmp10 = shl nuw i32 %tmp9, 2
  98   %tmp11 = add i32 %tmp10, -4
  99   %tmp12 = lshr i32 %tmp11, 2
 100   %tmp13 = add nuw nsw i32 %tmp12, 1
 101   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 102
 103 vector.ph:                                        ; preds = %entry
 104   %trip.count.minus.1 = add i32 %N, -1
 105   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 106   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, <4 x i32> zeroinitializer
 107   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 108   br label %vector.body
 109
 110 vector.body:                                      ; preds = %vector.body, %vector.ph
 111   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 112   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 113   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 114   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 115   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 116   %tmp = getelementptr inbounds i32, ptr %a, i32 %index
 117   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 118   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 119   %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
 120   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 121   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 122   %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
 123   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
 124   %index.next = add i32 %index, 4
 125   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 126   %tmp16 = icmp ne i32 %tmp15, 0
 127   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 128
 129 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 130   ret void
 131 }
 132
 133 ; The shuffle uses a non zero value for operand 2.
 134 define dso_local arm_aapcs_vfpcc void @wrong_ph_shuffle_2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
 135 entry:
 136   %cmp8 = icmp eq i32 %N, 0
 137   %tmp8 = add i32 %N, 3
 138   %tmp9 = lshr i32 %tmp8, 2
 139   %tmp10 = shl nuw i32 %tmp9, 2
 140   %tmp11 = add i32 %tmp10, -4
 141   %tmp12 = lshr i32 %tmp11, 2
 142   %tmp13 = add nuw nsw i32 %tmp12, 1
 143   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 144
 145 vector.ph:                                        ; preds = %entry
 146   %trip.count.minus.1 = add i32 %N, -1
 147   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 148   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 149   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 150   br label %vector.body
 151
 152 vector.body:                                      ; preds = %vector.body, %vector.ph
 153   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 154   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 155   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 156   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 157   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 158   %tmp = getelementptr inbounds i32, ptr %a, i32 %index
 159   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 160   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 161   %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
 162   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 163   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 164   %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
 165   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
 166   %index.next = add i32 %index, 4
 167   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 168   %tmp16 = icmp ne i32 %tmp15, 0
 169   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 170
 171 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 172   ret void
 173 }
 174
 175 ; %N - 2
 176 define dso_local arm_aapcs_vfpcc void @trip_count_minus_2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
 177 entry:
 178   %cmp8 = icmp eq i32 %N, 0
 179   %tmp8 = add i32 %N, 3
 180   %tmp9 = lshr i32 %tmp8, 2
 181   %tmp10 = shl nuw i32 %tmp9, 2
 182   %tmp11 = add i32 %tmp10, -4
 183   %tmp12 = lshr i32 %tmp11, 2
 184   %tmp13 = add nuw nsw i32 %tmp12, 1
 185   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 186
 187 vector.ph:                                        ; preds = %entry
 188   %trip.count.minus.2 = add i32 %N, -2
 189   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.2, i32 1
 190   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 191   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 192   br label %vector.body
 193
 194 vector.body:                                      ; preds = %vector.body, %vector.ph
 195   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 196   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 197   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 198   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 199   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 200   %tmp = getelementptr inbounds i32, ptr %a, i32 %index
 201   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 202   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 203   %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
 204   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 205   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 206   %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
 207   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
 208   %index.next = add i32 %index, 4
 209   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 210   %tmp16 = icmp ne i32 %tmp15, 0
 211   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 212
 213 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 214   ret void
 215 }
 216
 217 ; index has been inserted at element 1, not 0.
 218 define dso_local arm_aapcs_vfpcc void @wrong_loop_insert(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
 219 entry:
 220   %cmp8 = icmp eq i32 %N, 0
 221   %tmp8 = add i32 %N, 3
 222   %tmp9 = lshr i32 %tmp8, 2
 223   %tmp10 = shl nuw i32 %tmp9, 2
 224   %tmp11 = add i32 %tmp10, -4
 225   %tmp12 = lshr i32 %tmp11, 2
 226   %tmp13 = add nuw nsw i32 %tmp12, 1
 227   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 228
 229 vector.ph:                                        ; preds = %entry
 230   %trip.count.minus.1 = add i32 %N, -1
 231   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 232   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 233   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 234   br label %vector.body
 235
 236 vector.body:                                      ; preds = %vector.body, %vector.ph
 237   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 238   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 239   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 1
 240   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 241   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 242   %tmp = getelementptr inbounds i32, ptr %a, i32 %index
 243   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 244   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 245   %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
 246   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 247   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 248   %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
 249   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
 250   %index.next = add i32 %index, 4
 251   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 252   %tmp16 = icmp ne i32 %tmp15, 0
 253   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 254
 255 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 256   ret void
 257 }
 258
 259 define dso_local arm_aapcs_vfpcc void @wrong_loop_invalid_index_splat(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
 260 entry:
 261   %cmp8 = icmp eq i32 %N, 0
 262   %tmp8 = add i32 %N, 3
 263   %tmp9 = lshr i32 %tmp8, 2
 264   %tmp10 = shl nuw i32 %tmp9, 2
 265   %tmp11 = add i32 %tmp10, -4
 266   %tmp12 = lshr i32 %tmp11, 2
 267   %tmp13 = add nuw nsw i32 %tmp12, 1
 268   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 269
 270 vector.ph:                                        ; preds = %entry
 271   %trip.count.minus.1 = add i32 %N, -1
 272   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 273   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 274   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 275   br label %vector.body
 276
 277 vector.body:                                      ; preds = %vector.body, %vector.ph
 278   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 279   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 280   %incorrect = add i32 %index, 1
 281   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %incorrect, i32 0
 282   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 283   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 284   %tmp = getelementptr inbounds i32, ptr %a, i32 %index
 285   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 286   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 287   %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
 288   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 289   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 290   %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
 291   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
 292   %index.next = add i32 %index, 4
 293   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 294   %tmp16 = icmp ne i32 %tmp15, 0
 295   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 296
 297 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 298   ret void
 299 }
 300
 301 ; Now using ult, not ule for the vector icmp
 302 define dso_local arm_aapcs_vfpcc void @wrong_pred_opcode(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
 303 entry:
 304   %cmp8 = icmp eq i32 %N, 0
 305   %tmp8 = add i32 %N, 3
 306   %tmp9 = lshr i32 %tmp8, 2
 307   %tmp10 = shl nuw i32 %tmp9, 2
 308   %tmp11 = add i32 %tmp10, -4
 309   %tmp12 = lshr i32 %tmp11, 2
 310   %tmp13 = add nuw nsw i32 %tmp12, 1
 311   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 312
 313 vector.ph:                                        ; preds = %entry
 314   %trip.count.minus.1 = add i32 %N, -1
 315   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 316   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 317   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 318   br label %vector.body
 319
 320 vector.body:                                      ; preds = %vector.body, %vector.ph
 321   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 322   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 323   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 324   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 325   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 326   %tmp = getelementptr inbounds i32, ptr %a, i32 %index
 327   %tmp1 = icmp ult <4 x i32> %induction, %broadcast.splat11
 328   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 329   %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
 330   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 331   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 332   %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
 333   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
 334   %index.next = add i32 %index, 4
 335   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 336   %tmp16 = icmp ne i32 %tmp15, 0
 337   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 338
 339 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 340   ret void
 341 }
 342
 343 ; The add in the body uses 1, 2, 3, 4
 344 define void @wrong_body_broadcast_splat(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
 345 entry:
 346   %cmp8 = icmp eq i32 %N, 0
 347   %tmp8 = add i32 %N, 3
 348   %tmp9 = lshr i32 %tmp8, 2
 349   %tmp10 = shl nuw i32 %tmp9, 2
 350   %tmp11 = add i32 %tmp10, -4
 351   %tmp12 = lshr i32 %tmp11, 2
 352   %tmp13 = add nuw nsw i32 %tmp12, 1
 353   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 354
 355 vector.ph:                                        ; preds = %entry
 356   %trip.count.minus.1 = add i32 %N, -1
 357   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 358   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 359   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 360   br label %vector.body
 361
 362 vector.body:                                      ; preds = %vector.body, %vector.ph
 363   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 364   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 365   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 366   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 367   %induction = add <4 x i32> %broadcast.splat, <i32 1, i32 2, i32 3, i32 4>
 368   %tmp = getelementptr inbounds i32, ptr %a, i32 %index
 369   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 370   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 371   %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
 372   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 373   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 374   %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
 375   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
 376   %index.next = add i32 %index, 4
 377   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 378   %tmp16 = icmp ne i32 %tmp15, 0
 379   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 380
 381 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 382   ret void
 383 }
 384
 385 ; Using a variable for the loop body broadcast.
 386 define void @wrong_body_broadcast_splat_2(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N, <4 x i32> %offsets) {
 387 entry:
 388   %cmp8 = icmp eq i32 %N, 0
 389   %tmp8 = add i32 %N, 3
 390   %tmp9 = lshr i32 %tmp8, 2
 391   %tmp10 = shl nuw i32 %tmp9, 2
 392   %tmp11 = add i32 %tmp10, -4
 393   %tmp12 = lshr i32 %tmp11, 2
 394   %tmp13 = add nuw nsw i32 %tmp12, 1
 395   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 396
 397 vector.ph:                                        ; preds = %entry
 398   %trip.count.minus.1 = add i32 %N, -1
 399   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 400   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 401   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 402   br label %vector.body
 403
 404 vector.body:                                      ; preds = %vector.body, %vector.ph
 405   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 406   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 407   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 408   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 409   %induction = add <4 x i32> %broadcast.splat, %offsets
 410   %tmp = getelementptr inbounds i32, ptr %a, i32 %index
 411   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 412   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 413   %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
 414   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 415   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 416   %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
 417   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
 418   %index.next = add i32 %index, 4
 419   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 420   %tmp16 = icmp ne i32 %tmp15, 0
 421   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 422
 423 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 424   ret void
 425 }
 426
 427 ; adding 5, instead of 4, to index.
 428 define void @wrong_index_add(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture %c, i32 %N) {
 429 entry:
 430   %cmp8 = icmp eq i32 %N, 0
 431   %tmp8 = add i32 %N, 3
 432   %tmp9 = lshr i32 %tmp8, 2
 433   %tmp10 = shl nuw i32 %tmp9, 2
 434   %tmp11 = add i32 %tmp10, -4
 435   %tmp12 = lshr i32 %tmp11, 2
 436   %tmp13 = add nuw nsw i32 %tmp12, 1
 437   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 438
 439 vector.ph:                                        ; preds = %entry
 440   %trip.count.minus.1 = add i32 %N, -1
 441   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 442   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 443   %start = call i32 @llvm.start.loop.iterations.i32(i32 %tmp13)
 444   br label %vector.body
 445
 446 vector.body:                                      ; preds = %vector.body, %vector.ph
 447   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 448   %tmp14 = phi i32 [ %start, %vector.ph ], [ %tmp15, %vector.body ]
 449   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 450   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 451   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 452   %tmp = getelementptr inbounds i32, ptr %a, i32 %index
 453   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 454   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 455   %tmp3 = getelementptr inbounds i32, ptr %b, i32 %index
 456   %wide.masked.load12 = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %tmp3, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 457   %tmp5 = mul nsw <4 x i32> %wide.masked.load12, %wide.masked.load
 458   %tmp6 = getelementptr inbounds i32, ptr %c, i32 %index
 459   tail call void @llvm.masked.store.v4i32.p0(<4 x i32> %tmp5, ptr %tmp6, i32 4, <4 x i1> %tmp1)
 460   %index.next = add i32 %index, 5
 461   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 462   %tmp16 = icmp ne i32 %tmp15, 0
 463   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 464
 465 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 466   ret void
 467 }
 468
 469 declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) #1
 470 declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) #2
 471 declare i32 @llvm.start.loop.iterations.i32(i32) #3
 472 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
 473