test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-widen.ll

   1 ; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve,+lob %s -S -o - | FileCheck %s
   2
   3 ; CHECK-LABEL: expand_v8i16_v8i32
   4 ; CHECK-NOT: call i32 @llvm.arm.vctp
   5 define void @expand_v8i16_v8i32(i16* noalias nocapture readonly %a, i16* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
   6 entry:
   7   %cmp8 = icmp eq i32 %N, 0
   8   %tmp8 = add i32 %N, 7
   9   %tmp9 = lshr i32 %tmp8, 3
  10   %tmp10 = shl nuw i32 %tmp9, 3
  11   %tmp11 = add i32 %tmp10, -8
  12   %tmp12 = lshr i32 %tmp11, 3
  13   %tmp13 = add nuw nsw i32 %tmp12, 1
  14   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
  15
  16 vector.ph:                                        ; preds = %entry
  17   %trip.count.minus.1 = add i32 %N, -1
  18   %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
  19   %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
  20   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
  21   br label %vector.body
  22
  23 vector.body:                                      ; preds = %vector.body, %vector.ph
  24   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  25   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
  26   %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
  27   %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
  28   %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  29   %tmp = getelementptr inbounds i16, i16* %a, i32 %index
  30   %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
  31   %tmp2 = bitcast i16* %tmp to <8 x i16>*
  32   %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
  33   %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
  34   %tmp4 = bitcast i16* %tmp3 to <8 x i16>*
  35   %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
  36   %expand.1 = zext <8 x i16> %wide.masked.load to <8 x i32>
  37   %expand.2 = zext <8 x i16> %wide.masked.load2 to <8 x i32>
  38   %mul = mul nsw <8 x i32> %expand.2, %expand.1
  39   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %index
  40   %tmp7 = bitcast i32* %tmp6 to <8 x i32>*
  41   tail call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %mul, <8 x i32>* %tmp7, i32 4, <8 x i1> %tmp1)
  42   %index.next = add i32 %index, 8
  43   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
  44   %tmp16 = icmp ne i32 %tmp15, 0
  45   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
  46
  47 for.cond.cleanup:                                 ; preds = %vector.body, %entry
  48   ret void
  49 }
  50
  51 ; CHECK-LABEL: expand_v8i16_v4i32
  52 ; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS_REM:%[^ ]+]], %vector.body ]
  53 ; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
  54 ; CHECK: [[ELEMS_REM]] = sub i32 [[ELEMS]], 8
  55 ; CHECK: tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
  56 ; CHECK: %store.pred = icmp ule <4 x i32> %induction.store
  57 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %store.pred)
  58 ; CHECK: tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> {{.*}}, <4 x i32>* {{.*}}, i32 4, <4 x i1> %store.pred)
  59 define void @expand_v8i16_v4i32(i16* readonly %a, i16* readonly %b, i32* %c, i32* %d, i32 %N) {
  60 entry:
  61   %cmp8 = icmp eq i32 %N, 0
  62   %tmp8 = add i32 %N, 7
  63   %tmp9 = lshr i32 %tmp8, 3
  64   %tmp10 = shl nuw i32 %tmp9, 3
  65   %tmp11 = add i32 %tmp10, -8
  66   %tmp12 = lshr i32 %tmp11, 3
  67   %tmp13 = add nuw nsw i32 %tmp12, 1
  68   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
  69
  70 vector.ph:                                        ; preds = %entry
  71   %trip.count.minus.1 = add i32 %N, -1
  72   %broadcast.splatinsert10 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0
  73   %broadcast.splat11 = shufflevector <8 x i32> %broadcast.splatinsert10, <8 x i32> undef, <8 x i32> zeroinitializer
  74   %broadcast.splatinsert10.store = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
  75   %broadcast.splat11.store = shufflevector <4 x i32> %broadcast.splatinsert10.store, <4 x i32> undef, <4 x i32> zeroinitializer
  76   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
  77   br label %vector.body
  78
  79 vector.body:                                      ; preds = %vector.body, %vector.ph
  80   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
  81   %store.idx = phi i32 [ 0, %vector.ph ], [ %store.idx.next, %vector.body ]
  82   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
  83   %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
  84   %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
  85   %induction = add <8 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  86   %tmp = getelementptr inbounds i16, i16* %a, i32 %index
  87   %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
  88   %tmp2 = bitcast i16* %tmp to <8 x i16>*
  89   %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
  90   %tmp3 = getelementptr inbounds i16, i16* %b, i32 %index
  91   %tmp4 = bitcast i16* %tmp3 to <8 x i16>*
  92   %wide.masked.load2 = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> %tmp1, <8 x i16> undef)
  93   %extract.2.low = shufflevector <8 x i16> %wide.masked.load2, <8 x i16> undef, < 4 x i32> <i32 0, i32 1, i32 2, i32 3>
  94   %extract.2.high = shufflevector <8 x i16> %wide.masked.load2, <8 x i16> undef, < 4 x i32> <i32 4, i32 5, i32 6, i32 7>
  95   %expand.1 = zext <4 x i16> %extract.2.low to <4 x i32>
  96   %expand.2 = zext <4 x i16> %extract.2.high to <4 x i32>
  97   %mul = mul nsw <4 x i32> %expand.2, %expand.1
  98   %sub = mul nsw <4 x i32> %expand.1, %expand.2
  99   %broadcast.splatinsert.store = insertelement <4 x i32> undef, i32 %store.idx, i32 0
 100   %broadcast.splat.store = shufflevector <4 x i32> %broadcast.splatinsert.store, <4 x i32> undef, <4 x i32> zeroinitializer
 101   %induction.store = add <4 x i32> %broadcast.splat.store, <i32 0, i32 1, i32 2, i32 3>
 102   %store.pred = icmp ule <4 x i32> %induction.store, %broadcast.splat11.store
 103   %tmp6 = getelementptr inbounds i32, i32* %c, i32 %store.idx
 104   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
 105   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %store.pred)
 106   %gep = getelementptr inbounds i32, i32* %d, i32 %store.idx
 107   %cast.gep = bitcast i32* %gep to <4 x i32>*
 108   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %sub, <4 x i32>* %cast.gep, i32 4, <4 x i1> %store.pred)
 109   %store.idx.next = add i32 %store.idx, 4
 110   %index.next = add i32 %index, 8
 111   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 112   %tmp16 = icmp ne i32 %tmp15, 0
 113   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 114
 115 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 116   ret void
 117 }
 118
 119 ; CHECK-LABEL: expand_v4i32_v4i64
 120 ; CHECK-NOT: call i32 @llvm.arm.vctp
 121 define void @expand_v4i32_v4i64(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i64* noalias nocapture %c, i32 %N) {
 122 entry:
 123   %cmp8 = icmp eq i32 %N, 0
 124   %tmp8 = add i32 %N, 3
 125   %tmp9 = lshr i32 %tmp8, 2
 126   %tmp10 = shl nuw i32 %tmp9, 2
 127   %tmp11 = add i32 %tmp10, -4
 128   %tmp12 = lshr i32 %tmp11, 2
 129   %tmp13 = add nuw nsw i32 %tmp12, 1
 130   br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
 131
 132 vector.ph:                                        ; preds = %entry
 133   %trip.count.minus.1 = add i32 %N, -1
 134   %broadcast.splatinsert10 = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
 135   %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10, <4 x i32> undef, <4 x i32> zeroinitializer
 136   call void @llvm.set.loop.iterations.i32(i32 %tmp13)
 137   br label %vector.body
 138
 139 vector.body:                                      ; preds = %vector.body, %vector.ph
 140   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 141   %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
 142   %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
 143   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
 144   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 145   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 146   %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
 147   %tmp2 = bitcast i32* %tmp to <4 x i32>*
 148   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 149   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
 150   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
 151   %wide.masked.load2 = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp4, i32 4, <4 x i1> %tmp1, <4 x i32> undef)
 152   %expand.1 = zext <4 x i32> %wide.masked.load to <4 x i64>
 153   %expand.2 = zext <4 x i32> %wide.masked.load2 to <4 x i64>
 154   %mul = mul nsw <4 x i64> %expand.2, %expand.1
 155   %tmp6 = getelementptr inbounds i64, i64* %c, i32 %index
 156   %tmp7 = bitcast i64* %tmp6 to <4 x i64>*
 157   tail call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %mul, <4 x i64>* %tmp7, i32 4, <4 x i1> %tmp1)
 158   %index.next = add i32 %index, 4
 159   %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
 160   %tmp16 = icmp ne i32 %tmp15, 0
 161   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 162
 163 for.cond.cleanup:                                 ; preds = %vector.body, %entry
 164   ret void
 165 }
 166
 167 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
 168 declare void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>, <8 x i32>*, i32 immarg, <8 x i1>)
 169 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
 170 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
 171 declare void @llvm.masked.store.v4i64.p0v4i64(<4 x i64>, <4 x i64>*, i32 immarg, <4 x i1>)
 172 declare void @llvm.set.loop.iterations.i32(i32)
 173 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)