llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
   2 ; RUN: opt --arm-mve-gather-scatter-lowering -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -S -o - | FileCheck %s
   3
   4 define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
   5 ; CHECK-LABEL: @push_out_add_sub_block(
   6 ; CHECK-NEXT:  vector.ph:
   7 ; CHECK-NEXT:    [[PUSHEDOUTADD:%.*]] = add <4 x i32> <i32 0, i32 2, i32 4, i32 6>, <i32 6, i32 6, i32 6, i32 6>
   8 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
   9 ; CHECK:       vector.body:
  10 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
  11 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY_END]] ]
  12 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 48
  13 ; CHECK-NEXT:    br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]]
  14 ; CHECK:       lower.block:
  15 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1)
  16 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]]
  17 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
  18 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP3]], align 4
  19 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
  20 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
  21 ; CHECK-NEXT:    br label [[VECTOR_BODY_END]]
  22 ; CHECK:       vector.body.end:
  23 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
  24 ; CHECK-NEXT:    br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]]
  25 ; CHECK:       end:
  26 ; CHECK-NEXT:    ret void
  27 ;
  28
  29 vector.ph:
  30   br label %vector.body
  31
  32 vector.body:                                      ; preds = %vector.body, %vector.ph
  33   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
  34   %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
  35   %0 = icmp eq i32 %index, 48
  36   br i1 %0, label %lower.block, label %end
  37
  38 lower.block:                             ; preds = %vector.body
  39   %1 = add <4 x i32> %vec.ind, <i32 6, i32 6, i32 6, i32 6>
  40   %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
  41   %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
  42   %3 = getelementptr inbounds i32, i32* %dst, i32 %index
  43   %4 = bitcast i32* %3 to <4 x i32>*
  44   store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
  45   %index.next = add i32 %index, 4
  46   %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
  47   br label %vector.body.end
  48
  49 vector.body.end:                             ; preds = %lower.block
  50   %5 = icmp eq i32 %index.next, %n.vec
  51   br i1 %5, label %end, label %vector.body
  52
  53 end:
  54   ret void;
  55 }
  56
  57 define arm_aapcs_vfpcc void @push_out_mul_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
  58 ; CHECK-LABEL: @push_out_mul_sub_block(
  59 ; CHECK-NEXT:  vector.ph:
  60 ; CHECK-NEXT:    [[PUSHEDOUTMUL:%.*]] = mul <4 x i32> <i32 0, i32 2, i32 4, i32 6>, <i32 3, i32 3, i32 3, i32 3>
  61 ; CHECK-NEXT:    [[PRODUCT:%.*]] = mul <4 x i32> <i32 8, i32 8, i32 8, i32 8>, <i32 3, i32 3, i32 3, i32 3>
  62 ; CHECK-NEXT:    [[PUSHEDOUTADD:%.*]] = add <4 x i32> [[PUSHEDOUTMUL]], <i32 6, i32 6, i32 6, i32 6>
  63 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
  64 ; CHECK:       vector.body:
  65 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
  66 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[INCREMENTPUSHEDOUTMUL:%.*]], [[VECTOR_BODY_END]] ]
  67 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 48
  68 ; CHECK-NEXT:    br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]]
  69 ; CHECK:       lower.block:
  70 ; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1)
  71 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]]
  72 ; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
  73 ; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[TMP3]], align 4
  74 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
  75 ; CHECK-NEXT:    br label [[VECTOR_BODY_END]]
  76 ; CHECK:       vector.body.end:
  77 ; CHECK-NEXT:    [[INCREMENTPUSHEDOUTMUL]] = add <4 x i32> [[VEC_IND]], [[PRODUCT]]
  78 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
  79 ; CHECK-NEXT:    br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]]
  80 ; CHECK:       end:
  81 ; CHECK-NEXT:    ret void
  82 ;
  83
  84 vector.ph:
  85   br label %vector.body
  86
  87 vector.body:                                      ; preds = %vector.body, %vector.ph
  88   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
  89   %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
  90   %0 = icmp eq i32 %index, 48
  91   br i1 %0, label %lower.block, label %end
  92
  93 lower.block:                             ; preds = %vector.body
  94   %1 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
  95   %2 = add <4 x i32> %1, <i32 6, i32 6, i32 6, i32 6>
  96   %3 = getelementptr inbounds i32, i32* %data, <4 x i32> %2
  97   %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %3, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
  98   %4 = getelementptr inbounds i32, i32* %dst, i32 %index
  99   %5 = bitcast i32* %4 to <4 x i32>*
 100   store <4 x i32> %wide.masked.gather, <4 x i32>* %5, align 4
 101   %index.next = add i32 %index, 4
 102   %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
 103   br label %vector.body.end
 104
 105 vector.body.end:                             ; preds = %lower.block
 106   %6 = icmp eq i32 %index.next, %n.vec
 107   br i1 %6, label %end, label %vector.body
 108
 109 end:
 110   ret void;
 111 }
 112
 113
 114 define arm_aapcs_vfpcc void @push_out_mul_sub_loop(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
 115 ; CHECK-LABEL: @push_out_mul_sub_loop(
 116 ; CHECK-NEXT:  vector.ph:
 117 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 118 ; CHECK:       vector.body:
 119 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ]
 120 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY_END]] ]
 121 ; CHECK-NEXT:    br label [[VECTOR_2_PH:%.*]]
 122 ; CHECK:       vector.2.ph:
 123 ; CHECK-NEXT:    br label [[VECTOR_2_BODY:%.*]]
 124 ; CHECK:       vector.2.body:
 125 ; CHECK-NEXT:    [[TMP0:%.*]] = mul <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
 126 ; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[TMP0]], <i32 6, i32 6, i32 6, i32 6>
 127 ; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[TMP1]], i32 32, i32 2, i32 1)
 128 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]]
 129 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>*
 130 ; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4
 131 ; CHECK-NEXT:    br label [[VECTOR_2_BODY_END:%.*]]
 132 ; CHECK:       vector.2.body.end:
 133 ; CHECK-NEXT:    [[INDEX_2_NEXT:%.*]] = add i32 [[INDEX]], 4
 134 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[INDEX_2_NEXT]], 16
 135 ; CHECK-NEXT:    br i1 [[TMP5]], label [[VECTOR_BODY_END]], label [[VECTOR_2_BODY]]
 136 ; CHECK:       vector.body.end:
 137 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 138 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
 139 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
 140 ; CHECK-NEXT:    br i1 [[TMP6]], label [[END:%.*]], label [[VECTOR_BODY]]
 141 ; CHECK:       end:
 142 ; CHECK-NEXT:    ret void
 143 ;
 144
 145 vector.ph:
 146   br label %vector.body
 147
 148 vector.body:                                      ; preds = %vector.body, %vector.ph
 149   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body.end ]
 150   %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body.end ]
 151   br label %vector.2.ph
 152
 153 vector.2.ph:
 154   br label %vector.2.body
 155
 156 vector.2.body:                             ; preds = %vector.body
 157   %0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
 158   %1 = add <4 x i32> %0, <i32 6, i32 6, i32 6, i32 6>
 159   %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1
 160   %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
 161   %3 = getelementptr inbounds i32, i32* %dst, i32 %index
 162   %4 = bitcast i32* %3 to <4 x i32>*
 163   store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4
 164   br label %vector.2.body.end
 165
 166 vector.2.body.end:                             ; preds = %lower.block
 167   %index.2.next = add i32 %index, 4
 168   %5 = icmp eq i32 %index.2.next, 16
 169   br i1 %5, label %vector.body.end, label %vector.2.body
 170
 171 vector.body.end:                             ; preds = %lower.block
 172   %index.next = add i32 %index, 4
 173   %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
 174   %6 = icmp eq i32 %index.next, %n.vec
 175   br i1 %6, label %end, label %vector.body
 176
 177 end:
 178   ret void;
 179 }
 180
 181 define arm_aapcs_vfpcc void @invariant_add(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) {
 182 ; CHECK-LABEL: @invariant_add(
 183 ; CHECK-NEXT:  vector.ph:
 184 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 185 ; CHECK:       vector.body:
 186 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 187 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 188 ; CHECK-NEXT:    [[L0:%.*]] = mul <4 x i32> [[VEC_IND]], <i32 3, i32 3, i32 3, i32 3>
 189 ; CHECK-NEXT:    [[L1:%.*]] = add <4 x i32> [[L0]], [[VEC_IND]]
 190 ; CHECK-NEXT:    [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[L1]], i32 32, i32 2, i32 1)
 191 ; CHECK-NEXT:    [[L3:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]]
 192 ; CHECK-NEXT:    [[L4:%.*]] = bitcast i32* [[L3]] to <4 x i32>*
 193 ; CHECK-NEXT:    store <4 x i32> [[TMP0]], <4 x i32>* [[L4]], align 4
 194 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 195 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
 196 ; CHECK-NEXT:    [[L5:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC:%.*]]
 197 ; CHECK-NEXT:    br i1 [[L5]], label [[END:%.*]], label [[VECTOR_BODY]]
 198 ; CHECK:       end:
 199 ; CHECK-NEXT:    ret void
 200 ;
 201
 202 vector.ph:
 203   br label %vector.body
 204
 205 vector.body:                                      ; preds = %vector.body, %vector.ph
 206   %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 207   %vec.ind = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, %vector.ph ], [ %vec.ind.next, %vector.body ]
 208   %l0 = mul <4 x i32> %vec.ind, <i32 3, i32 3, i32 3, i32 3>
 209   %l1 = add <4 x i32> %l0, %vec.ind
 210   %l2 = getelementptr inbounds i32, i32* %data, <4 x i32> %l1
 211   %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %l2, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
 212   %l3 = getelementptr inbounds i32, i32* %dst, i32 %index
 213   %l4 = bitcast i32* %l3 to <4 x i32>*
 214   store <4 x i32> %wide.masked.gather, <4 x i32>* %l4, align 4
 215   %index.next = add i32 %index, 4
 216   %vec.ind.next = add <4 x i32> %vec.ind, <i32 8, i32 8, i32 8, i32 8>
 217   %l5 = icmp eq i32 %index.next, %n.vec
 218   br i1 %l5, label %end, label %vector.body
 219
 220 end:
 221   ret void;
 222 }
 223
 224
 225 declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)