llvm/test/Analysis/CostModel/X86/masked-interleaved-load-i16.ll

   1 ; RUN: opt -loop-vectorize -enable-interleaved-mem-accesses -prefer-predicate-over-epilogue=predicate-dont-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=DISABLED_MASKED_STRIDED
   2 ; RUN: opt -loop-vectorize -enable-interleaved-mem-accesses -enable-masked-interleaved-mem-accesses -prefer-predicate-over-epilogue=predicate-dont-vectorize -S -mcpu=skx --debug-only=loop-vectorize < %s 2>&1 | FileCheck %s --check-prefix=ENABLED_MASKED_STRIDED
   3 ; REQUIRES: asserts
   4
   5 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
   6 target triple = "x86_64-unknown-linux-gnu"
   7
   8 ; (1) Interleave-group with factor 4, storing only 2 members out of the 4.
   9 ; Check that when we allow masked-memops to support interleave-group with gaps,
  10 ; the store is vectorized using a wide masked store, with a 1,1,0,0,1,1,0,0,... mask.
  11 ; Check that when we don't allow masked-memops to support interleave-group with gaps,
  12 ; the store is scalarized.
  13 ; The input IR was generated from this source:
  14 ;     for(i=0;i<1024;i++){
  15 ;       x[i] = points[i*4];
  16 ;       y[i] = points[i*4 + 1];
  17 ;     }
  18 ; (relates to the testcase in PR50566)
  19
  20 ; DISABLED_MASKED_STRIDED: LV: Checking a loop in "test1"
  21 ;
  22 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
  23 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
  24 ;
  25 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 6 for VF 2 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
  26 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 6 for VF 2 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
  27 ;
  28 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 4 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
  29 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 4 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
  30 ;
  31 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 30 for VF 8 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
  32 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 30 for VF 8 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
  33 ;
  34 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 62 for VF 16 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
  35 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 62 for VF 16 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
  36
  37 ; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test1"
  38 ;
  39 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
  40 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
  41 ;
  42 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 2 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
  43 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
  44 ;
  45 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 11 for VF 4 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
  46 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
  47 ;
  48 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 11 for VF 8 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
  49 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
  50 ;
  51 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 17 for VF 16 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
  52 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 16 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
  53
  54 define void @test1(i16* noalias nocapture %points, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y) {
  55 entry:
  56   br label %for.body
  57
  58 for.body:
  59   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
  60   %i1 = shl nuw nsw i64 %indvars.iv, 2
  61   %arrayidx2 = getelementptr inbounds i16, i16* %points, i64 %i1
  62   %i2 = load i16, i16* %arrayidx2, align 2
  63   %i3 = or i64 %i1, 1
  64   %arrayidx7 = getelementptr inbounds i16, i16* %points, i64 %i3
  65   %i4 = load i16, i16* %arrayidx7, align 2
  66   %arrayidx = getelementptr inbounds i16, i16* %x, i64 %indvars.iv
  67   store i16 %i2, i16* %arrayidx, align 2
  68   %arrayidx4 = getelementptr inbounds i16, i16* %y, i64 %indvars.iv
  69   store i16 %i4, i16* %arrayidx4, align 2
  70   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  71   %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
  72   br i1 %exitcond.not, label %for.end, label %for.body
  73
  74 for.end:
  75   ret void
  76 }
  77
  78 ; (2) Same as above, but this time the gaps mask of the store is also And-ed with the
  79 ; fold-tail mask. If using masked memops to vectorize interleaved-group with gaps is
  80 ; not allowed, the store is scalarized and predicated.
  81 ; The input IR was generated from this source:
  82 ;     for(i=0;i<numPoints;i++){
  83 ;       x[i] = points[i*4];
  84 ;       y[i] = points[i*4 + 1];
  85 ;     }
  86
  87 ; DISABLED_MASKED_STRIDED: LV: Checking a loop in "test2"
  88 ;
  89 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
  90 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
  91 ;
  92 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
  93 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
  94 ;
  95 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
  96 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
  97 ;
  98 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
  99 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 100 ;
 101 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 102 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 103
 104 ; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test2"
 105 ;
 106 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 107 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 108 ;
 109 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 8 for VF 2 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 110 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 2 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 111 ;
 112 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 11 for VF 4 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 113 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 4 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 114 ;
 115 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 11 for VF 8 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 116 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 8 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 117 ;
 118 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 17 for VF 16 For instruction:   %i2 = load i16, i16* %arrayidx2, align 2
 119 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 0 for VF 16 For instruction:   %i4 = load i16, i16* %arrayidx7, align 2
 120
 121 define void @test2(i16* noalias nocapture %points, i32 %numPoints, i16* noalias nocapture readonly %x, i16* noalias nocapture readonly %y) {
 122 entry:
 123   %cmp15 = icmp sgt i32 %numPoints, 0
 124   br i1 %cmp15, label %for.body.preheader, label %for.end
 125
 126 for.body.preheader:
 127   %wide.trip.count = zext i32 %numPoints to i64
 128   br label %for.body
 129
 130 for.body:
 131   %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
 132   %i1 = shl nuw nsw i64 %indvars.iv, 2
 133   %arrayidx2 = getelementptr inbounds i16, i16* %points, i64 %i1
 134   %i2 = load i16, i16* %arrayidx2, align 2
 135   %i3 = or i64 %i1, 1
 136   %arrayidx7 = getelementptr inbounds i16, i16* %points, i64 %i3
 137   %i4 = load i16, i16* %arrayidx7, align 2
 138   %arrayidx = getelementptr inbounds i16, i16* %x, i64 %indvars.iv
 139   store i16 %i2, i16* %arrayidx, align 2
 140   %arrayidx4 = getelementptr inbounds i16, i16* %y, i64 %indvars.iv
 141   store i16 %i4, i16* %arrayidx4, align 2
 142   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 143   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
 144   br i1 %exitcond.not, label %for.end.loopexit, label %for.body
 145
 146 for.end.loopexit:
 147   br label %for.end
 148
 149 for.end:
 150   ret void
 151 }
 152
 153 ; (3) Testing a scenario of a conditional store. The gaps mask of the store is also
 154 ; And-ed with the condition mask (x[i] > 0).
 155 ; If using masked memops to vectorize interleaved-group with gaps is
 156 ; not allowed, the store is scalarized and predicated.
 157 ; Here the Interleave-group is with factor 3, storing only 1 member out of the 3.
 158 ; The input IR was generated from this source:
 159 ;     for(i=0;i<1024;i++){
 160 ;       if (x[i] > 0)
 161 ;         x[i] = points[i*3];
 162 ;     }
 163
 164 ; DISABLED_MASKED_STRIDED: LV: Checking a loop in "test"
 165 ;
 166 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 167 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 2 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 168 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 169 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 170 ; DISABLED_MASKED_STRIDED: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 171
 172 ; ENABLED_MASKED_STRIDED: LV: Checking a loop in "test"
 173 ;
 174 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 1 for VF 1 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 175 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 7 for VF 2 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 176 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 9 for VF 4 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 177 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 9 for VF 8 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 178 ; ENABLED_MASKED_STRIDED: LV: Found an estimated cost of 14 for VF 16 For instruction:   %i4 = load i16, i16* %arrayidx6, align 2
 179
 180 define void @test(i16* noalias nocapture %points, i16* noalias nocapture readonly %x, i16* noalias nocapture readnone %y) {
 181 entry:
 182   br label %for.body
 183
 184 for.body:
 185   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
 186   %arrayidx = getelementptr inbounds i16, i16* %x, i64 %indvars.iv
 187   %i2 = load i16, i16* %arrayidx, align 2
 188   %cmp1 = icmp sgt i16 %i2, 0
 189   br i1 %cmp1, label %if.then, label %for.inc
 190
 191 if.then:
 192   %i1 = mul nuw nsw i64 %indvars.iv, 3
 193   %arrayidx6 = getelementptr inbounds i16, i16* %points, i64 %i1
 194   %i4 = load i16, i16* %arrayidx6, align 2
 195   store i16 %i4, i16* %arrayidx, align 2
 196   br label %for.inc
 197
 198 for.inc:
 199   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 200   %exitcond.not = icmp eq i64 %indvars.iv.next, 1024
 201   br i1 %exitcond.not, label %for.end, label %for.body
 202
 203 for.end:
 204   ret void
 205 }