test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll

   1 ; RUN: opt -S < %s -basicaa -loop-vectorize -force-vector-interleave=1 2>&1 | FileCheck %s
   2
   3 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
   4 target triple = "aarch64"
   5
   6 ; CHECK-LABEL: @add_a(
   7 ; CHECK: load <16 x i8>, <16 x i8>*
   8 ; CHECK: add <16 x i8>
   9 ; CHECK: store <16 x i8>
  10 ; Function Attrs: nounwind
  11 define void @add_a(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
  12 entry:
  13   %cmp8 = icmp sgt i32 %len, 0
  14   br i1 %cmp8, label %for.body, label %for.cond.cleanup
  15
  16 for.cond.cleanup:                                 ; preds = %for.body, %entry
  17   ret void
  18
  19 for.body:                                         ; preds = %entry, %for.body
  20   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
  21   %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
  22   %0 = load i8, i8* %arrayidx
  23   %conv = zext i8 %0 to i32
  24   %add = add nuw nsw i32 %conv, 2
  25   %conv1 = trunc i32 %add to i8
  26   %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
  27   store i8 %conv1, i8* %arrayidx3
  28   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  29   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  30   %exitcond = icmp eq i32 %lftr.wideiv, %len
  31   br i1 %exitcond, label %for.cond.cleanup, label %for.body
  32 }
  33
  34 ; Ensure that we preserve nuw/nsw if we're not shrinking the values we're
  35 ; working with.
  36 ; CHECK-LABEL: @add_a1(
  37 ; CHECK: load <16 x i8>, <16 x i8>*
  38 ; CHECK: add nuw nsw <16 x i8>
  39 ; CHECK: store <16 x i8>
  40 ; Function Attrs: nounwind
  41 define void @add_a1(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
  42 entry:
  43   %cmp8 = icmp sgt i32 %len, 0
  44   br i1 %cmp8, label %for.body, label %for.cond.cleanup
  45
  46 for.cond.cleanup:                                 ; preds = %for.body, %entry
  47   ret void
  48
  49 for.body:                                         ; preds = %entry, %for.body
  50   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
  51   %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
  52   %0 = load i8, i8* %arrayidx
  53   %add = add nuw nsw i8 %0, 2
  54   %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
  55   store i8 %add, i8* %arrayidx3
  56   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  57   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  58   %exitcond = icmp eq i32 %lftr.wideiv, %len
  59   br i1 %exitcond, label %for.cond.cleanup, label %for.body
  60 }
  61
  62 ; CHECK-LABEL: @add_b(
  63 ; CHECK: load <8 x i16>, <8 x i16>*
  64 ; CHECK: add <8 x i16>
  65 ; CHECK: store <8 x i16>
  66 ; Function Attrs: nounwind
  67 define void @add_b(i16* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
  68 entry:
  69   %cmp9 = icmp sgt i32 %len, 0
  70   br i1 %cmp9, label %for.body, label %for.cond.cleanup
  71
  72 for.cond.cleanup:                                 ; preds = %for.body, %entry
  73   ret void
  74
  75 for.body:                                         ; preds = %entry, %for.body
  76   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
  77   %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
  78   %0 = load i16, i16* %arrayidx
  79   %conv8 = zext i16 %0 to i32
  80   %add = add nuw nsw i32 %conv8, 2
  81   %conv1 = trunc i32 %add to i16
  82   %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
  83   store i16 %conv1, i16* %arrayidx3
  84   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  85   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
  86   %exitcond = icmp eq i32 %lftr.wideiv, %len
  87   br i1 %exitcond, label %for.cond.cleanup, label %for.body
  88 }
  89
  90 ; CHECK-LABEL: @add_c(
  91 ; CHECK: load <8 x i8>, <8 x i8>*
  92 ; CHECK: add <8 x i16>
  93 ; CHECK: store <8 x i16>
  94 ; Function Attrs: nounwind
  95 define void @add_c(i8* noalias nocapture readonly %p, i16* noalias nocapture %q, i32 %len) #0 {
  96 entry:
  97   %cmp8 = icmp sgt i32 %len, 0
  98   br i1 %cmp8, label %for.body, label %for.cond.cleanup
  99
 100 for.cond.cleanup:                                 ; preds = %for.body, %entry
 101   ret void
 102
 103 for.body:                                         ; preds = %entry, %for.body
 104   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
 105   %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
 106   %0 = load i8, i8* %arrayidx
 107   %conv = zext i8 %0 to i32
 108   %add = add nuw nsw i32 %conv, 2
 109   %conv1 = trunc i32 %add to i16
 110   %arrayidx3 = getelementptr inbounds i16, i16* %q, i64 %indvars.iv
 111   store i16 %conv1, i16* %arrayidx3
 112   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 113   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 114   %exitcond = icmp eq i32 %lftr.wideiv, %len
 115   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 116 }
 117
 118 ; CHECK-LABEL: @add_d(
 119 ; CHECK: load <4 x i16>
 120 ; CHECK: add nsw <4 x i32>
 121 ; CHECK: store <4 x i32>
 122 define void @add_d(i16* noalias nocapture readonly %p, i32* noalias nocapture %q, i32 %len) #0 {
 123 entry:
 124   %cmp7 = icmp sgt i32 %len, 0
 125   br i1 %cmp7, label %for.body, label %for.cond.cleanup
 126
 127 for.cond.cleanup:                                 ; preds = %for.body, %entry
 128   ret void
 129
 130 for.body:                                         ; preds = %entry, %for.body
 131   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
 132   %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
 133   %0 = load i16, i16* %arrayidx
 134   %conv = sext i16 %0 to i32
 135   %add = add nsw i32 %conv, 2
 136   %arrayidx2 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
 137   store i32 %add, i32* %arrayidx2
 138   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 139   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 140   %exitcond = icmp eq i32 %lftr.wideiv, %len
 141   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 142 }
 143
 144 ; CHECK-LABEL: @add_e(
 145 ; CHECK: load <16 x i8>
 146 ; CHECK: shl <16 x i8>
 147 ; CHECK: add <16 x i8>
 148 ; CHECK: or <16 x i8>
 149 ; CHECK: mul <16 x i8>
 150 ; CHECK: and <16 x i8>
 151 ; CHECK: xor <16 x i8>
 152 ; CHECK: mul <16 x i8>
 153 ; CHECK: store <16 x i8>
 154 define void @add_e(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
 155 entry:
 156   %cmp.32 = icmp sgt i32 %len, 0
 157   br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
 158
 159 for.body.lr.ph:                                   ; preds = %entry
 160   %conv11 = zext i8 %arg2 to i32
 161   %conv13 = zext i8 %arg1 to i32
 162   br label %for.body
 163
 164 for.cond.cleanup:                                 ; preds = %for.body, %entry
 165   ret void
 166
 167 for.body:                                         ; preds = %for.body, %for.body.lr.ph
 168   %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
 169   %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
 170   %0 = load i8, i8* %arrayidx
 171   %conv = zext i8 %0 to i32
 172   %add = shl i32 %conv, 4
 173   %conv2 = add nuw nsw i32 %add, 32
 174   %or = or i32 %conv, 51
 175   %mul = mul nuw nsw i32 %or, 60
 176   %and = and i32 %conv2, %conv13
 177   %mul.masked = and i32 %mul, 252
 178   %conv17 = xor i32 %mul.masked, %conv11
 179   %mul18 = mul nuw nsw i32 %conv17, %and
 180   %conv19 = trunc i32 %mul18 to i8
 181   %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
 182   store i8 %conv19, i8* %arrayidx21
 183   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 184   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 185   %exitcond = icmp eq i32 %lftr.wideiv, %len
 186   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 187 }
 188
 189 ; CHECK-LABEL: @add_f
 190 ; CHECK: load <8 x i16>
 191 ; CHECK: trunc <8 x i16>
 192 ; CHECK: shl <8 x i8>
 193 ; CHECK: add <8 x i8>
 194 ; CHECK: or <8 x i8>
 195 ; CHECK: mul <8 x i8>
 196 ; CHECK: and <8 x i8>
 197 ; CHECK: xor <8 x i8>
 198 ; CHECK: mul <8 x i8>
 199 ; CHECK: store <8 x i8>
 200 define void @add_f(i16* noalias nocapture readonly %p, i8* noalias nocapture %q, i8 %arg1, i8 %arg2, i32 %len) #0 {
 201 entry:
 202   %cmp.32 = icmp sgt i32 %len, 0
 203   br i1 %cmp.32, label %for.body.lr.ph, label %for.cond.cleanup
 204
 205 for.body.lr.ph:                                   ; preds = %entry
 206   %conv11 = zext i8 %arg2 to i32
 207   %conv13 = zext i8 %arg1 to i32
 208   br label %for.body
 209
 210 for.cond.cleanup:                                 ; preds = %for.body, %entry
 211   ret void
 212
 213 for.body:                                         ; preds = %for.body, %for.body.lr.ph
 214   %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
 215   %arrayidx = getelementptr inbounds i16, i16* %p, i64 %indvars.iv
 216   %0 = load i16, i16* %arrayidx
 217   %conv = sext i16 %0 to i32
 218   %add = shl i32 %conv, 4
 219   %conv2 = add nsw i32 %add, 32
 220   %or = and i32 %conv, 204
 221   %conv8 = or i32 %or, 51
 222   %mul = mul nuw nsw i32 %conv8, 60
 223   %and = and i32 %conv2, %conv13
 224   %mul.masked = and i32 %mul, 252
 225   %conv17 = xor i32 %mul.masked, %conv11
 226   %mul18 = mul nuw nsw i32 %conv17, %and
 227   %conv19 = trunc i32 %mul18 to i8
 228   %arrayidx21 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
 229   store i8 %conv19, i8* %arrayidx21
 230   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 231   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 232   %exitcond = icmp eq i32 %lftr.wideiv, %len
 233   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 234 }
 235
 236 ; CHECK-LABEL: @add_phifail(
 237 ; CHECK: load <16 x i8>, <16 x i8>*
 238 ; CHECK: add nuw nsw <16 x i32>
 239 ; CHECK: store <16 x i8>
 240 ; Function Attrs: nounwind
 241 define void @add_phifail(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
 242 entry:
 243   %cmp8 = icmp sgt i32 %len, 0
 244   br i1 %cmp8, label %for.body, label %for.cond.cleanup
 245
 246 for.cond.cleanup:                                 ; preds = %for.body, %entry
 247   ret void
 248
 249 for.body:                                         ; preds = %entry, %for.body
 250   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
 251   %a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ]
 252   %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
 253   %0 = load i8, i8* %arrayidx
 254   %conv = zext i8 %0 to i32
 255   %add = add nuw nsw i32 %conv, 2
 256   %conv1 = trunc i32 %add to i8
 257   %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
 258   store i8 %conv1, i8* %arrayidx3
 259   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 260   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 261   %exitcond = icmp eq i32 %lftr.wideiv, %len
 262   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 263 }
 264
 265 ; Function Attrs: nounwind
 266 ; When we vectorize this loop, we generate correct code
 267 ; even when %len exactly divides VF (since we extract from the second last index
 268 ; and pass this to the for.cond.cleanup block). Vectorized loop returns
 269 ; the correct value a_phi = p[len -2]
 270 define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
 271 ; CHECK-LABEL: @add_phifail2(
 272 ; CHECK: vector.body:
 273 ; CHECK:   %wide.load = load <16 x i8>, <16 x i8>*
 274 ; CHECK:   %[[L1:.+]] = zext <16 x i8> %wide.load to <16 x i32>
 275 ; CHECK:   add nuw nsw <16 x i32>
 276 ; CHECK:   store <16 x i8>
 277 ; CHECK:   add i64 %index, 16
 278 ; CHECK:   icmp eq i64 %index.next, %n.vec
 279 ; CHECK: middle.block:
 280 ; CHECK:   %vector.recur.extract = extractelement <16 x i32> %[[L1]], i32 15
 281 ; CHECK:   %vector.recur.extract.for.phi = extractelement <16 x i32> %[[L1]], i32 14
 282 ; CHECK: for.cond.cleanup:
 283 ; CHECK:   %a_phi.lcssa = phi i32 [ %scalar.recur, %for.body ], [ %vector.recur.extract.for.phi, %middle.block ]
 284 ; CHECK:   %ret = trunc i32 %a_phi.lcssa to i8
 285 ; CHECK:   ret i8 %ret
 286 entry:
 287   br label %for.body
 288
 289 for.cond.cleanup:                                 ; preds = %for.body, %entry
 290   %ret = trunc i32 %a_phi to i8
 291   ret i8 %ret
 292
 293 for.body:                                         ; preds = %entry, %for.body
 294   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
 295   %a_phi = phi i32 [ %conv, %for.body ], [ 0, %entry ]
 296   %arrayidx = getelementptr inbounds i8, i8* %p, i64 %indvars.iv
 297   %0 = load i8, i8* %arrayidx
 298   %conv = zext i8 %0 to i32
 299   %add = add nuw nsw i32 %conv, 2
 300   %conv1 = trunc i32 %add to i8
 301   %arrayidx3 = getelementptr inbounds i8, i8* %q, i64 %indvars.iv
 302   store i8 %conv1, i8* %arrayidx3
 303   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 304   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
 305   %exitcond = icmp eq i32 %lftr.wideiv, %len
 306   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 307 }
 308
 309 attributes #0 = { nounwind }
 310