llvm/test/Transforms/LoopVectorize/AArch64/extend-vectorization-factor-for-unprofitable-memops.ll

   1 ; RUN: opt -loop-vectorize -mtriple=arm64-apple-darwin -S %s | FileCheck %s
   2
   3 ; Test cases for extending the vectorization factor, if small memory operations
   4 ; are not profitable.
   5
   6 ; Test with a loop that contains memory accesses of i8 and i32 types. The
   7 ; default maximum VF for NEON is 4. And while we don't have an instruction to
   8 ; load 4 x i8, vectorization might still be profitable.
   9 define void @test_load_i8_store_i32(i8* noalias %src, i32* noalias %dst, i32 %off, i64 %N) {
  10 ; CHECK-LABEL: @test_load_i8_store_i32(
  11 ; CHECK:       <4 x i8>
  12 ;
  13 entry:
  14   br label %loop
  15
  16 loop:
  17   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
  18   %gep.src = getelementptr inbounds i8, i8* %src, i64 %iv
  19   %lv = load i8, i8* %gep.src, align 1
  20   %lv.ext = zext i8 %lv to i32
  21   %add = add i32 %lv.ext, %off
  22   %gep.dst = getelementptr inbounds i32, i32* %dst, i64 %iv
  23   store i32 %add, i32* %gep.dst
  24   %iv.next = add nuw nsw i64 %iv, 1
  25   %exitcond.not = icmp eq i64 %iv.next, %N
  26   br i1 %exitcond.not, label %exit, label %loop
  27
  28 exit:
  29   ret void
  30 }
  31
  32 ; Same as test_load_i8_store_i32, but with types flipped for load and store.
  33 define void @test_load_i32_store_i8(i32* noalias %src, i8* noalias %dst, i32 %off, i64 %N) {
  34 ; CHECK-LABEL: @test_load_i32_store_i8(
  35 ; CHECK:     <4 x i8>
  36 ;
  37 entry:
  38   br label %loop
  39
  40 loop:
  41   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
  42   %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
  43   %lv = load i32, i32* %gep.src, align 1
  44   %add = add i32 %lv, %off
  45   %add.trunc = trunc i32 %add to i8
  46   %gep.dst = getelementptr inbounds i8, i8* %dst, i64 %iv
  47   store i8 %add.trunc, i8* %gep.dst
  48   %iv.next = add nuw nsw i64 %iv, 1
  49   %exitcond.not = icmp eq i64 %iv.next, %N
  50   br i1 %exitcond.not, label %exit, label %loop
  51
  52 exit:
  53   ret void
  54 }
  55
  56 ; All memory operations use i32, all memory operations are profitable with VF 4.
  57 define void @test_load_i32_store_i32(i32* noalias %src, i32* noalias %dst, i8 %off, i64 %N) {
  58 ; CHECK-LABEL: @test_load_i32_store_i32(
  59 ; CHECK: vector.body:
  60 ; CHECK:   <4 x i32>
  61 ;
  62 entry:
  63   br label %loop
  64
  65 loop:
  66   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
  67   %gep.src = getelementptr inbounds i32, i32* %src, i64 %iv
  68   %lv = load i32, i32* %gep.src, align 1
  69   %lv.trunc = trunc i32 %lv to i8
  70   %add = add i8 %lv.trunc, %off
  71   %add.ext = zext i8 %add to i32
  72   %gep.dst = getelementptr inbounds i32, i32* %dst, i64 %iv
  73   store i32 %add.ext, i32* %gep.dst
  74   %iv.next = add nuw nsw i64 %iv, 1
  75   %exitcond.not = icmp eq i64 %iv.next, %N
  76   br i1 %exitcond.not, label %exit, label %loop
  77
  78 exit:
  79   ret void
  80 }
  81
  82 ; Test with loop body that requires a large number of vector registers if the
  83 ; vectorization factor is large. Make sure the register estimates limit the
  84 ; vectorization factor.
  85 define void @test_load_i8_store_i64_large(i8* noalias %src, i64* noalias %dst, i64* noalias %dst.2, i64* noalias %dst.3, i64* noalias %dst.4, i64* noalias %dst.5, i64%off, i64 %off.2, i64 %N) {
  86 ; CHECK-LABEL: @test_load_i8_store_i64_large
  87 ; CHECK: <2 x i64>
  88 ;
  89 entry:
  90   br label %loop
  91
  92 loop:
  93   %iv = phi i64 [ %iv.next, %loop ], [ 0, %entry ]
  94   %gep.src = getelementptr inbounds i8, i8* %src, i64 %iv
  95   %gep.dst.3 = getelementptr inbounds i64, i64* %dst.3, i64 %iv
  96   %lv.dst.3 = load i64, i64* %gep.dst.3, align 1
  97   %gep.dst.5 = getelementptr inbounds i64, i64* %dst.5, i64 %iv
  98   %lv.dst.5 = load i64, i64* %gep.dst.3, align 1
  99
 100   %lv = load i8, i8* %gep.src, align 1
 101   %lv.ext = zext i8 %lv to i64
 102   %add = add i64 %lv.ext, %off
 103   %add.2 = add i64 %add, %off.2
 104   %gep.dst = getelementptr inbounds i64, i64* %dst, i64 %iv
 105   %gep.dst.2 = getelementptr inbounds i64, i64* %dst.2, i64 %iv
 106
 107   %add.3 = add i64 %add.2, %lv.dst.3
 108   %add.4 = add i64 %add.3, %add
 109   %gep.dst.4 = getelementptr inbounds i64, i64* %dst.4, i64 %iv
 110   %add.5 = add i64 %add.2, %lv.dst.5
 111   store i64 %add.2, i64* %gep.dst.2
 112   store i64 %add, i64* %gep.dst
 113   store i64 %add.3, i64* %gep.dst.3
 114   store i64 %add.4, i64* %gep.dst.4
 115   store i64 %add.5, i64* %gep.dst.5
 116
 117   %iv.next = add nuw nsw i64 %iv, 1
 118   %exitcond.not = icmp eq i64 %iv.next, %N
 119   br i1 %exitcond.not, label %exit, label %loop
 120
 121 exit:
 122   ret void
 123 }