test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll

   1 ; RUN: opt -mcpu=skx -S -loop-vectorize -instcombine -force-vector-width=8 -force-vector-interleave=1 < %s | FileCheck %s
   2
   3 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
   4 target triple = "x86_64-pc-linux-gnu"
   5
   6 ; Case1: With pragma predicate to force tail-folding.
   7 ; All memory opertions are masked.
   8 ;void fold_tail(int * restrict p, int * restrict q1, int * restrict q2, int guard) {
   9 ;   #pragma clang loop vectorize_predicate(enable)
  10 ;   for(int ix=0; ix < 1021; ++ix) {
  11 ;     if (ix > guard) {
  12 ;       p[ix] = q1[ix] + q2[ix];
  13 ;     }
  14 ;   }
  15 ;}
  16
  17 ;CHECK-LABEL: @fold_tail
  18 ;CHECK: vector.body:
  19 ;CHECK: call <8 x i32> @llvm.masked.load
  20 ;CHECK: call <8 x i32> @llvm.masked.load
  21 ;CHECK: call void @llvm.masked.store
  22
  23 ; Function Attrs: nofree norecurse nounwind uwtable
  24 define dso_local void @fold_tail(i32* noalias nocapture %p, i32* noalias nocapture readonly %q1, i32* noalias nocapture readonly %q2,
  25 i32 %guard) local_unnamed_addr #0 {
  26 entry:
  27   %0 = sext i32 %guard to i64
  28   br label %for.body
  29
  30 for.cond.cleanup:
  31   ret void
  32
  33 for.body:
  34   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
  35   %cmp1 = icmp sgt i64 %indvars.iv, %0
  36   br i1 %cmp1, label %if.then, label %for.inc
  37
  38 if.then:
  39   %arrayidx = getelementptr inbounds i32, i32* %q1, i64 %indvars.iv
  40   %1 = load i32, i32* %arrayidx, align 4, !tbaa !2
  41   %arrayidx3 = getelementptr inbounds i32, i32* %q2, i64 %indvars.iv
  42   %2 = load i32, i32* %arrayidx3, align 4, !tbaa !2
  43   %add = add nsw i32 %2, %1
  44   %arrayidx5 = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
  45   store i32 %add, i32* %arrayidx5, align 4, !tbaa !2
  46   br label %for.inc
  47
  48 for.inc:
  49   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  50   %exitcond = icmp eq i64 %indvars.iv.next, 1021
  51   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !8
  52 }
  53
  54 ; Case2: With pragma assume_safety only the store is masked.
  55 ; void assume_safety(int * p, int * q1, int * q2, int guard) {
  56 ;   #pragma clang loop vectorize(assume_safety)
  57 ;   for(int ix=0; ix < 1021; ++ix) {
  58 ;     if (ix > guard) {
  59 ;       p[ix] = q1[ix] + q2[ix];
  60 ;     }
  61 ;   }
  62 ;}
  63
  64 ;CHECK-LABEL: @assume_safety
  65 ;CHECK: vector.body:
  66 ;CHECK-NOT: @llvm.masked.load
  67 ;CHECK:  call void @llvm.masked.store
  68
  69 ; Function Attrs: norecurse nounwind uwtable
  70 define void @assume_safety(i32* nocapture, i32* nocapture readonly, i32* nocapture readonly, i32) local_unnamed_addr #0 {
  71   %5 = sext i32 %3 to i64
  72   br label %7
  73
  74 ; <label>:6:
  75   ret void
  76
  77 ; <label>:7:
  78   %8 = phi i64 [ 0, %4 ], [ %18, %17 ]
  79   %9 = icmp sgt i64 %8, %5
  80   br i1 %9, label %10, label %17
  81
  82 ; <label>:10:
  83   %11 = getelementptr inbounds i32, i32* %1, i64 %8
  84   %12 = load i32, i32* %11, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
  85   %13 = getelementptr inbounds i32, i32* %2, i64 %8
  86   %14 = load i32, i32* %13, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
  87   %15 = add nsw i32 %14, %12
  88   %16 = getelementptr inbounds i32, i32* %0, i64 %8
  89   store i32 %15, i32* %16, align 4, !tbaa !2, !llvm.mem.parallel_loop_access !6
  90   br label %17
  91
  92 ; <label>:17:
  93   %18 = add nuw nsw i64 %8, 1
  94   %19 = icmp eq i64 %18, 1021
  95   br i1 %19, label %6, label %7, !llvm.loop !6
  96 }
  97
  98 ; Case3: With pragma assume_safety and pragma predicate both the store and the
  99 ; load are masked.
 100 ; void fold_tail_and_assume_safety(int * p, int * q1, int * q2, int guard) {
 101 ;   #pragma clang loop vectorize(assume_safety) vectorize_predicate(enable)
 102 ;   for(int ix=0; ix < 1021; ++ix) {
 103 ;     if (ix > guard) {
 104 ;       p[ix] = q1[ix] + q2[ix];
 105 ;     }
 106 ;   }
 107 ;}
 108
 109 ;CHECK-LABEL: @fold_tail_and_assume_safety
 110 ;CHECK: vector.body:
 111 ;CHECK: call <8 x i32> @llvm.masked.load
 112 ;CHECK: call <8 x i32> @llvm.masked.load
 113 ;CHECK: call void @llvm.masked.store
 114
 115 ; Function Attrs: nofree norecurse nounwind uwtable
 116 define dso_local void @fold_tail_and_assume_safety(i32* noalias nocapture %p, i32* noalias nocapture readonly %q1, i32* noalias nocapture readonly %q2,
 117 i32 %guard) local_unnamed_addr #0 {
 118 entry:
 119   %0 = sext i32 %guard to i64
 120   br label %for.body
 121
 122 for.cond.cleanup:
 123   ret void
 124
 125 for.body:
 126   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.inc ]
 127   %cmp1 = icmp sgt i64 %indvars.iv, %0
 128   br i1 %cmp1, label %if.then, label %for.inc
 129
 130 if.then:
 131   %arrayidx = getelementptr inbounds i32, i32* %q1, i64 %indvars.iv
 132   %1 = load i32, i32* %arrayidx, align 4, !tbaa !2, !llvm.access.group !10
 133   %arrayidx3 = getelementptr inbounds i32, i32* %q2, i64 %indvars.iv
 134   %2 = load i32, i32* %arrayidx3, align 4, !tbaa !2, !llvm.access.group !10
 135   %add = add nsw i32 %2, %1
 136   %arrayidx5 = getelementptr inbounds i32, i32* %p, i64 %indvars.iv
 137   store i32 %add, i32* %arrayidx5, align 4, !tbaa !2, !llvm.access.group !10
 138   br label %for.inc
 139
 140 for.inc:
 141   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
 142   %exitcond = icmp eq i64 %indvars.iv.next, 1021
 143   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !11
 144 }
 145
 146 attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 147
 148 !llvm.module.flags = !{!0}
 149 !llvm.ident = !{!1}
 150
 151 !0 = !{i32 1, !"wchar_size", i32 4}
 152 !1 = !{!"clang version 6.0.0-1ubuntu2 (tags/RELEASE_600/final)"}
 153 !2 = !{!3, !3, i64 0}
 154 !3 = !{!"int", !4, i64 0}
 155 !4 = !{!"omnipotent char", !5, i64 0}
 156 !5 = !{!"Simple C/C++ TBAA"}
 157 !6 = distinct !{!6, !7}
 158 !7 = !{!"llvm.loop.vectorize.enable", i1 true}
 159
 160 !8 = distinct !{!8, !9}
 161 !9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
 162
 163 !10 = distinct !{}
 164 !11 = distinct !{!11, !12, !13}
 165 !12 = !{!"llvm.loop.parallel_accesses", !10}
 166 !13 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}