llvm/test/Transforms/LoopVectorize/vplan-vectorize-inner-loop-reduction.ll

   1 ; RUN: opt -passes=loop-vectorize -force-vector-width=4 -enable-vplan-native-path -S %s | FileCheck %s
   2
   3 ; Vectorize explict marked outer loop using vplan native path. Inner loop
   4 ; contains simple double add reduction. IR is compiled and modified by hand
   5 ; from following C code:
   6 ; void inner_loop_reduction(const ptr restrict in_a, const ptr restrict in_b, ptr restrict out)
   7 ; {
   8 ;     #pragma clang loop vectorize(enable)
   9 ;     for (int i = 0; i < 1000; ++i) {
  10 ;         double a = in_a[i];
  11 ;         double b = in_b[i];
  12 ;         for (int j = 0; j < 10000; ++j) {
  13 ;             a = a + b;
  14 ;         }
  15 ;         out[i] = a;
  16 ;     }
  17 ; }
  18 define void @inner_loop_reduction(ptr noalias nocapture readonly %a.in, ptr noalias nocapture readonly %b.in, ptr noalias nocapture %c.out) {
  19 ; CHECK-LABEL: @inner_loop_reduction(
  20
  21 ; CHECK: vector.body:
  22 ; CHECK-NEXT: %[[FOR1_INDEX:.*]] = phi i64 [ 0, %[[LABEL_PR:.*]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH:.*]] ]
  23 ; CHECK: %[[VEC_INDEX:.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %[[LABEL_PR]] ], [ %{{.*}}, %[[LABEL_FOR1_LATCH]] ]
  24 ; CHECK-NEXT: %[[A_PTR:.*]] = getelementptr inbounds double, ptr %a.in, <4 x i64> %[[VEC_INDEX]]
  25 ; CHECK-NEXT: %[[MASKED_GATHER1:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %[[A_PTR]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison)
  26 ; CHECK-NEXT: %[[B_PTR:.*]] = getelementptr inbounds double, ptr %b.in, <4 x i64> %[[VEC_INDEX]]
  27 ; CHECK-NEXT: %[[MASKED_GATHER2:.*]] = call <4 x double> @llvm.masked.gather.v4f64.v4p0(<4 x ptr> %[[B_PTR]], i32 8, <4 x i1> splat (i1 true), <4 x double> poison)
  28 ; CHECK-NEXT: br label %[[FOR2_HEADER:.*]]
  29
  30 ; CHECK: [[FOR2_HEADER]]:
  31 ; CHECK-NEXT: %[[FOR2_INDEX:.*]] = phi <4 x i32> [ zeroinitializer, %vector.body ], [ %[[FOR2_INDEX_NEXT:.*]], %[[FOR2_HEADER]] ]
  32 ; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[MASKED_GATHER1]], %vector.body ], [ %[[REDUCTION_NEXT:.*]], %[[FOR2_HEADER]] ]
  33 ; CHECK-NEXT: %[[REDUCTION_NEXT]] = fadd <4 x double> %[[MASKED_GATHER2]], %[[REDUCTION]]
  34 ; CHECK-NEXT: %[[FOR2_INDEX_NEXT]] = add nuw nsw <4 x i32> %[[FOR2_INDEX]], splat (i32 1)
  35 ; CHECK-NEXT: %[[VEC_PTR:.*]] = icmp eq <4 x i32> %[[FOR2_INDEX_NEXT]], splat (i32 10000)
  36 ; CHECK-NEXT: %[[EXIT_COND:.*]] = extractelement <4 x i1> %[[VEC_PTR]], i32 0
  37 ; CHECK-NEXT: br i1 %[[EXIT_COND]], label %[[FOR1_LATCH:.*]], label %{{.*}}
  38
  39 ; CHECK: [[FOR1_LATCH]]:
  40 ; CHECK-NEXT: %[[REDUCTION:.*]] = phi <4 x double> [ %[[REDUCTION_NEXT]], %[[FOR2_HEADER]] ]
  41 ; CHECK-NEXT: %[[C_PTR:.*]] = getelementptr inbounds double, ptr %c.out, <4 x i64> %[[VEC_INDEX]]
  42 ; CHECK-NEXT: call void @llvm.masked.scatter.v4f64.v4p0(<4 x double> %[[REDUCTION]], <4 x ptr> %[[C_PTR]], i32 8, <4 x i1> splat (i1 true))
  43 ; CHECK-NEXT: %[[FOR1_INDEX_NEXT:.*]] = add nuw i64 %[[FOR1_INDEX]], 4
  44 ; CHECK-NEXT: %{{.*}} = add <4 x i64> %[[VEC_INDEX]], splat (i64 4)
  45 ; CHECK-NEXT: %[[EXIT_COND:.*]] = icmp eq i64 %[[FOR1_INDEX_NEXT]], 1000
  46 ; CHECK-NEXT: br i1 %[[EXIT_COND]], label %{{.*}}, label %vector.body
  47
  48 entry:
  49   br label %for1.header
  50
  51 for1.header:                                              ; preds = %entry
  52   %indvar1 = phi i64 [ 0, %entry ], [ %indvar11, %for1.latch ]
  53   %a.ptr = getelementptr inbounds double, ptr %a.in, i64 %indvar1
  54   %a = load double, ptr %a.ptr, align 8
  55   %b.ptr = getelementptr inbounds double, ptr %b.in, i64 %indvar1
  56   %b = load double, ptr %b.ptr, align 8
  57   br label %for2.header
  58
  59 for2.header:                                              ; preds = %for1.header, %for2.header
  60   %indvar2 = phi i32 [ 0, %for1.header ], [ %indvar21, %for2.header ]
  61   %a.reduction = phi double [ %a, %for1.header ], [ %a.reduction1, %for2.header ]
  62   %a.reduction1 = fadd double %b, %a.reduction
  63   %indvar21 = add nuw nsw i32 %indvar2, 1
  64   %for2.cond = icmp eq i32 %indvar21, 10000
  65   br i1 %for2.cond, label %for1.latch, label %for2.header
  66
  67 for1.latch:                                               ; preds = %for2.header
  68   %c.ptr = getelementptr inbounds double, ptr %c.out, i64 %indvar1
  69   store double %a.reduction1, ptr %c.ptr, align 8
  70   %indvar11 = add nuw nsw i64 %indvar1, 1
  71   %for1.cond = icmp eq i64 %indvar11, 1000
  72   br i1 %for1.cond, label %exit, label %for1.header, !llvm.loop !0
  73
  74 exit:                                                    ; preds = %for1.latch
  75   ret void
  76 }
  77
  78 !0 = distinct !{!0, !1}
  79 !1 = !{!"llvm.loop.vectorize.enable", i1 true}