llvm/test/Transforms/LoopVectorize/X86/slm-no-vectorize.ll

   1 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
   2 ; RUN: opt < %s -passes=loop-vectorize -mtriple=x86_64-unknown-linux -S -mcpu=slm | FileCheck %s
   3 ; REQUIRES: asserts
   4
   5 ; This test should not be vectorized in X86\SLM arch
   6 ; Vectorizing the 64bit multiply in this case is wrong since
   7 ; it can be done with a lower bit mode (notice that the sources is 16bit)
   8 ; Also addq\subq (quad word) has a high cost on SLM arch.
   9 ; this test has a bad performance (regression of -70%) if vectorized on SLM arch
  10
  11 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
  12 target triple = "x86_64-unknown-linux-gnu"
  13
  14 define i32 @no_vec(i32 %LastIndex, ptr nocapture readonly %InputData, i16 signext %lag, i16 signext %Scale) {
  15 ; CHECK-LABEL: define i32 @no_vec
  16 ; CHECK-SAME: (i32 [[LASTINDEX:%.*]], ptr nocapture readonly [[INPUTDATA:%.*]], i16 signext [[LAG:%.*]], i16 signext [[SCALE:%.*]]) #[[ATTR0:[0-9]+]] {
  17 ; CHECK-NEXT:  entry:
  18 ; CHECK-NEXT:    [[CMP17:%.*]] = icmp sgt i32 [[LASTINDEX]], 0
  19 ; CHECK-NEXT:    br i1 [[CMP17]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
  20 ; CHECK:       for.body.lr.ph:
  21 ; CHECK-NEXT:    [[CONV5:%.*]] = sext i16 [[SCALE]] to i64
  22 ; CHECK-NEXT:    [[SH_PROM:%.*]] = and i64 [[CONV5]], 4294967295
  23 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i16 [[LAG]] to i64
  24 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[LASTINDEX]] to i64
  25 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
  26 ; CHECK:       for.cond.cleanup.loopexit:
  27 ; CHECK-NEXT:    [[ADD7_LCSSA:%.*]] = phi i64 [ [[ADD7:%.*]], [[FOR_BODY]] ]
  28 ; CHECK-NEXT:    [[CONV8:%.*]] = trunc i64 [[ADD7_LCSSA]] to i32
  29 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
  30 ; CHECK:       for.cond.cleanup:
  31 ; CHECK-NEXT:    [[ACCUMULATOR_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[CONV8]], [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] ]
  32 ; CHECK-NEXT:    ret i32 [[ACCUMULATOR_0_LCSSA]]
  33 ; CHECK:       for.body:
  34 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
  35 ; CHECK-NEXT:    [[ACCUMULATOR_018:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[ADD7]], [[FOR_BODY]] ]
  36 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[INPUTDATA]], i64 [[INDVARS_IV]]
  37 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
  38 ; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP1]] to i64
  39 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[INDVARS_IV]], [[TMP0]]
  40 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[INPUTDATA]], i64 [[TMP2]]
  41 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX3]], align 2
  42 ; CHECK-NEXT:    [[CONV4:%.*]] = sext i16 [[TMP3]] to i64
  43 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV4]], [[CONV]]
  44 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i64 [[MUL]], [[SH_PROM]]
  45 ; CHECK-NEXT:    [[ADD7]] = add i64 [[SHR]], [[ACCUMULATOR_018]]
  46 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
  47 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
  48 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
  49 ;
  50 entry:
  51   %cmp17 = icmp sgt i32 %LastIndex, 0
  52   br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup
  53
  54 for.body.lr.ph:                                   ; preds = %entry
  55   %conv5 = sext i16 %Scale to i64
  56   %sh_prom = and i64 %conv5, 4294967295
  57   %0 = sext i16 %lag to i64
  58   %wide.trip.count = zext i32 %LastIndex to i64
  59   br label %for.body
  60
  61 for.cond.cleanup.loopexit:                        ; preds = %for.body
  62   %conv8 = trunc i64 %add7 to i32
  63   br label %for.cond.cleanup
  64
  65 for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
  66   %Accumulator.0.lcssa = phi i32 [ 0, %entry ], [ %conv8, %for.cond.cleanup.loopexit ]
  67   ret i32 %Accumulator.0.lcssa
  68
  69 for.body:                                         ; preds = %for.body, %for.body.lr.ph
  70   %indvars.iv = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
  71   %Accumulator.018 = phi i64 [ 0, %for.body.lr.ph ], [ %add7, %for.body ]
  72   %arrayidx = getelementptr inbounds i16, ptr %InputData, i64 %indvars.iv
  73   %1 = load i16, ptr %arrayidx, align 2
  74   %conv = sext i16 %1 to i64
  75   %2 = add nsw i64 %indvars.iv, %0
  76   %arrayidx3 = getelementptr inbounds i16, ptr %InputData, i64 %2
  77   %3 = load i16, ptr %arrayidx3, align 2
  78   %conv4 = sext i16 %3 to i64
  79   %mul = mul nsw i64 %conv4, %conv
  80   %shr = ashr i64 %mul, %sh_prom
  81   %add7 = add i64 %shr, %Accumulator.018
  82   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
  83   %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
  84   br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
  85 }
  86