gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #define INCLUDE_MEMORY
  24 #include "config.h"
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "gimple.h"
  32 #include "cfghooks.h"
  33 #include "tree-pass.h"
  34 #include "ssa.h"
  35 #include "optabs-tree.h"
  36 #include "memmodel.h"
  37 #include "optabs.h"
  38 #include "diagnostic-core.h"
  39 #include "fold-const.h"
  40 #include "stor-layout.h"
  41 #include "cfganal.h"
  42 #include "gimplify.h"
  43 #include "gimple-iterator.h"
  44 #include "gimplify-me.h"
  45 #include "tree-ssa-loop-ivopts.h"
  46 #include "tree-ssa-loop-manip.h"
  47 #include "tree-ssa-loop-niter.h"
  48 #include "tree-ssa-loop.h"
  49 #include "cfgloop.h"
  50 #include "tree-scalar-evolution.h"
  51 #include "tree-vectorizer.h"
  52 #include "gimple-fold.h"
  53 #include "cgraph.h"
  54 #include "tree-cfg.h"
  55 #include "tree-if-conv.h"
  56 #include "internal-fn.h"
  57 #include "tree-vector-builder.h"
  58 #include "vec-perm-indices.h"
  59 #include "tree-eh.h"
  60 #include "case-cfn-macros.h"
  61 #include "langhooks.h"
  62
  63 /* Loop Vectorization Pass.
  64
  65    This pass tries to vectorize loops.
  66
  67    For example, the vectorizer transforms the following simple loop:
  68
  69         short a[N]; short b[N]; short c[N]; int i;
  70
  71         for (i=0; i<N; i++){
  72           a[i] = b[i] + c[i];
  73         }
  74
  75    as if it was manually vectorized by rewriting the source code into:
  76
  77         typedef int __attribute__((mode(V8HI))) v8hi;
  78         short a[N];  short b[N]; short c[N];   int i;
  79         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  80         v8hi va, vb, vc;
  81
  82         for (i=0; i<N/8; i++){
  83           vb = pb[i];
  84           vc = pc[i];
  85           va = vb + vc;
  86           pa[i] = va;
  87         }
  88
  89         The main entry to this pass is vectorize_loops(), in which
  90    the vectorizer applies a set of analyses on a given set of loops,
  91    followed by the actual vectorization transformation for the loops that
  92    had successfully passed the analysis phase.
  93         Throughout this pass we make a distinction between two types of
  94    data: scalars (which are represented by SSA_NAMES), and memory references
  95    ("data-refs").  These two types of data require different handling both
  96    during analysis and transformation. The types of data-refs that the
  97    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  98    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  99    accesses are required to have a simple (consecutive) access pattern.
 100
 101    Analysis phase:
 102    ===============
 103         The driver for the analysis phase is vect_analyze_loop().
 104    It applies a set of analyses, some of which rely on the scalar evolution
 105    analyzer (scev) developed by Sebastian Pop.
 106
 107         During the analysis phase the vectorizer records some information
 108    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 109    loop, as well as general information about the loop as a whole, which is
 110    recorded in a "loop_vec_info" struct attached to each loop.
 111
 112    Transformation phase:
 113    =====================
 114         The loop transformation phase scans all the stmts in the loop, and
 115    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 116    the loop that needs to be vectorized.  It inserts the vector code sequence
 117    just before the scalar stmt S, and records a pointer to the vector code
 118    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 119    attached to S).  This pointer will be used for the vectorization of following
 120    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 121    otherwise, we rely on dead code elimination for removing it.
 122
 123         For example, say stmt S1 was vectorized into stmt VS1:
 124
 125    VS1: vb = px[i];
 126    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 127    S2:  a = b;
 128
 129    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 130    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 131    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 132    resulting sequence would be:
 133
 134    VS1: vb = px[i];
 135    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 136    VS2: va = vb;
 137    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 138
 139         Operands that are not SSA_NAMEs, are data-refs that appear in
 140    load/store operations (like 'x[i]' in S1), and are handled differently.
 141
 142    Target modeling:
 143    =================
 144         Currently the only target specific information that is used is the
 145    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 146    Targets that can support different sizes of vectors, for now will need
 147    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 148    flexibility will be added in the future.
 149
 150         Since we only vectorize operations which vector form can be
 151    expressed using existing tree codes, to verify that an operation is
 152    supported, the vectorizer checks the relevant optab at the relevant
 153    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 154    the value found is CODE_FOR_nothing, then there's no target support, and
 155    we can't vectorize the stmt.
 156
 157    For additional information on this project see:
 158    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 159 */
 160
 161 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 162                                                 unsigned *);
 163 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 164                                                bool *, bool *, bool);
 165
 166 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 167    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 168    may already be set for general statements (not just data refs).  */
 169
 170 static opt_result
 171 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 172                               bool vectype_maybe_set_p,
 173                               poly_uint64 *vf)
 174 {
 175   gimple *stmt = stmt_info->stmt;
 176
 177   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 178        && !STMT_VINFO_LIVE_P (stmt_info))
 179       || gimple_clobber_p (stmt))
 180     {
 181       if (dump_enabled_p ())
 182         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 183       return opt_result::success ();
 184     }
 185
 186   tree stmt_vectype, nunits_vectype;
 187   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 188                                                    &stmt_vectype,
 189                                                    &nunits_vectype);
 190   if (!res)
 191     return res;
 192
 193   if (stmt_vectype)
 194     {
 195       if (STMT_VINFO_VECTYPE (stmt_info))
 196         /* The only case when a vectype had been already set is for stmts
 197            that contain a data ref, or for "pattern-stmts" (stmts generated
 198            by the vectorizer to represent/replace a certain idiom).  */
 199         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 200                      || vectype_maybe_set_p)
 201                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 202       else
 203         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 204     }
 205
 206   if (nunits_vectype)
 207     vect_update_max_nunits (vf, nunits_vectype);
 208
 209   return opt_result::success ();
 210 }
 211
 212 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 213    types of STMT_INFO and all attached pattern statements and update
 214    the vectorization factor VF accordingly.  Return true on success
 215    or false if something prevented vectorization.  */
 216
 217 static opt_result
 218 vect_determine_vf_for_stmt (vec_info *vinfo,
 219                             stmt_vec_info stmt_info, poly_uint64 *vf)
 220 {
 221   if (dump_enabled_p ())
 222     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 223                      stmt_info->stmt);
 224   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 225   if (!res)
 226     return res;
 227
 228   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 229       && STMT_VINFO_RELATED_STMT (stmt_info))
 230     {
 231       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 232       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 233
 234       /* If a pattern statement has def stmts, analyze them too.  */
 235       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 236            !gsi_end_p (si); gsi_next (&si))
 237         {
 238           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 239           if (dump_enabled_p ())
 240             dump_printf_loc (MSG_NOTE, vect_location,
 241                              "==> examining pattern def stmt: %G",
 242                              def_stmt_info->stmt);
 243           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 244           if (!res)
 245             return res;
 246         }
 247
 248       if (dump_enabled_p ())
 249         dump_printf_loc (MSG_NOTE, vect_location,
 250                          "==> examining pattern statement: %G",
 251                          stmt_info->stmt);
 252       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 253       if (!res)
 254         return res;
 255     }
 256
 257   return opt_result::success ();
 258 }
 259
 260 /* Function vect_determine_vectorization_factor
 261
 262    Determine the vectorization factor (VF).  VF is the number of data elements
 263    that are operated upon in parallel in a single iteration of the vectorized
 264    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 265    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 266    elements can fit in a single vector register.
 267
 268    We currently support vectorization of loops in which all types operated upon
 269    are of the same size.  Therefore this function currently sets VF according to
 270    the size of the types operated upon, and fails if there are multiple sizes
 271    in the loop.
 272
 273    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 274    original loop:
 275         for (i=0; i<N; i++){
 276           a[i] = b[i] + c[i];
 277         }
 278
 279    vectorized loop:
 280         for (i=0; i<N; i+=VF){
 281           a[i:VF] = b[i:VF] + c[i:VF];
 282         }
 283 */
 284
 285 static opt_result
 286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 287 {
 288   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 289   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 290   unsigned nbbs = loop->num_nodes;
 291   poly_uint64 vectorization_factor = 1;
 292   tree scalar_type = NULL_TREE;
 293   gphi *phi;
 294   tree vectype;
 295   stmt_vec_info stmt_info;
 296   unsigned i;
 297
 298   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 299
 300   for (i = 0; i < nbbs; i++)
 301     {
 302       basic_block bb = bbs[i];
 303
 304       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 305            gsi_next (&si))
 306         {
 307           phi = si.phi ();
 308           stmt_info = loop_vinfo->lookup_stmt (phi);
 309           if (dump_enabled_p ())
 310             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 311                              (gimple *) phi);
 312
 313           gcc_assert (stmt_info);
 314
 315           if (STMT_VINFO_RELEVANT_P (stmt_info)
 316               || STMT_VINFO_LIVE_P (stmt_info))
 317             {
 318               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 319               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 320
 321               if (dump_enabled_p ())
 322                 dump_printf_loc (MSG_NOTE, vect_location,
 323                                  "get vectype for scalar type:  %T\n",
 324                                  scalar_type);
 325
 326               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 327               if (!vectype)
 328                 return opt_result::failure_at (phi,
 329                                                "not vectorized: unsupported "
 330                                                "data-type %T\n",
 331                                                scalar_type);
 332               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 333
 334               if (dump_enabled_p ())
 335                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 336                                  vectype);
 337
 338               if (dump_enabled_p ())
 339                 {
 340                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 341                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 342                   dump_printf (MSG_NOTE, "\n");
 343                 }
 344
 345               vect_update_max_nunits (&vectorization_factor, vectype);
 346             }
 347         }
 348
 349       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 350            gsi_next (&si))
 351         {
 352           if (is_gimple_debug (gsi_stmt (si)))
 353             continue;
 354           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 355           opt_result res
 356             = vect_determine_vf_for_stmt (loop_vinfo,
 357                                           stmt_info, &vectorization_factor);
 358           if (!res)
 359             return res;
 360         }
 361     }
 362
 363   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 364   if (dump_enabled_p ())
 365     {
 366       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 367       dump_dec (MSG_NOTE, vectorization_factor);
 368       dump_printf (MSG_NOTE, "\n");
 369     }
 370
 371   if (known_le (vectorization_factor, 1U))
 372     return opt_result::failure_at (vect_location,
 373                                    "not vectorized: unsupported data-type\n");
 374   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 375   return opt_result::success ();
 376 }
 377
 378
 379 /* Function vect_is_simple_iv_evolution.
 380
 381    FORNOW: A simple evolution of an induction variables in the loop is
 382    considered a polynomial evolution.  */
 383
 384 static bool
 385 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 386                              tree * step)
 387 {
 388   tree init_expr;
 389   tree step_expr;
 390   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 391   basic_block bb;
 392
 393   /* When there is no evolution in this loop, the evolution function
 394      is not "simple".  */
 395   if (evolution_part == NULL_TREE)
 396     return false;
 397
 398   /* When the evolution is a polynomial of degree >= 2
 399      the evolution function is not "simple".  */
 400   if (tree_is_chrec (evolution_part))
 401     return false;
 402
 403   step_expr = evolution_part;
 404   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 405
 406   if (dump_enabled_p ())
 407     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 408                      step_expr, init_expr);
 409
 410   *init = init_expr;
 411   *step = step_expr;
 412
 413   if (TREE_CODE (step_expr) != INTEGER_CST
 414       && (TREE_CODE (step_expr) != SSA_NAME
 415           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 416               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 417           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 418               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 419                   || !flag_associative_math)))
 420       && (TREE_CODE (step_expr) != REAL_CST
 421           || !flag_associative_math))
 422     {
 423       if (dump_enabled_p ())
 424         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 425                          "step unknown.\n");
 426       return false;
 427     }
 428
 429   return true;
 430 }
 431
 432 /* Function vect_is_nonlinear_iv_evolution
 433
 434    Only support nonlinear induction for integer type
 435    1. neg
 436    2. mul by constant
 437    3. lshift/rshift by constant.
 438
 439    For neg induction, return a fake step as integer -1.  */
 440 static bool
 441 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 442                                 gphi* loop_phi_node, tree *init, tree *step)
 443 {
 444   tree init_expr, ev_expr, result, op1, op2;
 445   gimple* def;
 446
 447   if (gimple_phi_num_args (loop_phi_node) != 2)
 448     return false;
 449
 450   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 451   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 452
 453   /* Support nonlinear induction only for integer type.  */
 454   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 455     return false;
 456
 457   *init = init_expr;
 458   result = PHI_RESULT (loop_phi_node);
 459
 460   if (TREE_CODE (ev_expr) != SSA_NAME
 461       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 462       || !is_gimple_assign (def))
 463     return false;
 464
 465   enum tree_code t_code = gimple_assign_rhs_code (def);
 466   switch (t_code)
 467     {
 468     case NEGATE_EXPR:
 469       if (gimple_assign_rhs1 (def) != result)
 470         return false;
 471       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 472       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 473       break;
 474
 475     case RSHIFT_EXPR:
 476     case LSHIFT_EXPR:
 477     case MULT_EXPR:
 478       op1 = gimple_assign_rhs1 (def);
 479       op2 = gimple_assign_rhs2 (def);
 480       if (TREE_CODE (op2) != INTEGER_CST
 481           || op1 != result)
 482         return false;
 483       *step = op2;
 484       if (t_code == LSHIFT_EXPR)
 485         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 486       else if (t_code == RSHIFT_EXPR)
 487         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 488       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 489       else
 490         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 491       break;
 492
 493     default:
 494       return false;
 495     }
 496
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 498   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 499
 500   return true;
 501 }
 502
 503 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 504    what we are assuming is a double reduction.  For example, given
 505    a structure like this:
 506
 507       outer1:
 508         x_1 = PHI <x_4(outer2), ...>;
 509         ...
 510
 511       inner:
 512         x_2 = PHI <x_1(outer1), ...>;
 513         ...
 514         x_3 = ...;
 515         ...
 516
 517       outer2:
 518         x_4 = PHI <x_3(inner)>;
 519         ...
 520
 521    outer loop analysis would treat x_1 as a double reduction phi and
 522    this function would then return true for x_2.  */
 523
 524 static bool
 525 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 526 {
 527   use_operand_p use_p;
 528   ssa_op_iter op_iter;
 529   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 530     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 531       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 532         return true;
 533   return false;
 534 }
 535
 536 /* Returns true if Phi is a first-order recurrence. A first-order
 537    recurrence is a non-reduction recurrence relation in which the value of
 538    the recurrence in the current loop iteration equals a value defined in
 539    the previous iteration.  */
 540
 541 static bool
 542 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 543                                    gphi *phi)
 544 {
 545   /* A nested cycle isn't vectorizable as first order recurrence.  */
 546   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 547     return false;
 548
 549   /* Ensure the loop latch definition is from within the loop.  */
 550   edge latch = loop_latch_edge (loop);
 551   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 552   if (TREE_CODE (ldef) != SSA_NAME
 553       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 554       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 555       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 556     return false;
 557
 558   tree def = gimple_phi_result (phi);
 559
 560   /* Ensure every use_stmt of the phi node is dominated by the latch
 561      definition.  */
 562   imm_use_iterator imm_iter;
 563   use_operand_p use_p;
 564   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 565     if (!is_gimple_debug (USE_STMT (use_p))
 566         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 567             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 568                                             USE_STMT (use_p))))
 569       return false;
 570
 571   /* First-order recurrence autovectorization needs shuffle vector.  */
 572   tree scalar_type = TREE_TYPE (def);
 573   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 574   if (!vectype)
 575     return false;
 576
 577   return true;
 578 }
 579
 580 /* Function vect_analyze_scalar_cycles_1.
 581
 582    Examine the cross iteration def-use cycles of scalar variables
 583    in LOOP.  LOOP_VINFO represents the loop that is now being
 584    considered for vectorization (can be LOOP, or an outer-loop
 585    enclosing LOOP).  SLP indicates there will be some subsequent
 586    slp analyses or not.  */
 587
 588 static void
 589 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 590                               bool slp)
 591 {
 592   basic_block bb = loop->header;
 593   tree init, step;
 594   auto_vec<stmt_vec_info, 64> worklist;
 595   gphi_iterator gsi;
 596   bool double_reduc, reduc_chain;
 597
 598   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 599
 600   /* First - identify all inductions.  Reduction detection assumes that all the
 601      inductions have been identified, therefore, this order must not be
 602      changed.  */
 603   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 604     {
 605       gphi *phi = gsi.phi ();
 606       tree access_fn = NULL;
 607       tree def = PHI_RESULT (phi);
 608       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 609
 610       if (dump_enabled_p ())
 611         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 612                          (gimple *) phi);
 613
 614       /* Skip virtual phi's.  The data dependences that are associated with
 615          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 616       if (virtual_operand_p (def))
 617         continue;
 618
 619       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 620
 621       /* Analyze the evolution function.  */
 622       access_fn = analyze_scalar_evolution (loop, def);
 623       if (access_fn)
 624         {
 625           STRIP_NOPS (access_fn);
 626           if (dump_enabled_p ())
 627             dump_printf_loc (MSG_NOTE, vect_location,
 628                              "Access function of PHI: %T\n", access_fn);
 629           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 630             = initial_condition_in_loop_num (access_fn, loop->num);
 631           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 632             = evolution_part_in_loop_num (access_fn, loop->num);
 633         }
 634
 635       if ((!access_fn
 636            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 637            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 638                                             &init, &step)
 639            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 640                && TREE_CODE (step) != INTEGER_CST))
 641           /* Only handle nonlinear iv for same loop.  */
 642           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 643               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 644                                                   phi, &init, &step)))
 645         {
 646           worklist.safe_push (stmt_vinfo);
 647           continue;
 648         }
 649
 650       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 651                   != NULL_TREE);
 652       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 653
 654       if (dump_enabled_p ())
 655         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 656       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 657     }
 658
 659
 660   /* Second - identify all reductions and nested cycles.  */
 661   while (worklist.length () > 0)
 662     {
 663       stmt_vec_info stmt_vinfo = worklist.pop ();
 664       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 665       tree def = PHI_RESULT (phi);
 666
 667       if (dump_enabled_p ())
 668         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 669                          (gimple *) phi);
 670
 671       gcc_assert (!virtual_operand_p (def)
 672                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 673
 674       stmt_vec_info reduc_stmt_info
 675         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 676                                     &reduc_chain, slp);
 677       if (reduc_stmt_info)
 678         {
 679           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 680           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 681           if (double_reduc)
 682             {
 683               if (dump_enabled_p ())
 684                 dump_printf_loc (MSG_NOTE, vect_location,
 685                                  "Detected double reduction.\n");
 686
 687               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 688               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 689               /* Make it accessible for SLP vectorization.  */
 690               LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
 691             }
 692           else
 693             {
 694               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 695                 {
 696                   if (dump_enabled_p ())
 697                     dump_printf_loc (MSG_NOTE, vect_location,
 698                                      "Detected vectorizable nested cycle.\n");
 699
 700                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 701                 }
 702               else
 703                 {
 704                   if (dump_enabled_p ())
 705                     dump_printf_loc (MSG_NOTE, vect_location,
 706                                      "Detected reduction.\n");
 707
 708                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 709                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 710                   /* Store the reduction cycles for possible vectorization in
 711                      loop-aware SLP if it was not detected as reduction
 712                      chain.  */
 713                   if (! reduc_chain)
 714                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 715                       (reduc_stmt_info);
 716                 }
 717             }
 718         }
 719       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 720         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 721       else
 722         if (dump_enabled_p ())
 723           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 724                            "Unknown def-use cycle pattern.\n");
 725     }
 726 }
 727
 728
 729 /* Function vect_analyze_scalar_cycles.
 730
 731    Examine the cross iteration def-use cycles of scalar variables, by
 732    analyzing the loop-header PHIs of scalar variables.  Classify each
 733    cycle as one of the following: invariant, induction, reduction, unknown.
 734    We do that for the loop represented by LOOP_VINFO, and also to its
 735    inner-loop, if exists.
 736    Examples for scalar cycles:
 737
 738    Example1: reduction:
 739
 740               loop1:
 741               for (i=0; i<N; i++)
 742                  sum += a[i];
 743
 744    Example2: induction:
 745
 746               loop2:
 747               for (i=0; i<N; i++)
 748                  a[i] = i;  */
 749
 750 static void
 751 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 752 {
 753   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 754
 755   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 756
 757   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 758      Reductions in such inner-loop therefore have different properties than
 759      the reductions in the nest that gets vectorized:
 760      1. When vectorized, they are executed in the same order as in the original
 761         scalar loop, so we can't change the order of computation when
 762         vectorizing them.
 763      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 764         current checks are too strict.  */
 765
 766   if (loop->inner)
 767     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 768 }
 769
 770 /* Transfer group and reduction information from STMT_INFO to its
 771    pattern stmt.  */
 772
 773 static void
 774 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 775 {
 776   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 777   stmt_vec_info stmtp;
 778   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 779               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 780   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 781   do
 782     {
 783       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 784       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 785                            == STMT_VINFO_DEF_TYPE (stmt_info));
 786       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 787       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 788       if (stmt_info)
 789         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 790           = STMT_VINFO_RELATED_STMT (stmt_info);
 791     }
 792   while (stmt_info);
 793 }
 794
 795 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 796
 797 static void
 798 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 799 {
 800   stmt_vec_info first;
 801   unsigned i;
 802
 803   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 804     {
 805       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 806       while (next)
 807         {
 808           if ((STMT_VINFO_IN_PATTERN_P (next)
 809                != STMT_VINFO_IN_PATTERN_P (first))
 810               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 811             break;
 812           next = REDUC_GROUP_NEXT_ELEMENT (next);
 813         }
 814       /* If all reduction chain members are well-formed patterns adjust
 815          the group to group the pattern stmts instead.  */
 816       if (! next
 817           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 818         {
 819           if (STMT_VINFO_IN_PATTERN_P (first))
 820             {
 821               vect_fixup_reduc_chain (first);
 822               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 823                 = STMT_VINFO_RELATED_STMT (first);
 824             }
 825         }
 826       /* If not all stmt in the chain are patterns or if we failed
 827          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 828          it as regular reduction instead.  */
 829       else
 830         {
 831           stmt_vec_info vinfo = first;
 832           stmt_vec_info last = NULL;
 833           while (vinfo)
 834             {
 835               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 836               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 837               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 838               last = vinfo;
 839               vinfo = next;
 840             }
 841           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 842             = vect_internal_def;
 843           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 844           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 845           --i;
 846         }
 847     }
 848 }
 849
 850 /* Function vect_get_loop_niters.
 851
 852    Determine how many iterations the loop is executed and place it
 853    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 854    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 855    niter information holds in ASSUMPTIONS.
 856
 857    Return the loop exit conditions.  */
 858
 859
 860 static vec<gcond *>
 861 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 862                       tree *number_of_iterations, tree *number_of_iterationsm1)
 863 {
 864   auto_vec<edge> exits = get_loop_exit_edges (loop);
 865   vec<gcond *> conds;
 866   conds.create (exits.length ());
 867   class tree_niter_desc niter_desc;
 868   tree niter_assumptions, niter, may_be_zero;
 869
 870   *assumptions = boolean_true_node;
 871   *number_of_iterationsm1 = chrec_dont_know;
 872   *number_of_iterations = chrec_dont_know;
 873
 874   DUMP_VECT_SCOPE ("get_loop_niters");
 875
 876   if (exits.is_empty ())
 877     return conds;
 878
 879   if (dump_enabled_p ())
 880     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 881                      exits.length ());
 882
 883   edge exit;
 884   unsigned int i;
 885   FOR_EACH_VEC_ELT (exits, i, exit)
 886     {
 887       gcond *cond = get_loop_exit_condition (exit);
 888       if (cond)
 889         conds.safe_push (cond);
 890
 891       if (dump_enabled_p ())
 892         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 893
 894       if (exit != main_exit)
 895         continue;
 896
 897       may_be_zero = NULL_TREE;
 898       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 899           || chrec_contains_undetermined (niter_desc.niter))
 900         continue;
 901
 902       niter_assumptions = niter_desc.assumptions;
 903       may_be_zero = niter_desc.may_be_zero;
 904       niter = niter_desc.niter;
 905
 906       if (may_be_zero && integer_zerop (may_be_zero))
 907         may_be_zero = NULL_TREE;
 908
 909       if (may_be_zero)
 910         {
 911           if (COMPARISON_CLASS_P (may_be_zero))
 912             {
 913               /* Try to combine may_be_zero with assumptions, this can simplify
 914                  computation of niter expression.  */
 915               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 916                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 917                                                  niter_assumptions,
 918                                                  fold_build1 (TRUTH_NOT_EXPR,
 919                                                               boolean_type_node,
 920                                                               may_be_zero));
 921               else
 922                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 923                                      build_int_cst (TREE_TYPE (niter), 0),
 924                                      rewrite_to_non_trapping_overflow (niter));
 925
 926               may_be_zero = NULL_TREE;
 927             }
 928           else if (integer_nonzerop (may_be_zero))
 929             {
 930               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 931               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 932               continue;
 933             }
 934           else
 935             continue;
 936        }
 937
 938       /* Loop assumptions are based off the normal exit.  */
 939       *assumptions = niter_assumptions;
 940       *number_of_iterationsm1 = niter;
 941
 942       /* We want the number of loop header executions which is the number
 943          of latch executions plus one.
 944          ???  For UINT_MAX latch executions this number overflows to zero
 945          for loops like do { n++; } while (n != 0);  */
 946       if (niter && !chrec_contains_undetermined (niter))
 947         {
 948           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 949                                unshare_expr (niter),
 950                                build_int_cst (TREE_TYPE (niter), 1));
 951           if (TREE_CODE (niter) == INTEGER_CST
 952               && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
 953             {
 954               /* If we manage to fold niter + 1 into INTEGER_CST even when
 955                  niter is some complex expression, ensure back
 956                  *number_of_iterationsm1 is an INTEGER_CST as well.  See
 957                  PR113210.  */
 958               *number_of_iterationsm1
 959                 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
 960                                build_minus_one_cst (TREE_TYPE (niter)));
 961             }
 962         }
 963       *number_of_iterations = niter;
 964     }
 965
 966   if (dump_enabled_p ())
 967     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 968
 969   return conds;
 970 }
 971
 972 /*  Determine the main loop exit for the vectorizer.  */
 973
 974 edge
 975 vec_init_loop_exit_info (class loop *loop)
 976 {
 977   /* Before we begin we must first determine which exit is the main one and
 978      which are auxilary exits.  */
 979   auto_vec<edge> exits = get_loop_exit_edges (loop);
 980   if (exits.length () == 1)
 981     return exits[0];
 982
 983   /* If we have multiple exits we only support counting IV at the moment.
 984      Analyze all exits and return the last one we can analyze.  */
 985   class tree_niter_desc niter_desc;
 986   edge candidate = NULL;
 987   for (edge exit : exits)
 988     {
 989       if (!get_loop_exit_condition (exit))
 990         continue;
 991
 992       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 993           && !chrec_contains_undetermined (niter_desc.niter))
 994         {
 995           tree may_be_zero = niter_desc.may_be_zero;
 996           if ((integer_zerop (may_be_zero)
 997                /* As we are handling may_be_zero that's not false by
 998                   rewriting niter to may_be_zero ? 0 : niter we require
 999                   an empty latch.  */
1000                || (single_pred_p (loop->latch)
1001                    && exit->src == single_pred (loop->latch)
1002                    && (integer_nonzerop (may_be_zero)
1003                        || COMPARISON_CLASS_P (may_be_zero))))
1004               && (!candidate
1005                   || dominated_by_p (CDI_DOMINATORS, exit->src,
1006                                      candidate->src)))
1007             candidate = exit;
1008         }
1009     }
1010
1011   return candidate;
1012 }
1013
1014 /* Function bb_in_loop_p
1015
1016    Used as predicate for dfs order traversal of the loop bbs.  */
1017
1018 static bool
1019 bb_in_loop_p (const_basic_block bb, const void *data)
1020 {
1021   const class loop *const loop = (const class loop *)data;
1022   if (flow_bb_inside_loop_p (loop, bb))
1023     return true;
1024   return false;
1025 }
1026
1027
1028 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1029    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1030
1031 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1032   : vec_info (vec_info::loop, shared),
1033     loop (loop_in),
1034     num_itersm1 (NULL_TREE),
1035     num_iters (NULL_TREE),
1036     num_iters_unchanged (NULL_TREE),
1037     num_iters_assumptions (NULL_TREE),
1038     vector_costs (nullptr),
1039     scalar_costs (nullptr),
1040     th (0),
1041     versioning_threshold (0),
1042     vectorization_factor (0),
1043     main_loop_edge (nullptr),
1044     skip_main_loop_edge (nullptr),
1045     skip_this_loop_edge (nullptr),
1046     reusable_accumulators (),
1047     suggested_unroll_factor (1),
1048     max_vectorization_factor (0),
1049     mask_skip_niters (NULL_TREE),
1050     rgroup_compare_type (NULL_TREE),
1051     simd_if_cond (NULL_TREE),
1052     partial_vector_style (vect_partial_vectors_none),
1053     unaligned_dr (NULL),
1054     peeling_for_alignment (0),
1055     ptr_mask (0),
1056     ivexpr_map (NULL),
1057     scan_map (NULL),
1058     slp_unrolling_factor (1),
1059     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1060     vectorizable (false),
1061     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1062     using_partial_vectors_p (false),
1063     using_decrementing_iv_p (false),
1064     using_select_vl_p (false),
1065     epil_using_partial_vectors_p (false),
1066     partial_load_store_bias (0),
1067     peeling_for_gaps (false),
1068     peeling_for_niter (false),
1069     early_breaks (false),
1070     no_data_dependencies (false),
1071     has_mask_store (false),
1072     scalar_loop_scaling (profile_probability::uninitialized ()),
1073     scalar_loop (NULL),
1074     orig_loop_info (NULL),
1075     vec_loop_iv_exit (NULL),
1076     vec_epilogue_loop_iv_exit (NULL),
1077     scalar_loop_iv_exit (NULL)
1078 {
1079   /* CHECKME: We want to visit all BBs before their successors (except for
1080      latch blocks, for which this assertion wouldn't hold).  In the simple
1081      case of the loop forms we allow, a dfs order of the BBs would the same
1082      as reversed postorder traversal, so we are safe.  */
1083
1084   bbs = XCNEWVEC (basic_block, loop->num_nodes);
1085   nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
1086                              loop->num_nodes, loop);
1087   gcc_assert (nbbs == loop->num_nodes);
1088
1089   for (unsigned int i = 0; i < nbbs; i++)
1090     {
1091       basic_block bb = bbs[i];
1092       gimple_stmt_iterator si;
1093
1094       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1095         {
1096           gimple *phi = gsi_stmt (si);
1097           gimple_set_uid (phi, 0);
1098           add_stmt (phi);
1099         }
1100
1101       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1102         {
1103           gimple *stmt = gsi_stmt (si);
1104           gimple_set_uid (stmt, 0);
1105           if (is_gimple_debug (stmt))
1106             continue;
1107           add_stmt (stmt);
1108           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1109              third argument is the #pragma omp simd if (x) condition, when 0,
1110              loop shouldn't be vectorized, when non-zero constant, it should
1111              be vectorized normally, otherwise versioned with vectorized loop
1112              done if the condition is non-zero at runtime.  */
1113           if (loop_in->simduid
1114               && is_gimple_call (stmt)
1115               && gimple_call_internal_p (stmt)
1116               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1117               && gimple_call_num_args (stmt) >= 3
1118               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1119               && (loop_in->simduid
1120                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1121             {
1122               tree arg = gimple_call_arg (stmt, 2);
1123               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1124                 simd_if_cond = arg;
1125               else
1126                 gcc_assert (integer_nonzerop (arg));
1127             }
1128         }
1129     }
1130
1131   epilogue_vinfos.create (6);
1132 }
1133
1134 /* Free all levels of rgroup CONTROLS.  */
1135
1136 void
1137 release_vec_loop_controls (vec<rgroup_controls> *controls)
1138 {
1139   rgroup_controls *rgc;
1140   unsigned int i;
1141   FOR_EACH_VEC_ELT (*controls, i, rgc)
1142     rgc->controls.release ();
1143   controls->release ();
1144 }
1145
1146 /* Free all memory used by the _loop_vec_info, as well as all the
1147    stmt_vec_info structs of all the stmts in the loop.  */
1148
1149 _loop_vec_info::~_loop_vec_info ()
1150 {
1151   free (bbs);
1152
1153   release_vec_loop_controls (&masks.rgc_vec);
1154   release_vec_loop_controls (&lens);
1155   delete ivexpr_map;
1156   delete scan_map;
1157   epilogue_vinfos.release ();
1158   delete scalar_costs;
1159   delete vector_costs;
1160
1161   /* When we release an epiloge vinfo that we do not intend to use
1162      avoid clearing AUX of the main loop which should continue to
1163      point to the main loop vinfo since otherwise we'll leak that.  */
1164   if (loop->aux == this)
1165     loop->aux = NULL;
1166 }
1167
1168 /* Return an invariant or register for EXPR and emit necessary
1169    computations in the LOOP_VINFO loop preheader.  */
1170
1171 tree
1172 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1173 {
1174   if (is_gimple_reg (expr)
1175       || is_gimple_min_invariant (expr))
1176     return expr;
1177
1178   if (! loop_vinfo->ivexpr_map)
1179     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1180   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1181   if (! cached)
1182     {
1183       gimple_seq stmts = NULL;
1184       cached = force_gimple_operand (unshare_expr (expr),
1185                                      &stmts, true, NULL_TREE);
1186       if (stmts)
1187         {
1188           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1189           gsi_insert_seq_on_edge_immediate (e, stmts);
1190         }
1191     }
1192   return cached;
1193 }
1194
1195 /* Return true if we can use CMP_TYPE as the comparison type to produce
1196    all masks required to mask LOOP_VINFO.  */
1197
1198 static bool
1199 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1200 {
1201   rgroup_controls *rgm;
1202   unsigned int i;
1203   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1204     if (rgm->type != NULL_TREE
1205         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1206                                             cmp_type, rgm->type,
1207                                             OPTIMIZE_FOR_SPEED))
1208       return false;
1209   return true;
1210 }
1211
1212 /* Calculate the maximum number of scalars per iteration for every
1213    rgroup in LOOP_VINFO.  */
1214
1215 static unsigned int
1216 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1217 {
1218   unsigned int res = 1;
1219   unsigned int i;
1220   rgroup_controls *rgm;
1221   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1222     res = MAX (res, rgm->max_nscalars_per_iter);
1223   return res;
1224 }
1225
1226 /* Calculate the minimum precision necessary to represent:
1227
1228       MAX_NITERS * FACTOR
1229
1230    as an unsigned integer, where MAX_NITERS is the maximum number of
1231    loop header iterations for the original scalar form of LOOP_VINFO.  */
1232
1233 static unsigned
1234 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1235 {
1236   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1237
1238   /* Get the maximum number of iterations that is representable
1239      in the counter type.  */
1240   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1241   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1242
1243   /* Get a more refined estimate for the number of iterations.  */
1244   widest_int max_back_edges;
1245   if (max_loop_iterations (loop, &max_back_edges))
1246     max_ni = wi::smin (max_ni, max_back_edges + 1);
1247
1248   /* Work out how many bits we need to represent the limit.  */
1249   return wi::min_precision (max_ni * factor, UNSIGNED);
1250 }
1251
1252 /* True if the loop needs peeling or partial vectors when vectorized.  */
1253
1254 static bool
1255 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1256 {
1257   unsigned HOST_WIDE_INT const_vf;
1258   HOST_WIDE_INT max_niter
1259     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1260
1261   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1262   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1263     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1264                                           (loop_vinfo));
1265
1266   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1267       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1268     {
1269       /* Work out the (constant) number of iterations that need to be
1270          peeled for reasons other than niters.  */
1271       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1272       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1273         peel_niter += 1;
1274       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1275                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1276         return true;
1277     }
1278   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1279       /* ??? When peeling for gaps but not alignment, we could
1280          try to check whether the (variable) niters is known to be
1281          VF * N + 1.  That's something of a niche case though.  */
1282       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1283       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1284       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1285            < (unsigned) exact_log2 (const_vf))
1286           /* In case of versioning, check if the maximum number of
1287              iterations is greater than th.  If they are identical,
1288              the epilogue is unnecessary.  */
1289           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1290               || ((unsigned HOST_WIDE_INT) max_niter
1291                   /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1292                      but that's only computed later based on our result.
1293                      The following is the most conservative approximation.  */
1294                   > (std::max ((unsigned HOST_WIDE_INT) th,
1295                                const_vf) / const_vf) * const_vf))))
1296     return true;
1297
1298   return false;
1299 }
1300
1301 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1302    whether we can actually generate the masks required.  Return true if so,
1303    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1304
1305 static bool
1306 vect_verify_full_masking (loop_vec_info loop_vinfo)
1307 {
1308   unsigned int min_ni_width;
1309
1310   /* Use a normal loop if there are no statements that need masking.
1311      This only happens in rare degenerate cases: it means that the loop
1312      has no loads, no stores, and no live-out values.  */
1313   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1314     return false;
1315
1316   /* Produce the rgroup controls.  */
1317   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1318     {
1319       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1320       tree vectype = mask.first;
1321       unsigned nvectors = mask.second;
1322
1323       if (masks->rgc_vec.length () < nvectors)
1324         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1325       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1326       /* The number of scalars per iteration and the number of vectors are
1327          both compile-time constants.  */
1328       unsigned int nscalars_per_iter
1329           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1330                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1331
1332       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1333         {
1334           rgm->max_nscalars_per_iter = nscalars_per_iter;
1335           rgm->type = truth_type_for (vectype);
1336           rgm->factor = 1;
1337         }
1338     }
1339
1340   unsigned int max_nscalars_per_iter
1341     = vect_get_max_nscalars_per_iter (loop_vinfo);
1342
1343   /* Work out how many bits we need to represent the limit.  */
1344   min_ni_width
1345     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1346
1347   /* Find a scalar mode for which WHILE_ULT is supported.  */
1348   opt_scalar_int_mode cmp_mode_iter;
1349   tree cmp_type = NULL_TREE;
1350   tree iv_type = NULL_TREE;
1351   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1352   unsigned int iv_precision = UINT_MAX;
1353
1354   if (iv_limit != -1)
1355     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1356                                       UNSIGNED);
1357
1358   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1359     {
1360       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1361       if (cmp_bits >= min_ni_width
1362           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1363         {
1364           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1365           if (this_type
1366               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1367             {
1368               /* Although we could stop as soon as we find a valid mode,
1369                  there are at least two reasons why that's not always the
1370                  best choice:
1371
1372                  - An IV that's Pmode or wider is more likely to be reusable
1373                    in address calculations than an IV that's narrower than
1374                    Pmode.
1375
1376                  - Doing the comparison in IV_PRECISION or wider allows
1377                    a natural 0-based IV, whereas using a narrower comparison
1378                    type requires mitigations against wrap-around.
1379
1380                  Conversely, if the IV limit is variable, doing the comparison
1381                  in a wider type than the original type can introduce
1382                  unnecessary extensions, so picking the widest valid mode
1383                  is not always a good choice either.
1384
1385                  Here we prefer the first IV type that's Pmode or wider,
1386                  and the first comparison type that's IV_PRECISION or wider.
1387                  (The comparison type must be no wider than the IV type,
1388                  to avoid extensions in the vector loop.)
1389
1390                  ??? We might want to try continuing beyond Pmode for ILP32
1391                  targets if CMP_BITS < IV_PRECISION.  */
1392               iv_type = this_type;
1393               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1394                 cmp_type = this_type;
1395               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1396                 break;
1397             }
1398         }
1399     }
1400
1401   if (!cmp_type)
1402     {
1403       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1404       return false;
1405     }
1406
1407   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1408   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1409   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1410   return true;
1411 }
1412
1413 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1414    whether we can actually generate AVX512 style masks.  Return true if so,
1415    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1416
1417 static bool
1418 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1419 {
1420   /* Produce differently organized rgc_vec and differently check
1421      we can produce masks.  */
1422
1423   /* Use a normal loop if there are no statements that need masking.
1424      This only happens in rare degenerate cases: it means that the loop
1425      has no loads, no stores, and no live-out values.  */
1426   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1427     return false;
1428
1429   /* For the decrementing IV we need to represent all values in
1430      [0, niter + niter_skip] where niter_skip is the elements we
1431      skip in the first iteration for prologue peeling.  */
1432   tree iv_type = NULL_TREE;
1433   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1434   unsigned int iv_precision = UINT_MAX;
1435   if (iv_limit != -1)
1436     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1437
1438   /* First compute the type for the IV we use to track the remaining
1439      scalar iterations.  */
1440   opt_scalar_int_mode cmp_mode_iter;
1441   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1442     {
1443       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1444       if (cmp_bits >= iv_precision
1445           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1446         {
1447           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1448           if (iv_type)
1449             break;
1450         }
1451     }
1452   if (!iv_type)
1453     return false;
1454
1455   /* Produce the rgroup controls.  */
1456   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1457     {
1458       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1459       tree vectype = mask.first;
1460       unsigned nvectors = mask.second;
1461
1462       /* The number of scalars per iteration and the number of vectors are
1463          both compile-time constants.  */
1464       unsigned int nscalars_per_iter
1465         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1466                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1467
1468       /* We index the rgroup_controls vector with nscalars_per_iter
1469          which we keep constant and instead have a varying nvectors,
1470          remembering the vector mask with the fewest nV.  */
1471       if (masks->rgc_vec.length () < nscalars_per_iter)
1472         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1473       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1474
1475       if (!rgm->type || rgm->factor > nvectors)
1476         {
1477           rgm->type = truth_type_for (vectype);
1478           rgm->compare_type = NULL_TREE;
1479           rgm->max_nscalars_per_iter = nscalars_per_iter;
1480           rgm->factor = nvectors;
1481           rgm->bias_adjusted_ctrl = NULL_TREE;
1482         }
1483     }
1484
1485   /* There is no fixed compare type we are going to use but we have to
1486      be able to get at one for each mask group.  */
1487   unsigned int min_ni_width
1488     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1489
1490   bool ok = true;
1491   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1492     {
1493       tree mask_type = rgc.type;
1494       if (!mask_type)
1495         continue;
1496
1497       /* For now vect_get_loop_mask only supports integer mode masks
1498          when we need to split it.  */
1499       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1500           || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1501         {
1502           ok = false;
1503           break;
1504         }
1505
1506       /* If iv_type is usable as compare type use that - we can elide the
1507          saturation in that case.   */
1508       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1509         {
1510           tree cmp_vectype
1511             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1512           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1513             rgc.compare_type = cmp_vectype;
1514         }
1515       if (!rgc.compare_type)
1516         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1517           {
1518             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1519             if (cmp_bits >= min_ni_width
1520                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1521               {
1522                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1523                 if (!cmp_type)
1524                   continue;
1525
1526                 /* Check whether we can produce the mask with cmp_type.  */
1527                 tree cmp_vectype
1528                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1529                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1530                   {
1531                     rgc.compare_type = cmp_vectype;
1532                     break;
1533                   }
1534               }
1535         }
1536       if (!rgc.compare_type)
1537         {
1538           ok = false;
1539           break;
1540         }
1541     }
1542   if (!ok)
1543     {
1544       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1545       return false;
1546     }
1547
1548   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1549   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1550   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1551   return true;
1552 }
1553
1554 /* Check whether we can use vector access with length based on precison
1555    comparison.  So far, to keep it simple, we only allow the case that the
1556    precision of the target supported length is larger than the precision
1557    required by loop niters.  */
1558
1559 static bool
1560 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1561 {
1562   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1563     return false;
1564
1565   machine_mode len_load_mode, len_store_mode;
1566   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1567          .exists (&len_load_mode))
1568     return false;
1569   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1570          .exists (&len_store_mode))
1571     return false;
1572
1573   signed char partial_load_bias = internal_len_load_store_bias
1574     (IFN_LEN_LOAD, len_load_mode);
1575
1576   signed char partial_store_bias = internal_len_load_store_bias
1577     (IFN_LEN_STORE, len_store_mode);
1578
1579   gcc_assert (partial_load_bias == partial_store_bias);
1580
1581   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1582     return false;
1583
1584   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1585      len_loads with a length of zero.  In order to avoid that we prohibit
1586      more than one loop length here.  */
1587   if (partial_load_bias == -1
1588       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1589     return false;
1590
1591   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1592
1593   unsigned int max_nitems_per_iter = 1;
1594   unsigned int i;
1595   rgroup_controls *rgl;
1596   /* Find the maximum number of items per iteration for every rgroup.  */
1597   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1598     {
1599       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1600       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1601     }
1602
1603   /* Work out how many bits we need to represent the length limit.  */
1604   unsigned int min_ni_prec
1605     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1606
1607   /* Now use the maximum of below precisions for one suitable IV type:
1608      - the IV's natural precision
1609      - the precision needed to hold: the maximum number of scalar
1610        iterations multiplied by the scale factor (min_ni_prec above)
1611      - the Pmode precision
1612
1613      If min_ni_prec is less than the precision of the current niters,
1614      we perfer to still use the niters type.  Prefer to use Pmode and
1615      wider IV to avoid narrow conversions.  */
1616
1617   unsigned int ni_prec
1618     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1619   min_ni_prec = MAX (min_ni_prec, ni_prec);
1620   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1621
1622   tree iv_type = NULL_TREE;
1623   opt_scalar_int_mode tmode_iter;
1624   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1625     {
1626       scalar_mode tmode = tmode_iter.require ();
1627       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1628
1629       /* ??? Do we really want to construct one IV whose precision exceeds
1630          BITS_PER_WORD?  */
1631       if (tbits > BITS_PER_WORD)
1632         break;
1633
1634       /* Find the first available standard integral type.  */
1635       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1636         {
1637           iv_type = build_nonstandard_integer_type (tbits, true);
1638           break;
1639         }
1640     }
1641
1642   if (!iv_type)
1643     {
1644       if (dump_enabled_p ())
1645         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1646                          "can't vectorize with length-based partial vectors"
1647                          " because there is no suitable iv type.\n");
1648       return false;
1649     }
1650
1651   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1652   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1653   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1654
1655   return true;
1656 }
1657
1658 /* Calculate the cost of one scalar iteration of the loop.  */
1659 static void
1660 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1661 {
1662   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1663   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1664   int nbbs = loop->num_nodes, factor;
1665   int innerloop_iters, i;
1666
1667   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1668
1669   /* Gather costs for statements in the scalar loop.  */
1670
1671   /* FORNOW.  */
1672   innerloop_iters = 1;
1673   if (loop->inner)
1674     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1675
1676   for (i = 0; i < nbbs; i++)
1677     {
1678       gimple_stmt_iterator si;
1679       basic_block bb = bbs[i];
1680
1681       if (bb->loop_father == loop->inner)
1682         factor = innerloop_iters;
1683       else
1684         factor = 1;
1685
1686       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1687         {
1688           gimple *stmt = gsi_stmt (si);
1689           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1690
1691           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1692             continue;
1693
1694           /* Skip stmts that are not vectorized inside the loop.  */
1695           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1696           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1697               && (!STMT_VINFO_LIVE_P (vstmt_info)
1698                   || !VECTORIZABLE_CYCLE_DEF
1699                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1700             continue;
1701
1702           vect_cost_for_stmt kind;
1703           if (STMT_VINFO_DATA_REF (stmt_info))
1704             {
1705               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1706                kind = scalar_load;
1707              else
1708                kind = scalar_store;
1709             }
1710           else if (vect_nop_conversion_p (stmt_info))
1711             continue;
1712           else
1713             kind = scalar_stmt;
1714
1715           /* We are using vect_prologue here to avoid scaling twice
1716              by the inner loop factor.  */
1717           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1718                             factor, kind, stmt_info, 0, vect_prologue);
1719         }
1720     }
1721
1722   /* Now accumulate cost.  */
1723   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1724   add_stmt_costs (loop_vinfo->scalar_costs,
1725                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1726   loop_vinfo->scalar_costs->finish_cost (nullptr);
1727 }
1728
1729 /* Function vect_analyze_loop_form.
1730
1731    Verify that certain CFG restrictions hold, including:
1732    - the loop has a pre-header
1733    - the loop has a single entry
1734    - nested loops can have only a single exit.
1735    - the loop exit condition is simple enough
1736    - the number of iterations can be analyzed, i.e, a countable loop.  The
1737      niter could be analyzed under some assumptions.  */
1738
1739 opt_result
1740 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1741 {
1742   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1743
1744   edge exit_e = vec_init_loop_exit_info (loop);
1745   if (!exit_e)
1746     return opt_result::failure_at (vect_location,
1747                                    "not vectorized:"
1748                                    " could not determine main exit from"
1749                                    " loop with multiple exits.\n");
1750   info->loop_exit = exit_e;
1751   if (dump_enabled_p ())
1752       dump_printf_loc (MSG_NOTE, vect_location,
1753                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1754                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1755
1756   /* Check if we have any control flow that doesn't leave the loop.  */
1757   class loop *v_loop = loop->inner ? loop->inner : loop;
1758   basic_block *bbs = get_loop_body (v_loop);
1759   for (unsigned i = 0; i < v_loop->num_nodes; i++)
1760     if (EDGE_COUNT (bbs[i]->succs) != 1
1761         && (EDGE_COUNT (bbs[i]->succs) != 2
1762             || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1763       {
1764         free (bbs);
1765         return opt_result::failure_at (vect_location,
1766                                        "not vectorized:"
1767                                        " unsupported control flow in loop.\n");
1768       }
1769   free (bbs);
1770
1771   /* Different restrictions apply when we are considering an inner-most loop,
1772      vs. an outer (nested) loop.
1773      (FORNOW. May want to relax some of these restrictions in the future).  */
1774
1775   info->inner_loop_cond = NULL;
1776   if (!loop->inner)
1777     {
1778       /* Inner-most loop.  */
1779
1780       if (empty_block_p (loop->header))
1781         return opt_result::failure_at (vect_location,
1782                                        "not vectorized: empty loop.\n");
1783     }
1784   else
1785     {
1786       class loop *innerloop = loop->inner;
1787       edge entryedge;
1788
1789       /* Nested loop. We currently require that the loop is doubly-nested,
1790          contains a single inner loop with a single exit to the block
1791          with the single exit condition in the outer loop.
1792          Vectorizable outer-loops look like this:
1793
1794                         (pre-header)
1795                            |
1796                           header <---+
1797                            |         |
1798                           inner-loop |
1799                            |         |
1800                           tail ------+
1801                            |
1802                         (exit-bb)
1803
1804          The inner-loop also has the properties expected of inner-most loops
1805          as described above.  */
1806
1807       if ((loop->inner)->inner || (loop->inner)->next)
1808         return opt_result::failure_at (vect_location,
1809                                        "not vectorized:"
1810                                        " multiple nested loops.\n");
1811
1812       entryedge = loop_preheader_edge (innerloop);
1813       if (entryedge->src != loop->header
1814           || !single_exit (innerloop)
1815           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1816         return opt_result::failure_at (vect_location,
1817                                        "not vectorized:"
1818                                        " unsupported outerloop form.\n");
1819
1820       /* Analyze the inner-loop.  */
1821       vect_loop_form_info inner;
1822       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1823       if (!res)
1824         {
1825           if (dump_enabled_p ())
1826             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1827                              "not vectorized: Bad inner loop.\n");
1828           return res;
1829         }
1830
1831       /* Don't support analyzing niter under assumptions for inner
1832          loop.  */
1833       if (!integer_onep (inner.assumptions))
1834         return opt_result::failure_at (vect_location,
1835                                        "not vectorized: Bad inner loop.\n");
1836
1837       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1838         return opt_result::failure_at (vect_location,
1839                                        "not vectorized: inner-loop count not"
1840                                        " invariant.\n");
1841
1842       if (dump_enabled_p ())
1843         dump_printf_loc (MSG_NOTE, vect_location,
1844                          "Considering outer-loop vectorization.\n");
1845       info->inner_loop_cond = inner.conds[0];
1846     }
1847
1848   if (EDGE_COUNT (loop->header->preds) != 2)
1849     return opt_result::failure_at (vect_location,
1850                                    "not vectorized:"
1851                                    " too many incoming edges.\n");
1852
1853   /* We assume that the latch is empty.  */
1854   if (!empty_block_p (loop->latch)
1855       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1856     return opt_result::failure_at (vect_location,
1857                                    "not vectorized: latch block not empty.\n");
1858
1859   /* Make sure there is no abnormal exit.  */
1860   auto_vec<edge> exits = get_loop_exit_edges (loop);
1861   for (edge e : exits)
1862     {
1863       if (e->flags & EDGE_ABNORMAL)
1864         return opt_result::failure_at (vect_location,
1865                                        "not vectorized:"
1866                                        " abnormal loop exit edge.\n");
1867     }
1868
1869   info->conds
1870     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1871                             &info->number_of_iterations,
1872                             &info->number_of_iterationsm1);
1873   if (info->conds.is_empty ())
1874     return opt_result::failure_at
1875       (vect_location,
1876        "not vectorized: complicated exit condition.\n");
1877
1878   /* Determine what the primary and alternate exit conds are.  */
1879   for (unsigned i = 0; i < info->conds.length (); i++)
1880     {
1881       gcond *cond = info->conds[i];
1882       if (exit_e->src == gimple_bb (cond))
1883         std::swap (info->conds[0], info->conds[i]);
1884     }
1885
1886   if (integer_zerop (info->assumptions)
1887       || !info->number_of_iterations
1888       || chrec_contains_undetermined (info->number_of_iterations))
1889     return opt_result::failure_at
1890       (info->conds[0],
1891        "not vectorized: number of iterations cannot be computed.\n");
1892
1893   if (integer_zerop (info->number_of_iterations))
1894     return opt_result::failure_at
1895       (info->conds[0],
1896        "not vectorized: number of iterations = 0.\n");
1897
1898   if (!(tree_fits_shwi_p (info->number_of_iterations)
1899         && tree_to_shwi (info->number_of_iterations) > 0))
1900     {
1901       if (dump_enabled_p ())
1902         {
1903           dump_printf_loc (MSG_NOTE, vect_location,
1904                            "Symbolic number of iterations is ");
1905           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1906           dump_printf (MSG_NOTE, "\n");
1907         }
1908     }
1909
1910   return opt_result::success ();
1911 }
1912
1913 /* Create a loop_vec_info for LOOP with SHARED and the
1914    vect_analyze_loop_form result.  */
1915
1916 loop_vec_info
1917 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1918                         const vect_loop_form_info *info,
1919                         loop_vec_info main_loop_info)
1920 {
1921   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1922   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1923   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1924   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1925   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1926   /* Also record the assumptions for versioning.  */
1927   if (!integer_onep (info->assumptions) && !main_loop_info)
1928     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1929
1930   for (gcond *cond : info->conds)
1931     {
1932       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1933       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1934       /* Mark the statement as a condition.  */
1935       STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1936     }
1937
1938   for (unsigned i = 1; i < info->conds.length (); i ++)
1939     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1940   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1941
1942   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1943
1944   /* Check to see if we're vectorizing multiple exits.  */
1945   LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1946     = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1947
1948   if (info->inner_loop_cond)
1949     {
1950       stmt_vec_info inner_loop_cond_info
1951         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1952       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1953       /* If we have an estimate on the number of iterations of the inner
1954          loop use that to limit the scale for costing, otherwise use
1955          --param vect-inner-loop-cost-factor literally.  */
1956       widest_int nit;
1957       if (estimated_stmt_executions (loop->inner, &nit))
1958         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1959           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1960     }
1961
1962   return loop_vinfo;
1963 }
1964
1965
1966
1967 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1968    statements update the vectorization factor.  */
1969
1970 static void
1971 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1972 {
1973   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1974   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1975   int nbbs = loop->num_nodes;
1976   poly_uint64 vectorization_factor;
1977   int i;
1978
1979   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1980
1981   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1982   gcc_assert (known_ne (vectorization_factor, 0U));
1983
1984   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1985      vectorization factor of the loop is the unrolling factor required by
1986      the SLP instances.  If that unrolling factor is 1, we say, that we
1987      perform pure SLP on loop - cross iteration parallelism is not
1988      exploited.  */
1989   bool only_slp_in_loop = true;
1990   for (i = 0; i < nbbs; i++)
1991     {
1992       basic_block bb = bbs[i];
1993       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1994            gsi_next (&si))
1995         {
1996           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1997           if (!stmt_info)
1998             continue;
1999           if ((STMT_VINFO_RELEVANT_P (stmt_info)
2000                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2001               && !PURE_SLP_STMT (stmt_info))
2002             /* STMT needs both SLP and loop-based vectorization.  */
2003             only_slp_in_loop = false;
2004         }
2005       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2006            gsi_next (&si))
2007         {
2008           if (is_gimple_debug (gsi_stmt (si)))
2009             continue;
2010           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2011           stmt_info = vect_stmt_to_vectorize (stmt_info);
2012           if ((STMT_VINFO_RELEVANT_P (stmt_info)
2013                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2014               && !PURE_SLP_STMT (stmt_info))
2015             /* STMT needs both SLP and loop-based vectorization.  */
2016             only_slp_in_loop = false;
2017         }
2018     }
2019
2020   if (only_slp_in_loop)
2021     {
2022       if (dump_enabled_p ())
2023         dump_printf_loc (MSG_NOTE, vect_location,
2024                          "Loop contains only SLP stmts\n");
2025       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2026     }
2027   else
2028     {
2029       if (dump_enabled_p ())
2030         dump_printf_loc (MSG_NOTE, vect_location,
2031                          "Loop contains SLP and non-SLP stmts\n");
2032       /* Both the vectorization factor and unroll factor have the form
2033          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2034          so they must have a common multiple.  */
2035       vectorization_factor
2036         = force_common_multiple (vectorization_factor,
2037                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2038     }
2039
2040   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2041   if (dump_enabled_p ())
2042     {
2043       dump_printf_loc (MSG_NOTE, vect_location,
2044                        "Updating vectorization factor to ");
2045       dump_dec (MSG_NOTE, vectorization_factor);
2046       dump_printf (MSG_NOTE, ".\n");
2047     }
2048 }
2049
2050 /* Return true if STMT_INFO describes a double reduction phi and if
2051    the other phi in the reduction is also relevant for vectorization.
2052    This rejects cases such as:
2053
2054       outer1:
2055         x_1 = PHI <x_3(outer2), ...>;
2056         ...
2057
2058       inner:
2059         x_2 = ...;
2060         ...
2061
2062       outer2:
2063         x_3 = PHI <x_2(inner)>;
2064
2065    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2066
2067 static bool
2068 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2069 {
2070   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2071     return false;
2072
2073   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2074 }
2075
2076 /* Function vect_analyze_loop_operations.
2077
2078    Scan the loop stmts and make sure they are all vectorizable.  */
2079
2080 static opt_result
2081 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2082 {
2083   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2084   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2085   int nbbs = loop->num_nodes;
2086   int i;
2087   stmt_vec_info stmt_info;
2088   bool need_to_vectorize = false;
2089   bool ok;
2090
2091   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2092
2093   auto_vec<stmt_info_for_cost> cost_vec;
2094
2095   for (i = 0; i < nbbs; i++)
2096     {
2097       basic_block bb = bbs[i];
2098
2099       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2100            gsi_next (&si))
2101         {
2102           gphi *phi = si.phi ();
2103           ok = true;
2104
2105           stmt_info = loop_vinfo->lookup_stmt (phi);
2106           if (dump_enabled_p ())
2107             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2108                              (gimple *) phi);
2109           if (virtual_operand_p (gimple_phi_result (phi)))
2110             continue;
2111
2112           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2113              (i.e., a phi in the tail of the outer-loop).  */
2114           if (! is_loop_header_bb_p (bb))
2115             {
2116               /* FORNOW: we currently don't support the case that these phis
2117                  are not used in the outerloop (unless it is double reduction,
2118                  i.e., this phi is vect_reduction_def), cause this case
2119                  requires to actually do something here.  */
2120               if (STMT_VINFO_LIVE_P (stmt_info)
2121                   && !vect_active_double_reduction_p (stmt_info))
2122                 return opt_result::failure_at (phi,
2123                                                "Unsupported loop-closed phi"
2124                                                " in outer-loop.\n");
2125
2126               /* If PHI is used in the outer loop, we check that its operand
2127                  is defined in the inner loop.  */
2128               if (STMT_VINFO_RELEVANT_P (stmt_info))
2129                 {
2130                   tree phi_op;
2131
2132                   if (gimple_phi_num_args (phi) != 1)
2133                     return opt_result::failure_at (phi, "unsupported phi");
2134
2135                   phi_op = PHI_ARG_DEF (phi, 0);
2136                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2137                   if (!op_def_info)
2138                     return opt_result::failure_at (phi, "unsupported phi\n");
2139
2140                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2141                       && (STMT_VINFO_RELEVANT (op_def_info)
2142                           != vect_used_in_outer_by_reduction))
2143                     return opt_result::failure_at (phi, "unsupported phi\n");
2144
2145                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2146                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2147                            == vect_double_reduction_def))
2148                       && !vectorizable_lc_phi (loop_vinfo,
2149                                                stmt_info, NULL, NULL))
2150                     return opt_result::failure_at (phi, "unsupported phi\n");
2151                 }
2152
2153               continue;
2154             }
2155
2156           gcc_assert (stmt_info);
2157
2158           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2159                || STMT_VINFO_LIVE_P (stmt_info))
2160               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2161               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2162             /* A scalar-dependence cycle that we don't support.  */
2163             return opt_result::failure_at (phi,
2164                                            "not vectorized:"
2165                                            " scalar dependence cycle.\n");
2166
2167           if (STMT_VINFO_RELEVANT_P (stmt_info))
2168             {
2169               need_to_vectorize = true;
2170               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2171                   && ! PURE_SLP_STMT (stmt_info))
2172                 ok = vectorizable_induction (loop_vinfo,
2173                                              stmt_info, NULL, NULL,
2174                                              &cost_vec);
2175               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2176                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2177                             == vect_double_reduction_def)
2178                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2179                        && ! PURE_SLP_STMT (stmt_info))
2180                 ok = vectorizable_reduction (loop_vinfo,
2181                                              stmt_info, NULL, NULL, &cost_vec);
2182               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2183                         == vect_first_order_recurrence)
2184                        && ! PURE_SLP_STMT (stmt_info))
2185                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2186                                            &cost_vec);
2187             }
2188
2189           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2190           if (ok
2191               && STMT_VINFO_LIVE_P (stmt_info)
2192               && !PURE_SLP_STMT (stmt_info))
2193             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2194                                               -1, false, &cost_vec);
2195
2196           if (!ok)
2197             return opt_result::failure_at (phi,
2198                                            "not vectorized: relevant phi not "
2199                                            "supported: %G",
2200                                            static_cast <gimple *> (phi));
2201         }
2202
2203       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2204            gsi_next (&si))
2205         {
2206           gimple *stmt = gsi_stmt (si);
2207           if (!gimple_clobber_p (stmt)
2208               && !is_gimple_debug (stmt))
2209             {
2210               opt_result res
2211                 = vect_analyze_stmt (loop_vinfo,
2212                                      loop_vinfo->lookup_stmt (stmt),
2213                                      &need_to_vectorize,
2214                                      NULL, NULL, &cost_vec);
2215               if (!res)
2216                 return res;
2217             }
2218         }
2219     } /* bbs */
2220
2221   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2222
2223   /* All operations in the loop are either irrelevant (deal with loop
2224      control, or dead), or only used outside the loop and can be moved
2225      out of the loop (e.g. invariants, inductions).  The loop can be
2226      optimized away by scalar optimizations.  We're better off not
2227      touching this loop.  */
2228   if (!need_to_vectorize)
2229     {
2230       if (dump_enabled_p ())
2231         dump_printf_loc (MSG_NOTE, vect_location,
2232                          "All the computation can be taken out of the loop.\n");
2233       return opt_result::failure_at
2234         (vect_location,
2235          "not vectorized: redundant loop. no profit to vectorize.\n");
2236     }
2237
2238   return opt_result::success ();
2239 }
2240
2241 /* Return true if we know that the iteration count is smaller than the
2242    vectorization factor.  Return false if it isn't, or if we can't be sure
2243    either way.  */
2244
2245 static bool
2246 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2247 {
2248   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2249
2250   HOST_WIDE_INT max_niter;
2251   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2252     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2253   else
2254     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2255
2256   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2257     return true;
2258
2259   return false;
2260 }
2261
2262 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2263    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2264    definitely no, or -1 if it's worth retrying.  */
2265
2266 static int
2267 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2268                            unsigned *suggested_unroll_factor)
2269 {
2270   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2271   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2272
2273   /* Only loops that can handle partially-populated vectors can have iteration
2274      counts less than the vectorization factor.  */
2275   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2276       && vect_known_niters_smaller_than_vf (loop_vinfo))
2277     {
2278       if (dump_enabled_p ())
2279         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2280                          "not vectorized: iteration count smaller than "
2281                          "vectorization factor.\n");
2282       return 0;
2283     }
2284
2285   /* If we know the number of iterations we can do better, for the
2286      epilogue we can also decide whether the main loop leaves us
2287      with enough iterations, prefering a smaller vector epilog then
2288      also possibly used for the case we skip the vector loop.  */
2289   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2290     {
2291       widest_int scalar_niters
2292         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2293       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2294         {
2295           loop_vec_info orig_loop_vinfo
2296             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2297           unsigned lowest_vf
2298             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2299           int prolog_peeling = 0;
2300           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2301             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2302           if (prolog_peeling >= 0
2303               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2304                            lowest_vf))
2305             {
2306               unsigned gap
2307                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2308               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2309                                % lowest_vf + gap);
2310             }
2311         }
2312       /* Reject vectorizing for a single scalar iteration, even if
2313          we could in principle implement that using partial vectors.  */
2314       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2315       if (scalar_niters <= peeling_gap + 1)
2316         {
2317           if (dump_enabled_p ())
2318             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2319                              "not vectorized: loop only has a single "
2320                              "scalar iteration.\n");
2321           return 0;
2322         }
2323
2324       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2325         {
2326           /* Check that the loop processes at least one full vector.  */
2327           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2328           if (known_lt (scalar_niters, vf))
2329             {
2330               if (dump_enabled_p ())
2331                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2332                                  "loop does not have enough iterations "
2333                                  "to support vectorization.\n");
2334               return 0;
2335             }
2336
2337           /* If we need to peel an extra epilogue iteration to handle data
2338              accesses with gaps, check that there are enough scalar iterations
2339              available.
2340
2341              The check above is redundant with this one when peeling for gaps,
2342              but the distinction is useful for diagnostics.  */
2343           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2344               && known_le (scalar_niters, vf))
2345             {
2346               if (dump_enabled_p ())
2347                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2348                                  "loop does not have enough iterations "
2349                                  "to support peeling for gaps.\n");
2350               return 0;
2351             }
2352         }
2353     }
2354
2355   /* If using the "very cheap" model. reject cases in which we'd keep
2356      a copy of the scalar code (even if we might be able to vectorize it).  */
2357   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2358       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2359           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2360           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2361     {
2362       if (dump_enabled_p ())
2363         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2364                          "some scalar iterations would need to be peeled\n");
2365       return 0;
2366     }
2367
2368   int min_profitable_iters, min_profitable_estimate;
2369   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2370                                       &min_profitable_estimate,
2371                                       suggested_unroll_factor);
2372
2373   if (min_profitable_iters < 0)
2374     {
2375       if (dump_enabled_p ())
2376         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2377                          "not vectorized: vectorization not profitable.\n");
2378       if (dump_enabled_p ())
2379         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2380                          "not vectorized: vector version will never be "
2381                          "profitable.\n");
2382       return -1;
2383     }
2384
2385   int min_scalar_loop_bound = (param_min_vect_loop_bound
2386                                * assumed_vf);
2387
2388   /* Use the cost model only if it is more conservative than user specified
2389      threshold.  */
2390   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2391                                     min_profitable_iters);
2392
2393   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2394
2395   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2396       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2397     {
2398       if (dump_enabled_p ())
2399         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2400                          "not vectorized: vectorization not profitable.\n");
2401       if (dump_enabled_p ())
2402         dump_printf_loc (MSG_NOTE, vect_location,
2403                          "not vectorized: iteration count smaller than user "
2404                          "specified loop bound parameter or minimum profitable "
2405                          "iterations (whichever is more conservative).\n");
2406       return 0;
2407     }
2408
2409   /* The static profitablity threshold min_profitable_estimate includes
2410      the cost of having to check at runtime whether the scalar loop
2411      should be used instead.  If it turns out that we don't need or want
2412      such a check, the threshold we should use for the static estimate
2413      is simply the point at which the vector loop becomes more profitable
2414      than the scalar loop.  */
2415   if (min_profitable_estimate > min_profitable_iters
2416       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2417       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2418       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2419       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2420     {
2421       if (dump_enabled_p ())
2422         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2423                          " choice between the scalar and vector loops\n");
2424       min_profitable_estimate = min_profitable_iters;
2425     }
2426
2427   /* If the vector loop needs multiple iterations to be beneficial then
2428      things are probably too close to call, and the conservative thing
2429      would be to stick with the scalar code.  */
2430   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2431       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2432     {
2433       if (dump_enabled_p ())
2434         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2435                          "one iteration of the vector loop would be"
2436                          " more expensive than the equivalent number of"
2437                          " iterations of the scalar loop\n");
2438       return 0;
2439     }
2440
2441   HOST_WIDE_INT estimated_niter;
2442
2443   /* If we are vectorizing an epilogue then we know the maximum number of
2444      scalar iterations it will cover is at least one lower than the
2445      vectorization factor of the main loop.  */
2446   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2447     estimated_niter
2448       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2449   else
2450     {
2451       estimated_niter = estimated_stmt_executions_int (loop);
2452       if (estimated_niter == -1)
2453         estimated_niter = likely_max_stmt_executions_int (loop);
2454     }
2455   if (estimated_niter != -1
2456       && ((unsigned HOST_WIDE_INT) estimated_niter
2457           < MAX (th, (unsigned) min_profitable_estimate)))
2458     {
2459       if (dump_enabled_p ())
2460         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2461                          "not vectorized: estimated iteration count too "
2462                          "small.\n");
2463       if (dump_enabled_p ())
2464         dump_printf_loc (MSG_NOTE, vect_location,
2465                          "not vectorized: estimated iteration count smaller "
2466                          "than specified loop bound parameter or minimum "
2467                          "profitable iterations (whichever is more "
2468                          "conservative).\n");
2469       return -1;
2470     }
2471
2472   return 1;
2473 }
2474
2475 static opt_result
2476 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2477                            vec<data_reference_p> *datarefs,
2478                            unsigned int *n_stmts)
2479 {
2480   *n_stmts = 0;
2481   for (unsigned i = 0; i < loop->num_nodes; i++)
2482     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2483          !gsi_end_p (gsi); gsi_next (&gsi))
2484       {
2485         gimple *stmt = gsi_stmt (gsi);
2486         if (is_gimple_debug (stmt))
2487           continue;
2488         ++(*n_stmts);
2489         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2490                                                         NULL, 0);
2491         if (!res)
2492           {
2493             if (is_gimple_call (stmt) && loop->safelen)
2494               {
2495                 tree fndecl = gimple_call_fndecl (stmt), op;
2496                 if (fndecl == NULL_TREE
2497                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2498                   {
2499                     fndecl = gimple_call_arg (stmt, 0);
2500                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2501                     fndecl = TREE_OPERAND (fndecl, 0);
2502                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2503                   }
2504                 if (fndecl != NULL_TREE)
2505                   {
2506                     cgraph_node *node = cgraph_node::get (fndecl);
2507                     if (node != NULL && node->simd_clones != NULL)
2508                       {
2509                         unsigned int j, n = gimple_call_num_args (stmt);
2510                         for (j = 0; j < n; j++)
2511                           {
2512                             op = gimple_call_arg (stmt, j);
2513                             if (DECL_P (op)
2514                                 || (REFERENCE_CLASS_P (op)
2515                                     && get_base_address (op)))
2516                               break;
2517                           }
2518                         op = gimple_call_lhs (stmt);
2519                         /* Ignore #pragma omp declare simd functions
2520                            if they don't have data references in the
2521                            call stmt itself.  */
2522                         if (j == n
2523                             && !(op
2524                                  && (DECL_P (op)
2525                                      || (REFERENCE_CLASS_P (op)
2526                                          && get_base_address (op)))))
2527                           continue;
2528                       }
2529                   }
2530               }
2531             return res;
2532           }
2533         /* If dependence analysis will give up due to the limit on the
2534            number of datarefs stop here and fail fatally.  */
2535         if (datarefs->length ()
2536             > (unsigned)param_loop_max_datarefs_for_datadeps)
2537           return opt_result::failure_at (stmt, "exceeded param "
2538                                          "loop-max-datarefs-for-datadeps\n");
2539       }
2540   return opt_result::success ();
2541 }
2542
2543 /* Look for SLP-only access groups and turn each individual access into its own
2544    group.  */
2545 static void
2546 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2547 {
2548   unsigned int i;
2549   struct data_reference *dr;
2550
2551   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2552
2553   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2554   FOR_EACH_VEC_ELT (datarefs, i, dr)
2555     {
2556       gcc_assert (DR_REF (dr));
2557       stmt_vec_info stmt_info
2558         = vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (DR_STMT (dr)));
2559
2560       /* Check if the load is a part of an interleaving chain.  */
2561       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2562         {
2563           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2564           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2565           unsigned int group_size = DR_GROUP_SIZE (first_element);
2566
2567           /* Check if SLP-only groups.  */
2568           if (!STMT_SLP_TYPE (stmt_info)
2569               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2570             {
2571               /* Dissolve the group.  */
2572               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2573
2574               stmt_vec_info vinfo = first_element;
2575               while (vinfo)
2576                 {
2577                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2578                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2579                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2580                   DR_GROUP_SIZE (vinfo) = 1;
2581                   if (STMT_VINFO_STRIDED_P (first_element)
2582                       /* We cannot handle stores with gaps.  */
2583                       || DR_IS_WRITE (dr_info->dr))
2584                     {
2585                       STMT_VINFO_STRIDED_P (vinfo) = true;
2586                       DR_GROUP_GAP (vinfo) = 0;
2587                     }
2588                   else
2589                     DR_GROUP_GAP (vinfo) = group_size - 1;
2590                   /* Duplicate and adjust alignment info, it needs to
2591                      be present on each group leader, see dr_misalignment.  */
2592                   if (vinfo != first_element)
2593                     {
2594                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2595                       dr_info2->target_alignment = dr_info->target_alignment;
2596                       int misalignment = dr_info->misalignment;
2597                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2598                         {
2599                           HOST_WIDE_INT diff
2600                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2601                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2602                           unsigned HOST_WIDE_INT align_c
2603                             = dr_info->target_alignment.to_constant ();
2604                           misalignment = (misalignment + diff) % align_c;
2605                         }
2606                       dr_info2->misalignment = misalignment;
2607                     }
2608                   vinfo = next;
2609                 }
2610             }
2611         }
2612     }
2613 }
2614
2615 /* Determine if operating on full vectors for LOOP_VINFO might leave
2616    some scalar iterations still to do.  If so, decide how we should
2617    handle those scalar iterations.  The possibilities are:
2618
2619    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2620        In this case:
2621
2622          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2623          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2624          LOOP_VINFO_PEELING_FOR_NITER == false
2625
2626    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2627        to handle the remaining scalar iterations.  In this case:
2628
2629          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2630          LOOP_VINFO_PEELING_FOR_NITER == true
2631
2632        There are two choices:
2633
2634        (2a) Consider vectorizing the epilogue loop at the same VF as the
2635             main loop, but using partial vectors instead of full vectors.
2636             In this case:
2637
2638               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2639
2640        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2641             In this case:
2642
2643               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2644  */
2645
2646 opt_result
2647 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2648 {
2649   /* Determine whether there would be any scalar iterations left over.  */
2650   bool need_peeling_or_partial_vectors_p
2651     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2652
2653   /* Decide whether to vectorize the loop with partial vectors.  */
2654   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2655   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2656   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2657       && need_peeling_or_partial_vectors_p)
2658     {
2659       /* For partial-vector-usage=1, try to push the handling of partial
2660          vectors to the epilogue, with the main loop continuing to operate
2661          on full vectors.
2662
2663          If we are unrolling we also do not want to use partial vectors. This
2664          is to avoid the overhead of generating multiple masks and also to
2665          avoid having to execute entire iterations of FALSE masked instructions
2666          when dealing with one or less full iterations.
2667
2668          ??? We could then end up failing to use partial vectors if we
2669          decide to peel iterations into a prologue, and if the main loop
2670          then ends up processing fewer than VF iterations.  */
2671       if ((param_vect_partial_vector_usage == 1
2672            || loop_vinfo->suggested_unroll_factor > 1)
2673           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2674           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2675         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2676       else
2677         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2678     }
2679
2680   if (dump_enabled_p ())
2681     dump_printf_loc (MSG_NOTE, vect_location,
2682                      "operating on %s vectors%s.\n",
2683                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2684                      ? "partial" : "full",
2685                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2686                      ? " for epilogue loop" : "");
2687
2688   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2689     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2690        && need_peeling_or_partial_vectors_p);
2691
2692   /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2693      analysis that we don't know whether the loop is vectorized by partial
2694      vectors (More details see tree-vect-loop-manip.cc).
2695
2696      However, SELECT_VL vectorizaton style should only applied on partial
2697      vectorization since SELECT_VL is the GIMPLE IR that calculates the
2698      number of elements to be process for each iteration.
2699
2700      After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2701      if it is not partial vectorized loop.  */
2702   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2703     LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2704
2705   return opt_result::success ();
2706 }
2707
2708 /* Function vect_analyze_loop_2.
2709
2710    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2711    analyses will record information in some members of LOOP_VINFO.  FATAL
2712    indicates if some analysis meets fatal error.  If one non-NULL pointer
2713    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2714    worked out suggested unroll factor, while one NULL pointer shows it's
2715    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2716    is to hold the slp decision when the suggested unroll factor is worked
2717    out.  */
2718 static opt_result
2719 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2720                      unsigned *suggested_unroll_factor,
2721                      bool& slp_done_for_suggested_uf)
2722 {
2723   opt_result ok = opt_result::success ();
2724   int res;
2725   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2726   poly_uint64 min_vf = 2;
2727   loop_vec_info orig_loop_vinfo = NULL;
2728
2729   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2730      loop_vec_info of the first vectorized loop.  */
2731   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2732     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2733   else
2734     orig_loop_vinfo = loop_vinfo;
2735   gcc_assert (orig_loop_vinfo);
2736
2737   /* The first group of checks is independent of the vector size.  */
2738   fatal = true;
2739
2740   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2741       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2742     return opt_result::failure_at (vect_location,
2743                                    "not vectorized: simd if(0)\n");
2744
2745   /* Find all data references in the loop (which correspond to vdefs/vuses)
2746      and analyze their evolution in the loop.  */
2747
2748   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2749
2750   /* Gather the data references and count stmts in the loop.  */
2751   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2752     {
2753       opt_result res
2754         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2755                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2756                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2757       if (!res)
2758         {
2759           if (dump_enabled_p ())
2760             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2761                              "not vectorized: loop contains function "
2762                              "calls or data references that cannot "
2763                              "be analyzed\n");
2764           return res;
2765         }
2766       loop_vinfo->shared->save_datarefs ();
2767     }
2768   else
2769     loop_vinfo->shared->check_datarefs ();
2770
2771   /* Analyze the data references and also adjust the minimal
2772      vectorization factor according to the loads and stores.  */
2773
2774   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2775   if (!ok)
2776     {
2777       if (dump_enabled_p ())
2778         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2779                          "bad data references.\n");
2780       return ok;
2781     }
2782
2783   /* Check if we are applying unroll factor now.  */
2784   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2785   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2786
2787   /* If the slp decision is false when suggested unroll factor is worked
2788      out, and we are applying suggested unroll factor, we can simply skip
2789      all slp related analyses this time.  */
2790   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2791
2792   /* Classify all cross-iteration scalar data-flow cycles.
2793      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2794   vect_analyze_scalar_cycles (loop_vinfo, slp);
2795
2796   vect_pattern_recog (loop_vinfo);
2797
2798   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2799
2800   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2801      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2802
2803   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2804   if (!ok)
2805     {
2806       if (dump_enabled_p ())
2807         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2808                          "bad data access.\n");
2809       return ok;
2810     }
2811
2812   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2813
2814   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2815   if (!ok)
2816     {
2817       if (dump_enabled_p ())
2818         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2819                          "unexpected pattern.\n");
2820       return ok;
2821     }
2822
2823   /* While the rest of the analysis below depends on it in some way.  */
2824   fatal = false;
2825
2826   /* Analyze data dependences between the data-refs in the loop
2827      and adjust the maximum vectorization factor according to
2828      the dependences.
2829      FORNOW: fail at the first data dependence that we encounter.  */
2830
2831   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2832   if (!ok)
2833     {
2834       if (dump_enabled_p ())
2835         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2836                          "bad data dependence.\n");
2837       return ok;
2838     }
2839   if (max_vf != MAX_VECTORIZATION_FACTOR
2840       && maybe_lt (max_vf, min_vf))
2841     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2842   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2843
2844   ok = vect_determine_vectorization_factor (loop_vinfo);
2845   if (!ok)
2846     {
2847       if (dump_enabled_p ())
2848         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2849                          "can't determine vectorization factor.\n");
2850       return ok;
2851     }
2852
2853   /* Compute the scalar iteration cost.  */
2854   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2855
2856   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2857
2858   if (slp)
2859     {
2860       /* Check the SLP opportunities in the loop, analyze and build
2861          SLP trees.  */
2862       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2863       if (!ok)
2864         return ok;
2865
2866       /* If there are any SLP instances mark them as pure_slp.  */
2867       slp = vect_make_slp_decision (loop_vinfo);
2868       if (slp)
2869         {
2870           /* Find stmts that need to be both vectorized and SLPed.  */
2871           vect_detect_hybrid_slp (loop_vinfo);
2872
2873           /* Update the vectorization factor based on the SLP decision.  */
2874           vect_update_vf_for_slp (loop_vinfo);
2875
2876           /* Optimize the SLP graph with the vectorization factor fixed.  */
2877           vect_optimize_slp (loop_vinfo);
2878
2879           /* Gather the loads reachable from the SLP graph entries.  */
2880           vect_gather_slp_loads (loop_vinfo);
2881         }
2882     }
2883
2884   bool saved_can_use_partial_vectors_p
2885     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2886
2887   /* We don't expect to have to roll back to anything other than an empty
2888      set of rgroups.  */
2889   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2890
2891   /* This is the point where we can re-start analysis with SLP forced off.  */
2892 start_over:
2893
2894   /* Apply the suggested unrolling factor, this was determined by the backend
2895      during finish_cost the first time we ran the analyzis for this
2896      vector mode.  */
2897   if (applying_suggested_uf)
2898     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2899
2900   /* Now the vectorization factor is final.  */
2901   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2902   gcc_assert (known_ne (vectorization_factor, 0U));
2903
2904   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2905     {
2906       dump_printf_loc (MSG_NOTE, vect_location,
2907                        "vectorization_factor = ");
2908       dump_dec (MSG_NOTE, vectorization_factor);
2909       dump_printf (MSG_NOTE, ", niters = %wd\n",
2910                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2911     }
2912
2913   if (max_vf != MAX_VECTORIZATION_FACTOR
2914       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2915     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2916
2917   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2918
2919   /* Analyze the alignment of the data-refs in the loop.
2920      Fail if a data reference is found that cannot be vectorized.  */
2921
2922   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2923   if (!ok)
2924     {
2925       if (dump_enabled_p ())
2926         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2927                          "bad data alignment.\n");
2928       return ok;
2929     }
2930
2931   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2932      It is important to call pruning after vect_analyze_data_ref_accesses,
2933      since we use grouping information gathered by interleaving analysis.  */
2934   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2935   if (!ok)
2936     return ok;
2937
2938   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2939      vectorization, since we do not want to add extra peeling or
2940      add versioning for alignment.  */
2941   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2942     /* This pass will decide on using loop versioning and/or loop peeling in
2943        order to enhance the alignment of data references in the loop.  */
2944     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2945   if (!ok)
2946     return ok;
2947
2948   if (slp)
2949     {
2950       /* Analyze operations in the SLP instances.  We can't simply
2951          remove unsupported SLP instances as this makes the above
2952          SLP kind detection invalid and might also affect the VF.  */
2953       if (! vect_slp_analyze_operations (loop_vinfo))
2954         {
2955           ok = opt_result::failure_at (vect_location,
2956                                        "unsupported SLP instances\n");
2957           goto again;
2958         }
2959     }
2960
2961   /* Dissolve SLP-only groups.  */
2962   vect_dissolve_slp_only_groups (loop_vinfo);
2963
2964   /* Scan all the remaining operations in the loop that are not subject
2965      to SLP and make sure they are vectorizable.  */
2966   ok = vect_analyze_loop_operations (loop_vinfo);
2967   if (!ok)
2968     {
2969       if (dump_enabled_p ())
2970         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2971                          "bad operation or unsupported loop bound.\n");
2972       return ok;
2973     }
2974
2975   /* For now, we don't expect to mix both masking and length approaches for one
2976      loop, disable it if both are recorded.  */
2977   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2978       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2979       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2980     {
2981       if (dump_enabled_p ())
2982         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2983                          "can't vectorize a loop with partial vectors"
2984                          " because we don't expect to mix different"
2985                          " approaches with partial vectors for the"
2986                          " same loop.\n");
2987       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2988     }
2989
2990   /* If we still have the option of using partial vectors,
2991      check whether we can generate the necessary loop controls.  */
2992   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2993     {
2994       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2995         {
2996           if (!vect_verify_full_masking (loop_vinfo)
2997               && !vect_verify_full_masking_avx512 (loop_vinfo))
2998             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2999         }
3000       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3001         if (!vect_verify_loop_lens (loop_vinfo))
3002           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3003     }
3004
3005   /* If we're vectorizing a loop that uses length "controls" and
3006      can iterate more than once, we apply decrementing IV approach
3007      in loop control.  */
3008   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3009       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3010       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3011       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3012            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3013                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3014     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3015
3016   /* If a loop uses length controls and has a decrementing loop control IV,
3017      we will normally pass that IV through a MIN_EXPR to calcaluate the
3018      basis for the length controls.  E.g. in a loop that processes one
3019      element per scalar iteration, the number of elements would be
3020      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3021
3022      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3023      step, since only the final iteration of the vector loop can have
3024      inactive lanes.
3025
3026      However, some targets have a dedicated instruction for calculating the
3027      preferred length, given the total number of elements that still need to
3028      be processed.  This is encapsulated in the SELECT_VL internal function.
3029
3030      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3031      to determine the basis for the length controls.  However, unlike the
3032      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3033      lanes inactive in any iteration of the vector loop, not just the last
3034      iteration.  This SELECT_VL approach therefore requires us to use pointer
3035      IVs with variable steps.
3036
3037      Once we've decided how many elements should be processed by one
3038      iteration of the vector loop, we need to populate the rgroup controls.
3039      If a loop has multiple rgroups, we need to make sure that those rgroups
3040      "line up" (that is, they must be consistent about which elements are
3041      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3042
3043      In principle, it would be possible to use vect_adjust_loop_lens_control
3044      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3045      However:
3046
3047      (1) In practice, it only makes sense to use SELECT_VL when a vector
3048          operation will be controlled directly by the result.  It is not
3049          worth using SELECT_VL if it would only be the input to other
3050          calculations.
3051
3052      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3053          pointer IV will need N updates by a variable amount (N-1 updates
3054          within the iteration and 1 update to move to the next iteration).
3055
3056      Because of this, we prefer to use the MIN_EXPR approach whenever there
3057      is more than one length control.
3058
3059      In addition, SELECT_VL always operates to a granularity of 1 unit.
3060      If we wanted to use it to control an SLP operation on N consecutive
3061      elements, we would need to make the SELECT_VL inputs measure scalar
3062      iterations (rather than elements) and then multiply the SELECT_VL
3063      result by N.  But using SELECT_VL this way is inefficient because
3064      of (1) above.
3065
3066      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3067         satisfied:
3068
3069      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3070      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3071
3072      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3073      we will fail to gain benefits of following unroll optimizations. We prefer
3074      using the MIN_EXPR approach in this situation.  */
3075   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3076     {
3077       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3078       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3079                                           OPTIMIZE_FOR_SPEED)
3080           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3081           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3082           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3083               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3084         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3085     }
3086
3087   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3088      assuming that the loop will be used as a main loop.  We will redo
3089      this analysis later if we instead decide to use the loop as an
3090      epilogue loop.  */
3091   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3092   if (!ok)
3093     return ok;
3094
3095   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3096      to be able to handle fewer than VF scalars, or needs to have a lower VF
3097      than the main loop.  */
3098   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3099       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3100     {
3101       poly_uint64 unscaled_vf
3102         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3103                      orig_loop_vinfo->suggested_unroll_factor);
3104       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3105         return opt_result::failure_at (vect_location,
3106                                        "Vectorization factor too high for"
3107                                        " epilogue loop.\n");
3108     }
3109
3110   /* Check the costings of the loop make vectorizing worthwhile.  */
3111   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3112   if (res < 0)
3113     {
3114       ok = opt_result::failure_at (vect_location,
3115                                    "Loop costings may not be worthwhile.\n");
3116       goto again;
3117     }
3118   if (!res)
3119     return opt_result::failure_at (vect_location,
3120                                    "Loop costings not worthwhile.\n");
3121
3122   /* If an epilogue loop is required make sure we can create one.  */
3123   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3124       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3125       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3126     {
3127       if (dump_enabled_p ())
3128         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3129       if (!vect_can_advance_ivs_p (loop_vinfo)
3130           || !slpeel_can_duplicate_loop_p (loop,
3131                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3132                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3133         {
3134           ok = opt_result::failure_at (vect_location,
3135                                        "not vectorized: can't create required "
3136                                        "epilog loop\n");
3137           goto again;
3138         }
3139     }
3140
3141   /* During peeling, we need to check if number of loop iterations is
3142      enough for both peeled prolog loop and vector loop.  This check
3143      can be merged along with threshold check of loop versioning, so
3144      increase threshold for this case if necessary.
3145
3146      If we are analyzing an epilogue we still want to check what its
3147      versioning threshold would be.  If we decide to vectorize the epilogues we
3148      will want to use the lowest versioning threshold of all epilogues and main
3149      loop.  This will enable us to enter a vectorized epilogue even when
3150      versioning the loop.  We can't simply check whether the epilogue requires
3151      versioning though since we may have skipped some versioning checks when
3152      analyzing the epilogue.  For instance, checks for alias versioning will be
3153      skipped when dealing with epilogues as we assume we already checked them
3154      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3155   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3156     {
3157       poly_uint64 niters_th = 0;
3158       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3159
3160       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3161         {
3162           /* Niters for peeled prolog loop.  */
3163           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3164             {
3165               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3166               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3167               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3168             }
3169           else
3170             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3171         }
3172
3173       /* Niters for at least one iteration of vectorized loop.  */
3174       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3175         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3176       /* One additional iteration because of peeling for gap.  */
3177       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3178         niters_th += 1;
3179
3180       /*  Use the same condition as vect_transform_loop to decide when to use
3181           the cost to determine a versioning threshold.  */
3182       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3183           && ordered_p (th, niters_th))
3184         niters_th = ordered_max (poly_uint64 (th), niters_th);
3185
3186       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3187     }
3188
3189   gcc_assert (known_eq (vectorization_factor,
3190                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3191
3192   slp_done_for_suggested_uf = slp;
3193
3194   /* Ok to vectorize!  */
3195   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3196   return opt_result::success ();
3197
3198 again:
3199   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3200   gcc_assert (!ok);
3201
3202   /* Try again with SLP forced off but if we didn't do any SLP there is
3203      no point in re-trying.  */
3204   if (!slp)
3205     return ok;
3206
3207   /* If the slp decision is true when suggested unroll factor is worked
3208      out, and we are applying suggested unroll factor, we don't need to
3209      re-try any more.  */
3210   if (applying_suggested_uf && slp_done_for_suggested_uf)
3211     return ok;
3212
3213   /* If there are reduction chains re-trying will fail anyway.  */
3214   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3215     return ok;
3216
3217   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3218      via interleaving or lane instructions.  */
3219   slp_instance instance;
3220   slp_tree node;
3221   unsigned i, j;
3222   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3223     {
3224       stmt_vec_info vinfo;
3225       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3226       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3227         continue;
3228       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3229       unsigned int size = DR_GROUP_SIZE (vinfo);
3230       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3231       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3232          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3233          && ! vect_grouped_store_supported (vectype, size))
3234         return opt_result::failure_at (vinfo->stmt,
3235                                        "unsupported grouped store\n");
3236       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3237         {
3238           vinfo = SLP_TREE_REPRESENTATIVE (node);
3239           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3240             {
3241               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3242               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3243               size = DR_GROUP_SIZE (vinfo);
3244               vectype = STMT_VINFO_VECTYPE (vinfo);
3245               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3246                   && ! vect_grouped_load_supported (vectype, single_element_p,
3247                                                     size))
3248                 return opt_result::failure_at (vinfo->stmt,
3249                                                "unsupported grouped load\n");
3250             }
3251         }
3252     }
3253
3254   if (dump_enabled_p ())
3255     dump_printf_loc (MSG_NOTE, vect_location,
3256                      "re-trying with SLP disabled\n");
3257
3258   /* Roll back state appropriately.  No SLP this time.  */
3259   slp = false;
3260   /* Restore vectorization factor as it were without SLP.  */
3261   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3262   /* Free the SLP instances.  */
3263   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3264     vect_free_slp_instance (instance);
3265   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3266   /* Reset SLP type to loop_vect on all stmts.  */
3267   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3268     {
3269       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3270       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3271            !gsi_end_p (si); gsi_next (&si))
3272         {
3273           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3274           STMT_SLP_TYPE (stmt_info) = loop_vect;
3275           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3276               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3277             {
3278               /* vectorizable_reduction adjusts reduction stmt def-types,
3279                  restore them to that of the PHI.  */
3280               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3281                 = STMT_VINFO_DEF_TYPE (stmt_info);
3282               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3283                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3284                 = STMT_VINFO_DEF_TYPE (stmt_info);
3285             }
3286         }
3287       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3288            !gsi_end_p (si); gsi_next (&si))
3289         {
3290           if (is_gimple_debug (gsi_stmt (si)))
3291             continue;
3292           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3293           STMT_SLP_TYPE (stmt_info) = loop_vect;
3294           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3295             {
3296               stmt_vec_info pattern_stmt_info
3297                 = STMT_VINFO_RELATED_STMT (stmt_info);
3298               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3299                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3300
3301               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3302               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3303               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3304                    !gsi_end_p (pi); gsi_next (&pi))
3305                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3306                   = loop_vect;
3307             }
3308         }
3309     }
3310   /* Free optimized alias test DDRS.  */
3311   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3312   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3313   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3314   /* Reset target cost data.  */
3315   delete loop_vinfo->vector_costs;
3316   loop_vinfo->vector_costs = nullptr;
3317   /* Reset accumulated rgroup information.  */
3318   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3319   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3320   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3321   /* Reset assorted flags.  */
3322   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3323   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3324   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3325   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3326   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3327     = saved_can_use_partial_vectors_p;
3328   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
3329
3330   goto start_over;
3331 }
3332
3333 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3334    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3335    OLD_LOOP_VINFO is better unless something specifically indicates
3336    otherwise.
3337
3338    Note that this deliberately isn't a partial order.  */
3339
3340 static bool
3341 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3342                           loop_vec_info old_loop_vinfo)
3343 {
3344   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3345   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3346
3347   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3348   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3349
3350   /* Always prefer a VF of loop->simdlen over any other VF.  */
3351   if (loop->simdlen)
3352     {
3353       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3354       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3355       if (new_simdlen_p != old_simdlen_p)
3356         return new_simdlen_p;
3357     }
3358
3359   const auto *old_costs = old_loop_vinfo->vector_costs;
3360   const auto *new_costs = new_loop_vinfo->vector_costs;
3361   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3362     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3363
3364   return new_costs->better_main_loop_than_p (old_costs);
3365 }
3366
3367 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3368    true if we should.  */
3369
3370 static bool
3371 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3372                         loop_vec_info old_loop_vinfo)
3373 {
3374   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3375     return false;
3376
3377   if (dump_enabled_p ())
3378     dump_printf_loc (MSG_NOTE, vect_location,
3379                      "***** Preferring vector mode %s to vector mode %s\n",
3380                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3381                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3382   return true;
3383 }
3384
3385 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3386    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3387    MODE_I to the next mode useful to analyze.
3388    Return the loop_vinfo on success and wrapped null on failure.  */
3389
3390 static opt_loop_vec_info
3391 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3392                      const vect_loop_form_info *loop_form_info,
3393                      loop_vec_info main_loop_vinfo,
3394                      const vector_modes &vector_modes, unsigned &mode_i,
3395                      machine_mode &autodetected_vector_mode,
3396                      bool &fatal)
3397 {
3398   loop_vec_info loop_vinfo
3399     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3400
3401   machine_mode vector_mode = vector_modes[mode_i];
3402   loop_vinfo->vector_mode = vector_mode;
3403   unsigned int suggested_unroll_factor = 1;
3404   bool slp_done_for_suggested_uf = false;
3405
3406   /* Run the main analysis.  */
3407   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3408                                         &suggested_unroll_factor,
3409                                         slp_done_for_suggested_uf);
3410   if (dump_enabled_p ())
3411     dump_printf_loc (MSG_NOTE, vect_location,
3412                      "***** Analysis %s with vector mode %s\n",
3413                      res ? "succeeded" : "failed",
3414                      GET_MODE_NAME (loop_vinfo->vector_mode));
3415
3416   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3417     {
3418       if (dump_enabled_p ())
3419         dump_printf_loc (MSG_NOTE, vect_location,
3420                          "***** Re-trying analysis for unrolling"
3421                          " with unroll factor %d and slp %s.\n",
3422                          suggested_unroll_factor,
3423                          slp_done_for_suggested_uf ? "on" : "off");
3424       loop_vec_info unroll_vinfo
3425         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3426       unroll_vinfo->vector_mode = vector_mode;
3427       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3428       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3429                                                 slp_done_for_suggested_uf);
3430       if (new_res)
3431         {
3432           delete loop_vinfo;
3433           loop_vinfo = unroll_vinfo;
3434         }
3435       else
3436         delete unroll_vinfo;
3437     }
3438
3439   /* Remember the autodetected vector mode.  */
3440   if (vector_mode == VOIDmode)
3441     autodetected_vector_mode = loop_vinfo->vector_mode;
3442
3443   /* Advance mode_i, first skipping modes that would result in the
3444      same analysis result.  */
3445   while (mode_i + 1 < vector_modes.length ()
3446          && vect_chooses_same_modes_p (loop_vinfo,
3447                                        vector_modes[mode_i + 1]))
3448     {
3449       if (dump_enabled_p ())
3450         dump_printf_loc (MSG_NOTE, vect_location,
3451                          "***** The result for vector mode %s would"
3452                          " be the same\n",
3453                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3454       mode_i += 1;
3455     }
3456   if (mode_i + 1 < vector_modes.length ()
3457       && VECTOR_MODE_P (autodetected_vector_mode)
3458       && (related_vector_mode (vector_modes[mode_i + 1],
3459                                GET_MODE_INNER (autodetected_vector_mode))
3460           == autodetected_vector_mode)
3461       && (related_vector_mode (autodetected_vector_mode,
3462                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3463           == vector_modes[mode_i + 1]))
3464     {
3465       if (dump_enabled_p ())
3466         dump_printf_loc (MSG_NOTE, vect_location,
3467                          "***** Skipping vector mode %s, which would"
3468                          " repeat the analysis for %s\n",
3469                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3470                          GET_MODE_NAME (autodetected_vector_mode));
3471       mode_i += 1;
3472     }
3473   mode_i++;
3474
3475   if (!res)
3476     {
3477       delete loop_vinfo;
3478       if (fatal)
3479         gcc_checking_assert (main_loop_vinfo == NULL);
3480       return opt_loop_vec_info::propagate_failure (res);
3481     }
3482
3483   return opt_loop_vec_info::success (loop_vinfo);
3484 }
3485
3486 /* Function vect_analyze_loop.
3487
3488    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3489    for it.  The different analyses will record information in the
3490    loop_vec_info struct.  */
3491 opt_loop_vec_info
3492 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3493 {
3494   DUMP_VECT_SCOPE ("analyze_loop_nest");
3495
3496   if (loop_outer (loop)
3497       && loop_vec_info_for_loop (loop_outer (loop))
3498       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3499     return opt_loop_vec_info::failure_at (vect_location,
3500                                           "outer-loop already vectorized.\n");
3501
3502   if (!find_loop_nest (loop, &shared->loop_nest))
3503     return opt_loop_vec_info::failure_at
3504       (vect_location,
3505        "not vectorized: loop nest containing two or more consecutive inner"
3506        " loops cannot be vectorized\n");
3507
3508   /* Analyze the loop form.  */
3509   vect_loop_form_info loop_form_info;
3510   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3511   if (!res)
3512     {
3513       if (dump_enabled_p ())
3514         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3515                          "bad loop form.\n");
3516       return opt_loop_vec_info::propagate_failure (res);
3517     }
3518   if (!integer_onep (loop_form_info.assumptions))
3519     {
3520       /* We consider to vectorize this loop by versioning it under
3521          some assumptions.  In order to do this, we need to clear
3522          existing information computed by scev and niter analyzer.  */
3523       scev_reset_htab ();
3524       free_numbers_of_iterations_estimates (loop);
3525       /* Also set flag for this loop so that following scev and niter
3526          analysis are done under the assumptions.  */
3527       loop_constraint_set (loop, LOOP_C_FINITE);
3528     }
3529   else
3530     /* Clear the existing niter information to make sure the nonwrapping flag
3531        will be calculated and set propriately.  */
3532     free_numbers_of_iterations_estimates (loop);
3533
3534   auto_vector_modes vector_modes;
3535   /* Autodetect first vector size we try.  */
3536   vector_modes.safe_push (VOIDmode);
3537   unsigned int autovec_flags
3538     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3539                                                     loop->simdlen != 0);
3540   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3541                              && !unlimited_cost_model (loop));
3542   machine_mode autodetected_vector_mode = VOIDmode;
3543   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3544   unsigned int mode_i = 0;
3545   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3546
3547   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3548      a mode has not been analyzed.  */
3549   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3550   for (unsigned i = 0; i < vector_modes.length (); ++i)
3551     cached_vf_per_mode.safe_push (0);
3552
3553   /* First determine the main loop vectorization mode, either the first
3554      one that works, starting with auto-detecting the vector mode and then
3555      following the targets order of preference, or the one with the
3556      lowest cost if pick_lowest_cost_p.  */
3557   while (1)
3558     {
3559       bool fatal;
3560       unsigned int last_mode_i = mode_i;
3561       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3562          failed.  */
3563       cached_vf_per_mode[last_mode_i] = -1;
3564       opt_loop_vec_info loop_vinfo
3565         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3566                                NULL, vector_modes, mode_i,
3567                                autodetected_vector_mode, fatal);
3568       if (fatal)
3569         break;
3570
3571       if (loop_vinfo)
3572         {
3573           /*  Analyzis has been successful so update the VF value.  The
3574               VF should always be a multiple of unroll_factor and we want to
3575               capture the original VF here.  */
3576           cached_vf_per_mode[last_mode_i]
3577             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3578                          loop_vinfo->suggested_unroll_factor);
3579           /* Once we hit the desired simdlen for the first time,
3580              discard any previous attempts.  */
3581           if (simdlen
3582               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3583             {
3584               delete first_loop_vinfo;
3585               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3586               simdlen = 0;
3587             }
3588           else if (pick_lowest_cost_p
3589                    && first_loop_vinfo
3590                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3591             {
3592               /* Pick loop_vinfo over first_loop_vinfo.  */
3593               delete first_loop_vinfo;
3594               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3595             }
3596           if (first_loop_vinfo == NULL)
3597             first_loop_vinfo = loop_vinfo;
3598           else
3599             {
3600               delete loop_vinfo;
3601               loop_vinfo = opt_loop_vec_info::success (NULL);
3602             }
3603
3604           /* Commit to first_loop_vinfo if we have no reason to try
3605              alternatives.  */
3606           if (!simdlen && !pick_lowest_cost_p)
3607             break;
3608         }
3609       if (mode_i == vector_modes.length ()
3610           || autodetected_vector_mode == VOIDmode)
3611         break;
3612
3613       /* Try the next biggest vector size.  */
3614       if (dump_enabled_p ())
3615         dump_printf_loc (MSG_NOTE, vect_location,
3616                          "***** Re-trying analysis with vector mode %s\n",
3617                          GET_MODE_NAME (vector_modes[mode_i]));
3618     }
3619   if (!first_loop_vinfo)
3620     return opt_loop_vec_info::propagate_failure (res);
3621
3622   if (dump_enabled_p ())
3623     dump_printf_loc (MSG_NOTE, vect_location,
3624                      "***** Choosing vector mode %s\n",
3625                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3626
3627   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3628      enabled, SIMDUID is not set, it is the innermost loop and we have
3629      either already found the loop's SIMDLEN or there was no SIMDLEN to
3630      begin with.
3631      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3632   bool vect_epilogues = (!simdlen
3633                          && loop->inner == NULL
3634                          && param_vect_epilogues_nomask
3635                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3636                            /* No code motion support for multiple epilogues so for now
3637                               not supported when multiple exits.  */
3638                          && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3639                          && !loop->simduid);
3640   if (!vect_epilogues)
3641     return first_loop_vinfo;
3642
3643   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3644   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3645
3646   /* For epilogues start the analysis from the first mode.  The motivation
3647      behind starting from the beginning comes from cases where the VECTOR_MODES
3648      array may contain length-agnostic and length-specific modes.  Their
3649      ordering is not guaranteed, so we could end up picking a mode for the main
3650      loop that is after the epilogue's optimal mode.  */
3651   vector_modes[0] = autodetected_vector_mode;
3652   mode_i = 0;
3653
3654   bool supports_partial_vectors =
3655     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3656   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3657
3658   while (1)
3659     {
3660       /* If the target does not support partial vectors we can shorten the
3661          number of modes to analyze for the epilogue as we know we can't pick a
3662          mode that would lead to a VF at least as big as the
3663          FIRST_VINFO_VF.  */
3664       if (!supports_partial_vectors
3665           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3666         {
3667           mode_i++;
3668           if (mode_i == vector_modes.length ())
3669             break;
3670           continue;
3671         }
3672
3673       if (dump_enabled_p ())
3674         dump_printf_loc (MSG_NOTE, vect_location,
3675                          "***** Re-trying epilogue analysis with vector "
3676                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3677
3678       bool fatal;
3679       opt_loop_vec_info loop_vinfo
3680         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3681                                first_loop_vinfo,
3682                                vector_modes, mode_i,
3683                                autodetected_vector_mode, fatal);
3684       if (fatal)
3685         break;
3686
3687       if (loop_vinfo)
3688         {
3689           if (pick_lowest_cost_p)
3690             {
3691               /* Keep trying to roll back vectorization attempts while the
3692                  loop_vec_infos they produced were worse than this one.  */
3693               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3694               while (!vinfos.is_empty ()
3695                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3696                 {
3697                   gcc_assert (vect_epilogues);
3698                   delete vinfos.pop ();
3699                 }
3700             }
3701           /* For now only allow one epilogue loop.  */
3702           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3703             {
3704               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3705               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3706               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3707                           || maybe_ne (lowest_th, 0U));
3708               /* Keep track of the known smallest versioning
3709                  threshold.  */
3710               if (ordered_p (lowest_th, th))
3711                 lowest_th = ordered_min (lowest_th, th);
3712             }
3713           else
3714             {
3715               delete loop_vinfo;
3716               loop_vinfo = opt_loop_vec_info::success (NULL);
3717             }
3718
3719           /* For now only allow one epilogue loop, but allow
3720              pick_lowest_cost_p to replace it, so commit to the
3721              first epilogue if we have no reason to try alternatives.  */
3722           if (!pick_lowest_cost_p)
3723             break;
3724         }
3725
3726       if (mode_i == vector_modes.length ())
3727         break;
3728
3729     }
3730
3731   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3732     {
3733       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3734       if (dump_enabled_p ())
3735         dump_printf_loc (MSG_NOTE, vect_location,
3736                          "***** Choosing epilogue vector mode %s\n",
3737                          GET_MODE_NAME
3738                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3739     }
3740
3741   return first_loop_vinfo;
3742 }
3743
3744 /* Return true if there is an in-order reduction function for CODE, storing
3745    it in *REDUC_FN if so.  */
3746
3747 static bool
3748 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3749 {
3750   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3751      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3752      (-0.0) = -0.0.  */
3753   if (code == PLUS_EXPR || code == MINUS_EXPR)
3754     {
3755       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3756       return true;
3757     }
3758   return false;
3759 }
3760
3761 /* Function reduction_fn_for_scalar_code
3762
3763    Input:
3764    CODE - tree_code of a reduction operations.
3765
3766    Output:
3767    REDUC_FN - the corresponding internal function to be used to reduce the
3768       vector of partial results into a single scalar result, or IFN_LAST
3769       if the operation is a supported reduction operation, but does not have
3770       such an internal function.
3771
3772    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3773
3774 bool
3775 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3776 {
3777   if (code.is_tree_code ())
3778     switch (tree_code (code))
3779       {
3780       case MAX_EXPR:
3781         *reduc_fn = IFN_REDUC_MAX;
3782         return true;
3783
3784       case MIN_EXPR:
3785         *reduc_fn = IFN_REDUC_MIN;
3786         return true;
3787
3788       case PLUS_EXPR:
3789         *reduc_fn = IFN_REDUC_PLUS;
3790         return true;
3791
3792       case BIT_AND_EXPR:
3793         *reduc_fn = IFN_REDUC_AND;
3794         return true;
3795
3796       case BIT_IOR_EXPR:
3797         *reduc_fn = IFN_REDUC_IOR;
3798         return true;
3799
3800       case BIT_XOR_EXPR:
3801         *reduc_fn = IFN_REDUC_XOR;
3802         return true;
3803
3804       case MULT_EXPR:
3805       case MINUS_EXPR:
3806         *reduc_fn = IFN_LAST;
3807         return true;
3808
3809       default:
3810         return false;
3811       }
3812   else
3813     switch (combined_fn (code))
3814       {
3815       CASE_CFN_FMAX:
3816         *reduc_fn = IFN_REDUC_FMAX;
3817         return true;
3818
3819       CASE_CFN_FMIN:
3820         *reduc_fn = IFN_REDUC_FMIN;
3821         return true;
3822
3823       default:
3824         return false;
3825       }
3826 }
3827
3828 /* If there is a neutral value X such that a reduction would not be affected
3829    by the introduction of additional X elements, return that X, otherwise
3830    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3831    of the scalar elements.  If the reduction has just a single initial value
3832    then INITIAL_VALUE is that value, otherwise it is null.
3833    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3834    In that case no signed zero is returned.  */
3835
3836 tree
3837 neutral_op_for_reduction (tree scalar_type, code_helper code,
3838                           tree initial_value, bool as_initial)
3839 {
3840   if (code.is_tree_code ())
3841     switch (tree_code (code))
3842       {
3843       case DOT_PROD_EXPR:
3844       case SAD_EXPR:
3845       case MINUS_EXPR:
3846       case BIT_IOR_EXPR:
3847       case BIT_XOR_EXPR:
3848         return build_zero_cst (scalar_type);
3849       case WIDEN_SUM_EXPR:
3850       case PLUS_EXPR:
3851         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3852           return build_real (scalar_type, dconstm0);
3853         else
3854           return build_zero_cst (scalar_type);
3855
3856       case MULT_EXPR:
3857         return build_one_cst (scalar_type);
3858
3859       case BIT_AND_EXPR:
3860         return build_all_ones_cst (scalar_type);
3861
3862       case MAX_EXPR:
3863       case MIN_EXPR:
3864         return initial_value;
3865
3866       default:
3867         return NULL_TREE;
3868       }
3869   else
3870     switch (combined_fn (code))
3871       {
3872       CASE_CFN_FMIN:
3873       CASE_CFN_FMAX:
3874         return initial_value;
3875
3876       default:
3877         return NULL_TREE;
3878       }
3879 }
3880
3881 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3882    STMT is printed with a message MSG. */
3883
3884 static void
3885 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3886 {
3887   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3888 }
3889
3890 /* Return true if we need an in-order reduction for operation CODE
3891    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3892    overflow must wrap.  */
3893
3894 bool
3895 needs_fold_left_reduction_p (tree type, code_helper code)
3896 {
3897   /* CHECKME: check for !flag_finite_math_only too?  */
3898   if (SCALAR_FLOAT_TYPE_P (type))
3899     {
3900       if (code.is_tree_code ())
3901         switch (tree_code (code))
3902           {
3903           case MIN_EXPR:
3904           case MAX_EXPR:
3905             return false;
3906
3907           default:
3908             return !flag_associative_math;
3909           }
3910       else
3911         switch (combined_fn (code))
3912           {
3913           CASE_CFN_FMIN:
3914           CASE_CFN_FMAX:
3915             return false;
3916
3917           default:
3918             return !flag_associative_math;
3919           }
3920     }
3921
3922   if (INTEGRAL_TYPE_P (type))
3923     return (!code.is_tree_code ()
3924             || !operation_no_trapping_overflow (type, tree_code (code)));
3925
3926   if (SAT_FIXED_POINT_TYPE_P (type))
3927     return true;
3928
3929   return false;
3930 }
3931
3932 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3933    has a handled computation expression.  Store the main reduction
3934    operation in *CODE.  */
3935
3936 static bool
3937 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3938                       tree loop_arg, code_helper *code,
3939                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3940 {
3941   auto_bitmap visited;
3942   tree lookfor = PHI_RESULT (phi);
3943   ssa_op_iter curri;
3944   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3945   while (USE_FROM_PTR (curr) != loop_arg)
3946     curr = op_iter_next_use (&curri);
3947   curri.i = curri.numops;
3948   do
3949     {
3950       path.safe_push (std::make_pair (curri, curr));
3951       tree use = USE_FROM_PTR (curr);
3952       if (use == lookfor)
3953         break;
3954       gimple *def = SSA_NAME_DEF_STMT (use);
3955       if (gimple_nop_p (def)
3956           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3957         {
3958 pop:
3959           do
3960             {
3961               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3962               curri = x.first;
3963               curr = x.second;
3964               do
3965                 curr = op_iter_next_use (&curri);
3966               /* Skip already visited or non-SSA operands (from iterating
3967                  over PHI args).  */
3968               while (curr != NULL_USE_OPERAND_P
3969                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3970                          || ! bitmap_set_bit (visited,
3971                                               SSA_NAME_VERSION
3972                                                 (USE_FROM_PTR (curr)))));
3973             }
3974           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3975           if (curr == NULL_USE_OPERAND_P)
3976             break;
3977         }
3978       else
3979         {
3980           if (gimple_code (def) == GIMPLE_PHI)
3981             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3982           else
3983             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3984           while (curr != NULL_USE_OPERAND_P
3985                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3986                      || ! bitmap_set_bit (visited,
3987                                           SSA_NAME_VERSION
3988                                             (USE_FROM_PTR (curr)))))
3989             curr = op_iter_next_use (&curri);
3990           if (curr == NULL_USE_OPERAND_P)
3991             goto pop;
3992         }
3993     }
3994   while (1);
3995   if (dump_file && (dump_flags & TDF_DETAILS))
3996     {
3997       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3998       unsigned i;
3999       std::pair<ssa_op_iter, use_operand_p> *x;
4000       FOR_EACH_VEC_ELT (path, i, x)
4001         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4002       dump_printf (MSG_NOTE, "\n");
4003     }
4004
4005   /* Check whether the reduction path detected is valid.  */
4006   bool fail = path.length () == 0;
4007   bool neg = false;
4008   int sign = -1;
4009   *code = ERROR_MARK;
4010   for (unsigned i = 1; i < path.length (); ++i)
4011     {
4012       gimple *use_stmt = USE_STMT (path[i].second);
4013       gimple_match_op op;
4014       if (!gimple_extract_op (use_stmt, &op))
4015         {
4016           fail = true;
4017           break;
4018         }
4019       unsigned int opi = op.num_ops;
4020       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4021         {
4022           /* The following make sure we can compute the operand index
4023              easily plus it mostly disallows chaining via COND_EXPR condition
4024              operands.  */
4025           for (opi = 0; opi < op.num_ops; ++opi)
4026             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4027               break;
4028         }
4029       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4030         {
4031           for (opi = 0; opi < op.num_ops; ++opi)
4032             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4033               break;
4034         }
4035       if (opi == op.num_ops)
4036         {
4037           fail = true;
4038           break;
4039         }
4040       op.code = canonicalize_code (op.code, op.type);
4041       if (op.code == MINUS_EXPR)
4042         {
4043           op.code = PLUS_EXPR;
4044           /* Track whether we negate the reduction value each iteration.  */
4045           if (op.ops[1] == op.ops[opi])
4046             neg = ! neg;
4047         }
4048       else if (op.code == IFN_COND_SUB)
4049         {
4050           op.code = IFN_COND_ADD;
4051           /* Track whether we negate the reduction value each iteration.  */
4052           if (op.ops[2] == op.ops[opi])
4053             neg = ! neg;
4054         }
4055       if (CONVERT_EXPR_CODE_P (op.code)
4056           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4057         ;
4058       else if (*code == ERROR_MARK)
4059         {
4060           *code = op.code;
4061           sign = TYPE_SIGN (op.type);
4062         }
4063       else if (op.code != *code)
4064         {
4065           fail = true;
4066           break;
4067         }
4068       else if ((op.code == MIN_EXPR
4069                 || op.code == MAX_EXPR)
4070                && sign != TYPE_SIGN (op.type))
4071         {
4072           fail = true;
4073           break;
4074         }
4075       /* Check there's only a single stmt the op is used on.  For the
4076          not value-changing tail and the last stmt allow out-of-loop uses.
4077          ???  We could relax this and handle arbitrary live stmts by
4078          forcing a scalar epilogue for example.  */
4079       imm_use_iterator imm_iter;
4080       use_operand_p use_p;
4081       gimple *op_use_stmt;
4082       unsigned cnt = 0;
4083       bool cond_fn_p = op.code.is_internal_fn ()
4084         && (conditional_internal_fn_code (internal_fn (op.code))
4085             != ERROR_MARK);
4086
4087       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4088         {
4089           /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
4090              have op1 twice (once as definition, once as else) in the same
4091              operation.  Enforce this.  */
4092           if (cond_fn_p && op_use_stmt == use_stmt)
4093             {
4094               gcall *call = as_a<gcall *> (use_stmt);
4095               unsigned else_pos
4096                 = internal_fn_else_index (internal_fn (op.code));
4097               if (gimple_call_arg (call, else_pos) != op.ops[opi])
4098                 {
4099                   fail = true;
4100                   break;
4101                 }
4102               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4103                 {
4104                   if (j == else_pos)
4105                     continue;
4106                   if (gimple_call_arg (call, j) == op.ops[opi])
4107                     cnt++;
4108                 }
4109             }
4110           else if (!is_gimple_debug (op_use_stmt)
4111                    && (*code != ERROR_MARK
4112                        || flow_bb_inside_loop_p (loop,
4113                                                  gimple_bb (op_use_stmt))))
4114             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4115               cnt++;
4116         }
4117
4118       if (cnt != 1)
4119         {
4120           fail = true;
4121           break;
4122         }
4123     }
4124   return ! fail && ! neg && *code != ERROR_MARK;
4125 }
4126
4127 bool
4128 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4129                       tree loop_arg, enum tree_code code)
4130 {
4131   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4132   code_helper code_;
4133   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4134           && code_ == code);
4135 }
4136
4137
4138
4139 /* Function vect_is_simple_reduction
4140
4141    (1) Detect a cross-iteration def-use cycle that represents a simple
4142    reduction computation.  We look for the following pattern:
4143
4144    loop_header:
4145      a1 = phi < a0, a2 >
4146      a3 = ...
4147      a2 = operation (a3, a1)
4148
4149    or
4150
4151    a3 = ...
4152    loop_header:
4153      a1 = phi < a0, a2 >
4154      a2 = operation (a3, a1)
4155
4156    such that:
4157    1. operation is commutative and associative and it is safe to
4158       change the order of the computation
4159    2. no uses for a2 in the loop (a2 is used out of the loop)
4160    3. no uses of a1 in the loop besides the reduction operation
4161    4. no uses of a1 outside the loop.
4162
4163    Conditions 1,4 are tested here.
4164    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4165
4166    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4167    nested cycles.
4168
4169    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4170    reductions:
4171
4172      a1 = phi < a0, a2 >
4173      inner loop (def of a3)
4174      a2 = phi < a3 >
4175
4176    (4) Detect condition expressions, ie:
4177      for (int i = 0; i < N; i++)
4178        if (a[i] < val)
4179         ret_val = a[i];
4180
4181 */
4182
4183 static stmt_vec_info
4184 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4185                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4186 {
4187   gphi *phi = as_a <gphi *> (phi_info->stmt);
4188   gimple *phi_use_stmt = NULL;
4189   imm_use_iterator imm_iter;
4190   use_operand_p use_p;
4191
4192   *double_reduc = false;
4193   *reduc_chain_p = false;
4194   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4195
4196   tree phi_name = PHI_RESULT (phi);
4197   /* ???  If there are no uses of the PHI result the inner loop reduction
4198      won't be detected as possibly double-reduction by vectorizable_reduction
4199      because that tries to walk the PHI arg from the preheader edge which
4200      can be constant.  See PR60382.  */
4201   if (has_zero_uses (phi_name))
4202     return NULL;
4203   class loop *loop = (gimple_bb (phi))->loop_father;
4204   unsigned nphi_def_loop_uses = 0;
4205   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4206     {
4207       gimple *use_stmt = USE_STMT (use_p);
4208       if (is_gimple_debug (use_stmt))
4209         continue;
4210
4211       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4212         {
4213           if (dump_enabled_p ())
4214             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4215                              "intermediate value used outside loop.\n");
4216
4217           return NULL;
4218         }
4219
4220       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4221          op1 twice (once as definition, once as else) in the same operation.
4222          Only count it as one. */
4223       if (use_stmt != phi_use_stmt)
4224         {
4225           nphi_def_loop_uses++;
4226           phi_use_stmt = use_stmt;
4227         }
4228     }
4229
4230   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4231   if (TREE_CODE (latch_def) != SSA_NAME)
4232     {
4233       if (dump_enabled_p ())
4234         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4235                          "reduction: not ssa_name: %T\n", latch_def);
4236       return NULL;
4237     }
4238
4239   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4240   if (!def_stmt_info
4241       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4242     return NULL;
4243
4244   bool nested_in_vect_loop
4245     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4246   unsigned nlatch_def_loop_uses = 0;
4247   auto_vec<gphi *, 3> lcphis;
4248   bool inner_loop_of_double_reduc = false;
4249   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4250     {
4251       gimple *use_stmt = USE_STMT (use_p);
4252       if (is_gimple_debug (use_stmt))
4253         continue;
4254       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4255         nlatch_def_loop_uses++;
4256       else
4257         {
4258           /* We can have more than one loop-closed PHI.  */
4259           lcphis.safe_push (as_a <gphi *> (use_stmt));
4260           if (nested_in_vect_loop
4261               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4262                   == vect_double_reduction_def))
4263             inner_loop_of_double_reduc = true;
4264         }
4265     }
4266
4267   /* If we are vectorizing an inner reduction we are executing that
4268      in the original order only in case we are not dealing with a
4269      double reduction.  */
4270   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4271     {
4272       if (dump_enabled_p ())
4273         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4274                         "detected nested cycle: ");
4275       return def_stmt_info;
4276     }
4277
4278   /* When the inner loop of a double reduction ends up with more than
4279      one loop-closed PHI we have failed to classify alternate such
4280      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4281   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4282     {
4283       if (dump_enabled_p ())
4284         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4285                          "unhandle double reduction\n");
4286       return NULL;
4287     }
4288
4289   /* If this isn't a nested cycle or if the nested cycle reduction value
4290      is used ouside of the inner loop we cannot handle uses of the reduction
4291      value.  */
4292   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4293     {
4294       if (dump_enabled_p ())
4295         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4296                          "reduction used in loop.\n");
4297       return NULL;
4298     }
4299
4300   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4301      defined in the inner loop.  */
4302   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4303     {
4304       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4305       if (gimple_phi_num_args (def_stmt) != 1
4306           || TREE_CODE (op1) != SSA_NAME)
4307         {
4308           if (dump_enabled_p ())
4309             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4310                              "unsupported phi node definition.\n");
4311
4312           return NULL;
4313         }
4314
4315       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4316          and the latch definition op1.  */
4317       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4318       if (gimple_bb (def1)
4319           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4320           && loop->inner
4321           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4322           && (is_gimple_assign (def1) || is_gimple_call (def1))
4323           && is_a <gphi *> (phi_use_stmt)
4324           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4325           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4326                                             loop_latch_edge (loop->inner)))
4327           && lcphis.length () == 1)
4328         {
4329           if (dump_enabled_p ())
4330             report_vect_op (MSG_NOTE, def_stmt,
4331                             "detected double reduction: ");
4332
4333           *double_reduc = true;
4334           return def_stmt_info;
4335         }
4336
4337       return NULL;
4338     }
4339
4340   /* Look for the expression computing latch_def from then loop PHI result.  */
4341   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4342   code_helper code;
4343   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4344                             path))
4345     {
4346       STMT_VINFO_REDUC_CODE (phi_info) = code;
4347       if (code == COND_EXPR && !nested_in_vect_loop)
4348         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4349
4350       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4351          reduction chain for which the additional restriction is that
4352          all operations in the chain are the same.  */
4353       auto_vec<stmt_vec_info, 8> reduc_chain;
4354       unsigned i;
4355       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4356       for (i = path.length () - 1; i >= 1; --i)
4357         {
4358           gimple *stmt = USE_STMT (path[i].second);
4359           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4360           gimple_match_op op;
4361           if (!gimple_extract_op (stmt, &op))
4362             gcc_unreachable ();
4363           if (gassign *assign = dyn_cast<gassign *> (stmt))
4364             STMT_VINFO_REDUC_IDX (stmt_info)
4365               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4366           else
4367             {
4368               gcall *call = as_a<gcall *> (stmt);
4369               STMT_VINFO_REDUC_IDX (stmt_info)
4370                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4371             }
4372           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4373                                      && (i == 1 || i == path.length () - 1));
4374           if ((op.code != code && !leading_conversion)
4375               /* We can only handle the final value in epilogue
4376                  generation for reduction chains.  */
4377               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4378             is_slp_reduc = false;
4379           /* For reduction chains we support a trailing/leading
4380              conversions.  We do not store those in the actual chain.  */
4381           if (leading_conversion)
4382             continue;
4383           reduc_chain.safe_push (stmt_info);
4384         }
4385       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4386         {
4387           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4388             {
4389               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4390               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4391             }
4392           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4393           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4394
4395           /* Save the chain for further analysis in SLP detection.  */
4396           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4397           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4398
4399           *reduc_chain_p = true;
4400           if (dump_enabled_p ())
4401             dump_printf_loc (MSG_NOTE, vect_location,
4402                             "reduction: detected reduction chain\n");
4403         }
4404       else if (dump_enabled_p ())
4405         dump_printf_loc (MSG_NOTE, vect_location,
4406                          "reduction: detected reduction\n");
4407
4408       return def_stmt_info;
4409     }
4410
4411   if (dump_enabled_p ())
4412     dump_printf_loc (MSG_NOTE, vect_location,
4413                      "reduction: unknown pattern\n");
4414
4415   return NULL;
4416 }
4417
4418 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4419    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4420    or -1 if not known.  */
4421
4422 static int
4423 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4424 {
4425   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4426   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4427     {
4428       if (dump_enabled_p ())
4429         dump_printf_loc (MSG_NOTE, vect_location,
4430                          "cost model: epilogue peel iters set to vf/2 "
4431                          "because loop iterations are unknown .\n");
4432       return assumed_vf / 2;
4433     }
4434   else
4435     {
4436       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4437       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4438       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4439       /* If we need to peel for gaps, but no peeling is required, we have to
4440          peel VF iterations.  */
4441       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4442         peel_iters_epilogue = assumed_vf;
4443       return peel_iters_epilogue;
4444     }
4445 }
4446
4447 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4448 int
4449 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4450                              int *peel_iters_epilogue,
4451                              stmt_vector_for_cost *scalar_cost_vec,
4452                              stmt_vector_for_cost *prologue_cost_vec,
4453                              stmt_vector_for_cost *epilogue_cost_vec)
4454 {
4455   int retval = 0;
4456
4457   *peel_iters_epilogue
4458     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4459
4460   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4461     {
4462       /* If peeled iterations are known but number of scalar loop
4463          iterations are unknown, count a taken branch per peeled loop.  */
4464       if (peel_iters_prologue > 0)
4465         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4466                                    vect_prologue);
4467       if (*peel_iters_epilogue > 0)
4468         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4469                                     vect_epilogue);
4470     }
4471
4472   stmt_info_for_cost *si;
4473   int j;
4474   if (peel_iters_prologue)
4475     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4476       retval += record_stmt_cost (prologue_cost_vec,
4477                                   si->count * peel_iters_prologue,
4478                                   si->kind, si->stmt_info, si->misalign,
4479                                   vect_prologue);
4480   if (*peel_iters_epilogue)
4481     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4482       retval += record_stmt_cost (epilogue_cost_vec,
4483                                   si->count * *peel_iters_epilogue,
4484                                   si->kind, si->stmt_info, si->misalign,
4485                                   vect_epilogue);
4486
4487   return retval;
4488 }
4489
4490 /* Function vect_estimate_min_profitable_iters
4491
4492    Return the number of iterations required for the vector version of the
4493    loop to be profitable relative to the cost of the scalar version of the
4494    loop.
4495
4496    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4497    of iterations for vectorization.  -1 value means loop vectorization
4498    is not profitable.  This returned value may be used for dynamic
4499    profitability check.
4500
4501    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4502    for static check against estimated number of iterations.  */
4503
4504 static void
4505 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4506                                     int *ret_min_profitable_niters,
4507                                     int *ret_min_profitable_estimate,
4508                                     unsigned *suggested_unroll_factor)
4509 {
4510   int min_profitable_iters;
4511   int min_profitable_estimate;
4512   int peel_iters_prologue;
4513   int peel_iters_epilogue;
4514   unsigned vec_inside_cost = 0;
4515   int vec_outside_cost = 0;
4516   unsigned vec_prologue_cost = 0;
4517   unsigned vec_epilogue_cost = 0;
4518   int scalar_single_iter_cost = 0;
4519   int scalar_outside_cost = 0;
4520   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4521   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4522   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4523
4524   /* Cost model disabled.  */
4525   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4526     {
4527       if (dump_enabled_p ())
4528         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4529       *ret_min_profitable_niters = 0;
4530       *ret_min_profitable_estimate = 0;
4531       return;
4532     }
4533
4534   /* Requires loop versioning tests to handle misalignment.  */
4535   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4536     {
4537       /*  FIXME: Make cost depend on complexity of individual check.  */
4538       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4539       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4540       if (dump_enabled_p ())
4541         dump_printf (MSG_NOTE,
4542                      "cost model: Adding cost of checks for loop "
4543                      "versioning to treat misalignment.\n");
4544     }
4545
4546   /* Requires loop versioning with alias checks.  */
4547   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4548     {
4549       /*  FIXME: Make cost depend on complexity of individual check.  */
4550       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4551       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4552       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4553       if (len)
4554         /* Count LEN - 1 ANDs and LEN comparisons.  */
4555         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4556                               scalar_stmt, vect_prologue);
4557       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4558       if (len)
4559         {
4560           /* Count LEN - 1 ANDs and LEN comparisons.  */
4561           unsigned int nstmts = len * 2 - 1;
4562           /* +1 for each bias that needs adding.  */
4563           for (unsigned int i = 0; i < len; ++i)
4564             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4565               nstmts += 1;
4566           (void) add_stmt_cost (target_cost_data, nstmts,
4567                                 scalar_stmt, vect_prologue);
4568         }
4569       if (dump_enabled_p ())
4570         dump_printf (MSG_NOTE,
4571                      "cost model: Adding cost of checks for loop "
4572                      "versioning aliasing.\n");
4573     }
4574
4575   /* Requires loop versioning with niter checks.  */
4576   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4577     {
4578       /*  FIXME: Make cost depend on complexity of individual check.  */
4579       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4580                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4581       if (dump_enabled_p ())
4582         dump_printf (MSG_NOTE,
4583                      "cost model: Adding cost of checks for loop "
4584                      "versioning niters.\n");
4585     }
4586
4587   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4588     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4589                           vect_prologue);
4590
4591   /* Count statements in scalar loop.  Using this as scalar cost for a single
4592      iteration for now.
4593
4594      TODO: Add outer loop support.
4595
4596      TODO: Consider assigning different costs to different scalar
4597      statements.  */
4598
4599   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4600
4601   /* Add additional cost for the peeled instructions in prologue and epilogue
4602      loop.  (For fully-masked loops there will be no peeling.)
4603
4604      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4605      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4606
4607      TODO: Build an expression that represents peel_iters for prologue and
4608      epilogue to be used in a run-time test.  */
4609
4610   bool prologue_need_br_taken_cost = false;
4611   bool prologue_need_br_not_taken_cost = false;
4612
4613   /* Calculate peel_iters_prologue.  */
4614   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4615     peel_iters_prologue = 0;
4616   else if (npeel < 0)
4617     {
4618       peel_iters_prologue = assumed_vf / 2;
4619       if (dump_enabled_p ())
4620         dump_printf (MSG_NOTE, "cost model: "
4621                      "prologue peel iters set to vf/2.\n");
4622
4623       /* If peeled iterations are unknown, count a taken branch and a not taken
4624          branch per peeled loop.  Even if scalar loop iterations are known,
4625          vector iterations are not known since peeled prologue iterations are
4626          not known.  Hence guards remain the same.  */
4627       prologue_need_br_taken_cost = true;
4628       prologue_need_br_not_taken_cost = true;
4629     }
4630   else
4631     {
4632       peel_iters_prologue = npeel;
4633       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4634         /* If peeled iterations are known but number of scalar loop
4635            iterations are unknown, count a taken branch per peeled loop.  */
4636         prologue_need_br_taken_cost = true;
4637     }
4638
4639   bool epilogue_need_br_taken_cost = false;
4640   bool epilogue_need_br_not_taken_cost = false;
4641
4642   /* Calculate peel_iters_epilogue.  */
4643   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4644     /* We need to peel exactly one iteration for gaps.  */
4645     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4646   else if (npeel < 0)
4647     {
4648       /* If peeling for alignment is unknown, loop bound of main loop
4649          becomes unknown.  */
4650       peel_iters_epilogue = assumed_vf / 2;
4651       if (dump_enabled_p ())
4652         dump_printf (MSG_NOTE, "cost model: "
4653                      "epilogue peel iters set to vf/2 because "
4654                      "peeling for alignment is unknown.\n");
4655
4656       /* See the same reason above in peel_iters_prologue calculation.  */
4657       epilogue_need_br_taken_cost = true;
4658       epilogue_need_br_not_taken_cost = true;
4659     }
4660   else
4661     {
4662       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4663       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4664         /* If peeled iterations are known but number of scalar loop
4665            iterations are unknown, count a taken branch per peeled loop.  */
4666         epilogue_need_br_taken_cost = true;
4667     }
4668
4669   stmt_info_for_cost *si;
4670   int j;
4671   /* Add costs associated with peel_iters_prologue.  */
4672   if (peel_iters_prologue)
4673     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4674       {
4675         (void) add_stmt_cost (target_cost_data,
4676                               si->count * peel_iters_prologue, si->kind,
4677                               si->stmt_info, si->node, si->vectype,
4678                               si->misalign, vect_prologue);
4679       }
4680
4681   /* Add costs associated with peel_iters_epilogue.  */
4682   if (peel_iters_epilogue)
4683     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4684       {
4685         (void) add_stmt_cost (target_cost_data,
4686                               si->count * peel_iters_epilogue, si->kind,
4687                               si->stmt_info, si->node, si->vectype,
4688                               si->misalign, vect_epilogue);
4689       }
4690
4691   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4692
4693   if (prologue_need_br_taken_cost)
4694     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4695                           vect_prologue);
4696
4697   if (prologue_need_br_not_taken_cost)
4698     (void) add_stmt_cost (target_cost_data, 1,
4699                           cond_branch_not_taken, vect_prologue);
4700
4701   if (epilogue_need_br_taken_cost)
4702     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4703                           vect_epilogue);
4704
4705   if (epilogue_need_br_not_taken_cost)
4706     (void) add_stmt_cost (target_cost_data, 1,
4707                           cond_branch_not_taken, vect_epilogue);
4708
4709   /* Take care of special costs for rgroup controls of partial vectors.  */
4710   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4711       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4712           == vect_partial_vectors_avx512))
4713     {
4714       /* Calculate how many masks we need to generate.  */
4715       unsigned int num_masks = 0;
4716       bool need_saturation = false;
4717       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4718         if (rgm.type)
4719           {
4720             unsigned nvectors = rgm.factor;
4721             num_masks += nvectors;
4722             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4723                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4724               need_saturation = true;
4725           }
4726
4727       /* ???  The target isn't able to identify the costs below as
4728          producing masks so it cannot penaltize cases where we'd run
4729          out of mask registers for example.  */
4730
4731       /* ???  We are also failing to account for smaller vector masks
4732          we generate by splitting larger masks in vect_get_loop_mask.  */
4733
4734       /* In the worst case, we need to generate each mask in the prologue
4735          and in the loop body.  We need one splat per group and one
4736          compare per mask.
4737
4738          Sometimes the prologue mask will fold to a constant,
4739          so the actual prologue cost might be smaller.  However, it's
4740          simpler and safer to use the worst-case cost; if this ends up
4741          being the tie-breaker between vectorizing or not, then it's
4742          probably better not to vectorize.  */
4743       (void) add_stmt_cost (target_cost_data,
4744                             num_masks
4745                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4746                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4747                             vect_prologue);
4748       (void) add_stmt_cost (target_cost_data,
4749                             num_masks
4750                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4751                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4752
4753       /* When we need saturation we need it both in the prologue and
4754          the epilogue.  */
4755       if (need_saturation)
4756         {
4757           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4758                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4759           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4760                                 NULL, NULL, NULL_TREE, 0, vect_body);
4761         }
4762     }
4763   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4764            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4765                == vect_partial_vectors_while_ult))
4766     {
4767       /* Calculate how many masks we need to generate.  */
4768       unsigned int num_masks = 0;
4769       rgroup_controls *rgm;
4770       unsigned int num_vectors_m1;
4771       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4772                         num_vectors_m1, rgm)
4773         if (rgm->type)
4774           num_masks += num_vectors_m1 + 1;
4775       gcc_assert (num_masks > 0);
4776
4777       /* In the worst case, we need to generate each mask in the prologue
4778          and in the loop body.  One of the loop body mask instructions
4779          replaces the comparison in the scalar loop, and since we don't
4780          count the scalar comparison against the scalar body, we shouldn't
4781          count that vector instruction against the vector body either.
4782
4783          Sometimes we can use unpacks instead of generating prologue
4784          masks and sometimes the prologue mask will fold to a constant,
4785          so the actual prologue cost might be smaller.  However, it's
4786          simpler and safer to use the worst-case cost; if this ends up
4787          being the tie-breaker between vectorizing or not, then it's
4788          probably better not to vectorize.  */
4789       (void) add_stmt_cost (target_cost_data, num_masks,
4790                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4791                             vect_prologue);
4792       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4793                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4794                             vect_body);
4795     }
4796   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4797     {
4798       /* Referring to the functions vect_set_loop_condition_partial_vectors
4799          and vect_set_loop_controls_directly, we need to generate each
4800          length in the prologue and in the loop body if required. Although
4801          there are some possible optimizations, we consider the worst case
4802          here.  */
4803
4804       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4805       signed char partial_load_store_bias
4806         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4807       bool need_iterate_p
4808         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4809            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4810
4811       /* Calculate how many statements to be added.  */
4812       unsigned int prologue_stmts = 0;
4813       unsigned int body_stmts = 0;
4814
4815       rgroup_controls *rgc;
4816       unsigned int num_vectors_m1;
4817       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4818         if (rgc->type)
4819           {
4820             /* May need one SHIFT for nitems_total computation.  */
4821             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4822             if (nitems != 1 && !niters_known_p)
4823               prologue_stmts += 1;
4824
4825             /* May need one MAX and one MINUS for wrap around.  */
4826             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4827               prologue_stmts += 2;
4828
4829             /* Need one MAX and one MINUS for each batch limit excepting for
4830                the 1st one.  */
4831             prologue_stmts += num_vectors_m1 * 2;
4832
4833             unsigned int num_vectors = num_vectors_m1 + 1;
4834
4835             /* Need to set up lengths in prologue, only one MIN required
4836                for each since start index is zero.  */
4837             prologue_stmts += num_vectors;
4838
4839             /* If we have a non-zero partial load bias, we need one PLUS
4840                to adjust the load length.  */
4841             if (partial_load_store_bias != 0)
4842               body_stmts += 1;
4843
4844             unsigned int length_update_cost = 0;
4845             if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4846               /* For decrement IV style, Each only need a single SELECT_VL
4847                  or MIN since beginning to calculate the number of elements
4848                  need to be processed in current iteration.  */
4849               length_update_cost = 1;
4850             else
4851               /* For increment IV stype, Each may need two MINs and one MINUS to
4852                  update lengths in body for next iteration.  */
4853               length_update_cost = 3;
4854
4855             if (need_iterate_p)
4856               body_stmts += length_update_cost * num_vectors;
4857           }
4858
4859       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4860                             scalar_stmt, vect_prologue);
4861       (void) add_stmt_cost (target_cost_data, body_stmts,
4862                             scalar_stmt, vect_body);
4863     }
4864
4865   /* FORNOW: The scalar outside cost is incremented in one of the
4866      following ways:
4867
4868      1. The vectorizer checks for alignment and aliasing and generates
4869      a condition that allows dynamic vectorization.  A cost model
4870      check is ANDED with the versioning condition.  Hence scalar code
4871      path now has the added cost of the versioning check.
4872
4873        if (cost > th & versioning_check)
4874          jmp to vector code
4875
4876      Hence run-time scalar is incremented by not-taken branch cost.
4877
4878      2. The vectorizer then checks if a prologue is required.  If the
4879      cost model check was not done before during versioning, it has to
4880      be done before the prologue check.
4881
4882        if (cost <= th)
4883          prologue = scalar_iters
4884        if (prologue == 0)
4885          jmp to vector code
4886        else
4887          execute prologue
4888        if (prologue == num_iters)
4889          go to exit
4890
4891      Hence the run-time scalar cost is incremented by a taken branch,
4892      plus a not-taken branch, plus a taken branch cost.
4893
4894      3. The vectorizer then checks if an epilogue is required.  If the
4895      cost model check was not done before during prologue check, it
4896      has to be done with the epilogue check.
4897
4898        if (prologue == 0)
4899          jmp to vector code
4900        else
4901          execute prologue
4902        if (prologue == num_iters)
4903          go to exit
4904        vector code:
4905          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4906            jmp to epilogue
4907
4908      Hence the run-time scalar cost should be incremented by 2 taken
4909      branches.
4910
4911      TODO: The back end may reorder the BBS's differently and reverse
4912      conditions/branch directions.  Change the estimates below to
4913      something more reasonable.  */
4914
4915   /* If the number of iterations is known and we do not do versioning, we can
4916      decide whether to vectorize at compile time.  Hence the scalar version
4917      do not carry cost model guard costs.  */
4918   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4919       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4920     {
4921       /* Cost model check occurs at versioning.  */
4922       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4923         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4924       else
4925         {
4926           /* Cost model check occurs at prologue generation.  */
4927           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4928             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4929               + vect_get_stmt_cost (cond_branch_not_taken);
4930           /* Cost model check occurs at epilogue generation.  */
4931           else
4932             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4933         }
4934     }
4935
4936   /* Complete the target-specific cost calculations.  */
4937   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4938                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4939                suggested_unroll_factor);
4940
4941   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4942       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4943       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4944                     *suggested_unroll_factor,
4945                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4946     {
4947       if (dump_enabled_p ())
4948         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4949                          "can't unroll as unrolled vectorization factor larger"
4950                          " than maximum vectorization factor: "
4951                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4952                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4953       *suggested_unroll_factor = 1;
4954     }
4955
4956   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4957
4958   if (dump_enabled_p ())
4959     {
4960       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4961       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4962                    vec_inside_cost);
4963       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4964                    vec_prologue_cost);
4965       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4966                    vec_epilogue_cost);
4967       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4968                    scalar_single_iter_cost);
4969       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4970                    scalar_outside_cost);
4971       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4972                    vec_outside_cost);
4973       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4974                    peel_iters_prologue);
4975       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4976                    peel_iters_epilogue);
4977     }
4978
4979   /* Calculate number of iterations required to make the vector version
4980      profitable, relative to the loop bodies only.  The following condition
4981      must hold true:
4982      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4983      where
4984      SIC = scalar iteration cost, VIC = vector iteration cost,
4985      VOC = vector outside cost, VF = vectorization factor,
4986      NPEEL = prologue iterations + epilogue iterations,
4987      SOC = scalar outside cost for run time cost model check.  */
4988
4989   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4990                           - vec_inside_cost);
4991   if (saving_per_viter <= 0)
4992     {
4993       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4994         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4995                     "vectorization did not happen for a simd loop");
4996
4997       if (dump_enabled_p ())
4998         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4999                          "cost model: the vector iteration cost = %d "
5000                          "divided by the scalar iteration cost = %d "
5001                          "is greater or equal to the vectorization factor = %d"
5002                          ".\n",
5003                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5004       *ret_min_profitable_niters = -1;
5005       *ret_min_profitable_estimate = -1;
5006       return;
5007     }
5008
5009   /* ??? The "if" arm is written to handle all cases; see below for what
5010      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5011   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5012     {
5013       /* Rewriting the condition above in terms of the number of
5014          vector iterations (vniters) rather than the number of
5015          scalar iterations (niters) gives:
5016
5017          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5018
5019          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5020
5021          For integer N, X and Y when X > 0:
5022
5023          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5024       int outside_overhead = (vec_outside_cost
5025                               - scalar_single_iter_cost * peel_iters_prologue
5026                               - scalar_single_iter_cost * peel_iters_epilogue
5027                               - scalar_outside_cost);
5028       /* We're only interested in cases that require at least one
5029          vector iteration.  */
5030       int min_vec_niters = 1;
5031       if (outside_overhead > 0)
5032         min_vec_niters = outside_overhead / saving_per_viter + 1;
5033
5034       if (dump_enabled_p ())
5035         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5036                      min_vec_niters);
5037
5038       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5039         {
5040           /* Now that we know the minimum number of vector iterations,
5041              find the minimum niters for which the scalar cost is larger:
5042
5043              SIC * niters > VIC * vniters + VOC - SOC
5044
5045              We know that the minimum niters is no more than
5046              vniters * VF + NPEEL, but it might be (and often is) less
5047              than that if a partial vector iteration is cheaper than the
5048              equivalent scalar code.  */
5049           int threshold = (vec_inside_cost * min_vec_niters
5050                            + vec_outside_cost
5051                            - scalar_outside_cost);
5052           if (threshold <= 0)
5053             min_profitable_iters = 1;
5054           else
5055             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5056         }
5057       else
5058         /* Convert the number of vector iterations into a number of
5059            scalar iterations.  */
5060         min_profitable_iters = (min_vec_niters * assumed_vf
5061                                 + peel_iters_prologue
5062                                 + peel_iters_epilogue);
5063     }
5064   else
5065     {
5066       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5067                               * assumed_vf
5068                               - vec_inside_cost * peel_iters_prologue
5069                               - vec_inside_cost * peel_iters_epilogue);
5070       if (min_profitable_iters <= 0)
5071         min_profitable_iters = 0;
5072       else
5073         {
5074           min_profitable_iters /= saving_per_viter;
5075
5076           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5077               <= (((int) vec_inside_cost * min_profitable_iters)
5078                   + (((int) vec_outside_cost - scalar_outside_cost)
5079                      * assumed_vf)))
5080             min_profitable_iters++;
5081         }
5082     }
5083
5084   if (dump_enabled_p ())
5085     dump_printf (MSG_NOTE,
5086                  "  Calculated minimum iters for profitability: %d\n",
5087                  min_profitable_iters);
5088
5089   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5090       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5091     /* We want the vectorized loop to execute at least once.  */
5092     min_profitable_iters = assumed_vf + peel_iters_prologue;
5093   else if (min_profitable_iters < peel_iters_prologue)
5094     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5095        vectorized loop executes at least once.  */
5096     min_profitable_iters = peel_iters_prologue;
5097
5098   if (dump_enabled_p ())
5099     dump_printf_loc (MSG_NOTE, vect_location,
5100                      "  Runtime profitability threshold = %d\n",
5101                      min_profitable_iters);
5102
5103   *ret_min_profitable_niters = min_profitable_iters;
5104
5105   /* Calculate number of iterations required to make the vector version
5106      profitable, relative to the loop bodies only.
5107
5108      Non-vectorized variant is SIC * niters and it must win over vector
5109      variant on the expected loop trip count.  The following condition must hold true:
5110      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5111
5112   if (vec_outside_cost <= 0)
5113     min_profitable_estimate = 0;
5114   /* ??? This "else if" arm is written to handle all cases; see below for
5115      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5116   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5117     {
5118       /* This is a repeat of the code above, but with + SOC rather
5119          than - SOC.  */
5120       int outside_overhead = (vec_outside_cost
5121                               - scalar_single_iter_cost * peel_iters_prologue
5122                               - scalar_single_iter_cost * peel_iters_epilogue
5123                               + scalar_outside_cost);
5124       int min_vec_niters = 1;
5125       if (outside_overhead > 0)
5126         min_vec_niters = outside_overhead / saving_per_viter + 1;
5127
5128       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5129         {
5130           int threshold = (vec_inside_cost * min_vec_niters
5131                            + vec_outside_cost
5132                            + scalar_outside_cost);
5133           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5134         }
5135       else
5136         min_profitable_estimate = (min_vec_niters * assumed_vf
5137                                    + peel_iters_prologue
5138                                    + peel_iters_epilogue);
5139     }
5140   else
5141     {
5142       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5143                                  * assumed_vf
5144                                  - vec_inside_cost * peel_iters_prologue
5145                                  - vec_inside_cost * peel_iters_epilogue)
5146                                  / ((scalar_single_iter_cost * assumed_vf)
5147                                    - vec_inside_cost);
5148     }
5149   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5150   if (dump_enabled_p ())
5151     dump_printf_loc (MSG_NOTE, vect_location,
5152                      "  Static estimate profitability threshold = %d\n",
5153                      min_profitable_estimate);
5154
5155   *ret_min_profitable_estimate = min_profitable_estimate;
5156 }
5157
5158 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5159    vector elements (not bits) for a vector with NELT elements.  */
5160 static void
5161 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5162                               vec_perm_builder *sel)
5163 {
5164   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5165      by vec_perm_indices.  */
5166   sel->new_vector (nelt, 1, 3);
5167   for (unsigned int i = 0; i < 3; i++)
5168     sel->quick_push (i + offset);
5169 }
5170
5171 /* Checks whether the target supports whole-vector shifts for vectors of mode
5172    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5173    it supports vec_perm_const with masks for all necessary shift amounts.  */
5174 static bool
5175 have_whole_vector_shift (machine_mode mode)
5176 {
5177   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5178     return true;
5179
5180   /* Variable-length vectors should be handled via the optab.  */
5181   unsigned int nelt;
5182   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5183     return false;
5184
5185   vec_perm_builder sel;
5186   vec_perm_indices indices;
5187   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5188     {
5189       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5190       indices.new_vector (sel, 2, nelt);
5191       if (!can_vec_perm_const_p (mode, mode, indices, false))
5192         return false;
5193     }
5194   return true;
5195 }
5196
5197 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5198    multiplication operands have differing signs and (b) we intend
5199    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5200    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5201
5202 static bool
5203 vect_is_emulated_mixed_dot_prod (stmt_vec_info stmt_info)
5204 {
5205   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5206   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5207     return false;
5208
5209   tree rhs1 = gimple_assign_rhs1 (assign);
5210   tree rhs2 = gimple_assign_rhs2 (assign);
5211   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5212     return false;
5213
5214   gcc_assert (STMT_VINFO_REDUC_VECTYPE_IN (stmt_info));
5215   return !directly_supported_p (DOT_PROD_EXPR,
5216                                 STMT_VINFO_REDUC_VECTYPE_IN (stmt_info),
5217                                 optab_vector_mixed_sign);
5218 }
5219
5220 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5221    functions. Design better to avoid maintenance issues.  */
5222
5223 /* Function vect_model_reduction_cost.
5224
5225    Models cost for a reduction operation, including the vector ops
5226    generated within the strip-mine loop in some cases, the initial
5227    definition before the loop, and the epilogue code that must be generated.  */
5228
5229 static void
5230 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5231                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5232                            vect_reduction_type reduction_type,
5233                            int ncopies, stmt_vector_for_cost *cost_vec)
5234 {
5235   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5236   tree vectype;
5237   machine_mode mode;
5238   class loop *loop = NULL;
5239
5240   if (loop_vinfo)
5241     loop = LOOP_VINFO_LOOP (loop_vinfo);
5242
5243   /* Condition reductions generate two reductions in the loop.  */
5244   if (reduction_type == COND_REDUCTION)
5245     ncopies *= 2;
5246
5247   vectype = STMT_VINFO_VECTYPE (stmt_info);
5248   mode = TYPE_MODE (vectype);
5249   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5250
5251   gimple_match_op op;
5252   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5253     gcc_unreachable ();
5254
5255   if (reduction_type == EXTRACT_LAST_REDUCTION)
5256     /* No extra instructions are needed in the prologue.  The loop body
5257        operations are costed in vectorizable_condition.  */
5258     inside_cost = 0;
5259   else if (reduction_type == FOLD_LEFT_REDUCTION)
5260     {
5261       /* No extra instructions needed in the prologue.  */
5262       prologue_cost = 0;
5263
5264       if (reduc_fn != IFN_LAST)
5265         /* Count one reduction-like operation per vector.  */
5266         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5267                                         stmt_info, 0, vect_body);
5268       else
5269         {
5270           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5271           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5272           inside_cost = record_stmt_cost (cost_vec, nelements,
5273                                           vec_to_scalar, stmt_info, 0,
5274                                           vect_body);
5275           inside_cost += record_stmt_cost (cost_vec, nelements,
5276                                            scalar_stmt, stmt_info, 0,
5277                                            vect_body);
5278         }
5279     }
5280   else
5281     {
5282       /* Add in the cost of the initial definitions.  */
5283       int prologue_stmts;
5284       if (reduction_type == COND_REDUCTION)
5285         /* For cond reductions we have four vectors: initial index, step,
5286            initial result of the data reduction, initial value of the index
5287            reduction.  */
5288         prologue_stmts = 4;
5289       else
5290         /* We need the initial reduction value.  */
5291         prologue_stmts = 1;
5292       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5293                                          scalar_to_vec, stmt_info, 0,
5294                                          vect_prologue);
5295     }
5296
5297   /* Determine cost of epilogue code.
5298
5299      We have a reduction operator that will reduce the vector in one statement.
5300      Also requires scalar extract.  */
5301
5302   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5303     {
5304       if (reduc_fn != IFN_LAST)
5305         {
5306           if (reduction_type == COND_REDUCTION)
5307             {
5308               /* An EQ stmt and an COND_EXPR stmt.  */
5309               epilogue_cost += record_stmt_cost (cost_vec, 2,
5310                                                  vector_stmt, stmt_info, 0,
5311                                                  vect_epilogue);
5312               /* Reduction of the max index and a reduction of the found
5313                  values.  */
5314               epilogue_cost += record_stmt_cost (cost_vec, 2,
5315                                                  vec_to_scalar, stmt_info, 0,
5316                                                  vect_epilogue);
5317               /* A broadcast of the max value.  */
5318               epilogue_cost += record_stmt_cost (cost_vec, 1,
5319                                                  scalar_to_vec, stmt_info, 0,
5320                                                  vect_epilogue);
5321             }
5322           else
5323             {
5324               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5325                                                  stmt_info, 0, vect_epilogue);
5326               epilogue_cost += record_stmt_cost (cost_vec, 1,
5327                                                  vec_to_scalar, stmt_info, 0,
5328                                                  vect_epilogue);
5329             }
5330         }
5331       else if (reduction_type == COND_REDUCTION)
5332         {
5333           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5334           /* Extraction of scalar elements.  */
5335           epilogue_cost += record_stmt_cost (cost_vec,
5336                                              2 * estimated_nunits,
5337                                              vec_to_scalar, stmt_info, 0,
5338                                              vect_epilogue);
5339           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5340           epilogue_cost += record_stmt_cost (cost_vec,
5341                                              2 * estimated_nunits - 3,
5342                                              scalar_stmt, stmt_info, 0,
5343                                              vect_epilogue);
5344         }
5345       else if (reduction_type == EXTRACT_LAST_REDUCTION
5346                || reduction_type == FOLD_LEFT_REDUCTION)
5347         /* No extra instructions need in the epilogue.  */
5348         ;
5349       else
5350         {
5351           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5352           tree bitsize = TYPE_SIZE (op.type);
5353           int element_bitsize = tree_to_uhwi (bitsize);
5354           int nelements = vec_size_in_bits / element_bitsize;
5355
5356           if (op.code == COND_EXPR)
5357             op.code = MAX_EXPR;
5358
5359           /* We have a whole vector shift available.  */
5360           if (VECTOR_MODE_P (mode)
5361               && directly_supported_p (op.code, vectype)
5362               && have_whole_vector_shift (mode))
5363             {
5364               /* Final reduction via vector shifts and the reduction operator.
5365                  Also requires scalar extract.  */
5366               epilogue_cost += record_stmt_cost (cost_vec,
5367                                                  exact_log2 (nelements) * 2,
5368                                                  vector_stmt, stmt_info, 0,
5369                                                  vect_epilogue);
5370               epilogue_cost += record_stmt_cost (cost_vec, 1,
5371                                                  vec_to_scalar, stmt_info, 0,
5372                                                  vect_epilogue);
5373             }
5374           else
5375             /* Use extracts and reduction op for final reduction.  For N
5376                elements, we have N extracts and N-1 reduction ops.  */
5377             epilogue_cost += record_stmt_cost (cost_vec,
5378                                                nelements + nelements - 1,
5379                                                vector_stmt, stmt_info, 0,
5380                                                vect_epilogue);
5381         }
5382     }
5383
5384   if (dump_enabled_p ())
5385     dump_printf (MSG_NOTE,
5386                  "vect_model_reduction_cost: inside_cost = %d, "
5387                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5388                  prologue_cost, epilogue_cost);
5389 }
5390
5391 /* SEQ is a sequence of instructions that initialize the reduction
5392    described by REDUC_INFO.  Emit them in the appropriate place.  */
5393
5394 static void
5395 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5396                                 stmt_vec_info reduc_info, gimple *seq)
5397 {
5398   if (reduc_info->reused_accumulator)
5399     {
5400       /* When reusing an accumulator from the main loop, we only need
5401          initialization instructions if the main loop can be skipped.
5402          In that case, emit the initialization instructions at the end
5403          of the guard block that does the skip.  */
5404       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5405       gcc_assert (skip_edge);
5406       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5407       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5408     }
5409   else
5410     {
5411       /* The normal case: emit the initialization instructions on the
5412          preheader edge.  */
5413       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5414       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5415     }
5416 }
5417
5418 /* Function get_initial_def_for_reduction
5419
5420    Input:
5421    REDUC_INFO - the info_for_reduction
5422    INIT_VAL - the initial value of the reduction variable
5423    NEUTRAL_OP - a value that has no effect on the reduction, as per
5424                 neutral_op_for_reduction
5425
5426    Output:
5427    Return a vector variable, initialized according to the operation that
5428         STMT_VINFO performs. This vector will be used as the initial value
5429         of the vector of partial results.
5430
5431    The value we need is a vector in which element 0 has value INIT_VAL
5432    and every other element has value NEUTRAL_OP.  */
5433
5434 static tree
5435 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5436                                stmt_vec_info reduc_info,
5437                                tree init_val, tree neutral_op)
5438 {
5439   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5440   tree scalar_type = TREE_TYPE (init_val);
5441   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5442   tree init_def;
5443   gimple_seq stmts = NULL;
5444
5445   gcc_assert (vectype);
5446
5447   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5448               || SCALAR_FLOAT_TYPE_P (scalar_type));
5449
5450   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5451               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5452
5453   if (operand_equal_p (init_val, neutral_op))
5454     {
5455       /* If both elements are equal then the vector described above is
5456          just a splat.  */
5457       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5458       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5459     }
5460   else
5461     {
5462       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5463       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5464       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5465         {
5466           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5467              element 0.  */
5468           init_def = gimple_build_vector_from_val (&stmts, vectype,
5469                                                    neutral_op);
5470           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5471                                    vectype, init_def, init_val);
5472         }
5473       else
5474         {
5475           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5476           tree_vector_builder elts (vectype, 1, 2);
5477           elts.quick_push (init_val);
5478           elts.quick_push (neutral_op);
5479           init_def = gimple_build_vector (&stmts, &elts);
5480         }
5481     }
5482
5483   if (stmts)
5484     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5485   return init_def;
5486 }
5487
5488 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5489    which performs a reduction involving GROUP_SIZE scalar statements.
5490    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5491    is nonnull, introducing extra elements of that value will not change the
5492    result.  */
5493
5494 static void
5495 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5496                                 stmt_vec_info reduc_info,
5497                                 vec<tree> *vec_oprnds,
5498                                 unsigned int number_of_vectors,
5499                                 unsigned int group_size, tree neutral_op)
5500 {
5501   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5502   unsigned HOST_WIDE_INT nunits;
5503   unsigned j, number_of_places_left_in_vector;
5504   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5505   unsigned int i;
5506
5507   gcc_assert (group_size == initial_values.length () || neutral_op);
5508
5509   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5510      created vectors. It is greater than 1 if unrolling is performed.
5511
5512      For example, we have two scalar operands, s1 and s2 (e.g., group of
5513      strided accesses of size two), while NUNITS is four (i.e., four scalars
5514      of this type can be packed in a vector).  The output vector will contain
5515      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5516      will be 2).
5517
5518      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5519      vectors containing the operands.
5520
5521      For example, NUNITS is four as before, and the group size is 8
5522      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5523      {s5, s6, s7, s8}.  */
5524
5525   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5526     nunits = group_size;
5527
5528   number_of_places_left_in_vector = nunits;
5529   bool constant_p = true;
5530   tree_vector_builder elts (vector_type, nunits, 1);
5531   elts.quick_grow (nunits);
5532   gimple_seq ctor_seq = NULL;
5533   if (neutral_op
5534       && !useless_type_conversion_p (TREE_TYPE (vector_type),
5535                                      TREE_TYPE (neutral_op)))
5536     neutral_op = gimple_convert (&ctor_seq,
5537                                  TREE_TYPE (vector_type),
5538                                  neutral_op);
5539   for (j = 0; j < nunits * number_of_vectors; ++j)
5540     {
5541       tree op;
5542       i = j % group_size;
5543
5544       /* Get the def before the loop.  In reduction chain we have only
5545          one initial value.  Else we have as many as PHIs in the group.  */
5546       if (i >= initial_values.length () || (j > i && neutral_op))
5547         op = neutral_op;
5548       else
5549         {
5550           if (!useless_type_conversion_p (TREE_TYPE (vector_type),
5551                                           TREE_TYPE (initial_values[i])))
5552             initial_values[i] = gimple_convert (&ctor_seq,
5553                                                 TREE_TYPE (vector_type),
5554                                                 initial_values[i]);
5555           op = initial_values[i];
5556         }
5557
5558       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5559       number_of_places_left_in_vector--;
5560       elts[nunits - number_of_places_left_in_vector - 1] = op;
5561       if (!CONSTANT_CLASS_P (op))
5562         constant_p = false;
5563
5564       if (number_of_places_left_in_vector == 0)
5565         {
5566           tree init;
5567           if (constant_p && !neutral_op
5568               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5569               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5570             /* Build the vector directly from ELTS.  */
5571             init = gimple_build_vector (&ctor_seq, &elts);
5572           else if (neutral_op)
5573             {
5574               /* Build a vector of the neutral value and shift the
5575                  other elements into place.  */
5576               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5577                                                    neutral_op);
5578               int k = nunits;
5579               while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
5580                 k -= 1;
5581               while (k > 0)
5582                 {
5583                   k -= 1;
5584                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5585                                        vector_type, init, elts[k]);
5586                 }
5587             }
5588           else
5589             {
5590               /* First time round, duplicate ELTS to fill the
5591                  required number of vectors.  */
5592               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5593                                         elts, number_of_vectors, *vec_oprnds);
5594               break;
5595             }
5596           vec_oprnds->quick_push (init);
5597
5598           number_of_places_left_in_vector = nunits;
5599           elts.new_vector (vector_type, nunits, 1);
5600           elts.quick_grow (nunits);
5601           constant_p = true;
5602         }
5603     }
5604   if (ctor_seq != NULL)
5605     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5606 }
5607
5608 /* For a statement STMT_INFO taking part in a reduction operation return
5609    the stmt_vec_info the meta information is stored on.  */
5610
5611 stmt_vec_info
5612 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5613 {
5614   stmt_info = vect_orig_stmt (stmt_info);
5615   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5616   if (!is_a <gphi *> (stmt_info->stmt)
5617       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5618     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5619   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5620   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5621     {
5622       if (gimple_phi_num_args (phi) == 1)
5623         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5624     }
5625   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5626     {
5627       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5628       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5629         stmt_info = info;
5630     }
5631   return stmt_info;
5632 }
5633
5634 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5635    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5636    return false.  */
5637
5638 static bool
5639 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5640                                 stmt_vec_info reduc_info)
5641 {
5642   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5643   if (!main_loop_vinfo)
5644     return false;
5645
5646   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5647     return false;
5648
5649   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5650   auto_vec<tree, 16> main_loop_results (num_phis);
5651   auto_vec<tree, 16> initial_values (num_phis);
5652   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5653     {
5654       /* The epilogue loop can be entered either from the main loop or
5655          from an earlier guard block.  */
5656       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5657       for (tree incoming_value : reduc_info->reduc_initial_values)
5658         {
5659           /* Look for:
5660
5661                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5662                                     INITIAL_VALUE(guard block)>.  */
5663           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5664
5665           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5666           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5667
5668           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5669           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5670
5671           main_loop_results.quick_push (from_main_loop);
5672           initial_values.quick_push (from_skip);
5673         }
5674     }
5675   else
5676     /* The main loop dominates the epilogue loop.  */
5677     main_loop_results.splice (reduc_info->reduc_initial_values);
5678
5679   /* See if the main loop has the kind of accumulator we need.  */
5680   vect_reusable_accumulator *accumulator
5681     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5682   if (!accumulator
5683       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5684       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5685                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5686     return false;
5687
5688   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5689   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5690   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5691   unsigned HOST_WIDE_INT m;
5692   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5693                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5694     return false;
5695   /* Check the intermediate vector types and operations are available.  */
5696   tree prev_vectype = old_vectype;
5697   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5698   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5699     {
5700       intermediate_nunits = exact_div (intermediate_nunits, 2);
5701       tree intermediate_vectype = get_related_vectype_for_scalar_type
5702         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5703       if (!intermediate_vectype
5704           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5705                                     intermediate_vectype)
5706           || !can_vec_extract (TYPE_MODE (prev_vectype),
5707                                TYPE_MODE (intermediate_vectype)))
5708         return false;
5709       prev_vectype = intermediate_vectype;
5710     }
5711
5712   /* Non-SLP reductions might apply an adjustment after the reduction
5713      operation, in order to simplify the initialization of the accumulator.
5714      If the epilogue loop carries on from where the main loop left off,
5715      it should apply the same adjustment to the final reduction result.
5716
5717      If the epilogue loop can also be entered directly (rather than via
5718      the main loop), we need to be able to handle that case in the same way,
5719      with the same adjustment.  (In principle we could add a PHI node
5720      to select the correct adjustment, but in practice that shouldn't be
5721      necessary.)  */
5722   tree main_adjustment
5723     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5724   if (loop_vinfo->main_loop_edge && main_adjustment)
5725     {
5726       gcc_assert (num_phis == 1);
5727       tree initial_value = initial_values[0];
5728       /* Check that we can use INITIAL_VALUE as the adjustment and
5729          initialize the accumulator with a neutral value instead.  */
5730       if (!operand_equal_p (initial_value, main_adjustment))
5731         return false;
5732       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5733       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5734                                                     code, initial_value);
5735     }
5736   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5737   reduc_info->reduc_initial_values.truncate (0);
5738   reduc_info->reduc_initial_values.splice (initial_values);
5739   reduc_info->reused_accumulator = accumulator;
5740   return true;
5741 }
5742
5743 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5744    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5745
5746 static tree
5747 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5748                             gimple_seq *seq)
5749 {
5750   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5751   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5752   tree stype = TREE_TYPE (vectype);
5753   tree new_temp = vec_def;
5754   while (nunits > nunits1)
5755     {
5756       nunits /= 2;
5757       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5758                                                            stype, nunits);
5759       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5760
5761       /* The target has to make sure we support lowpart/highpart
5762          extraction, either via direct vector extract or through
5763          an integer mode punning.  */
5764       tree dst1, dst2;
5765       gimple *epilog_stmt;
5766       if (convert_optab_handler (vec_extract_optab,
5767                                  TYPE_MODE (TREE_TYPE (new_temp)),
5768                                  TYPE_MODE (vectype1))
5769           != CODE_FOR_nothing)
5770         {
5771           /* Extract sub-vectors directly once vec_extract becomes
5772              a conversion optab.  */
5773           dst1 = make_ssa_name (vectype1);
5774           epilog_stmt
5775               = gimple_build_assign (dst1, BIT_FIELD_REF,
5776                                      build3 (BIT_FIELD_REF, vectype1,
5777                                              new_temp, TYPE_SIZE (vectype1),
5778                                              bitsize_int (0)));
5779           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5780           dst2 =  make_ssa_name (vectype1);
5781           epilog_stmt
5782               = gimple_build_assign (dst2, BIT_FIELD_REF,
5783                                      build3 (BIT_FIELD_REF, vectype1,
5784                                              new_temp, TYPE_SIZE (vectype1),
5785                                              bitsize_int (bitsize)));
5786           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5787         }
5788       else
5789         {
5790           /* Extract via punning to appropriately sized integer mode
5791              vector.  */
5792           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5793           tree etype = build_vector_type (eltype, 2);
5794           gcc_assert (convert_optab_handler (vec_extract_optab,
5795                                              TYPE_MODE (etype),
5796                                              TYPE_MODE (eltype))
5797                       != CODE_FOR_nothing);
5798           tree tem = make_ssa_name (etype);
5799           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5800                                              build1 (VIEW_CONVERT_EXPR,
5801                                                      etype, new_temp));
5802           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5803           new_temp = tem;
5804           tem = make_ssa_name (eltype);
5805           epilog_stmt
5806               = gimple_build_assign (tem, BIT_FIELD_REF,
5807                                      build3 (BIT_FIELD_REF, eltype,
5808                                              new_temp, TYPE_SIZE (eltype),
5809                                              bitsize_int (0)));
5810           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5811           dst1 = make_ssa_name (vectype1);
5812           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5813                                              build1 (VIEW_CONVERT_EXPR,
5814                                                      vectype1, tem));
5815           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5816           tem = make_ssa_name (eltype);
5817           epilog_stmt
5818               = gimple_build_assign (tem, BIT_FIELD_REF,
5819                                      build3 (BIT_FIELD_REF, eltype,
5820                                              new_temp, TYPE_SIZE (eltype),
5821                                              bitsize_int (bitsize)));
5822           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5823           dst2 =  make_ssa_name (vectype1);
5824           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5825                                              build1 (VIEW_CONVERT_EXPR,
5826                                                      vectype1, tem));
5827           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5828         }
5829
5830       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5831     }
5832
5833   return new_temp;
5834 }
5835
5836 /* Function vect_create_epilog_for_reduction
5837
5838    Create code at the loop-epilog to finalize the result of a reduction
5839    computation.
5840
5841    STMT_INFO is the scalar reduction stmt that is being vectorized.
5842    SLP_NODE is an SLP node containing a group of reduction statements. The
5843      first one in this group is STMT_INFO.
5844    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5845    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5846      (counting from 0)
5847    LOOP_EXIT is the edge to update in the merge block.  In the case of a single
5848      exit this edge is always the main loop exit.
5849
5850    This function:
5851    1. Completes the reduction def-use cycles.
5852    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5853       by calling the function specified by REDUC_FN if available, or by
5854       other means (whole-vector shifts or a scalar loop).
5855       The function also creates a new phi node at the loop exit to preserve
5856       loop-closed form, as illustrated below.
5857
5858      The flow at the entry to this function:
5859
5860         loop:
5861           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5862           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5863           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5864         loop_exit:
5865           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5866           use <s_out0>
5867           use <s_out0>
5868
5869      The above is transformed by this function into:
5870
5871         loop:
5872           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5873           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5874           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5875         loop_exit:
5876           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5877           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5878           v_out2 = reduce <v_out1>
5879           s_out3 = extract_field <v_out2, 0>
5880           s_out4 = adjust_result <s_out3>
5881           use <s_out4>
5882           use <s_out4>
5883 */
5884
5885 static void
5886 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5887                                   stmt_vec_info stmt_info,
5888                                   slp_tree slp_node,
5889                                   slp_instance slp_node_instance,
5890                                   edge loop_exit)
5891 {
5892   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5893   gcc_assert (reduc_info->is_reduc_info);
5894   /* For double reductions we need to get at the inner loop reduction
5895      stmt which has the meta info attached.  Our stmt_info is that of the
5896      loop-closed PHI of the inner loop which we remember as
5897      def for the reduction PHI generation.  */
5898   bool double_reduc = false;
5899   stmt_vec_info rdef_info = stmt_info;
5900   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5901     {
5902       double_reduc = true;
5903       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5904                                             (stmt_info->stmt, 0));
5905       stmt_info = vect_stmt_to_vectorize (stmt_info);
5906     }
5907   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5908   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5909   tree vectype;
5910   machine_mode mode;
5911   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5912   basic_block exit_bb;
5913   tree scalar_dest;
5914   tree scalar_type;
5915   gimple *new_phi = NULL, *phi = NULL;
5916   gimple_stmt_iterator exit_gsi;
5917   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5918   gimple *epilog_stmt = NULL;
5919   gimple *exit_phi;
5920   tree bitsize;
5921   tree def;
5922   tree orig_name, scalar_result;
5923   imm_use_iterator imm_iter, phi_imm_iter;
5924   use_operand_p use_p, phi_use_p;
5925   gimple *use_stmt;
5926   auto_vec<tree> reduc_inputs;
5927   int j, i;
5928   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5929   unsigned int group_size = 1, k;
5930   /* SLP reduction without reduction chain, e.g.,
5931      # a1 = phi <a2, a0>
5932      # b1 = phi <b2, b0>
5933      a2 = operation (a1)
5934      b2 = operation (b1)  */
5935   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5936   bool direct_slp_reduc;
5937   tree induction_index = NULL_TREE;
5938
5939   if (slp_node)
5940     group_size = SLP_TREE_LANES (slp_node);
5941
5942   if (nested_in_vect_loop_p (loop, stmt_info))
5943     {
5944       outer_loop = loop;
5945       loop = loop->inner;
5946       gcc_assert (double_reduc);
5947     }
5948
5949   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5950   gcc_assert (vectype);
5951   mode = TYPE_MODE (vectype);
5952
5953   tree induc_val = NULL_TREE;
5954   tree adjustment_def = NULL;
5955   /* Optimize: for induction condition reduction, if we can't use zero
5956      for induc_val, use initial_def.  */
5957   if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5958     induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5959   else if (double_reduc)
5960     ;
5961   else
5962     adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5963
5964   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5965   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5966   if (slp_reduc)
5967     /* All statements produce live-out values.  */
5968     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5969
5970   unsigned vec_num;
5971   int ncopies;
5972   if (slp_node)
5973     {
5974       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5975       ncopies = 1;
5976     }
5977   else
5978     {
5979       vec_num = 1;
5980       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5981     }
5982
5983   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5984      which is updated with the current index of the loop for every match of
5985      the original loop's cond_expr (VEC_STMT).  This results in a vector
5986      containing the last time the condition passed for that vector lane.
5987      The first match will be a 1 to allow 0 to be used for non-matching
5988      indexes.  If there are no matches at all then the vector will be all
5989      zeroes.
5990
5991      PR92772: This algorithm is broken for architectures that support
5992      masked vectors, but do not provide fold_extract_last.  */
5993   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5994     {
5995       auto_vec<std::pair<tree, bool>, 2> ccompares;
5996       if (slp_node)
5997         {
5998           slp_tree cond_node = slp_node_instance->root;
5999           while (cond_node != slp_node_instance->reduc_phis)
6000             {
6001               stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
6002               int slp_reduc_idx;
6003               if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6004                 {
6005                   gimple *vec_stmt
6006                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
6007                   gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6008                   ccompares.safe_push
6009                     (std::make_pair (gimple_assign_rhs1 (vec_stmt),
6010                                      STMT_VINFO_REDUC_IDX (cond_info) == 2));
6011                   /* ???  We probably want to have REDUC_IDX on the SLP node?
6012                      We have both three and four children COND_EXPR nodes
6013                      dependent on whether the comparison is still embedded
6014                      as GENERIC.  So work backwards.  */
6015                   slp_reduc_idx = (SLP_TREE_CHILDREN (cond_node).length () - 3
6016                                    + STMT_VINFO_REDUC_IDX (cond_info));
6017                 }
6018               else
6019                 slp_reduc_idx = STMT_VINFO_REDUC_IDX (cond_info);
6020               cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
6021             }
6022         }
6023       else
6024         {
6025           stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6026           cond_info = vect_stmt_to_vectorize (cond_info);
6027           while (cond_info != reduc_info)
6028             {
6029               if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6030                 {
6031                   gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6032                   gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6033                   ccompares.safe_push
6034                     (std::make_pair (gimple_assign_rhs1 (vec_stmt),
6035                                      STMT_VINFO_REDUC_IDX (cond_info) == 2));
6036                 }
6037               cond_info
6038                 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6039                                                      1 + STMT_VINFO_REDUC_IDX
6040                                                      (cond_info)));
6041               cond_info = vect_stmt_to_vectorize (cond_info);
6042             }
6043         }
6044       gcc_assert (ccompares.length () != 0);
6045
6046       tree indx_before_incr, indx_after_incr;
6047       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6048       int scalar_precision
6049         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6050       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6051       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6052         (TYPE_MODE (vectype), cr_index_scalar_type,
6053          TYPE_VECTOR_SUBPARTS (vectype));
6054
6055       /* First we create a simple vector induction variable which starts
6056          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6057          vector size (STEP).  */
6058
6059       /* Create a {1,2,3,...} vector.  */
6060       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6061
6062       /* Create a vector of the step value.  */
6063       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6064       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6065
6066       /* Create an induction variable.  */
6067       gimple_stmt_iterator incr_gsi;
6068       bool insert_after;
6069       vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6070       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6071                  insert_after, &indx_before_incr, &indx_after_incr);
6072
6073       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6074          filled with zeros (VEC_ZERO).  */
6075
6076       /* Create a vector of 0s.  */
6077       tree zero = build_zero_cst (cr_index_scalar_type);
6078       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6079
6080       /* Create a vector phi node.  */
6081       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6082       new_phi = create_phi_node (new_phi_tree, loop->header);
6083       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6084                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6085
6086       /* Now take the condition from the loops original cond_exprs
6087          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6088          every match uses values from the induction variable
6089          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6090          (NEW_PHI_TREE).
6091          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6092          the new cond_expr (INDEX_COND_EXPR).  */
6093       gimple_seq stmts = NULL;
6094       for (int i = ccompares.length () - 1; i != -1; --i)
6095         {
6096           tree ccompare = ccompares[i].first;
6097           if (ccompares[i].second)
6098             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6099                                          cr_index_vector_type,
6100                                          ccompare,
6101                                          indx_before_incr, new_phi_tree);
6102           else
6103             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6104                                          cr_index_vector_type,
6105                                          ccompare,
6106                                          new_phi_tree, indx_before_incr);
6107         }
6108       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6109
6110       /* Update the phi with the vec cond.  */
6111       induction_index = new_phi_tree;
6112       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6113                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6114     }
6115
6116   /* 2. Create epilog code.
6117         The reduction epilog code operates across the elements of the vector
6118         of partial results computed by the vectorized loop.
6119         The reduction epilog code consists of:
6120
6121         step 1: compute the scalar result in a vector (v_out2)
6122         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6123         step 3: adjust the scalar result (s_out3) if needed.
6124
6125         Step 1 can be accomplished using one the following three schemes:
6126           (scheme 1) using reduc_fn, if available.
6127           (scheme 2) using whole-vector shifts, if available.
6128           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6129                      combined.
6130
6131           The overall epilog code looks like this:
6132
6133           s_out0 = phi <s_loop>         # original EXIT_PHI
6134           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6135           v_out2 = reduce <v_out1>              # step 1
6136           s_out3 = extract_field <v_out2, 0>    # step 2
6137           s_out4 = adjust_result <s_out3>       # step 3
6138
6139           (step 3 is optional, and steps 1 and 2 may be combined).
6140           Lastly, the uses of s_out0 are replaced by s_out4.  */
6141
6142
6143   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6144          v_out1 = phi <VECT_DEF>
6145          Store them in NEW_PHIS.  */
6146   if (double_reduc)
6147     loop = outer_loop;
6148   /* We need to reduce values in all exits.  */
6149   exit_bb = loop_exit->dest;
6150   exit_gsi = gsi_after_labels (exit_bb);
6151   reduc_inputs.create (slp_node ? vec_num : ncopies);
6152   for (unsigned i = 0; i < vec_num; i++)
6153     {
6154       gimple_seq stmts = NULL;
6155       if (slp_node)
6156         def = vect_get_slp_vect_def (slp_node, i);
6157       else
6158         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6159       for (j = 0; j < ncopies; j++)
6160         {
6161           tree new_def = copy_ssa_name (def);
6162           phi = create_phi_node (new_def, exit_bb);
6163           if (j)
6164             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6165           if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6166             SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6167           else
6168             {
6169               for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6170                 SET_PHI_ARG_DEF (phi, k, def);
6171             }
6172           new_def = gimple_convert (&stmts, vectype, new_def);
6173           reduc_inputs.quick_push (new_def);
6174         }
6175       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6176     }
6177
6178   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6179          (i.e. when reduc_fn is not available) and in the final adjustment
6180          code (if needed).  Also get the original scalar reduction variable as
6181          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6182          represents a reduction pattern), the tree-code and scalar-def are
6183          taken from the original stmt that the pattern-stmt (STMT) replaces.
6184          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6185          are taken from STMT.  */
6186
6187   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6188   if (orig_stmt_info != stmt_info)
6189     {
6190       /* Reduction pattern  */
6191       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6192       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6193     }
6194
6195   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6196   scalar_type = TREE_TYPE (scalar_dest);
6197   scalar_results.truncate (0);
6198   scalar_results.reserve_exact (group_size);
6199   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6200   bitsize = TYPE_SIZE (scalar_type);
6201
6202   /* True if we should implement SLP_REDUC using native reduction operations
6203      instead of scalar operations.  */
6204   direct_slp_reduc = (reduc_fn != IFN_LAST
6205                       && slp_reduc
6206                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6207
6208   /* In case of reduction chain, e.g.,
6209      # a1 = phi <a3, a0>
6210      a2 = operation (a1)
6211      a3 = operation (a2),
6212
6213      we may end up with more than one vector result.  Here we reduce them
6214      to one vector.
6215
6216      The same is true for a SLP reduction, e.g.,
6217      # a1 = phi <a2, a0>
6218      # b1 = phi <b2, b0>
6219      a2 = operation (a1)
6220      b2 = operation (a2),
6221
6222      where we can end up with more than one vector as well.  We can
6223      easily accumulate vectors when the number of vector elements is
6224      a multiple of the SLP group size.
6225
6226      The same is true if we couldn't use a single defuse cycle.  */
6227   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6228       || direct_slp_reduc
6229       || (slp_reduc
6230           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6231       || ncopies > 1)
6232     {
6233       gimple_seq stmts = NULL;
6234       tree single_input = reduc_inputs[0];
6235       for (k = 1; k < reduc_inputs.length (); k++)
6236         single_input = gimple_build (&stmts, code, vectype,
6237                                      single_input, reduc_inputs[k]);
6238       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6239
6240       reduc_inputs.truncate (0);
6241       reduc_inputs.safe_push (single_input);
6242     }
6243
6244   tree orig_reduc_input = reduc_inputs[0];
6245
6246   /* If this loop is an epilogue loop that can be skipped after the
6247      main loop, we can only share a reduction operation between the
6248      main loop and the epilogue if we put it at the target of the
6249      skip edge.
6250
6251      We can still reuse accumulators if this check fails.  Doing so has
6252      the minor(?) benefit of making the epilogue loop's scalar result
6253      independent of the main loop's scalar result.  */
6254   bool unify_with_main_loop_p = false;
6255   if (reduc_info->reused_accumulator
6256       && loop_vinfo->skip_this_loop_edge
6257       && single_succ_p (exit_bb)
6258       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6259     {
6260       unify_with_main_loop_p = true;
6261
6262       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6263       reduc_inputs[0] = make_ssa_name (vectype);
6264       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6265       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6266                    UNKNOWN_LOCATION);
6267       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6268                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6269       exit_gsi = gsi_after_labels (reduc_block);
6270     }
6271
6272   /* Shouldn't be used beyond this point.  */
6273   exit_bb = nullptr;
6274
6275   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6276       && reduc_fn != IFN_LAST)
6277     {
6278       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6279          various data values where the condition matched and another vector
6280          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6281          need to extract the last matching index (which will be the index with
6282          highest value) and use this to index into the data vector.
6283          For the case where there were no matches, the data vector will contain
6284          all default values and the index vector will be all zeros.  */
6285
6286       /* Get various versions of the type of the vector of indexes.  */
6287       tree index_vec_type = TREE_TYPE (induction_index);
6288       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6289       tree index_scalar_type = TREE_TYPE (index_vec_type);
6290       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6291
6292       /* Get an unsigned integer version of the type of the data vector.  */
6293       int scalar_precision
6294         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6295       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6296       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6297                                                 vectype);
6298
6299       /* First we need to create a vector (ZERO_VEC) of zeros and another
6300          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6301          can create using a MAX reduction and then expanding.
6302          In the case where the loop never made any matches, the max index will
6303          be zero.  */
6304
6305       /* Vector of {0, 0, 0,...}.  */
6306       tree zero_vec = build_zero_cst (vectype);
6307
6308       /* Find maximum value from the vector of found indexes.  */
6309       tree max_index = make_ssa_name (index_scalar_type);
6310       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6311                                                           1, induction_index);
6312       gimple_call_set_lhs (max_index_stmt, max_index);
6313       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6314
6315       /* Vector of {max_index, max_index, max_index,...}.  */
6316       tree max_index_vec = make_ssa_name (index_vec_type);
6317       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6318                                                       max_index);
6319       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6320                                                         max_index_vec_rhs);
6321       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6322
6323       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6324          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6325          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6326          otherwise.  Only one value should match, resulting in a vector
6327          (VEC_COND) with one data value and the rest zeros.
6328          In the case where the loop never made any matches, every index will
6329          match, resulting in a vector with all data values (which will all be
6330          the default value).  */
6331
6332       /* Compare the max index vector to the vector of found indexes to find
6333          the position of the max value.  */
6334       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6335       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6336                                                       induction_index,
6337                                                       max_index_vec);
6338       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6339
6340       /* Use the compare to choose either values from the data vector or
6341          zero.  */
6342       tree vec_cond = make_ssa_name (vectype);
6343       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6344                                                    vec_compare,
6345                                                    reduc_inputs[0],
6346                                                    zero_vec);
6347       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6348
6349       /* Finally we need to extract the data value from the vector (VEC_COND)
6350          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6351          reduction, but because this doesn't exist, we can use a MAX reduction
6352          instead.  The data value might be signed or a float so we need to cast
6353          it first.
6354          In the case where the loop never made any matches, the data values are
6355          all identical, and so will reduce down correctly.  */
6356
6357       /* Make the matched data values unsigned.  */
6358       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6359       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6360                                        vec_cond);
6361       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6362                                                         VIEW_CONVERT_EXPR,
6363                                                         vec_cond_cast_rhs);
6364       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6365
6366       /* Reduce down to a scalar value.  */
6367       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6368       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6369                                                            1, vec_cond_cast);
6370       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6371       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6372
6373       /* Convert the reduced value back to the result type and set as the
6374          result.  */
6375       gimple_seq stmts = NULL;
6376       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6377                                data_reduc);
6378       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6379       scalar_results.safe_push (new_temp);
6380     }
6381   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6382            && reduc_fn == IFN_LAST)
6383     {
6384       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6385          idx = 0;
6386          idx_val = induction_index[0];
6387          val = data_reduc[0];
6388          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6389            if (induction_index[i] > idx_val)
6390              val = data_reduc[i], idx_val = induction_index[i];
6391          return val;  */
6392
6393       tree data_eltype = TREE_TYPE (vectype);
6394       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6395       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6396       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6397       /* Enforced by vectorizable_reduction, which ensures we have target
6398          support before allowing a conditional reduction on variable-length
6399          vectors.  */
6400       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6401       tree idx_val = NULL_TREE, val = NULL_TREE;
6402       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6403         {
6404           tree old_idx_val = idx_val;
6405           tree old_val = val;
6406           idx_val = make_ssa_name (idx_eltype);
6407           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6408                                              build3 (BIT_FIELD_REF, idx_eltype,
6409                                                      induction_index,
6410                                                      bitsize_int (el_size),
6411                                                      bitsize_int (off)));
6412           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6413           val = make_ssa_name (data_eltype);
6414           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6415                                              build3 (BIT_FIELD_REF,
6416                                                      data_eltype,
6417                                                      reduc_inputs[0],
6418                                                      bitsize_int (el_size),
6419                                                      bitsize_int (off)));
6420           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6421           if (off != 0)
6422             {
6423               tree new_idx_val = idx_val;
6424               if (off != v_size - el_size)
6425                 {
6426                   new_idx_val = make_ssa_name (idx_eltype);
6427                   epilog_stmt = gimple_build_assign (new_idx_val,
6428                                                      MAX_EXPR, idx_val,
6429                                                      old_idx_val);
6430                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6431                 }
6432               tree cond = make_ssa_name (boolean_type_node);
6433               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6434                                                  idx_val, old_idx_val);
6435               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6436               tree new_val = make_ssa_name (data_eltype);
6437               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6438                                                  cond, val, old_val);
6439               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6440               idx_val = new_idx_val;
6441               val = new_val;
6442             }
6443         }
6444       /* Convert the reduced value back to the result type and set as the
6445          result.  */
6446       gimple_seq stmts = NULL;
6447       val = gimple_convert (&stmts, scalar_type, val);
6448       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6449       scalar_results.safe_push (val);
6450     }
6451
6452   /* 2.3 Create the reduction code, using one of the three schemes described
6453          above. In SLP we simply need to extract all the elements from the
6454          vector (without reducing them), so we use scalar shifts.  */
6455   else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
6456     {
6457       tree tmp;
6458       tree vec_elem_type;
6459
6460       /* Case 1:  Create:
6461          v_out2 = reduc_expr <v_out1>  */
6462
6463       if (dump_enabled_p ())
6464         dump_printf_loc (MSG_NOTE, vect_location,
6465                          "Reduce using direct vector reduction.\n");
6466
6467       gimple_seq stmts = NULL;
6468       vec_elem_type = TREE_TYPE (vectype);
6469       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6470                                vec_elem_type, reduc_inputs[0]);
6471       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6472       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6473
6474       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6475           && induc_val)
6476         {
6477           /* Earlier we set the initial value to be a vector if induc_val
6478              values.  Check the result and if it is induc_val then replace
6479              with the original initial value, unless induc_val is
6480              the same as initial_def already.  */
6481           tree zcompare = make_ssa_name (boolean_type_node);
6482           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6483                                              new_temp, induc_val);
6484           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6485           tree initial_def = reduc_info->reduc_initial_values[0];
6486           tmp = make_ssa_name (new_scalar_dest);
6487           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6488                                              initial_def, new_temp);
6489           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6490           new_temp = tmp;
6491         }
6492
6493       scalar_results.safe_push (new_temp);
6494     }
6495   else if (direct_slp_reduc)
6496     {
6497       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6498          with the elements for other SLP statements replaced with the
6499          neutral value.  We can then do a normal reduction on each vector.  */
6500
6501       /* Enforced by vectorizable_reduction.  */
6502       gcc_assert (reduc_inputs.length () == 1);
6503       gcc_assert (pow2p_hwi (group_size));
6504
6505       gimple_seq seq = NULL;
6506
6507       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6508          and the same element size as VECTYPE.  */
6509       tree index = build_index_vector (vectype, 0, 1);
6510       tree index_type = TREE_TYPE (index);
6511       tree index_elt_type = TREE_TYPE (index_type);
6512       tree mask_type = truth_type_for (index_type);
6513
6514       /* Create a vector that, for each element, identifies which of
6515          the REDUC_GROUP_SIZE results should use it.  */
6516       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6517       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6518                             build_vector_from_val (index_type, index_mask));
6519
6520       /* Get a neutral vector value.  This is simply a splat of the neutral
6521          scalar value if we have one, otherwise the initial scalar value
6522          is itself a neutral value.  */
6523       tree vector_identity = NULL_TREE;
6524       tree neutral_op = NULL_TREE;
6525       if (slp_node)
6526         {
6527           tree initial_value = NULL_TREE;
6528           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6529             initial_value = reduc_info->reduc_initial_values[0];
6530           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6531                                                  initial_value, false);
6532         }
6533       if (neutral_op)
6534         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6535                                                         neutral_op);
6536       for (unsigned int i = 0; i < group_size; ++i)
6537         {
6538           /* If there's no univeral neutral value, we can use the
6539              initial scalar value from the original PHI.  This is used
6540              for MIN and MAX reduction, for example.  */
6541           if (!neutral_op)
6542             {
6543               tree scalar_value = reduc_info->reduc_initial_values[i];
6544               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6545                                              scalar_value);
6546               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6547                                                               scalar_value);
6548             }
6549
6550           /* Calculate the equivalent of:
6551
6552              sel[j] = (index[j] == i);
6553
6554              which selects the elements of REDUC_INPUTS[0] that should
6555              be included in the result.  */
6556           tree compare_val = build_int_cst (index_elt_type, i);
6557           compare_val = build_vector_from_val (index_type, compare_val);
6558           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6559                                    index, compare_val);
6560
6561           /* Calculate the equivalent of:
6562
6563              vec = seq ? reduc_inputs[0] : vector_identity;
6564
6565              VEC is now suitable for a full vector reduction.  */
6566           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6567                                    sel, reduc_inputs[0], vector_identity);
6568
6569           /* Do the reduction and convert it to the appropriate type.  */
6570           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6571                                       TREE_TYPE (vectype), vec);
6572           scalar = gimple_convert (&seq, scalar_type, scalar);
6573           scalar_results.safe_push (scalar);
6574         }
6575       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6576     }
6577   else
6578     {
6579       bool reduce_with_shift;
6580       tree vec_temp;
6581
6582       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6583
6584       /* See if the target wants to do the final (shift) reduction
6585          in a vector mode of smaller size and first reduce upper/lower
6586          halves against each other.  */
6587       enum machine_mode mode1 = mode;
6588       tree stype = TREE_TYPE (vectype);
6589       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6590       unsigned nunits1 = nunits;
6591       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6592           && reduc_inputs.length () == 1)
6593         {
6594           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6595           /* For SLP reductions we have to make sure lanes match up, but
6596              since we're doing individual element final reduction reducing
6597              vector width here is even more important.
6598              ???  We can also separate lanes with permutes, for the common
6599              case of power-of-two group-size odd/even extracts would work.  */
6600           if (slp_reduc && nunits != nunits1)
6601             {
6602               nunits1 = least_common_multiple (nunits1, group_size);
6603               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6604             }
6605         }
6606       if (!slp_reduc
6607           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6608         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6609
6610       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6611                                                            stype, nunits1);
6612       reduce_with_shift = have_whole_vector_shift (mode1);
6613       if (!VECTOR_MODE_P (mode1)
6614           || !directly_supported_p (code, vectype1))
6615         reduce_with_shift = false;
6616
6617       /* First reduce the vector to the desired vector size we should
6618          do shift reduction on by combining upper and lower halves.  */
6619       gimple_seq stmts = NULL;
6620       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6621                                              code, &stmts);
6622       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6623       reduc_inputs[0] = new_temp;
6624
6625       if (reduce_with_shift && (!slp_reduc || group_size == 1))
6626         {
6627           int element_bitsize = tree_to_uhwi (bitsize);
6628           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6629              for variable-length vectors and also requires direct target support
6630              for loop reductions.  */
6631           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6632           int nelements = vec_size_in_bits / element_bitsize;
6633           vec_perm_builder sel;
6634           vec_perm_indices indices;
6635
6636           int elt_offset;
6637
6638           tree zero_vec = build_zero_cst (vectype1);
6639           /* Case 2: Create:
6640              for (offset = nelements/2; offset >= 1; offset/=2)
6641                 {
6642                   Create:  va' = vec_shift <va, offset>
6643                   Create:  va = vop <va, va'>
6644                 }  */
6645
6646           tree rhs;
6647
6648           if (dump_enabled_p ())
6649             dump_printf_loc (MSG_NOTE, vect_location,
6650                              "Reduce using vector shifts\n");
6651
6652           gimple_seq stmts = NULL;
6653           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6654           for (elt_offset = nelements / 2;
6655                elt_offset >= 1;
6656                elt_offset /= 2)
6657             {
6658               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6659               indices.new_vector (sel, 2, nelements);
6660               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6661               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6662                                        new_temp, zero_vec, mask);
6663               new_temp = gimple_build (&stmts, code,
6664                                        vectype1, new_name, new_temp);
6665             }
6666           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6667
6668           /* 2.4  Extract the final scalar result.  Create:
6669              s_out3 = extract_field <v_out2, bitpos>  */
6670
6671           if (dump_enabled_p ())
6672             dump_printf_loc (MSG_NOTE, vect_location,
6673                              "extract scalar result\n");
6674
6675           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6676                         bitsize, bitsize_zero_node);
6677           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6678           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6679           gimple_assign_set_lhs (epilog_stmt, new_temp);
6680           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6681           scalar_results.safe_push (new_temp);
6682         }
6683       else
6684         {
6685           /* Case 3: Create:
6686              s = extract_field <v_out2, 0>
6687              for (offset = element_size;
6688                   offset < vector_size;
6689                   offset += element_size;)
6690                {
6691                  Create:  s' = extract_field <v_out2, offset>
6692                  Create:  s = op <s, s'>  // For non SLP cases
6693                }  */
6694
6695           if (dump_enabled_p ())
6696             dump_printf_loc (MSG_NOTE, vect_location,
6697                              "Reduce using scalar code.\n");
6698
6699           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6700           int element_bitsize = tree_to_uhwi (bitsize);
6701           tree compute_type = TREE_TYPE (vectype);
6702           gimple_seq stmts = NULL;
6703           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6704             {
6705               int bit_offset;
6706               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6707                                        vec_temp, bitsize, bitsize_zero_node);
6708
6709               /* In SLP we don't need to apply reduction operation, so we just
6710                  collect s' values in SCALAR_RESULTS.  */
6711               if (slp_reduc)
6712                 scalar_results.safe_push (new_temp);
6713
6714               for (bit_offset = element_bitsize;
6715                    bit_offset < vec_size_in_bits;
6716                    bit_offset += element_bitsize)
6717                 {
6718                   tree bitpos = bitsize_int (bit_offset);
6719                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6720                                            compute_type, vec_temp,
6721                                            bitsize, bitpos);
6722                   if (slp_reduc)
6723                     {
6724                       /* In SLP we don't need to apply reduction operation, so
6725                          we just collect s' values in SCALAR_RESULTS.  */
6726                       new_temp = new_name;
6727                       scalar_results.safe_push (new_name);
6728                     }
6729                   else
6730                     new_temp = gimple_build (&stmts, code, compute_type,
6731                                              new_name, new_temp);
6732                 }
6733             }
6734
6735           /* The only case where we need to reduce scalar results in SLP, is
6736              unrolling.  If the size of SCALAR_RESULTS is greater than
6737              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6738              REDUC_GROUP_SIZE.  */
6739           if (slp_reduc)
6740             {
6741               tree res, first_res, new_res;
6742
6743               /* Reduce multiple scalar results in case of SLP unrolling.  */
6744               for (j = group_size; scalar_results.iterate (j, &res);
6745                    j++)
6746                 {
6747                   first_res = scalar_results[j % group_size];
6748                   new_res = gimple_build (&stmts, code, compute_type,
6749                                           first_res, res);
6750                   scalar_results[j % group_size] = new_res;
6751                 }
6752               scalar_results.truncate (group_size);
6753               for (k = 0; k < group_size; k++)
6754                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6755                                                     scalar_results[k]);
6756             }
6757           else
6758             {
6759               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6760               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6761               scalar_results.safe_push (new_temp);
6762             }
6763
6764           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6765         }
6766
6767       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6768           && induc_val)
6769         {
6770           /* Earlier we set the initial value to be a vector if induc_val
6771              values.  Check the result and if it is induc_val then replace
6772              with the original initial value, unless induc_val is
6773              the same as initial_def already.  */
6774           tree zcompare = make_ssa_name (boolean_type_node);
6775           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6776                                              scalar_results[0], induc_val);
6777           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6778           tree initial_def = reduc_info->reduc_initial_values[0];
6779           tree tmp = make_ssa_name (new_scalar_dest);
6780           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6781                                              initial_def, scalar_results[0]);
6782           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6783           scalar_results[0] = tmp;
6784         }
6785     }
6786
6787   /* 2.5 Adjust the final result by the initial value of the reduction
6788          variable. (When such adjustment is not needed, then
6789          'adjustment_def' is zero).  For example, if code is PLUS we create:
6790          new_temp = loop_exit_def + adjustment_def  */
6791
6792   if (adjustment_def)
6793     {
6794       gcc_assert (!slp_reduc || group_size == 1);
6795       gimple_seq stmts = NULL;
6796       if (double_reduc)
6797         {
6798           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6799           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6800           new_temp = gimple_build (&stmts, code, vectype,
6801                                    reduc_inputs[0], adjustment_def);
6802         }
6803       else
6804         {
6805           new_temp = scalar_results[0];
6806           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6807           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6808                                            adjustment_def);
6809           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6810           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6811                                    new_temp, adjustment_def);
6812           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6813         }
6814
6815       epilog_stmt = gimple_seq_last_stmt (stmts);
6816       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6817       scalar_results[0] = new_temp;
6818     }
6819
6820   /* Record this operation if it could be reused by the epilogue loop.  */
6821   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6822       && reduc_inputs.length () == 1)
6823     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6824                                            { orig_reduc_input, reduc_info });
6825
6826   if (double_reduc)
6827     loop = outer_loop;
6828
6829   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6830           phis with new adjusted scalar results, i.e., replace use <s_out0>
6831           with use <s_out4>.
6832
6833      Transform:
6834         loop_exit:
6835           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6836           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6837           v_out2 = reduce <v_out1>
6838           s_out3 = extract_field <v_out2, 0>
6839           s_out4 = adjust_result <s_out3>
6840           use <s_out0>
6841           use <s_out0>
6842
6843      into:
6844
6845         loop_exit:
6846           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6847           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6848           v_out2 = reduce <v_out1>
6849           s_out3 = extract_field <v_out2, 0>
6850           s_out4 = adjust_result <s_out3>
6851           use <s_out4>
6852           use <s_out4> */
6853
6854   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6855   auto_vec<gimple *> phis;
6856   for (k = 0; k < live_out_stmts.size (); k++)
6857     {
6858       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6859       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6860
6861       /* Find the loop-closed-use at the loop exit of the original scalar
6862          result.  (The reduction result is expected to have two immediate uses,
6863          one at the latch block, and one at the loop exit).  For double
6864          reductions we are looking for exit phis of the outer loop.  */
6865       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6866         {
6867           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6868             {
6869               if (!is_gimple_debug (USE_STMT (use_p))
6870                   && gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6871                 phis.safe_push (USE_STMT (use_p));
6872             }
6873           else
6874             {
6875               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6876                 {
6877                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6878
6879                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6880                     {
6881                       if (!flow_bb_inside_loop_p (loop,
6882                                              gimple_bb (USE_STMT (phi_use_p)))
6883                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6884                         phis.safe_push (USE_STMT (phi_use_p));
6885                     }
6886                 }
6887             }
6888         }
6889
6890       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6891         {
6892           /* Replace the uses:  */
6893           orig_name = PHI_RESULT (exit_phi);
6894
6895           /* Look for a single use at the target of the skip edge.  */
6896           if (unify_with_main_loop_p)
6897             {
6898               use_operand_p use_p;
6899               gimple *user;
6900               if (!single_imm_use (orig_name, &use_p, &user))
6901                 gcc_unreachable ();
6902               orig_name = gimple_get_lhs (user);
6903             }
6904
6905           scalar_result = scalar_results[k];
6906           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6907             {
6908               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6909                 SET_USE (use_p, scalar_result);
6910               update_stmt (use_stmt);
6911             }
6912         }
6913
6914       phis.truncate (0);
6915     }
6916 }
6917
6918 /* Return a vector of type VECTYPE that is equal to the vector select
6919    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6920    before GSI.  */
6921
6922 static tree
6923 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6924                      tree vec, tree identity)
6925 {
6926   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6927   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6928                                           mask, vec, identity);
6929   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6930   return cond;
6931 }
6932
6933 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6934    order, starting with LHS.  Insert the extraction statements before GSI and
6935    associate the new scalar SSA names with variable SCALAR_DEST.
6936    If MASK is nonzero mask the input and then operate on it unconditionally.
6937    Return the SSA name for the result.  */
6938
6939 static tree
6940 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6941                        tree_code code, tree lhs, tree vector_rhs,
6942                        tree mask)
6943 {
6944   tree vectype = TREE_TYPE (vector_rhs);
6945   tree scalar_type = TREE_TYPE (vectype);
6946   tree bitsize = TYPE_SIZE (scalar_type);
6947   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6948   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6949
6950   /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6951      to perform an unconditional element-wise reduction of it.  */
6952   if (mask)
6953     {
6954       tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
6955                                                    "masked_vector_rhs");
6956       tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
6957                                                   false);
6958       tree vector_identity = build_vector_from_val (vectype, neutral_op);
6959       gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
6960                                              mask, vector_rhs, vector_identity);
6961       gsi_insert_before (gsi, select, GSI_SAME_STMT);
6962       vector_rhs = masked_vector_rhs;
6963     }
6964
6965   for (unsigned HOST_WIDE_INT bit_offset = 0;
6966        bit_offset < vec_size_in_bits;
6967        bit_offset += element_bitsize)
6968     {
6969       tree bitpos = bitsize_int (bit_offset);
6970       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6971                          bitsize, bitpos);
6972
6973       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6974       rhs = make_ssa_name (scalar_dest, stmt);
6975       gimple_assign_set_lhs (stmt, rhs);
6976       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6977
6978       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6979       tree new_name = make_ssa_name (scalar_dest, stmt);
6980       gimple_assign_set_lhs (stmt, new_name);
6981       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6982       lhs = new_name;
6983     }
6984   return lhs;
6985 }
6986
6987 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6988    type of the vector input.  */
6989
6990 static internal_fn
6991 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6992 {
6993   internal_fn mask_reduc_fn;
6994   internal_fn mask_len_reduc_fn;
6995
6996   switch (reduc_fn)
6997     {
6998     case IFN_FOLD_LEFT_PLUS:
6999       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7000       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7001       break;
7002
7003     default:
7004       return IFN_LAST;
7005     }
7006
7007   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7008                                       OPTIMIZE_FOR_SPEED))
7009     return mask_reduc_fn;
7010   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7011                                       OPTIMIZE_FOR_SPEED))
7012     return mask_len_reduc_fn;
7013   return IFN_LAST;
7014 }
7015
7016 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
7017    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
7018    statement.  CODE is the operation performed by STMT_INFO and OPS are
7019    its scalar operands.  REDUC_INDEX is the index of the operand in
7020    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
7021    implements in-order reduction, or IFN_LAST if we should open-code it.
7022    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
7023    that should be used to control the operation in a fully-masked loop.  */
7024
7025 static bool
7026 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7027                                stmt_vec_info stmt_info,
7028                                gimple_stmt_iterator *gsi,
7029                                gimple **vec_stmt, slp_tree slp_node,
7030                                gimple *reduc_def_stmt,
7031                                code_helper code, internal_fn reduc_fn,
7032                                tree *ops, int num_ops, tree vectype_in,
7033                                int reduc_index, vec_loop_masks *masks,
7034                                vec_loop_lens *lens)
7035 {
7036   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7037   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7038   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7039
7040   int ncopies;
7041   if (slp_node)
7042     ncopies = 1;
7043   else
7044     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7045
7046   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7047   gcc_assert (ncopies == 1);
7048
7049   bool is_cond_op = false;
7050   if (!code.is_tree_code ())
7051     {
7052       code = conditional_internal_fn_code (internal_fn (code));
7053       gcc_assert (code != ERROR_MARK);
7054       is_cond_op = true;
7055     }
7056
7057   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7058
7059   if (slp_node)
7060     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7061                           TYPE_VECTOR_SUBPARTS (vectype_in)));
7062
7063   /* The operands either come from a binary operation or an IFN_COND operation.
7064      The former is a gimple assign with binary rhs and the latter is a
7065      gimple call with four arguments.  */
7066   gcc_assert (num_ops == 2 || num_ops == 4);
7067
7068   int group_size = 1;
7069   stmt_vec_info scalar_dest_def_info;
7070   auto_vec<tree> vec_oprnds0, vec_opmask;
7071   if (slp_node)
7072     {
7073       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
7074                                                       + (1 - reduc_index)],
7075                                                       &vec_oprnds0);
7076       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7077       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7078       /* For an IFN_COND_OP we also need the vector mask operand.  */
7079       if (is_cond_op)
7080         vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
7081     }
7082   else
7083     {
7084       tree op0, opmask;
7085       if (!is_cond_op)
7086         op0 = ops[1 - reduc_index];
7087       else
7088         {
7089           op0 = ops[2 + (1 - reduc_index)];
7090           opmask = ops[0];
7091         }
7092       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7093                                      op0, &vec_oprnds0);
7094       scalar_dest_def_info = stmt_info;
7095
7096       /* For an IFN_COND_OP we also need the vector mask operand.  */
7097       if (is_cond_op)
7098         vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7099                                        opmask, &vec_opmask);
7100     }
7101
7102   gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7103   tree scalar_dest = gimple_get_lhs (sdef);
7104   tree scalar_type = TREE_TYPE (scalar_dest);
7105   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7106
7107   int vec_num = vec_oprnds0.length ();
7108   gcc_assert (vec_num == 1 || slp_node);
7109   tree vec_elem_type = TREE_TYPE (vectype_out);
7110   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7111
7112   tree vector_identity = NULL_TREE;
7113   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7114     {
7115       vector_identity = build_zero_cst (vectype_out);
7116       if (!HONOR_SIGNED_ZEROS (vectype_out))
7117         ;
7118       else
7119         {
7120           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7121           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7122                                         vector_identity);
7123         }
7124     }
7125
7126   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7127   int i;
7128   tree def0;
7129   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7130     {
7131       gimple *new_stmt;
7132       tree mask = NULL_TREE;
7133       tree len = NULL_TREE;
7134       tree bias = NULL_TREE;
7135       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7136         {
7137           tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
7138                                                vec_num, vectype_in, i);
7139           if (is_cond_op)
7140             mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
7141                                      loop_mask, vec_opmask[i], gsi);
7142           else
7143             mask = loop_mask;
7144         }
7145       else if (is_cond_op)
7146         mask = vec_opmask[i];
7147       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7148         {
7149           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7150                                    i, 1);
7151           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7152           bias = build_int_cst (intQI_type_node, biasval);
7153           if (!is_cond_op)
7154             mask = build_minus_one_cst (truth_type_for (vectype_in));
7155         }
7156
7157       /* Handle MINUS by adding the negative.  */
7158       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7159         {
7160           tree negated = make_ssa_name (vectype_out);
7161           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7162           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7163           def0 = negated;
7164         }
7165
7166       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7167           && mask && mask_reduc_fn == IFN_LAST)
7168         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7169                                     vector_identity);
7170
7171       /* On the first iteration the input is simply the scalar phi
7172          result, and for subsequent iterations it is the output of
7173          the preceding operation.  */
7174       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7175         {
7176           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7177             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7178                                                    def0, mask, len, bias);
7179           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7180             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7181                                                    def0, mask);
7182           else
7183             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7184                                                    def0);
7185           /* For chained SLP reductions the output of the previous reduction
7186              operation serves as the input of the next. For the final statement
7187              the output cannot be a temporary - we reuse the original
7188              scalar destination of the last statement.  */
7189           if (i != vec_num - 1)
7190             {
7191               gimple_set_lhs (new_stmt, scalar_dest_var);
7192               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7193               gimple_set_lhs (new_stmt, reduc_var);
7194             }
7195         }
7196       else
7197         {
7198           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7199                                              tree_code (code), reduc_var, def0,
7200                                              mask);
7201           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7202           /* Remove the statement, so that we can use the same code paths
7203              as for statements that we've just created.  */
7204           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7205           gsi_remove (&tmp_gsi, true);
7206         }
7207
7208       if (i == vec_num - 1)
7209         {
7210           gimple_set_lhs (new_stmt, scalar_dest);
7211           vect_finish_replace_stmt (loop_vinfo,
7212                                     scalar_dest_def_info,
7213                                     new_stmt);
7214         }
7215       else
7216         vect_finish_stmt_generation (loop_vinfo,
7217                                      scalar_dest_def_info,
7218                                      new_stmt, gsi);
7219
7220       if (slp_node)
7221         slp_node->push_vec_def (new_stmt);
7222       else
7223         {
7224           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7225           *vec_stmt = new_stmt;
7226         }
7227     }
7228
7229   return true;
7230 }
7231
7232 /* Function is_nonwrapping_integer_induction.
7233
7234    Check if STMT_VINO (which is part of loop LOOP) both increments and
7235    does not cause overflow.  */
7236
7237 static bool
7238 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7239 {
7240   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7241   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7242   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7243   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7244   widest_int ni, max_loop_value, lhs_max;
7245   wi::overflow_type overflow = wi::OVF_NONE;
7246
7247   /* Make sure the loop is integer based.  */
7248   if (TREE_CODE (base) != INTEGER_CST
7249       || TREE_CODE (step) != INTEGER_CST)
7250     return false;
7251
7252   /* Check that the max size of the loop will not wrap.  */
7253
7254   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7255     return true;
7256
7257   if (! max_stmt_executions (loop, &ni))
7258     return false;
7259
7260   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7261                             &overflow);
7262   if (overflow)
7263     return false;
7264
7265   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7266                             TYPE_SIGN (lhs_type), &overflow);
7267   if (overflow)
7268     return false;
7269
7270   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7271           <= TYPE_PRECISION (lhs_type));
7272 }
7273
7274 /* Check if masking can be supported by inserting a conditional expression.
7275    CODE is the code for the operation.  COND_FN is the conditional internal
7276    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7277 static bool
7278 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7279                          tree vectype_in)
7280 {
7281   if (cond_fn != IFN_LAST
7282       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7283                                          OPTIMIZE_FOR_SPEED))
7284     return false;
7285
7286   if (code.is_tree_code ())
7287     switch (tree_code (code))
7288       {
7289       case DOT_PROD_EXPR:
7290       case SAD_EXPR:
7291         return true;
7292
7293       default:
7294         break;
7295       }
7296   return false;
7297 }
7298
7299 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7300    code for the operation.  VOP is the array of operands.  MASK is the loop
7301    mask.  GSI is a statement iterator used to place the new conditional
7302    expression.  */
7303 static void
7304 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7305                       gimple_stmt_iterator *gsi)
7306 {
7307   switch (tree_code (code))
7308     {
7309     case DOT_PROD_EXPR:
7310       {
7311         tree vectype = TREE_TYPE (vop[1]);
7312         tree zero = build_zero_cst (vectype);
7313         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7314         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7315                                                mask, vop[1], zero);
7316         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7317         vop[1] = masked_op1;
7318         break;
7319       }
7320
7321     case SAD_EXPR:
7322       {
7323         tree vectype = TREE_TYPE (vop[1]);
7324         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7325         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7326                                                mask, vop[1], vop[0]);
7327         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7328         vop[1] = masked_op1;
7329         break;
7330       }
7331
7332     default:
7333       gcc_unreachable ();
7334     }
7335 }
7336
7337 /* Given an operation with CODE in loop reduction path whose reduction PHI is
7338    specified by REDUC_INFO, the operation has TYPE of scalar result, and its
7339    input vectype is represented by VECTYPE_IN. The vectype of vectorized result
7340    may be different from VECTYPE_IN, either in base type or vectype lanes,
7341    lane-reducing operation is the case.  This function check if it is possible,
7342    and how to perform partial vectorization on the operation in the context
7343    of LOOP_VINFO.  */
7344
7345 static void
7346 vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
7347                                             stmt_vec_info reduc_info,
7348                                             slp_tree slp_node,
7349                                             code_helper code, tree type,
7350                                             tree vectype_in)
7351 {
7352   enum vect_reduction_type reduc_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7353   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7354   internal_fn cond_fn = get_conditional_internal_fn (code, type);
7355
7356   if (reduc_type != FOLD_LEFT_REDUCTION
7357       && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7358       && (cond_fn == IFN_LAST
7359           || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7360                                               OPTIMIZE_FOR_SPEED)))
7361     {
7362       if (dump_enabled_p ())
7363         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7364                          "can't operate on partial vectors because"
7365                          " no conditional operation is available.\n");
7366       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7367     }
7368   else if (reduc_type == FOLD_LEFT_REDUCTION
7369            && reduc_fn == IFN_LAST
7370            && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in),
7371                                        SSA_NAME))
7372     {
7373       if (dump_enabled_p ())
7374         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7375                         "can't operate on partial vectors because"
7376                         " no conditional operation is available.\n");
7377       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7378     }
7379   else if (reduc_type == FOLD_LEFT_REDUCTION
7380            && internal_fn_mask_index (reduc_fn) == -1
7381            && FLOAT_TYPE_P (vectype_in)
7382            && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
7383     {
7384       if (dump_enabled_p ())
7385         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7386                          "can't operate on partial vectors because"
7387                          " signed zeros cannot be preserved.\n");
7388       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7389     }
7390   else
7391     {
7392       internal_fn mask_reduc_fn
7393                         = get_masked_reduction_fn (reduc_fn, vectype_in);
7394       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7395       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
7396       unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node,
7397                                                vectype_in);
7398
7399       if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7400         vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
7401       else
7402         vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
7403     }
7404 }
7405
7406 /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
7407    the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
7408    and the analysis is for slp if SLP_NODE is not NULL.
7409
7410    For a lane-reducing operation, the loop reduction path that it lies in,
7411    may contain normal operation, or other lane-reducing operation of different
7412    input type size, an example as:
7413
7414      int sum = 0;
7415      for (i)
7416        {
7417          ...
7418          sum += d0[i] * d1[i];       // dot-prod <vector(16) char>
7419          sum += w[i];                // widen-sum <vector(16) char>
7420          sum += abs(s0[i] - s1[i]);  // sad <vector(8) short>
7421          sum += n[i];                // normal <vector(4) int>
7422          ...
7423        }
7424
7425    Vectorization factor is essentially determined by operation whose input
7426    vectype has the most lanes ("vector(16) char" in the example), while we
7427    need to choose input vectype with the least lanes ("vector(4) int" in the
7428    example) to determine effective number of vector reduction PHIs.  */
7429
7430 bool
7431 vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7432                             slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7433 {
7434   gimple *stmt = stmt_info->stmt;
7435
7436   if (!lane_reducing_stmt_p (stmt))
7437     return false;
7438
7439   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
7440
7441   if (!INTEGRAL_TYPE_P (type))
7442     return false;
7443
7444   /* Do not try to vectorize bit-precision reductions.  */
7445   if (!type_has_mode_precision_p (type))
7446     return false;
7447
7448   stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7449
7450   /* TODO: Support lane-reducing operation that does not directly participate
7451      in loop reduction.  */
7452   if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
7453     return false;
7454
7455   /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
7456      recoginized.  */
7457   gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
7458   gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
7459
7460   for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
7461     {
7462       stmt_vec_info def_stmt_info;
7463       slp_tree slp_op;
7464       tree op;
7465       tree vectype;
7466       enum vect_def_type dt;
7467
7468       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
7469                                &slp_op, &dt, &vectype, &def_stmt_info))
7470         {
7471           if (dump_enabled_p ())
7472             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7473                              "use not simple.\n");
7474           return false;
7475         }
7476
7477       if (!vectype)
7478         {
7479           vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
7480                                                  slp_op);
7481           if (!vectype)
7482             return false;
7483         }
7484
7485       if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
7486         {
7487           if (dump_enabled_p ())
7488             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7489                              "incompatible vector types for invariants\n");
7490           return false;
7491         }
7492
7493       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7494         continue;
7495
7496       /* There should be at most one cycle def in the stmt.  */
7497       if (VECTORIZABLE_CYCLE_DEF (dt))
7498         return false;
7499     }
7500
7501   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
7502
7503   gcc_assert (vectype_in);
7504
7505   /* Compute number of effective vector statements for costing.  */
7506   unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, slp_node,
7507                                                        vectype_in);
7508   gcc_assert (ncopies_for_cost >= 1);
7509
7510   if (vect_is_emulated_mixed_dot_prod (stmt_info))
7511     {
7512       /* We need extra two invariants: one that contains the minimum signed
7513          value and one that contains half of its negative.  */
7514       int prologue_stmts = 2;
7515       unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
7516                                         scalar_to_vec, stmt_info, 0,
7517                                         vect_prologue);
7518       if (dump_enabled_p ())
7519         dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
7520                      "extra prologue_cost = %d .\n", cost);
7521
7522       /* Three dot-products and a subtraction.  */
7523       ncopies_for_cost *= 4;
7524     }
7525
7526   record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, stmt_info,
7527                     0, vect_body);
7528
7529   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7530     {
7531       enum tree_code code = gimple_assign_rhs_code (stmt);
7532       vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
7533                                                   slp_node, code, type,
7534                                                   vectype_in);
7535     }
7536
7537   /* Transform via vect_transform_reduction.  */
7538   STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7539   return true;
7540 }
7541
7542 /* Function vectorizable_reduction.
7543
7544    Check if STMT_INFO performs a reduction operation that can be vectorized.
7545    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7546    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7547    Return true if STMT_INFO is vectorizable in this way.
7548
7549    This function also handles reduction idioms (patterns) that have been
7550    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7551    may be of this form:
7552      X = pattern_expr (arg0, arg1, ..., X)
7553    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7554    sequence that had been detected and replaced by the pattern-stmt
7555    (STMT_INFO).
7556
7557    This function also handles reduction of condition expressions, for example:
7558      for (int i = 0; i < N; i++)
7559        if (a[i] < value)
7560          last = a[i];
7561    This is handled by vectorising the loop and creating an additional vector
7562    containing the loop indexes for which "a[i] < value" was true.  In the
7563    function epilogue this is reduced to a single max value and then used to
7564    index into the vector of results.
7565
7566    In some cases of reduction patterns, the type of the reduction variable X is
7567    different than the type of the other arguments of STMT_INFO.
7568    In such cases, the vectype that is used when transforming STMT_INFO into
7569    a vector stmt is different than the vectype that is used to determine the
7570    vectorization factor, because it consists of a different number of elements
7571    than the actual number of elements that are being operated upon in parallel.
7572
7573    For example, consider an accumulation of shorts into an int accumulator.
7574    On some targets it's possible to vectorize this pattern operating on 8
7575    shorts at a time (hence, the vectype for purposes of determining the
7576    vectorization factor should be V8HI); on the other hand, the vectype that
7577    is used to create the vector form is actually V4SI (the type of the result).
7578
7579    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7580    indicates what is the actual level of parallelism (V8HI in the example), so
7581    that the right vectorization factor would be derived.  This vectype
7582    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7583    be used to create the vectorized stmt.  The right vectype for the vectorized
7584    stmt is obtained from the type of the result X:
7585       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7586
7587    This means that, contrary to "regular" reductions (or "regular" stmts in
7588    general), the following equation:
7589       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7590    does *NOT* necessarily hold for reduction patterns.  */
7591
7592 bool
7593 vectorizable_reduction (loop_vec_info loop_vinfo,
7594                         stmt_vec_info stmt_info, slp_tree slp_node,
7595                         slp_instance slp_node_instance,
7596                         stmt_vector_for_cost *cost_vec)
7597 {
7598   tree vectype_in = NULL_TREE;
7599   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7600   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7601   stmt_vec_info cond_stmt_vinfo = NULL;
7602   int i;
7603   int ncopies;
7604   bool single_defuse_cycle = false;
7605   bool nested_cycle = false;
7606   bool double_reduc = false;
7607   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7608   tree cond_reduc_val = NULL_TREE;
7609
7610   /* Make sure it was already recognized as a reduction computation.  */
7611   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7612       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7613       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7614     return false;
7615
7616   /* The stmt we store reduction analysis meta on.  */
7617   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7618   reduc_info->is_reduc_info = true;
7619
7620   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7621     {
7622       if (is_a <gphi *> (stmt_info->stmt))
7623         {
7624           if (slp_node)
7625             {
7626               /* We eventually need to set a vector type on invariant
7627                  arguments.  */
7628               unsigned j;
7629               slp_tree child;
7630               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7631                 if (!vect_maybe_update_slp_op_vectype
7632                        (child, SLP_TREE_VECTYPE (slp_node)))
7633                   {
7634                     if (dump_enabled_p ())
7635                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7636                                        "incompatible vector types for "
7637                                        "invariants\n");
7638                     return false;
7639                   }
7640             }
7641           /* Analysis for double-reduction is done on the outer
7642              loop PHI, nested cycles have no further restrictions.  */
7643           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7644         }
7645       else
7646         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7647       return true;
7648     }
7649
7650   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7651   stmt_vec_info phi_info = stmt_info;
7652   if (!is_a <gphi *> (stmt_info->stmt))
7653     {
7654       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7655       return true;
7656     }
7657   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7658     {
7659       if (gimple_bb (stmt_info->stmt) != loop->header)
7660         {
7661           /* For SLP we arrive here for both the inner loop LC PHI and
7662              the outer loop PHI.  The latter is what we want to analyze
7663              the reduction with.  */
7664           gcc_assert (slp_node);
7665           return true;
7666         }
7667       use_operand_p use_p;
7668       gimple *use_stmt;
7669       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7670                                  &use_p, &use_stmt);
7671       gcc_assert (res);
7672       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7673     }
7674
7675   if (slp_node)
7676     {
7677       slp_node_instance->reduc_phis = slp_node;
7678       /* ???  We're leaving slp_node to point to the PHIs, we only
7679          need it to get at the number of vector stmts which wasn't
7680          yet initialized for the instance root.  */
7681     }
7682
7683   /* PHIs should not participate in patterns.  */
7684   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7685   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7686
7687   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7688      and compute the reduction chain length.  Discover the real
7689      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7690   tree reduc_def
7691     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7692                              loop_latch_edge
7693                                (gimple_bb (reduc_def_phi)->loop_father));
7694   unsigned reduc_chain_length = 0;
7695   bool only_slp_reduc_chain = true;
7696   stmt_info = NULL;
7697   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7698   /* For double-reductions we start SLP analysis at the inner loop LC PHI
7699      which is the def of the outer loop live stmt.  */
7700   if (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def
7701       && slp_node)
7702     slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7703   while (reduc_def != PHI_RESULT (reduc_def_phi))
7704     {
7705       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7706       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7707       int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
7708
7709       if (reduc_idx == -1)
7710         {
7711           if (dump_enabled_p ())
7712             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7713                              "reduction chain broken by patterns.\n");
7714           return false;
7715         }
7716       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7717         only_slp_reduc_chain = false;
7718       /* For epilogue generation live members of the chain need
7719          to point back to the PHI via their original stmt for
7720          info_for_reduction to work.  For SLP we need to look at
7721          all lanes here - even though we only will vectorize from
7722          the SLP node with live lane zero the other live lanes also
7723          need to be identified as part of a reduction to be able
7724          to skip code generation for them.  */
7725       if (slp_for_stmt_info)
7726         {
7727           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7728             if (STMT_VINFO_LIVE_P (s))
7729               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7730         }
7731       else if (STMT_VINFO_LIVE_P (vdef))
7732         STMT_VINFO_REDUC_DEF (def) = phi_info;
7733       gimple_match_op op;
7734       if (!gimple_extract_op (vdef->stmt, &op))
7735         {
7736           if (dump_enabled_p ())
7737             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7738                              "reduction chain includes unsupported"
7739                              " statement type.\n");
7740           return false;
7741         }
7742       if (CONVERT_EXPR_CODE_P (op.code))
7743         {
7744           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7745             {
7746               if (dump_enabled_p ())
7747                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7748                                  "conversion in the reduction chain.\n");
7749               return false;
7750             }
7751         }
7752       else
7753         {
7754           /* First non-conversion stmt.  */
7755           if (!stmt_info)
7756             stmt_info = vdef;
7757
7758           if (lane_reducing_op_p (op.code))
7759             {
7760               enum vect_def_type dt;
7761               tree vectype_op;
7762
7763               /* The last operand of lane-reducing operation is for
7764                  reduction.  */
7765               gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
7766
7767               if (!vect_is_simple_use (op.ops[0], loop_vinfo, &dt, &vectype_op))
7768                 return false;
7769
7770               tree type_op = TREE_TYPE (op.ops[0]);
7771
7772               if (!vectype_op)
7773                 {
7774                   vectype_op = get_vectype_for_scalar_type (loop_vinfo,
7775                                                             type_op);
7776                   if (!vectype_op)
7777                     return false;
7778                 }
7779
7780               /* For lane-reducing operation vectorizable analysis needs the
7781                  reduction PHI information.  */
7782               STMT_VINFO_REDUC_DEF (def) = phi_info;
7783
7784               /* Each lane-reducing operation has its own input vectype, while
7785                  reduction PHI will record the input vectype with the least
7786                  lanes.  */
7787               STMT_VINFO_REDUC_VECTYPE_IN (vdef) = vectype_op;
7788
7789               /* To accommodate lane-reducing operations of mixed input
7790                  vectypes, choose input vectype with the least lanes for the
7791                  reduction PHI statement, which would result in the most
7792                  ncopies for vectorized reduction results.  */
7793               if (!vectype_in
7794                   || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7795                        < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
7796                 vectype_in = vectype_op;
7797             }
7798           else
7799             vectype_in = STMT_VINFO_VECTYPE (phi_info);
7800         }
7801
7802       reduc_def = op.ops[reduc_idx];
7803       reduc_chain_length++;
7804       if (!stmt_info && slp_node)
7805         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7806     }
7807   /* PHIs should not participate in patterns.  */
7808   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7809
7810   if (nested_in_vect_loop_p (loop, stmt_info))
7811     {
7812       loop = loop->inner;
7813       nested_cycle = true;
7814     }
7815
7816   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7817      element.  */
7818   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7819     {
7820       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7821       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7822     }
7823   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7824     gcc_assert (slp_node
7825                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7826
7827   /* 1. Is vectorizable reduction?  */
7828   /* Not supportable if the reduction variable is used in the loop, unless
7829      it's a reduction chain.  */
7830   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7831       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7832     return false;
7833
7834   /* Reductions that are not used even in an enclosing outer-loop,
7835      are expected to be "live" (used out of the loop).  */
7836   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7837       && !STMT_VINFO_LIVE_P (stmt_info))
7838     return false;
7839
7840   /* 2. Has this been recognized as a reduction pattern?
7841
7842      Check if STMT represents a pattern that has been recognized
7843      in earlier analysis stages.  For stmts that represent a pattern,
7844      the STMT_VINFO_RELATED_STMT field records the last stmt in
7845      the original sequence that constitutes the pattern.  */
7846
7847   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7848   if (orig_stmt_info)
7849     {
7850       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7851       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7852     }
7853
7854   /* 3. Check the operands of the operation.  The first operands are defined
7855         inside the loop body. The last operand is the reduction variable,
7856         which is defined by the loop-header-phi.  */
7857
7858   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7859   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7860   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7861
7862   gimple_match_op op;
7863   if (!gimple_extract_op (stmt_info->stmt, &op))
7864     gcc_unreachable ();
7865   bool lane_reducing = lane_reducing_op_p (op.code);
7866
7867   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7868       && !SCALAR_FLOAT_TYPE_P (op.type))
7869     return false;
7870
7871   /* Do not try to vectorize bit-precision reductions.  */
7872   if (!type_has_mode_precision_p (op.type))
7873     return false;
7874
7875   /* Lane-reducing ops also never can be used in a SLP reduction group
7876      since we'll mix lanes belonging to different reductions.  But it's
7877      OK to use them in a reduction chain or when the reduction group
7878      has just one element.  */
7879   if (lane_reducing
7880       && slp_node
7881       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7882       && SLP_TREE_LANES (slp_node) > 1)
7883     {
7884       if (dump_enabled_p ())
7885         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7886                          "lane-reducing reduction in reduction group.\n");
7887       return false;
7888     }
7889
7890   /* All uses but the last are expected to be defined in the loop.
7891      The last use is the reduction variable.  In case of nested cycle this
7892      assumption is not true: we use reduc_index to record the index of the
7893      reduction variable.  */
7894   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7895   tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7896   /* We need to skip an extra operand for COND_EXPRs with embedded
7897      comparison.  */
7898   unsigned opno_adjust = 0;
7899   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7900     opno_adjust = 1;
7901   for (i = 0; i < (int) op.num_ops; i++)
7902     {
7903       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7904       if (i == 0 && op.code == COND_EXPR)
7905         continue;
7906
7907       stmt_vec_info def_stmt_info;
7908       enum vect_def_type dt;
7909       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7910                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7911                                &vectype_op[i], &def_stmt_info))
7912         {
7913           if (dump_enabled_p ())
7914             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7915                              "use not simple.\n");
7916           return false;
7917         }
7918
7919       /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7920          reduction operand twice (once as definition, once as else).  */
7921       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7922         continue;
7923
7924       /* There should be only one cycle def in the stmt, the one
7925          leading to reduc_def.  */
7926       if (VECTORIZABLE_CYCLE_DEF (dt))
7927         return false;
7928
7929       if (!vectype_op[i])
7930         vectype_op[i]
7931           = get_vectype_for_scalar_type (loop_vinfo,
7932                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7933
7934       /* Record how the non-reduction-def value of COND_EXPR is defined.
7935          ???  For a chain of multiple CONDs we'd have to match them up all.  */
7936       if (op.code == COND_EXPR && reduc_chain_length == 1)
7937         {
7938           if (dt == vect_constant_def)
7939             {
7940               cond_reduc_dt = dt;
7941               cond_reduc_val = op.ops[i];
7942             }
7943           else if (dt == vect_induction_def
7944                    && def_stmt_info
7945                    && is_nonwrapping_integer_induction (def_stmt_info, loop))
7946             {
7947               cond_reduc_dt = dt;
7948               cond_stmt_vinfo = def_stmt_info;
7949             }
7950         }
7951     }
7952
7953   enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
7954   STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
7955   /* If we have a condition reduction, see if we can simplify it further.  */
7956   if (reduction_type == COND_REDUCTION)
7957     {
7958       if (slp_node && SLP_TREE_LANES (slp_node) != 1)
7959         return false;
7960
7961       /* When the condition uses the reduction value in the condition, fail.  */
7962       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7963         {
7964           if (dump_enabled_p ())
7965             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7966                              "condition depends on previous iteration\n");
7967           return false;
7968         }
7969
7970       if (reduc_chain_length == 1
7971           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7972                                               OPTIMIZE_FOR_SPEED)
7973               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7974                                                  vectype_in,
7975                                                  OPTIMIZE_FOR_SPEED)))
7976         {
7977           if (dump_enabled_p ())
7978             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7979                              "optimizing condition reduction with"
7980                              " FOLD_EXTRACT_LAST.\n");
7981           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7982         }
7983       else if (cond_reduc_dt == vect_induction_def)
7984         {
7985           tree base
7986             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7987           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7988
7989           gcc_assert (TREE_CODE (base) == INTEGER_CST
7990                       && TREE_CODE (step) == INTEGER_CST);
7991           cond_reduc_val = NULL_TREE;
7992           enum tree_code cond_reduc_op_code = ERROR_MARK;
7993           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7994           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7995             ;
7996           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7997              above base; punt if base is the minimum value of the type for
7998              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7999           else if (tree_int_cst_sgn (step) == -1)
8000             {
8001               cond_reduc_op_code = MIN_EXPR;
8002               if (tree_int_cst_sgn (base) == -1)
8003                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
8004               else if (tree_int_cst_lt (base,
8005                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
8006                 cond_reduc_val
8007                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
8008             }
8009           else
8010             {
8011               cond_reduc_op_code = MAX_EXPR;
8012               if (tree_int_cst_sgn (base) == 1)
8013                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
8014               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
8015                                         base))
8016                 cond_reduc_val
8017                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
8018             }
8019           if (cond_reduc_val)
8020             {
8021               if (dump_enabled_p ())
8022                 dump_printf_loc (MSG_NOTE, vect_location,
8023                                  "condition expression based on "
8024                                  "integer induction.\n");
8025               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
8026               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
8027                 = cond_reduc_val;
8028               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
8029             }
8030         }
8031       else if (cond_reduc_dt == vect_constant_def)
8032         {
8033           enum vect_def_type cond_initial_dt;
8034           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
8035           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
8036           if (cond_initial_dt == vect_constant_def
8037               && types_compatible_p (TREE_TYPE (cond_initial_val),
8038                                      TREE_TYPE (cond_reduc_val)))
8039             {
8040               tree e = fold_binary (LE_EXPR, boolean_type_node,
8041                                     cond_initial_val, cond_reduc_val);
8042               if (e && (integer_onep (e) || integer_zerop (e)))
8043                 {
8044                   if (dump_enabled_p ())
8045                     dump_printf_loc (MSG_NOTE, vect_location,
8046                                      "condition expression based on "
8047                                      "compile time constant.\n");
8048                   /* Record reduction code at analysis stage.  */
8049                   STMT_VINFO_REDUC_CODE (reduc_info)
8050                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
8051                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
8052                 }
8053             }
8054         }
8055     }
8056
8057   if (STMT_VINFO_LIVE_P (phi_info))
8058     return false;
8059
8060   if (slp_node)
8061     ncopies = 1;
8062   else
8063     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8064
8065   gcc_assert (ncopies >= 1);
8066
8067   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
8068
8069   if (nested_cycle)
8070     {
8071       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
8072                   == vect_double_reduction_def);
8073       double_reduc = true;
8074     }
8075
8076   /* 4.2. Check support for the epilog operation.
8077
8078           If STMT represents a reduction pattern, then the type of the
8079           reduction variable may be different than the type of the rest
8080           of the arguments.  For example, consider the case of accumulation
8081           of shorts into an int accumulator; The original code:
8082                         S1: int_a = (int) short_a;
8083           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
8084
8085           was replaced with:
8086                         STMT: int_acc = widen_sum <short_a, int_acc>
8087
8088           This means that:
8089           1. The tree-code that is used to create the vector operation in the
8090              epilog code (that reduces the partial results) is not the
8091              tree-code of STMT, but is rather the tree-code of the original
8092              stmt from the pattern that STMT is replacing.  I.e, in the example
8093              above we want to use 'widen_sum' in the loop, but 'plus' in the
8094              epilog.
8095           2. The type (mode) we use to check available target support
8096              for the vector operation to be created in the *epilog*, is
8097              determined by the type of the reduction variable (in the example
8098              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
8099              However the type (mode) we use to check available target support
8100              for the vector operation to be created *inside the loop*, is
8101              determined by the type of the other arguments to STMT (in the
8102              example we'd check this: optab_handler (widen_sum_optab,
8103              vect_short_mode)).
8104
8105           This is contrary to "regular" reductions, in which the types of all
8106           the arguments are the same as the type of the reduction variable.
8107           For "regular" reductions we can therefore use the same vector type
8108           (and also the same tree-code) when generating the epilog code and
8109           when generating the code inside the loop.  */
8110
8111   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
8112
8113   /* If conversion might have created a conditional operation like
8114      IFN_COND_ADD already.  Use the internal code for the following checks.  */
8115   if (orig_code.is_internal_fn ())
8116     {
8117       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
8118       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
8119     }
8120
8121   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
8122
8123   reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8124   if (reduction_type == TREE_CODE_REDUCTION)
8125     {
8126       /* Check whether it's ok to change the order of the computation.
8127          Generally, when vectorizing a reduction we change the order of the
8128          computation.  This may change the behavior of the program in some
8129          cases, so we need to check that this is ok.  One exception is when
8130          vectorizing an outer-loop: the inner-loop is executed sequentially,
8131          and therefore vectorizing reductions in the inner-loop during
8132          outer-loop vectorization is safe.  Likewise when we are vectorizing
8133          a series of reductions using SLP and the VF is one the reductions
8134          are performed in scalar order.  */
8135       if (slp_node
8136           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8137           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
8138         ;
8139       else if (needs_fold_left_reduction_p (op.type, orig_code))
8140         {
8141           /* When vectorizing a reduction chain w/o SLP the reduction PHI
8142              is not directy used in stmt.  */
8143           if (!only_slp_reduc_chain
8144               && reduc_chain_length != 1)
8145             {
8146               if (dump_enabled_p ())
8147                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8148                                  "in-order reduction chain without SLP.\n");
8149               return false;
8150             }
8151           STMT_VINFO_REDUC_TYPE (reduc_info)
8152             = reduction_type = FOLD_LEFT_REDUCTION;
8153         }
8154       else if (!commutative_binary_op_p (orig_code, op.type)
8155                || !associative_binary_op_p (orig_code, op.type))
8156         {
8157           if (dump_enabled_p ())
8158             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8159                             "reduction: not commutative/associative\n");
8160           return false;
8161         }
8162     }
8163
8164   if ((reduction_type == COND_REDUCTION
8165        || reduction_type == INTEGER_INDUC_COND_REDUCTION
8166        || reduction_type == CONST_COND_REDUCTION
8167        || reduction_type == EXTRACT_LAST_REDUCTION)
8168       && slp_node
8169       && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)
8170     {
8171       if (dump_enabled_p ())
8172         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8173                          "multiple types in condition reduction.\n");
8174       return false;
8175     }
8176
8177   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
8178       && ncopies > 1)
8179     {
8180       if (dump_enabled_p ())
8181         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8182                          "multiple types in double reduction or condition "
8183                          "reduction or fold-left reduction.\n");
8184       return false;
8185     }
8186
8187   internal_fn reduc_fn = IFN_LAST;
8188   if (reduction_type == TREE_CODE_REDUCTION
8189       || reduction_type == FOLD_LEFT_REDUCTION
8190       || reduction_type == INTEGER_INDUC_COND_REDUCTION
8191       || reduction_type == CONST_COND_REDUCTION)
8192     {
8193       if (reduction_type == FOLD_LEFT_REDUCTION
8194           ? fold_left_reduction_fn (orig_code, &reduc_fn)
8195           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
8196         {
8197           if (reduc_fn != IFN_LAST
8198               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
8199                                                   OPTIMIZE_FOR_SPEED))
8200             {
8201               if (dump_enabled_p ())
8202                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8203                                  "reduc op not supported by target.\n");
8204
8205               reduc_fn = IFN_LAST;
8206             }
8207         }
8208       else
8209         {
8210           if (!nested_cycle || double_reduc)
8211             {
8212               if (dump_enabled_p ())
8213                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8214                                  "no reduc code for scalar code.\n");
8215
8216               return false;
8217             }
8218         }
8219     }
8220   else if (reduction_type == COND_REDUCTION)
8221     {
8222       int scalar_precision
8223         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8224       cr_index_scalar_type = make_unsigned_type (scalar_precision);
8225       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8226                                                 vectype_out);
8227
8228       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8229                                           OPTIMIZE_FOR_SPEED))
8230         reduc_fn = IFN_REDUC_MAX;
8231     }
8232   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8233
8234   if (reduction_type != EXTRACT_LAST_REDUCTION
8235       && (!nested_cycle || double_reduc)
8236       && reduc_fn == IFN_LAST
8237       && !nunits_out.is_constant ())
8238     {
8239       if (dump_enabled_p ())
8240         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8241                          "missing target support for reduction on"
8242                          " variable-length vectors.\n");
8243       return false;
8244     }
8245
8246   /* For SLP reductions, see if there is a neutral value we can use.  */
8247   tree neutral_op = NULL_TREE;
8248   if (slp_node)
8249     {
8250       tree initial_value = NULL_TREE;
8251       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8252         initial_value = vect_phi_initial_value (reduc_def_phi);
8253       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8254                                              orig_code, initial_value);
8255     }
8256
8257   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8258     {
8259       /* We can't support in-order reductions of code such as this:
8260
8261            for (int i = 0; i < n1; ++i)
8262              for (int j = 0; j < n2; ++j)
8263                l += a[j];
8264
8265          since GCC effectively transforms the loop when vectorizing:
8266
8267            for (int i = 0; i < n1 / VF; ++i)
8268              for (int j = 0; j < n2; ++j)
8269                for (int k = 0; k < VF; ++k)
8270                  l += a[j];
8271
8272          which is a reassociation of the original operation.  */
8273       if (dump_enabled_p ())
8274         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8275                          "in-order double reduction not supported.\n");
8276
8277       return false;
8278     }
8279
8280   if (reduction_type == FOLD_LEFT_REDUCTION
8281       && (slp_node && SLP_TREE_LANES (slp_node) > 1)
8282       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8283     {
8284       /* We cannot use in-order reductions in this case because there is
8285          an implicit reassociation of the operations involved.  */
8286       if (dump_enabled_p ())
8287         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8288                          "in-order unchained SLP reductions not supported.\n");
8289       return false;
8290     }
8291
8292   /* For double reductions, and for SLP reductions with a neutral value,
8293      we construct a variable-length initial vector by loading a vector
8294      full of the neutral value and then shift-and-inserting the start
8295      values into the low-numbered elements.  */
8296   if ((double_reduc || neutral_op)
8297       && !nunits_out.is_constant ()
8298       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8299                                           vectype_out, OPTIMIZE_FOR_SPEED))
8300     {
8301       if (dump_enabled_p ())
8302         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8303                          "reduction on variable-length vectors requires"
8304                          " target support for a vector-shift-and-insert"
8305                          " operation.\n");
8306       return false;
8307     }
8308
8309   /* Check extra constraints for variable-length unchained SLP reductions.  */
8310   if (slp_node
8311       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8312       && !nunits_out.is_constant ())
8313     {
8314       /* We checked above that we could build the initial vector when
8315          there's a neutral element value.  Check here for the case in
8316          which each SLP statement has its own initial value and in which
8317          that value needs to be repeated for every instance of the
8318          statement within the initial vector.  */
8319       unsigned int group_size = SLP_TREE_LANES (slp_node);
8320       if (!neutral_op
8321           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8322                                               TREE_TYPE (vectype_out)))
8323         {
8324           if (dump_enabled_p ())
8325             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8326                              "unsupported form of SLP reduction for"
8327                              " variable-length vectors: cannot build"
8328                              " initial vector.\n");
8329           return false;
8330         }
8331       /* The epilogue code relies on the number of elements being a multiple
8332          of the group size.  The duplicate-and-interleave approach to setting
8333          up the initial vector does too.  */
8334       if (!multiple_p (nunits_out, group_size))
8335         {
8336           if (dump_enabled_p ())
8337             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8338                              "unsupported form of SLP reduction for"
8339                              " variable-length vectors: the vector size"
8340                              " is not a multiple of the number of results.\n");
8341           return false;
8342         }
8343     }
8344
8345   if (reduction_type == COND_REDUCTION)
8346     {
8347       widest_int ni;
8348
8349       if (! max_loop_iterations (loop, &ni))
8350         {
8351           if (dump_enabled_p ())
8352             dump_printf_loc (MSG_NOTE, vect_location,
8353                              "loop count not known, cannot create cond "
8354                              "reduction.\n");
8355           return false;
8356         }
8357       /* Convert backedges to iterations.  */
8358       ni += 1;
8359
8360       /* The additional index will be the same type as the condition.  Check
8361          that the loop can fit into this less one (because we'll use up the
8362          zero slot for when there are no matches).  */
8363       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8364       if (wi::geu_p (ni, wi::to_widest (max_index)))
8365         {
8366           if (dump_enabled_p ())
8367             dump_printf_loc (MSG_NOTE, vect_location,
8368                              "loop size is greater than data size.\n");
8369           return false;
8370         }
8371     }
8372
8373   /* In case the vectorization factor (VF) is bigger than the number
8374      of elements that we can fit in a vectype (nunits), we have to generate
8375      more than one vector stmt - i.e - we need to "unroll" the
8376      vector stmt by a factor VF/nunits.  For more details see documentation
8377      in vectorizable_operation.  */
8378
8379   /* If the reduction is used in an outer loop we need to generate
8380      VF intermediate results, like so (e.g. for ncopies=2):
8381         r0 = phi (init, r0)
8382         r1 = phi (init, r1)
8383         r0 = x0 + r0;
8384         r1 = x1 + r1;
8385     (i.e. we generate VF results in 2 registers).
8386     In this case we have a separate def-use cycle for each copy, and therefore
8387     for each copy we get the vector def for the reduction variable from the
8388     respective phi node created for this copy.
8389
8390     Otherwise (the reduction is unused in the loop nest), we can combine
8391     together intermediate results, like so (e.g. for ncopies=2):
8392         r = phi (init, r)
8393         r = x0 + r;
8394         r = x1 + r;
8395    (i.e. we generate VF/2 results in a single register).
8396    In this case for each copy we get the vector def for the reduction variable
8397    from the vectorized reduction operation generated in the previous iteration.
8398
8399    This only works when we see both the reduction PHI and its only consumer
8400    in vectorizable_reduction and there are no intermediate stmts
8401    participating.  When unrolling we want each unrolled iteration to have its
8402    own reduction accumulator since one of the main goals of unrolling a
8403    reduction is to reduce the aggregate loop-carried latency.  */
8404   if ((ncopies > 1
8405        || (slp_node
8406            && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8407            && SLP_TREE_LANES (slp_node) == 1
8408            && vect_get_num_copies (loop_vinfo, vectype_in) > 1))
8409       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8410       && reduc_chain_length == 1
8411       && loop_vinfo->suggested_unroll_factor == 1)
8412     single_defuse_cycle = true;
8413
8414   if (single_defuse_cycle && !lane_reducing)
8415     {
8416       gcc_assert (op.code != COND_EXPR);
8417
8418       /* 4. check support for the operation in the loop
8419
8420          This isn't necessary for the lane reduction codes, since they
8421          can only be produced by pattern matching, and it's up to the
8422          pattern matcher to test for support.  The main reason for
8423          specifically skipping this step is to avoid rechecking whether
8424          mixed-sign dot-products can be implemented using signed
8425          dot-products.  */
8426       machine_mode vec_mode = TYPE_MODE (vectype_in);
8427       if (!directly_supported_p (op.code, vectype_in, optab_vector))
8428         {
8429           if (dump_enabled_p ())
8430             dump_printf (MSG_NOTE, "op not supported by target.\n");
8431           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8432               || !vect_can_vectorize_without_simd_p (op.code))
8433             single_defuse_cycle = false;
8434           else
8435             if (dump_enabled_p ())
8436               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8437         }
8438
8439       if (vect_emulated_vector_p (vectype_in)
8440           && !vect_can_vectorize_without_simd_p (op.code))
8441         {
8442           if (dump_enabled_p ())
8443             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8444           return false;
8445         }
8446     }
8447   if (dump_enabled_p () && single_defuse_cycle)
8448     dump_printf_loc (MSG_NOTE, vect_location,
8449                      "using single def-use cycle for reduction by reducing "
8450                      "multiple vectors to one in the loop body\n");
8451   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8452
8453   /* For lane-reducing operation, the below processing related to single
8454      defuse-cycle will be done in its own vectorizable function.  One more
8455      thing to note is that the operation must not be involved in fold-left
8456      reduction.  */
8457   single_defuse_cycle &= !lane_reducing;
8458
8459   if (slp_node
8460       && (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION))
8461     for (i = 0; i < (int) op.num_ops; i++)
8462       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8463         {
8464           if (dump_enabled_p ())
8465             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8466                              "incompatible vector types for invariants\n");
8467           return false;
8468         }
8469
8470   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8471                              reduction_type, ncopies, cost_vec);
8472   /* Cost the reduction op inside the loop if transformed via
8473      vect_transform_reduction for non-lane-reducing operation.  Otherwise
8474      this is costed by the separate vectorizable_* routines.  */
8475   if (single_defuse_cycle)
8476     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
8477
8478   if (dump_enabled_p ()
8479       && reduction_type == FOLD_LEFT_REDUCTION)
8480     dump_printf_loc (MSG_NOTE, vect_location,
8481                      "using an in-order (fold-left) reduction.\n");
8482   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8483
8484   /* All but single defuse-cycle optimized and fold-left reductions go
8485      through their own vectorizable_* routines.  */
8486   if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
8487     {
8488       stmt_vec_info tem
8489         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8490       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8491         {
8492           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8493           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8494         }
8495       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8496       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8497     }
8498   else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8499     vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
8500                                                 slp_node, op.code, op.type,
8501                                                 vectype_in);
8502   return true;
8503 }
8504
8505 /* STMT_INFO is a dot-product reduction whose multiplication operands
8506    have different signs.  Emit a sequence to emulate the operation
8507    using a series of signed DOT_PROD_EXPRs and return the last
8508    statement generated.  VEC_DEST is the result of the vector operation
8509    and VOP lists its inputs.  */
8510
8511 static gassign *
8512 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8513                              gimple_stmt_iterator *gsi, tree vec_dest,
8514                              tree vop[3])
8515 {
8516   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8517   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8518   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8519   gimple *new_stmt;
8520
8521   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8522   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8523     std::swap (vop[0], vop[1]);
8524
8525   /* Convert all inputs to signed types.  */
8526   for (int i = 0; i < 3; ++i)
8527     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8528       {
8529         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8530         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8531         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8532         vop[i] = tmp;
8533       }
8534
8535   /* In the comments below we assume 8-bit inputs for simplicity,
8536      but the approach works for any full integer type.  */
8537
8538   /* Create a vector of -128.  */
8539   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8540   tree min_narrow = build_vector_from_val (narrow_vectype,
8541                                            min_narrow_elttype);
8542
8543   /* Create a vector of 64.  */
8544   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8545   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8546   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8547
8548   /* Emit: SUB_RES = VOP[0] - 128.  */
8549   tree sub_res = make_ssa_name (narrow_vectype);
8550   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8551   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8552
8553   /* Emit:
8554
8555        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8556        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8557        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8558
8559      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8560      Doing the two 64 * y steps first allows more time to compute x.  */
8561   tree stage1 = make_ssa_name (wide_vectype);
8562   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8563                                   vop[1], half_narrow, vop[2]);
8564   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8565
8566   tree stage2 = make_ssa_name (wide_vectype);
8567   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8568                                   vop[1], half_narrow, stage1);
8569   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8570
8571   tree stage3 = make_ssa_name (wide_vectype);
8572   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8573                                   sub_res, vop[1], stage2);
8574   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8575
8576   /* Convert STAGE3 to the reduction type.  */
8577   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8578 }
8579
8580 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8581    value.  */
8582
8583 bool
8584 vect_transform_reduction (loop_vec_info loop_vinfo,
8585                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8586                           gimple **vec_stmt, slp_tree slp_node)
8587 {
8588   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8589   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8590   unsigned ncopies;
8591   unsigned vec_num;
8592
8593   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8594   gcc_assert (reduc_info->is_reduc_info);
8595
8596   if (nested_in_vect_loop_p (loop, stmt_info))
8597     {
8598       loop = loop->inner;
8599       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8600     }
8601
8602   gimple_match_op op;
8603   if (!gimple_extract_op (stmt_info->stmt, &op))
8604     gcc_unreachable ();
8605
8606   /* All uses but the last are expected to be defined in the loop.
8607      The last use is the reduction variable.  In case of nested cycle this
8608      assumption is not true: we use reduc_index to record the index of the
8609      reduction variable.  */
8610   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8611   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8612   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8613   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
8614
8615   if (!vectype_in)
8616     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8617
8618   if (slp_node)
8619     {
8620       ncopies = 1;
8621       vec_num = vect_get_num_copies (loop_vinfo, slp_node, vectype_in);
8622     }
8623   else
8624     {
8625       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8626       vec_num = 1;
8627     }
8628
8629   code_helper code = canonicalize_code (op.code, op.type);
8630   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8631
8632   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8633   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8634   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8635
8636   /* Transform.  */
8637   tree new_temp = NULL_TREE;
8638   auto_vec<tree> vec_oprnds[3];
8639
8640   if (dump_enabled_p ())
8641     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8642
8643   /* FORNOW: Multiple types are not supported for condition.  */
8644   if (code == COND_EXPR)
8645     gcc_assert (ncopies == 1);
8646
8647   /* A binary COND_OP reduction must have the same definition and else
8648      value. */
8649   bool cond_fn_p = code.is_internal_fn ()
8650     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8651   if (cond_fn_p)
8652     {
8653       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8654                   || code == IFN_COND_MUL || code == IFN_COND_AND
8655                   || code == IFN_COND_IOR || code == IFN_COND_XOR
8656                   || code == IFN_COND_MIN || code == IFN_COND_MAX);
8657       gcc_assert (op.num_ops == 4
8658                   && (op.ops[reduc_index]
8659                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
8660     }
8661
8662   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8663
8664   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8665   if (reduction_type == FOLD_LEFT_REDUCTION)
8666     {
8667       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8668       gcc_assert (code.is_tree_code () || cond_fn_p);
8669       return vectorize_fold_left_reduction
8670           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8671            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8672            reduc_index, masks, lens);
8673     }
8674
8675   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8676   bool lane_reducing = lane_reducing_op_p (code);
8677   gcc_assert (single_defuse_cycle || lane_reducing);
8678
8679   if (lane_reducing)
8680     {
8681       /* The last operand of lane-reducing op is for reduction.  */
8682       gcc_assert (reduc_index == (int) op.num_ops - 1);
8683     }
8684
8685   /* Create the destination vector  */
8686   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8687   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8688
8689   if (lane_reducing && !slp_node && !single_defuse_cycle)
8690     {
8691       /* Note: there are still vectorizable cases that can not be handled by
8692          single-lane slp.  Probably it would take some time to evolve the
8693          feature to a mature state.  So we have to keep the below non-slp code
8694          path as failsafe for lane-reducing support.  */
8695       gcc_assert (op.num_ops <= 3);
8696       for (unsigned i = 0; i < op.num_ops; i++)
8697         {
8698           unsigned oprnd_ncopies = ncopies;
8699
8700           if ((int) i == reduc_index)
8701             {
8702               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8703               oprnd_ncopies = vect_get_num_copies (loop_vinfo, vectype);
8704             }
8705
8706           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, oprnd_ncopies,
8707                                          op.ops[i], &vec_oprnds[i]);
8708         }
8709     }
8710   /* Get NCOPIES vector definitions for all operands except the reduction
8711      definition.  */
8712   else if (!cond_fn_p)
8713     {
8714       gcc_assert (reduc_index >= 0 && reduc_index <= 2);
8715       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8716                          single_defuse_cycle && reduc_index == 0
8717                          ? NULL_TREE : op.ops[0], &vec_oprnds[0],
8718                          single_defuse_cycle && reduc_index == 1
8719                          ? NULL_TREE : op.ops[1], &vec_oprnds[1],
8720                          op.num_ops == 3
8721                          && !(single_defuse_cycle && reduc_index == 2)
8722                          ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
8723     }
8724   else
8725     {
8726       /* For a conditional operation pass the truth type as mask
8727          vectype.  */
8728       gcc_assert (single_defuse_cycle
8729                   && (reduc_index == 1 || reduc_index == 2));
8730       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, op.ops[0],
8731                          truth_type_for (vectype_in), &vec_oprnds[0],
8732                          reduc_index == 1 ? NULL_TREE : op.ops[1],
8733                          NULL_TREE, &vec_oprnds[1],
8734                          reduc_index == 2 ? NULL_TREE : op.ops[2],
8735                          NULL_TREE, &vec_oprnds[2]);
8736     }
8737
8738   /* For single def-use cycles get one copy of the vectorized reduction
8739      definition.  */
8740   if (single_defuse_cycle)
8741     {
8742       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1,
8743                          reduc_index == 0 ? op.ops[0] : NULL_TREE,
8744                          &vec_oprnds[0],
8745                          reduc_index == 1 ? op.ops[1] : NULL_TREE,
8746                          &vec_oprnds[1],
8747                          reduc_index == 2 ? op.ops[2] : NULL_TREE,
8748                          &vec_oprnds[2]);
8749     }
8750   else if (lane_reducing)
8751     {
8752       /* For normal reduction, consistency between vectorized def/use is
8753          naturally ensured when mapping from scalar statement.  But if lane-
8754          reducing op is involved in reduction, thing would become somewhat
8755          complicated in that the op's result and operand for accumulation are
8756          limited to less lanes than other operands, which certainly causes
8757          def/use mismatch on adjacent statements around the op if do not have
8758          any kind of specific adjustment.  One approach is to refit lane-
8759          reducing op in the way of introducing new trivial pass-through copies
8760          to fix possible def/use gap, so as to make it behave like a normal op.
8761          And vector reduction PHIs are always generated to the full extent, no
8762          matter lane-reducing op exists or not.  If some copies or PHIs are
8763          actually superfluous, they would be cleaned up by passes after
8764          vectorization.  An example for single-lane slp, lane-reducing ops
8765          with mixed input vectypes in a reduction chain, is given as below.
8766          Similarly, this handling is applicable for multiple-lane slp as well.
8767
8768            int sum = 1;
8769            for (i)
8770              {
8771                sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
8772                sum += w[i];               // widen-sum <vector(16) char>
8773                sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
8774                sum += n[i];               // normal <vector(4) int>
8775              }
8776
8777          The vector size is 128-bit，vectorization factor is 16.  Reduction
8778          statements would be transformed as:
8779
8780            vector<4> int sum_v0 = { 0, 0, 0, 1 };
8781            vector<4> int sum_v1 = { 0, 0, 0, 0 };
8782            vector<4> int sum_v2 = { 0, 0, 0, 0 };
8783            vector<4> int sum_v3 = { 0, 0, 0, 0 };
8784
8785            for (i / 16)
8786              {
8787                sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
8788                sum_v1 = sum_v1;  // copy
8789                sum_v2 = sum_v2;  // copy
8790                sum_v3 = sum_v3;  // copy
8791
8792                sum_v0 = sum_v0;  // copy
8793                sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
8794                sum_v2 = sum_v2;  // copy
8795                sum_v3 = sum_v3;  // copy
8796
8797                sum_v0 = sum_v0;  // copy
8798                sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
8799                sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
8800                sum_v3 = sum_v3;  // copy
8801
8802                sum_v0 += n_v0[i: 0  ~ 3 ];
8803                sum_v1 += n_v1[i: 4  ~ 7 ];
8804                sum_v2 += n_v2[i: 8  ~ 11];
8805                sum_v3 += n_v3[i: 12 ~ 15];
8806              }
8807
8808          Moreover, for a higher instruction parallelism in final vectorized
8809          loop, it is considered to make those effective vector lane-reducing
8810          ops be distributed evenly among all def-use cycles.  In the above
8811          example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
8812          cycles, instruction dependency among them could be eliminated.  */
8813       unsigned effec_ncopies = vec_oprnds[0].length ();
8814       unsigned total_ncopies = vec_oprnds[reduc_index].length ();
8815
8816       gcc_assert (effec_ncopies <= total_ncopies);
8817
8818       if (effec_ncopies < total_ncopies)
8819         {
8820           for (unsigned i = 0; i < op.num_ops - 1; i++)
8821             {
8822               gcc_assert (vec_oprnds[i].length () == effec_ncopies);
8823               vec_oprnds[i].safe_grow_cleared (total_ncopies);
8824             }
8825         }
8826
8827       tree reduc_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8828       gcc_assert (reduc_vectype_in);
8829
8830       unsigned effec_reduc_ncopies
8831         = vect_get_num_copies (loop_vinfo, slp_node, reduc_vectype_in);
8832
8833       gcc_assert (effec_ncopies <= effec_reduc_ncopies);
8834
8835       if (effec_ncopies < effec_reduc_ncopies)
8836         {
8837           /* Find suitable def-use cycles to generate vectorized statements
8838              into, and reorder operands based on the selection.  */
8839           unsigned curr_pos = reduc_info->reduc_result_pos;
8840           unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
8841
8842           gcc_assert (curr_pos < effec_reduc_ncopies);
8843           reduc_info->reduc_result_pos = next_pos;
8844
8845           if (curr_pos)
8846             {
8847               unsigned count = effec_reduc_ncopies - effec_ncopies;
8848               unsigned start = curr_pos - count;
8849
8850               if ((int) start < 0)
8851                 {
8852                   count = curr_pos;
8853                   start = 0;
8854                 }
8855
8856               for (unsigned i = 0; i < op.num_ops - 1; i++)
8857                 {
8858                   for (unsigned j = effec_ncopies; j > start; j--)
8859                     {
8860                       unsigned k = j - 1;
8861                       std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
8862                       gcc_assert (!vec_oprnds[i][k]);
8863                     }
8864                 }
8865             }
8866         }
8867     }
8868
8869   bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
8870   unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
8871
8872   for (unsigned i = 0; i < num; ++i)
8873     {
8874       gimple *new_stmt;
8875       tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
8876       if (!vop[0] || !vop[1])
8877         {
8878           tree reduc_vop = vec_oprnds[reduc_index][i];
8879
8880           /* If could not generate an effective vector statement for current
8881              portion of reduction operand, insert a trivial copy to simply
8882              handle over the operand to other dependent statements.  */
8883           gcc_assert (reduc_vop);
8884
8885           if (slp_node && TREE_CODE (reduc_vop) == SSA_NAME
8886               && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
8887             new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
8888           else
8889             {
8890               new_temp = make_ssa_name (vec_dest);
8891               new_stmt = gimple_build_assign (new_temp, reduc_vop);
8892               vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
8893                                            gsi);
8894             }
8895         }
8896       else if (masked_loop_p && !mask_by_cond_expr)
8897         {
8898           /* No conditional ifns have been defined for lane-reducing op
8899              yet.  */
8900           gcc_assert (!lane_reducing);
8901
8902           /* Make sure that the reduction accumulator is vop[0].  */
8903           if (reduc_index == 1)
8904             {
8905               gcc_assert (commutative_binary_op_p (code, op.type));
8906               std::swap (vop[0], vop[1]);
8907             }
8908           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8909                                           vec_num * ncopies, vectype_in, i);
8910           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8911                                                     vop[0], vop[1], vop[0]);
8912           new_temp = make_ssa_name (vec_dest, call);
8913           gimple_call_set_lhs (call, new_temp);
8914           gimple_call_set_nothrow (call, true);
8915           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8916           new_stmt = call;
8917         }
8918       else
8919         {
8920           if (op.num_ops >= 3)
8921             vop[2] = vec_oprnds[2][i];
8922
8923           if (masked_loop_p && mask_by_cond_expr)
8924             {
8925               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8926                                               vec_num * ncopies, vectype_in, i);
8927               build_vect_cond_expr (code, vop, mask, gsi);
8928             }
8929
8930           if (emulated_mixed_dot_prod)
8931             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8932                                                     vec_dest, vop);
8933
8934           else if (code.is_internal_fn () && !cond_fn_p)
8935             new_stmt = gimple_build_call_internal (internal_fn (code),
8936                                                    op.num_ops,
8937                                                    vop[0], vop[1], vop[2]);
8938           else if (code.is_internal_fn () && cond_fn_p)
8939             new_stmt = gimple_build_call_internal (internal_fn (code),
8940                                                    op.num_ops,
8941                                                    vop[0], vop[1], vop[2],
8942                                                    vop[1]);
8943           else
8944             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8945                                             vop[0], vop[1], vop[2]);
8946           new_temp = make_ssa_name (vec_dest, new_stmt);
8947           gimple_set_lhs (new_stmt, new_temp);
8948           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8949         }
8950
8951       if (single_defuse_cycle && i < num - 1)
8952         vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
8953       else if (slp_node)
8954         slp_node->push_vec_def (new_stmt);
8955       else
8956         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8957     }
8958
8959   if (!slp_node)
8960     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8961
8962   return true;
8963 }
8964
8965 /* Transform phase of a cycle PHI.  */
8966
8967 bool
8968 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8969                           stmt_vec_info stmt_info, gimple **vec_stmt,
8970                           slp_tree slp_node, slp_instance slp_node_instance)
8971 {
8972   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8973   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8974   int i;
8975   int ncopies;
8976   int j;
8977   bool nested_cycle = false;
8978   int vec_num;
8979
8980   if (nested_in_vect_loop_p (loop, stmt_info))
8981     {
8982       loop = loop->inner;
8983       nested_cycle = true;
8984     }
8985
8986   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8987   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8988   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8989   gcc_assert (reduc_info->is_reduc_info);
8990
8991   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8992       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8993     /* Leave the scalar phi in place.  */
8994     return true;
8995
8996   if (slp_node)
8997     {
8998       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8999       ncopies = 1;
9000     }
9001   else
9002     {
9003       vec_num = 1;
9004       ncopies = vect_get_num_copies (loop_vinfo,
9005                                      STMT_VINFO_VECTYPE (stmt_info));
9006     }
9007
9008   /* Check whether we should use a single PHI node and accumulate
9009      vectors to one before the backedge.  */
9010   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
9011     {
9012       ncopies = 1;
9013       vec_num = 1;
9014     }
9015
9016   /* Create the destination vector  */
9017   gphi *phi = as_a <gphi *> (stmt_info->stmt);
9018   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
9019                                                vectype_out);
9020
9021   /* Get the loop-entry arguments.  */
9022   tree vec_initial_def = NULL_TREE;
9023   auto_vec<tree> vec_initial_defs;
9024   if (slp_node)
9025     {
9026       vec_initial_defs.reserve (vec_num);
9027       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
9028          and we can't use zero for induc_val, use initial_def.  Similarly
9029          for REDUC_MIN and initial_def larger than the base.  */
9030       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
9031         {
9032           gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9033           tree initial_def = vect_phi_initial_value (phi);
9034           reduc_info->reduc_initial_values.safe_push (initial_def);
9035           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
9036           if (TREE_CODE (initial_def) == INTEGER_CST
9037               && !integer_zerop (induc_val)
9038               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
9039                    && tree_int_cst_lt (initial_def, induc_val))
9040                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
9041                       && tree_int_cst_lt (induc_val, initial_def))))
9042             {
9043               induc_val = initial_def;
9044               /* Communicate we used the initial_def to epilouge
9045                  generation.  */
9046               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
9047             }
9048           vec_initial_defs.quick_push
9049             (build_vector_from_val (vectype_out, induc_val));
9050         }
9051       else if (nested_cycle)
9052         {
9053           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
9054           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
9055                              &vec_initial_defs);
9056         }
9057       else
9058         {
9059           gcc_assert (slp_node == slp_node_instance->reduc_phis);
9060           vec<tree> &initial_values = reduc_info->reduc_initial_values;
9061           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
9062
9063           unsigned int num_phis = stmts.length ();
9064           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
9065             num_phis = 1;
9066           initial_values.reserve (num_phis);
9067           for (unsigned int i = 0; i < num_phis; ++i)
9068             {
9069               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
9070               initial_values.quick_push (vect_phi_initial_value (this_phi));
9071             }
9072           if (vec_num == 1)
9073             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
9074           if (!initial_values.is_empty ())
9075             {
9076               tree initial_value
9077                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
9078               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
9079               tree neutral_op
9080                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
9081                                             code, initial_value);
9082               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
9083                                               &vec_initial_defs, vec_num,
9084                                               stmts.length (), neutral_op);
9085             }
9086         }
9087     }
9088   else
9089     {
9090       /* Get at the scalar def before the loop, that defines the initial
9091          value of the reduction variable.  */
9092       tree initial_def = vect_phi_initial_value (phi);
9093       reduc_info->reduc_initial_values.safe_push (initial_def);
9094       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
9095          and we can't use zero for induc_val, use initial_def.  Similarly
9096          for REDUC_MIN and initial_def larger than the base.  */
9097       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
9098         {
9099           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
9100           if (TREE_CODE (initial_def) == INTEGER_CST
9101               && !integer_zerop (induc_val)
9102               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
9103                    && tree_int_cst_lt (initial_def, induc_val))
9104                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
9105                       && tree_int_cst_lt (induc_val, initial_def))))
9106             {
9107               induc_val = initial_def;
9108               /* Communicate we used the initial_def to epilouge
9109                  generation.  */
9110               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
9111             }
9112           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
9113         }
9114       else if (nested_cycle)
9115         {
9116           /* Do not use an adjustment def as that case is not supported
9117              correctly if ncopies is not one.  */
9118           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
9119                                          ncopies, initial_def,
9120                                          &vec_initial_defs);
9121         }
9122       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
9123                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
9124         /* Fill the initial vector with the initial scalar value.  */
9125         vec_initial_def
9126           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
9127                                            initial_def, initial_def);
9128       else
9129         {
9130           if (ncopies == 1)
9131             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
9132           if (!reduc_info->reduc_initial_values.is_empty ())
9133             {
9134               initial_def = reduc_info->reduc_initial_values[0];
9135               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
9136               tree neutral_op
9137                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
9138                                             code, initial_def);
9139               gcc_assert (neutral_op);
9140               /* Try to simplify the vector initialization by applying an
9141                  adjustment after the reduction has been performed.  */
9142               if (!reduc_info->reused_accumulator
9143                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9144                   && !operand_equal_p (neutral_op, initial_def))
9145                 {
9146                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
9147                     = initial_def;
9148                   initial_def = neutral_op;
9149                 }
9150               vec_initial_def
9151                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
9152                                                  initial_def, neutral_op);
9153             }
9154         }
9155     }
9156
9157   if (vec_initial_def)
9158     {
9159       vec_initial_defs.create (ncopies);
9160       for (i = 0; i < ncopies; ++i)
9161         vec_initial_defs.quick_push (vec_initial_def);
9162     }
9163
9164   if (auto *accumulator = reduc_info->reused_accumulator)
9165     {
9166       tree def = accumulator->reduc_input;
9167       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
9168         {
9169           unsigned int nreduc;
9170           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
9171                                             (TREE_TYPE (def)),
9172                                           TYPE_VECTOR_SUBPARTS (vectype_out),
9173                                           &nreduc);
9174           gcc_assert (res);
9175           gimple_seq stmts = NULL;
9176           /* Reduce the single vector to a smaller one.  */
9177           if (nreduc != 1)
9178             {
9179               /* Perform the reduction in the appropriate type.  */
9180               tree rvectype = vectype_out;
9181               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
9182                                               TREE_TYPE (TREE_TYPE (def))))
9183                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
9184                                               TYPE_VECTOR_SUBPARTS
9185                                                 (vectype_out));
9186               def = vect_create_partial_epilog (def, rvectype,
9187                                                 STMT_VINFO_REDUC_CODE
9188                                                   (reduc_info),
9189                                                 &stmts);
9190             }
9191           /* The epilogue loop might use a different vector mode, like
9192              VNx2DI vs. V2DI.  */
9193           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
9194             {
9195               tree reduc_type = build_vector_type_for_mode
9196                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
9197               def = gimple_convert (&stmts, reduc_type, def);
9198             }
9199           /* Adjust the input so we pick up the partially reduced value
9200              for the skip edge in vect_create_epilog_for_reduction.  */
9201           accumulator->reduc_input = def;
9202           /* And the reduction could be carried out using a different sign.  */
9203           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
9204             def = gimple_convert (&stmts, vectype_out, def);
9205           edge e;
9206           if ((e = loop_vinfo->main_loop_edge)
9207               || (e = loop_vinfo->skip_this_loop_edge))
9208             {
9209               /* While we'd like to insert on the edge this will split
9210                  blocks and disturb bookkeeping, we also will eventually
9211                  need this on the skip edge.  Rely on sinking to
9212                  fixup optimal placement and insert in the pred.  */
9213               gimple_stmt_iterator gsi = gsi_last_bb (e->src);
9214               /* Insert before a cond that eventually skips the
9215                  epilogue.  */
9216               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
9217                 gsi_prev (&gsi);
9218               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
9219             }
9220           else
9221             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
9222                                               stmts);
9223         }
9224       if (loop_vinfo->main_loop_edge)
9225         vec_initial_defs[0]
9226           = vect_get_main_loop_result (loop_vinfo, def,
9227                                        vec_initial_defs[0]);
9228       else
9229         vec_initial_defs.safe_push (def);
9230     }
9231
9232   /* Generate the reduction PHIs upfront.  */
9233   for (i = 0; i < vec_num; i++)
9234     {
9235       tree vec_init_def = vec_initial_defs[i];
9236       for (j = 0; j < ncopies; j++)
9237         {
9238           /* Create the reduction-phi that defines the reduction
9239              operand.  */
9240           gphi *new_phi = create_phi_node (vec_dest, loop->header);
9241
9242           /* Set the loop-entry arg of the reduction-phi.  */
9243           if (j != 0 && nested_cycle)
9244             vec_init_def = vec_initial_defs[j];
9245           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
9246                        UNKNOWN_LOCATION);
9247
9248           /* The loop-latch arg is set in epilogue processing.  */
9249
9250           if (slp_node)
9251             slp_node->push_vec_def (new_phi);
9252           else
9253             {
9254               if (j == 0)
9255                 *vec_stmt = new_phi;
9256               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
9257             }
9258         }
9259     }
9260
9261   return true;
9262 }
9263
9264 /* Vectorizes LC PHIs.  */
9265
9266 bool
9267 vectorizable_lc_phi (loop_vec_info loop_vinfo,
9268                      stmt_vec_info stmt_info, gimple **vec_stmt,
9269                      slp_tree slp_node)
9270 {
9271   if (!loop_vinfo
9272       || !is_a <gphi *> (stmt_info->stmt)
9273       || gimple_phi_num_args (stmt_info->stmt) != 1)
9274     return false;
9275
9276   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
9277       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
9278     return false;
9279
9280   if (!vec_stmt) /* transformation not required.  */
9281     {
9282       /* Deal with copies from externs or constants that disguise as
9283          loop-closed PHI nodes (PR97886).  */
9284       if (slp_node
9285           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
9286                                                 SLP_TREE_VECTYPE (slp_node)))
9287         {
9288           if (dump_enabled_p ())
9289             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9290                              "incompatible vector types for invariants\n");
9291           return false;
9292         }
9293       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
9294       return true;
9295     }
9296
9297   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9298   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9299   basic_block bb = gimple_bb (stmt_info->stmt);
9300   edge e = single_pred_edge (bb);
9301   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9302   auto_vec<tree> vec_oprnds;
9303   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
9304                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
9305                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
9306   for (unsigned i = 0; i < vec_oprnds.length (); i++)
9307     {
9308       /* Create the vectorized LC PHI node.  */
9309       gphi *new_phi = create_phi_node (vec_dest, bb);
9310       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
9311       if (slp_node)
9312         slp_node->push_vec_def (new_phi);
9313       else
9314         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
9315     }
9316   if (!slp_node)
9317     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9318
9319   return true;
9320 }
9321
9322 /* Vectorizes PHIs.  */
9323
9324 bool
9325 vectorizable_phi (vec_info *,
9326                   stmt_vec_info stmt_info, gimple **vec_stmt,
9327                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9328 {
9329   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9330     return false;
9331
9332   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9333     return false;
9334
9335   tree vectype = SLP_TREE_VECTYPE (slp_node);
9336
9337   if (!vec_stmt) /* transformation not required.  */
9338     {
9339       slp_tree child;
9340       unsigned i;
9341       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9342         if (!child)
9343           {
9344             if (dump_enabled_p ())
9345               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9346                                "PHI node with unvectorized backedge def\n");
9347             return false;
9348           }
9349         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9350           {
9351             if (dump_enabled_p ())
9352               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9353                                "incompatible vector types for invariants\n");
9354             return false;
9355           }
9356         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9357                  && !useless_type_conversion_p (vectype,
9358                                                 SLP_TREE_VECTYPE (child)))
9359           {
9360             /* With bools we can have mask and non-mask precision vectors
9361                or different non-mask precisions.  while pattern recog is
9362                supposed to guarantee consistency here bugs in it can cause
9363                mismatches (PR103489 and PR103800 for example).
9364                Deal with them here instead of ICEing later.  */
9365             if (dump_enabled_p ())
9366               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9367                                "incompatible vector type setup from "
9368                                "bool pattern detection\n");
9369             return false;
9370           }
9371
9372       /* For single-argument PHIs assume coalescing which means zero cost
9373          for the scalar and the vector PHIs.  This avoids artificially
9374          favoring the vector path (but may pessimize it in some cases).  */
9375       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9376         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9377                           vector_stmt, stmt_info, vectype, 0, vect_body);
9378       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9379       return true;
9380     }
9381
9382   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9383   basic_block bb = gimple_bb (stmt_info->stmt);
9384   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9385   auto_vec<gphi *> new_phis;
9386   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9387     {
9388       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9389
9390       /* Skip not yet vectorized defs.  */
9391       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9392           && SLP_TREE_VEC_DEFS (child).is_empty ())
9393         continue;
9394
9395       auto_vec<tree> vec_oprnds;
9396       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9397       if (!new_phis.exists ())
9398         {
9399           new_phis.create (vec_oprnds.length ());
9400           for (unsigned j = 0; j < vec_oprnds.length (); j++)
9401             {
9402               /* Create the vectorized LC PHI node.  */
9403               new_phis.quick_push (create_phi_node (vec_dest, bb));
9404               slp_node->push_vec_def (new_phis[j]);
9405             }
9406         }
9407       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9408       for (unsigned j = 0; j < vec_oprnds.length (); j++)
9409         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9410     }
9411   /* We should have at least one already vectorized child.  */
9412   gcc_assert (new_phis.exists ());
9413
9414   return true;
9415 }
9416
9417 /* Vectorizes first order recurrences.  An overview of the transformation
9418    is described below. Suppose we have the following loop.
9419
9420      int t = 0;
9421      for (int i = 0; i < n; ++i)
9422        {
9423          b[i] = a[i] - t;
9424          t = a[i];
9425        }
9426
9427    There is a first-order recurrence on 'a'. For this loop, the scalar IR
9428    looks (simplified) like:
9429
9430     scalar.preheader:
9431       init = 0;
9432
9433     scalar.body:
9434       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9435       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9436       _1 = a[i]
9437       b[i] = _1 - _2
9438       if (i < n) goto scalar.body
9439
9440    In this example, _2 is a recurrence because it's value depends on the
9441    previous iteration.  We vectorize this as (VF = 4)
9442
9443     vector.preheader:
9444       vect_init = vect_cst(..., ..., ..., 0)
9445
9446     vector.body
9447       i = PHI <0(vector.preheader), i+4(vector.body)>
9448       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9449       vect_2 = a[i, i+1, i+2, i+3];
9450       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9451       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9452       if (..) goto vector.body
9453
9454    In this function, vectorizable_recurr, we code generate both the
9455    vector PHI node and the permute since those together compute the
9456    vectorized value of the scalar PHI.  We do not yet have the
9457    backedge value to fill in there nor into the vec_perm.  Those
9458    are filled in maybe_set_vectorized_backedge_value and
9459    vect_schedule_scc.
9460
9461    TODO:  Since the scalar loop does not have a use of the recurrence
9462    outside of the loop the natural way to implement peeling via
9463    vectorizing the live value doesn't work.  For now peeling of loops
9464    with a recurrence is not implemented.  For SLP the supported cases
9465    are restricted to those requiring a single vector recurrence PHI.  */
9466
9467 bool
9468 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9469                      gimple **vec_stmt, slp_tree slp_node,
9470                      stmt_vector_for_cost *cost_vec)
9471 {
9472   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9473     return false;
9474
9475   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9476
9477   /* So far we only support first-order recurrence auto-vectorization.  */
9478   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9479     return false;
9480
9481   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9482   unsigned ncopies;
9483   if (slp_node)
9484     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9485   else
9486     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9487   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9488   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9489   /* We need to be able to make progress with a single vector.  */
9490   if (maybe_gt (dist * 2, nunits))
9491     {
9492       if (dump_enabled_p ())
9493         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9494                          "first order recurrence exceeds half of "
9495                          "a vector\n");
9496       return false;
9497     }
9498
9499   /* First-order recurrence autovectorization needs to handle permutation
9500      with indices = [nunits-1, nunits, nunits+1, ...].  */
9501   vec_perm_builder sel (nunits, 1, 3);
9502   for (int i = 0; i < 3; ++i)
9503     sel.quick_push (nunits - dist + i);
9504   vec_perm_indices indices (sel, 2, nunits);
9505
9506   if (!vec_stmt) /* transformation not required.  */
9507     {
9508       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9509                                  indices))
9510         return false;
9511
9512       if (slp_node)
9513         {
9514           /* We eventually need to set a vector type on invariant
9515              arguments.  */
9516           unsigned j;
9517           slp_tree child;
9518           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9519             if (!vect_maybe_update_slp_op_vectype
9520                   (child, SLP_TREE_VECTYPE (slp_node)))
9521               {
9522                 if (dump_enabled_p ())
9523                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9524                                    "incompatible vector types for "
9525                                    "invariants\n");
9526                 return false;
9527               }
9528         }
9529
9530       /* Verify we have set up compatible types.  */
9531       edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9532       tree latch_vectype = NULL_TREE;
9533       if (slp_node)
9534         {
9535           slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
9536           latch_vectype = SLP_TREE_VECTYPE (latch_def);
9537         }
9538       else
9539         {
9540           tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, le);
9541           if (TREE_CODE (latch_def) == SSA_NAME)
9542             {
9543               stmt_vec_info latch_def_info = loop_vinfo->lookup_def (latch_def);
9544               latch_def_info = vect_stmt_to_vectorize (latch_def_info);
9545               latch_vectype = STMT_VINFO_VECTYPE (latch_def_info);
9546             }
9547         }
9548       if (!types_compatible_p (latch_vectype, vectype))
9549         return false;
9550
9551       /* The recurrence costs the initialization vector and one permute
9552          for each copy.  */
9553       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9554                                                  stmt_info, 0, vect_prologue);
9555       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9556                                                stmt_info, 0, vect_body);
9557       if (dump_enabled_p ())
9558         dump_printf_loc (MSG_NOTE, vect_location,
9559                          "vectorizable_recurr: inside_cost = %d, "
9560                          "prologue_cost = %d .\n", inside_cost,
9561                          prologue_cost);
9562
9563       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9564       return true;
9565     }
9566
9567   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9568   basic_block bb = gimple_bb (phi);
9569   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9570   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9571     {
9572       gimple_seq stmts = NULL;
9573       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9574       gsi_insert_seq_on_edge_immediate (pe, stmts);
9575     }
9576   tree vec_init = build_vector_from_val (vectype, preheader);
9577   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9578
9579   /* Create the vectorized first-order PHI node.  */
9580   tree vec_dest = vect_get_new_vect_var (vectype,
9581                                          vect_simple_var, "vec_recur_");
9582   gphi *new_phi = create_phi_node (vec_dest, bb);
9583   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9584
9585   /* Insert shuffles the first-order recurrence autovectorization.
9586        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9587   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9588
9589   /* Insert the required permute after the latch definition.  The
9590      second and later operands are tentative and will be updated when we have
9591      vectorized the latch definition.  */
9592   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9593   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9594   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9595   gsi_next (&gsi2);
9596
9597   for (unsigned i = 0; i < ncopies; ++i)
9598     {
9599       vec_dest = make_ssa_name (vectype);
9600       gassign *vperm
9601           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9602                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9603                                  NULL, perm);
9604       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9605
9606       if (slp_node)
9607         slp_node->push_vec_def (vperm);
9608       else
9609         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9610     }
9611
9612   if (!slp_node)
9613     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9614   return true;
9615 }
9616
9617 /* Return true if VECTYPE represents a vector that requires lowering
9618    by the vector lowering pass.  */
9619
9620 bool
9621 vect_emulated_vector_p (tree vectype)
9622 {
9623   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9624           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9625               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9626 }
9627
9628 /* Return true if we can emulate CODE on an integer mode representation
9629    of a vector.  */
9630
9631 bool
9632 vect_can_vectorize_without_simd_p (tree_code code)
9633 {
9634   switch (code)
9635     {
9636     case PLUS_EXPR:
9637     case MINUS_EXPR:
9638     case NEGATE_EXPR:
9639     case BIT_AND_EXPR:
9640     case BIT_IOR_EXPR:
9641     case BIT_XOR_EXPR:
9642     case BIT_NOT_EXPR:
9643       return true;
9644
9645     default:
9646       return false;
9647     }
9648 }
9649
9650 /* Likewise, but taking a code_helper.  */
9651
9652 bool
9653 vect_can_vectorize_without_simd_p (code_helper code)
9654 {
9655   return (code.is_tree_code ()
9656           && vect_can_vectorize_without_simd_p (tree_code (code)));
9657 }
9658
9659 /* Create vector init for vectorized iv.  */
9660 static tree
9661 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9662                                tree step_expr, poly_uint64 nunits,
9663                                tree vectype,
9664                                enum vect_induction_op_type induction_type)
9665 {
9666   unsigned HOST_WIDE_INT const_nunits;
9667   tree vec_shift, vec_init, new_name;
9668   unsigned i;
9669   tree itype = TREE_TYPE (vectype);
9670
9671   /* iv_loop is the loop to be vectorized. Create:
9672      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9673   new_name = gimple_convert (stmts, itype, init_expr);
9674   switch (induction_type)
9675     {
9676     case vect_step_op_shr:
9677     case vect_step_op_shl:
9678       /* Build the Initial value from shift_expr.  */
9679       vec_init = gimple_build_vector_from_val (stmts,
9680                                                vectype,
9681                                                new_name);
9682       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9683                                 build_zero_cst (itype), step_expr);
9684       vec_init = gimple_build (stmts,
9685                                (induction_type == vect_step_op_shr
9686                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9687                                vectype, vec_init, vec_shift);
9688       break;
9689
9690     case vect_step_op_neg:
9691       {
9692         vec_init = gimple_build_vector_from_val (stmts,
9693                                                  vectype,
9694                                                  new_name);
9695         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9696                                      vectype, vec_init);
9697         /* The encoding has 2 interleaved stepped patterns.  */
9698         vec_perm_builder sel (nunits, 2, 3);
9699         sel.quick_grow (6);
9700         for (i = 0; i < 3; i++)
9701           {
9702             sel[2 * i] = i;
9703             sel[2 * i + 1] = i + nunits;
9704           }
9705         vec_perm_indices indices (sel, 2, nunits);
9706         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9707            fail when vec_init is const vector. In that situation vec_perm is not
9708            really needed.  */
9709         tree perm_mask_even
9710           = vect_gen_perm_mask_any (vectype, indices);
9711         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9712                                  vectype,
9713                                  vec_init, vec_neg,
9714                                  perm_mask_even);
9715       }
9716       break;
9717
9718     case vect_step_op_mul:
9719       {
9720         /* Use unsigned mult to avoid UD integer overflow.  */
9721         gcc_assert (nunits.is_constant (&const_nunits));
9722         tree utype = unsigned_type_for (itype);
9723         tree uvectype = build_vector_type (utype,
9724                                            TYPE_VECTOR_SUBPARTS (vectype));
9725         new_name = gimple_convert (stmts, utype, new_name);
9726         vec_init = gimple_build_vector_from_val (stmts,
9727                                                  uvectype,
9728                                                  new_name);
9729         tree_vector_builder elts (uvectype, const_nunits, 1);
9730         tree elt_step = build_one_cst (utype);
9731
9732         elts.quick_push (elt_step);
9733         for (i = 1; i < const_nunits; i++)
9734           {
9735             /* Create: new_name_i = new_name + step_expr.  */
9736             elt_step = gimple_build (stmts, MULT_EXPR,
9737                                      utype, elt_step, step_expr);
9738             elts.quick_push (elt_step);
9739           }
9740         /* Create a vector from [new_name_0, new_name_1, ...,
9741            new_name_nunits-1].  */
9742         tree vec_mul = gimple_build_vector (stmts, &elts);
9743         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9744                                  vec_init, vec_mul);
9745         vec_init = gimple_convert (stmts, vectype, vec_init);
9746       }
9747       break;
9748
9749     default:
9750       gcc_unreachable ();
9751     }
9752
9753   return vec_init;
9754 }
9755
9756 /* Peel init_expr by skip_niter for induction_type.  */
9757 tree
9758 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9759                              tree skip_niters, tree step_expr,
9760                              enum vect_induction_op_type induction_type)
9761 {
9762   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9763   tree type = TREE_TYPE (init_expr);
9764   unsigned prec = TYPE_PRECISION (type);
9765   switch (induction_type)
9766     {
9767     case vect_step_op_neg:
9768       if (TREE_INT_CST_LOW (skip_niters) % 2)
9769         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9770       /* else no change.  */
9771       break;
9772
9773     case vect_step_op_shr:
9774     case vect_step_op_shl:
9775       skip_niters = gimple_convert (stmts, type, skip_niters);
9776       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9777       /* When shift mount >= precision, need to avoid UD.
9778          In the original loop, there's no UD, and according to semantic,
9779          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9780       if (!tree_fits_uhwi_p (step_expr)
9781           || tree_to_uhwi (step_expr) >= prec)
9782         {
9783           if (induction_type == vect_step_op_shl
9784               || TYPE_UNSIGNED (type))
9785             init_expr = build_zero_cst (type);
9786           else
9787             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9788                                       init_expr,
9789                                       wide_int_to_tree (type, prec - 1));
9790         }
9791       else
9792         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9793                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9794                                   type, init_expr, step_expr);
9795       break;
9796
9797     case vect_step_op_mul:
9798       {
9799         tree utype = unsigned_type_for (type);
9800         init_expr = gimple_convert (stmts, utype, init_expr);
9801         wide_int skipn = wi::to_wide (skip_niters);
9802         wide_int begin = wi::to_wide (step_expr);
9803         auto_mpz base, exp, mod, res;
9804         wi::to_mpz (begin, base, TYPE_SIGN (type));
9805         wi::to_mpz (skipn, exp, UNSIGNED);
9806         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9807         mpz_powm (res, base, exp, mod);
9808         begin = wi::from_mpz (utype, res, true);
9809         tree mult_expr = wide_int_to_tree (utype, begin);
9810         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9811                                   init_expr, mult_expr);
9812         init_expr = gimple_convert (stmts, type, init_expr);
9813       }
9814       break;
9815
9816     default:
9817       gcc_unreachable ();
9818     }
9819
9820   return init_expr;
9821 }
9822
9823 /* Create vector step for vectorized iv.  */
9824 static tree
9825 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9826                                poly_uint64 vf,
9827                                enum vect_induction_op_type induction_type)
9828 {
9829   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9830   tree new_name = NULL;
9831   /* Step should be pow (step, vf) for mult induction.  */
9832   if (induction_type == vect_step_op_mul)
9833     {
9834       gcc_assert (vf.is_constant ());
9835       wide_int begin = wi::to_wide (step_expr);
9836
9837       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9838         begin = wi::mul (begin, wi::to_wide (step_expr));
9839
9840       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9841     }
9842   else if (induction_type == vect_step_op_neg)
9843     /* Do nothing.  */
9844     ;
9845   else
9846     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9847                              expr, step_expr);
9848   return new_name;
9849 }
9850
9851 static tree
9852 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9853                                    stmt_vec_info stmt_info,
9854                                    tree new_name, tree vectype,
9855                                    enum vect_induction_op_type induction_type)
9856 {
9857   /* No step is needed for neg induction.  */
9858   if (induction_type == vect_step_op_neg)
9859     return NULL;
9860
9861   tree t = unshare_expr (new_name);
9862   gcc_assert (CONSTANT_CLASS_P (new_name)
9863               || TREE_CODE (new_name) == SSA_NAME);
9864   tree new_vec = build_vector_from_val (vectype, t);
9865   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9866                                     new_vec, vectype, NULL);
9867   return vec_step;
9868 }
9869
9870 /* Update vectorized iv with vect_step, induc_def is init.  */
9871 static tree
9872 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9873                           tree induc_def, tree vec_step,
9874                           enum vect_induction_op_type induction_type)
9875 {
9876   tree vec_def = induc_def;
9877   switch (induction_type)
9878     {
9879     case vect_step_op_mul:
9880       {
9881         /* Use unsigned mult to avoid UD integer overflow.  */
9882         tree uvectype
9883           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9884                                TYPE_VECTOR_SUBPARTS (vectype));
9885         vec_def = gimple_convert (stmts, uvectype, vec_def);
9886         vec_step = gimple_convert (stmts, uvectype, vec_step);
9887         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9888                                 vec_def, vec_step);
9889         vec_def = gimple_convert (stmts, vectype, vec_def);
9890       }
9891       break;
9892
9893     case vect_step_op_shr:
9894       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9895                               vec_def, vec_step);
9896       break;
9897
9898     case vect_step_op_shl:
9899       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9900                               vec_def, vec_step);
9901       break;
9902     case vect_step_op_neg:
9903       vec_def = induc_def;
9904       /* Do nothing.  */
9905       break;
9906     default:
9907       gcc_unreachable ();
9908     }
9909
9910   return vec_def;
9911
9912 }
9913
9914 /* Function vectorizable_induction
9915
9916    Check if STMT_INFO performs an nonlinear induction computation that can be
9917    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9918    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9919    basic block.
9920    Return true if STMT_INFO is vectorizable in this way.  */
9921
9922 static bool
9923 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9924                                   stmt_vec_info stmt_info,
9925                                   gimple **vec_stmt, slp_tree slp_node,
9926                                   stmt_vector_for_cost *cost_vec)
9927 {
9928   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9929   unsigned ncopies;
9930   bool nested_in_vect_loop = false;
9931   class loop *iv_loop;
9932   tree vec_def;
9933   edge pe = loop_preheader_edge (loop);
9934   basic_block new_bb;
9935   tree vec_init, vec_step;
9936   tree new_name;
9937   gimple *new_stmt;
9938   gphi *induction_phi;
9939   tree induc_def, vec_dest;
9940   tree init_expr, step_expr;
9941   tree niters_skip;
9942   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9943   unsigned i;
9944   gimple_stmt_iterator si;
9945
9946   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9947
9948   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9949   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9950   enum vect_induction_op_type induction_type
9951     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9952
9953   gcc_assert (induction_type > vect_step_op_add);
9954
9955   if (slp_node)
9956     ncopies = 1;
9957   else
9958     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9959   gcc_assert (ncopies >= 1);
9960
9961   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9962   if (nested_in_vect_loop_p (loop, stmt_info))
9963     {
9964       if (dump_enabled_p ())
9965         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9966                          "nonlinear induction in nested loop.\n");
9967       return false;
9968     }
9969
9970   iv_loop = loop;
9971   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9972
9973   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9974      update for each iv and a permutation to generate wanted vector iv.  */
9975   if (slp_node)
9976     {
9977       if (dump_enabled_p ())
9978         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9979                          "SLP induction not supported for nonlinear"
9980                          " induction.\n");
9981       return false;
9982     }
9983
9984   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9985     {
9986       if (dump_enabled_p ())
9987         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9988                          "floating point nonlinear induction vectorization"
9989                          " not supported.\n");
9990       return false;
9991     }
9992
9993   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9994   init_expr = vect_phi_initial_value (phi);
9995   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9996               && TREE_CODE (step_expr) == INTEGER_CST);
9997   /* step_expr should be aligned with init_expr,
9998      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9999   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
10000
10001   if (TREE_CODE (init_expr) == INTEGER_CST)
10002     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
10003   else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
10004     {
10005       /* INIT_EXPR could be a bit_field, bail out for such case.  */
10006       if (dump_enabled_p ())
10007         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10008                          "nonlinear induction vectorization failed:"
10009                          " component type of vectype is not a nop conversion"
10010                          " from type of init_expr.\n");
10011       return false;
10012     }
10013
10014   switch (induction_type)
10015     {
10016     case vect_step_op_neg:
10017       if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
10018         return false;
10019       if (TREE_CODE (init_expr) != INTEGER_CST
10020           && TREE_CODE (init_expr) != REAL_CST)
10021         {
10022           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
10023           if (!directly_supported_p (NEGATE_EXPR, vectype))
10024             return false;
10025
10026           /* The encoding has 2 interleaved stepped patterns.  */
10027           vec_perm_builder sel (nunits, 2, 3);
10028           machine_mode mode = TYPE_MODE (vectype);
10029           sel.quick_grow (6);
10030           for (i = 0; i < 3; i++)
10031             {
10032               sel[i * 2] = i;
10033               sel[i * 2 + 1] = i + nunits;
10034             }
10035           vec_perm_indices indices (sel, 2, nunits);
10036           if (!can_vec_perm_const_p (mode, mode, indices))
10037             return false;
10038         }
10039       break;
10040
10041     case vect_step_op_mul:
10042       {
10043         /* Check for backend support of MULT_EXPR.  */
10044         if (!directly_supported_p (MULT_EXPR, vectype))
10045           return false;
10046
10047         /* ?? How to construct vector step for variable number vector.
10048            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
10049         if (!vf.is_constant ())
10050           return false;
10051       }
10052       break;
10053
10054     case vect_step_op_shr:
10055       /* Check for backend support of RSHIFT_EXPR.  */
10056       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
10057         return false;
10058
10059       /* Don't shift more than type precision to avoid UD.  */
10060       if (!tree_fits_uhwi_p (step_expr)
10061           || maybe_ge (nunits * tree_to_uhwi (step_expr),
10062                        TYPE_PRECISION (TREE_TYPE (init_expr))))
10063         return false;
10064       break;
10065
10066     case vect_step_op_shl:
10067       /* Check for backend support of RSHIFT_EXPR.  */
10068       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
10069         return false;
10070
10071       /* Don't shift more than type precision to avoid UD.  */
10072       if (!tree_fits_uhwi_p (step_expr)
10073           || maybe_ge (nunits * tree_to_uhwi (step_expr),
10074                        TYPE_PRECISION (TREE_TYPE (init_expr))))
10075         return false;
10076
10077       break;
10078
10079     default:
10080       gcc_unreachable ();
10081     }
10082
10083   if (!vec_stmt) /* transformation not required.  */
10084     {
10085       unsigned inside_cost = 0, prologue_cost = 0;
10086       /* loop cost for vec_loop. Neg induction doesn't have any
10087          inside_cost.  */
10088       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10089                                       stmt_info, 0, vect_body);
10090
10091       /* loop cost for vec_loop. Neg induction doesn't have any
10092          inside_cost.  */
10093       if (induction_type == vect_step_op_neg)
10094         inside_cost = 0;
10095
10096       /* prologue cost for vec_init and vec_step.  */
10097       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10098                                         stmt_info, 0, vect_prologue);
10099
10100       if (dump_enabled_p ())
10101         dump_printf_loc (MSG_NOTE, vect_location,
10102                          "vect_model_induction_cost: inside_cost = %d, "
10103                          "prologue_cost = %d. \n", inside_cost,
10104                          prologue_cost);
10105
10106       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10107       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
10108       return true;
10109     }
10110
10111   /* Transform.  */
10112
10113   /* Compute a vector variable, initialized with the first VF values of
10114      the induction variable.  E.g., for an iv with IV_PHI='X' and
10115      evolution S, for a vector of 4 units, we want to compute:
10116      [X, X + S, X + 2*S, X + 3*S].  */
10117
10118   if (dump_enabled_p ())
10119     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10120
10121   pe = loop_preheader_edge (iv_loop);
10122   /* Find the first insertion point in the BB.  */
10123   basic_block bb = gimple_bb (phi);
10124   si = gsi_after_labels (bb);
10125
10126   gimple_seq stmts = NULL;
10127
10128   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10129   /* If we are using the loop mask to "peel" for alignment then we need
10130      to adjust the start value here.  */
10131   if (niters_skip != NULL_TREE)
10132     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
10133                                              step_expr, induction_type);
10134
10135   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
10136                                             step_expr, nunits, vectype,
10137                                             induction_type);
10138   if (stmts)
10139     {
10140       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10141       gcc_assert (!new_bb);
10142     }
10143
10144   stmts = NULL;
10145   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
10146                                             vf, induction_type);
10147   if (stmts)
10148     {
10149       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10150       gcc_assert (!new_bb);
10151     }
10152
10153   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
10154                                                 new_name, vectype,
10155                                                 induction_type);
10156   /* Create the following def-use cycle:
10157      loop prolog:
10158      vec_init = ...
10159      vec_step = ...
10160      loop:
10161      vec_iv = PHI <vec_init, vec_loop>
10162      ...
10163      STMT
10164      ...
10165      vec_loop = vec_iv + vec_step;  */
10166
10167   /* Create the induction-phi that defines the induction-operand.  */
10168   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10169   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10170   induc_def = PHI_RESULT (induction_phi);
10171
10172   /* Create the iv update inside the loop.  */
10173   stmts = NULL;
10174   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
10175                                       induc_def, vec_step,
10176                                       induction_type);
10177
10178   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10179   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10180
10181   /* Set the arguments of the phi node:  */
10182   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10183   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10184                UNKNOWN_LOCATION);
10185
10186   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10187   *vec_stmt = induction_phi;
10188
10189   /* In case that vectorization factor (VF) is bigger than the number
10190      of elements that we can fit in a vectype (nunits), we have to generate
10191      more than one vector stmt - i.e - we need to "unroll" the
10192      vector stmt by a factor VF/nunits.  For more details see documentation
10193      in vectorizable_operation.  */
10194
10195   if (ncopies > 1)
10196     {
10197       stmts = NULL;
10198       /* FORNOW. This restriction should be relaxed.  */
10199       gcc_assert (!nested_in_vect_loop);
10200
10201       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
10202                                                 nunits, induction_type);
10203
10204       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
10205                                                     new_name, vectype,
10206                                                     induction_type);
10207       vec_def = induc_def;
10208       for (i = 1; i < ncopies; i++)
10209         {
10210           /* vec_i = vec_prev + vec_step.  */
10211           stmts = NULL;
10212           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
10213                                               vec_def, vec_step,
10214                                               induction_type);
10215           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10216           new_stmt = SSA_NAME_DEF_STMT (vec_def);
10217           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10218         }
10219     }
10220
10221   if (dump_enabled_p ())
10222     dump_printf_loc (MSG_NOTE, vect_location,
10223                      "transform induction: created def-use cycle: %G%G",
10224                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10225
10226   return true;
10227 }
10228
10229 /* Function vectorizable_induction
10230
10231    Check if STMT_INFO performs an induction computation that can be vectorized.
10232    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
10233    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
10234    Return true if STMT_INFO is vectorizable in this way.  */
10235
10236 bool
10237 vectorizable_induction (loop_vec_info loop_vinfo,
10238                         stmt_vec_info stmt_info,
10239                         gimple **vec_stmt, slp_tree slp_node,
10240                         stmt_vector_for_cost *cost_vec)
10241 {
10242   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10243   unsigned ncopies;
10244   bool nested_in_vect_loop = false;
10245   class loop *iv_loop;
10246   tree vec_def;
10247   edge pe = loop_preheader_edge (loop);
10248   basic_block new_bb;
10249   tree new_vec, vec_init, vec_step, t;
10250   tree new_name;
10251   gimple *new_stmt;
10252   gphi *induction_phi;
10253   tree induc_def, vec_dest;
10254   tree init_expr, step_expr;
10255   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10256   unsigned i;
10257   tree expr;
10258   gimple_stmt_iterator si;
10259   enum vect_induction_op_type induction_type
10260     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
10261
10262   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
10263   if (!phi)
10264     return false;
10265
10266   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10267     return false;
10268
10269   /* Make sure it was recognized as induction computation.  */
10270   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
10271     return false;
10272
10273   /* Handle nonlinear induction in a separate place.  */
10274   if (induction_type != vect_step_op_add)
10275     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
10276                                              vec_stmt, slp_node, cost_vec);
10277
10278   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
10279   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10280
10281   if (slp_node)
10282     ncopies = 1;
10283   else
10284     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10285   gcc_assert (ncopies >= 1);
10286
10287   /* FORNOW. These restrictions should be relaxed.  */
10288   if (nested_in_vect_loop_p (loop, stmt_info))
10289     {
10290       imm_use_iterator imm_iter;
10291       use_operand_p use_p;
10292       gimple *exit_phi;
10293       edge latch_e;
10294       tree loop_arg;
10295
10296       if (ncopies > 1)
10297         {
10298           if (dump_enabled_p ())
10299             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10300                              "multiple types in nested loop.\n");
10301           return false;
10302         }
10303
10304       exit_phi = NULL;
10305       latch_e = loop_latch_edge (loop->inner);
10306       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
10307       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
10308         {
10309           gimple *use_stmt = USE_STMT (use_p);
10310           if (is_gimple_debug (use_stmt))
10311             continue;
10312
10313           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
10314             {
10315               exit_phi = use_stmt;
10316               break;
10317             }
10318         }
10319       if (exit_phi)
10320         {
10321           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
10322           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
10323                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
10324             {
10325               if (dump_enabled_p ())
10326                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10327                                  "inner-loop induction only used outside "
10328                                  "of the outer vectorized loop.\n");
10329               return false;
10330             }
10331         }
10332
10333       nested_in_vect_loop = true;
10334       iv_loop = loop->inner;
10335     }
10336   else
10337     iv_loop = loop;
10338   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10339
10340   if (slp_node && !nunits.is_constant ())
10341     {
10342       /* The current SLP code creates the step value element-by-element.  */
10343       if (dump_enabled_p ())
10344         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10345                          "SLP induction not supported for variable-length"
10346                          " vectors.\n");
10347       return false;
10348     }
10349
10350   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10351     {
10352       if (dump_enabled_p ())
10353         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10354                          "floating point induction vectorization disabled\n");
10355       return false;
10356     }
10357
10358   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10359   gcc_assert (step_expr != NULL_TREE);
10360   if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10361       && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10362     {
10363       if (dump_enabled_p ())
10364         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10365                          "bit-precision induction vectorization not "
10366                          "supported.\n");
10367       return false;
10368     }
10369   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10370
10371   /* Check for backend support of PLUS/MINUS_EXPR. */
10372   if (!directly_supported_p (PLUS_EXPR, step_vectype)
10373       || !directly_supported_p (MINUS_EXPR, step_vectype))
10374     return false;
10375
10376   if (!vec_stmt) /* transformation not required.  */
10377     {
10378       unsigned inside_cost = 0, prologue_cost = 0;
10379       if (slp_node)
10380         {
10381           /* We eventually need to set a vector type on invariant
10382              arguments.  */
10383           unsigned j;
10384           slp_tree child;
10385           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10386             if (!vect_maybe_update_slp_op_vectype
10387                 (child, SLP_TREE_VECTYPE (slp_node)))
10388               {
10389                 if (dump_enabled_p ())
10390                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10391                                    "incompatible vector types for "
10392                                    "invariants\n");
10393                 return false;
10394               }
10395           /* loop cost for vec_loop.  */
10396           inside_cost
10397             = record_stmt_cost (cost_vec,
10398                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10399                                 vector_stmt, stmt_info, 0, vect_body);
10400           /* prologue cost for vec_init (if not nested) and step.  */
10401           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10402                                             scalar_to_vec,
10403                                             stmt_info, 0, vect_prologue);
10404         }
10405       else /* if (!slp_node) */
10406         {
10407           /* loop cost for vec_loop.  */
10408           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10409                                           stmt_info, 0, vect_body);
10410           /* prologue cost for vec_init and vec_step.  */
10411           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10412                                             stmt_info, 0, vect_prologue);
10413         }
10414       if (dump_enabled_p ())
10415         dump_printf_loc (MSG_NOTE, vect_location,
10416                          "vect_model_induction_cost: inside_cost = %d, "
10417                          "prologue_cost = %d .\n", inside_cost,
10418                          prologue_cost);
10419
10420       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10421       DUMP_VECT_SCOPE ("vectorizable_induction");
10422       return true;
10423     }
10424
10425   /* Transform.  */
10426
10427   /* Compute a vector variable, initialized with the first VF values of
10428      the induction variable.  E.g., for an iv with IV_PHI='X' and
10429      evolution S, for a vector of 4 units, we want to compute:
10430      [X, X + S, X + 2*S, X + 3*S].  */
10431
10432   if (dump_enabled_p ())
10433     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10434
10435   pe = loop_preheader_edge (iv_loop);
10436   /* Find the first insertion point in the BB.  */
10437   basic_block bb = gimple_bb (phi);
10438   si = gsi_after_labels (bb);
10439
10440   /* For SLP induction we have to generate several IVs as for example
10441      with group size 3 we need
10442        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10443        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
10444   if (slp_node)
10445     {
10446       /* Enforced above.  */
10447       unsigned int const_nunits = nunits.to_constant ();
10448
10449       /* The initial values are vectorized, but any lanes > group_size
10450          need adjustment.  */
10451       slp_tree init_node
10452         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10453
10454       /* Gather steps.  Since we do not vectorize inductions as
10455          cycles we have to reconstruct the step from SCEV data.  */
10456       unsigned group_size = SLP_TREE_LANES (slp_node);
10457       tree *steps = XALLOCAVEC (tree, group_size);
10458       tree *inits = XALLOCAVEC (tree, group_size);
10459       stmt_vec_info phi_info;
10460       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10461         {
10462           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10463           if (!init_node)
10464             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10465                                            pe->dest_idx);
10466         }
10467
10468       /* Now generate the IVs.  */
10469       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10470       gcc_assert ((const_nunits * nvects) % group_size == 0);
10471       unsigned nivs;
10472       if (nested_in_vect_loop)
10473         nivs = nvects;
10474       else
10475         {
10476           /* Compute the number of distinct IVs we need.  First reduce
10477              group_size if it is a multiple of const_nunits so we get
10478              one IV for a group_size of 4 but const_nunits 2.  */
10479           unsigned group_sizep = group_size;
10480           if (group_sizep % const_nunits == 0)
10481             group_sizep = group_sizep / const_nunits;
10482           nivs = least_common_multiple (group_sizep,
10483                                         const_nunits) / const_nunits;
10484         }
10485       tree stept = TREE_TYPE (step_vectype);
10486       tree lupdate_mul = NULL_TREE;
10487       if (!nested_in_vect_loop)
10488         {
10489           /* The number of iterations covered in one vector iteration.  */
10490           unsigned lup_mul = (nvects * const_nunits) / group_size;
10491           lupdate_mul
10492             = build_vector_from_val (step_vectype,
10493                                      SCALAR_FLOAT_TYPE_P (stept)
10494                                      ? build_real_from_wide (stept, lup_mul,
10495                                                              UNSIGNED)
10496                                      : build_int_cstu (stept, lup_mul));
10497         }
10498       tree peel_mul = NULL_TREE;
10499       gimple_seq init_stmts = NULL;
10500       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10501         {
10502           if (SCALAR_FLOAT_TYPE_P (stept))
10503             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10504                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10505           else
10506             peel_mul = gimple_convert (&init_stmts, stept,
10507                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10508           peel_mul = gimple_build_vector_from_val (&init_stmts,
10509                                                    step_vectype, peel_mul);
10510         }
10511       unsigned ivn;
10512       auto_vec<tree> vec_steps;
10513       for (ivn = 0; ivn < nivs; ++ivn)
10514         {
10515           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10516           tree_vector_builder init_elts (vectype, const_nunits, 1);
10517           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10518           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10519             {
10520               /* The scalar steps of the IVs.  */
10521               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10522               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10523               step_elts.quick_push (elt);
10524               if (!init_node)
10525                 {
10526                   /* The scalar inits of the IVs if not vectorized.  */
10527                   elt = inits[(ivn*const_nunits + eltn) % group_size];
10528                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
10529                                                   TREE_TYPE (elt)))
10530                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10531                                         TREE_TYPE (vectype), elt);
10532                   init_elts.quick_push (elt);
10533                 }
10534               /* The number of steps to add to the initial values.  */
10535               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10536               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10537                                    ? build_real_from_wide (stept,
10538                                                            mul_elt, UNSIGNED)
10539                                    : build_int_cstu (stept, mul_elt));
10540             }
10541           vec_step = gimple_build_vector (&init_stmts, &step_elts);
10542           vec_steps.safe_push (vec_step);
10543           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10544           if (peel_mul)
10545             step_mul = gimple_build (&init_stmts, MINUS_EXPR, step_vectype,
10546                                      step_mul, peel_mul);
10547           if (!init_node)
10548             vec_init = gimple_build_vector (&init_stmts, &init_elts);
10549
10550           /* Create the induction-phi that defines the induction-operand.  */
10551           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10552                                             "vec_iv_");
10553           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10554           induc_def = PHI_RESULT (induction_phi);
10555
10556           /* Create the iv update inside the loop  */
10557           tree up = vec_step;
10558           if (lupdate_mul)
10559             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10560                                vec_step, lupdate_mul);
10561           gimple_seq stmts = NULL;
10562           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10563           vec_def = gimple_build (&stmts,
10564                                   PLUS_EXPR, step_vectype, vec_def, up);
10565           vec_def = gimple_convert (&stmts, vectype, vec_def);
10566           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10567           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10568                        UNKNOWN_LOCATION);
10569
10570           if (init_node)
10571             vec_init = vect_get_slp_vect_def (init_node, ivn);
10572           if (!nested_in_vect_loop
10573               && !integer_zerop (step_mul))
10574             {
10575               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10576               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10577                                  vec_step, step_mul);
10578               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10579                                       vec_def, up);
10580               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10581             }
10582
10583           /* Set the arguments of the phi node:  */
10584           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10585
10586           slp_node->push_vec_def (induction_phi);
10587         }
10588       if (!nested_in_vect_loop)
10589         {
10590           /* Fill up to the number of vectors we need for the whole group.  */
10591           nivs = least_common_multiple (group_size,
10592                                         const_nunits) / const_nunits;
10593           vec_steps.reserve (nivs-ivn);
10594           for (; ivn < nivs; ++ivn)
10595             {
10596               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10597               vec_steps.quick_push (vec_steps[0]);
10598             }
10599         }
10600
10601       /* Re-use IVs when we can.  We are generating further vector
10602          stmts by adding VF' * stride to the IVs generated above.  */
10603       if (ivn < nvects)
10604         {
10605           unsigned vfp
10606             = least_common_multiple (group_size, const_nunits) / group_size;
10607           tree lupdate_mul
10608             = build_vector_from_val (step_vectype,
10609                                      SCALAR_FLOAT_TYPE_P (stept)
10610                                      ? build_real_from_wide (stept,
10611                                                              vfp, UNSIGNED)
10612                                      : build_int_cstu (stept, vfp));
10613           for (; ivn < nvects; ++ivn)
10614             {
10615               gimple *iv
10616                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10617               tree def = gimple_get_lhs (iv);
10618               if (ivn < 2*nivs)
10619                 vec_steps[ivn - nivs]
10620                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10621                                   vec_steps[ivn - nivs], lupdate_mul);
10622               gimple_seq stmts = NULL;
10623               def = gimple_convert (&stmts, step_vectype, def);
10624               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10625                                   def, vec_steps[ivn % nivs]);
10626               def = gimple_convert (&stmts, vectype, def);
10627               if (gimple_code (iv) == GIMPLE_PHI)
10628                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10629               else
10630                 {
10631                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10632                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10633                 }
10634               slp_node->push_vec_def (def);
10635             }
10636         }
10637
10638       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10639       gcc_assert (!new_bb);
10640
10641       return true;
10642     }
10643
10644   init_expr = vect_phi_initial_value (phi);
10645
10646   gimple_seq stmts = NULL;
10647   if (!nested_in_vect_loop)
10648     {
10649       /* Convert the initial value to the IV update type.  */
10650       tree new_type = TREE_TYPE (step_expr);
10651       init_expr = gimple_convert (&stmts, new_type, init_expr);
10652
10653       /* If we are using the loop mask to "peel" for alignment then we need
10654          to adjust the start value here.  */
10655       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10656       if (skip_niters != NULL_TREE)
10657         {
10658           if (FLOAT_TYPE_P (vectype))
10659             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10660                                         skip_niters);
10661           else
10662             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10663           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10664                                          skip_niters, step_expr);
10665           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10666                                     init_expr, skip_step);
10667         }
10668     }
10669
10670   if (stmts)
10671     {
10672       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10673       gcc_assert (!new_bb);
10674     }
10675
10676   /* Create the vector that holds the initial_value of the induction.  */
10677   if (nested_in_vect_loop)
10678     {
10679       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10680          been created during vectorization of previous stmts.  We obtain it
10681          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10682       auto_vec<tree> vec_inits;
10683       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10684                                      init_expr, &vec_inits);
10685       vec_init = vec_inits[0];
10686       /* If the initial value is not of proper type, convert it.  */
10687       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10688         {
10689           new_stmt
10690             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10691                                                           vect_simple_var,
10692                                                           "vec_iv_"),
10693                                    VIEW_CONVERT_EXPR,
10694                                    build1 (VIEW_CONVERT_EXPR, vectype,
10695                                            vec_init));
10696           vec_init = gimple_assign_lhs (new_stmt);
10697           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10698                                                  new_stmt);
10699           gcc_assert (!new_bb);
10700         }
10701     }
10702   else
10703     {
10704       /* iv_loop is the loop to be vectorized. Create:
10705          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10706       stmts = NULL;
10707       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10708
10709       unsigned HOST_WIDE_INT const_nunits;
10710       if (nunits.is_constant (&const_nunits))
10711         {
10712           tree_vector_builder elts (step_vectype, const_nunits, 1);
10713           elts.quick_push (new_name);
10714           for (i = 1; i < const_nunits; i++)
10715             {
10716               /* Create: new_name_i = new_name + step_expr  */
10717               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10718                                        new_name, step_expr);
10719               elts.quick_push (new_name);
10720             }
10721           /* Create a vector from [new_name_0, new_name_1, ...,
10722              new_name_nunits-1]  */
10723           vec_init = gimple_build_vector (&stmts, &elts);
10724         }
10725       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10726         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10727         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10728                                  new_name, step_expr);
10729       else
10730         {
10731           /* Build:
10732                 [base, base, base, ...]
10733                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10734           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10735           gcc_assert (flag_associative_math);
10736           tree index = build_index_vector (step_vectype, 0, 1);
10737           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10738                                                         new_name);
10739           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10740                                                         step_expr);
10741           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10742           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10743                                    vec_init, step_vec);
10744           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10745                                    vec_init, base_vec);
10746         }
10747       vec_init = gimple_convert (&stmts, vectype, vec_init);
10748
10749       if (stmts)
10750         {
10751           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10752           gcc_assert (!new_bb);
10753         }
10754     }
10755
10756
10757   /* Create the vector that holds the step of the induction.  */
10758   gimple_stmt_iterator *step_iv_si = NULL;
10759   if (nested_in_vect_loop)
10760     /* iv_loop is nested in the loop to be vectorized. Generate:
10761        vec_step = [S, S, S, S]  */
10762     new_name = step_expr;
10763   else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10764     {
10765       /* When we're using loop_len produced by SELEC_VL, the non-final
10766          iterations are not always processing VF elements.  So vectorize
10767          induction variable instead of
10768
10769            _21 = vect_vec_iv_.6_22 + { VF, ... };
10770
10771          We should generate:
10772
10773            _35 = .SELECT_VL (ivtmp_33, VF);
10774            vect_cst__22 = [vec_duplicate_expr] _35;
10775            _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
10776       gcc_assert (!slp_node);
10777       gimple_seq seq = NULL;
10778       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10779       tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10780       expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10781                                                  unshare_expr (len)),
10782                                    &seq, true, NULL_TREE);
10783       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10784                                step_expr);
10785       gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10786       step_iv_si = &si;
10787     }
10788   else
10789     {
10790       /* iv_loop is the loop to be vectorized. Generate:
10791           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10792       gimple_seq seq = NULL;
10793       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10794         {
10795           expr = build_int_cst (integer_type_node, vf);
10796           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10797         }
10798       else
10799         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10800       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10801                                expr, step_expr);
10802       if (seq)
10803         {
10804           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10805           gcc_assert (!new_bb);
10806         }
10807     }
10808
10809   t = unshare_expr (new_name);
10810   gcc_assert (CONSTANT_CLASS_P (new_name)
10811               || TREE_CODE (new_name) == SSA_NAME);
10812   new_vec = build_vector_from_val (step_vectype, t);
10813   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10814                                new_vec, step_vectype, step_iv_si);
10815
10816
10817   /* Create the following def-use cycle:
10818      loop prolog:
10819          vec_init = ...
10820          vec_step = ...
10821      loop:
10822          vec_iv = PHI <vec_init, vec_loop>
10823          ...
10824          STMT
10825          ...
10826          vec_loop = vec_iv + vec_step;  */
10827
10828   /* Create the induction-phi that defines the induction-operand.  */
10829   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10830   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10831   induc_def = PHI_RESULT (induction_phi);
10832
10833   /* Create the iv update inside the loop  */
10834   stmts = NULL;
10835   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10836   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10837   vec_def = gimple_convert (&stmts, vectype, vec_def);
10838   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10839   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10840
10841   /* Set the arguments of the phi node:  */
10842   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10843   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10844                UNKNOWN_LOCATION);
10845
10846   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10847   *vec_stmt = induction_phi;
10848
10849   /* In case that vectorization factor (VF) is bigger than the number
10850      of elements that we can fit in a vectype (nunits), we have to generate
10851      more than one vector stmt - i.e - we need to "unroll" the
10852      vector stmt by a factor VF/nunits.  For more details see documentation
10853      in vectorizable_operation.  */
10854
10855   if (ncopies > 1)
10856     {
10857       gimple_seq seq = NULL;
10858       /* FORNOW. This restriction should be relaxed.  */
10859       gcc_assert (!nested_in_vect_loop);
10860       /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1.  */
10861       gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10862
10863       /* Create the vector that holds the step of the induction.  */
10864       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10865         {
10866           expr = build_int_cst (integer_type_node, nunits);
10867           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10868         }
10869       else
10870         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10871       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10872                                expr, step_expr);
10873       if (seq)
10874         {
10875           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10876           gcc_assert (!new_bb);
10877         }
10878
10879       t = unshare_expr (new_name);
10880       gcc_assert (CONSTANT_CLASS_P (new_name)
10881                   || TREE_CODE (new_name) == SSA_NAME);
10882       new_vec = build_vector_from_val (step_vectype, t);
10883       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10884                                    new_vec, step_vectype, NULL);
10885
10886       vec_def = induc_def;
10887       for (i = 1; i < ncopies + 1; i++)
10888         {
10889           /* vec_i = vec_prev + vec_step  */
10890           gimple_seq stmts = NULL;
10891           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10892           vec_def = gimple_build (&stmts,
10893                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10894           vec_def = gimple_convert (&stmts, vectype, vec_def);
10895
10896           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10897           if (i < ncopies)
10898             {
10899               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10900               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10901             }
10902           else
10903             {
10904               /* vec_1 = vec_iv + (VF/n * S)
10905                  vec_2 = vec_1 + (VF/n * S)
10906                  ...
10907                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10908
10909                  vec_n is used as vec_loop to save the large step register and
10910                  related operations.  */
10911               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10912                            UNKNOWN_LOCATION);
10913             }
10914         }
10915     }
10916
10917   if (dump_enabled_p ())
10918     dump_printf_loc (MSG_NOTE, vect_location,
10919                      "transform induction: created def-use cycle: %G%G",
10920                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10921
10922   return true;
10923 }
10924
10925 /* Function vectorizable_live_operation_1.
10926
10927    helper function for vectorizable_live_operation.  */
10928
10929 static tree
10930 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10931                                stmt_vec_info stmt_info, basic_block exit_bb,
10932                                tree vectype, int ncopies, slp_tree slp_node,
10933                                tree bitsize, tree bitstart, tree vec_lhs,
10934                                tree lhs_type, gimple_stmt_iterator *exit_gsi)
10935 {
10936   gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10937
10938   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10939   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10940   for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10941     SET_PHI_ARG_DEF (phi, i, vec_lhs);
10942
10943   gimple_seq stmts = NULL;
10944   tree new_tree;
10945
10946   /* If bitstart is 0 then we can use a BIT_FIELD_REF  */
10947   if (integer_zerop (bitstart))
10948     {
10949       tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10950                                       vec_lhs_phi, bitsize, bitstart);
10951
10952       /* Convert the extracted vector element to the scalar type.  */
10953       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10954     }
10955   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10956     {
10957       /* Emit:
10958
10959          SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10960
10961          where VEC_LHS is the vectorized live-out result and MASK is
10962          the loop mask for the final iteration.  */
10963       gcc_assert (ncopies == 1
10964                   && (!slp_node || SLP_TREE_LANES (slp_node) == 1));
10965       gimple_seq tem = NULL;
10966       gimple_stmt_iterator gsi = gsi_last (tem);
10967       tree len = vect_get_loop_len (loop_vinfo, &gsi,
10968                                     &LOOP_VINFO_LENS (loop_vinfo),
10969                                     1, vectype, 0, 0);
10970
10971       /* BIAS - 1.  */
10972       signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10973       tree bias_minus_one
10974         = int_const_binop (MINUS_EXPR,
10975                            build_int_cst (TREE_TYPE (len), biasval),
10976                            build_one_cst (TREE_TYPE (len)));
10977
10978       /* LAST_INDEX = LEN + (BIAS - 1).  */
10979       tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10980                                      len, bias_minus_one);
10981
10982       /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10983       tree scalar_res
10984         = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10985                         vec_lhs_phi, last_index);
10986
10987       /* Convert the extracted vector element to the scalar type.  */
10988       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10989     }
10990   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10991     {
10992       /* Emit:
10993
10994          SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10995
10996          where VEC_LHS is the vectorized live-out result and MASK is
10997          the loop mask for the final iteration.  */
10998       gcc_assert (!slp_node || SLP_TREE_LANES (slp_node) == 1);
10999       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
11000       gimple_seq tem = NULL;
11001       gimple_stmt_iterator gsi = gsi_last (tem);
11002       tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
11003                                       &LOOP_VINFO_MASKS (loop_vinfo),
11004                                       1, vectype, 0);
11005       tree scalar_res;
11006       gimple_seq_add_seq (&stmts, tem);
11007
11008       scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
11009                                  mask, vec_lhs_phi);
11010
11011       /* Convert the extracted vector element to the scalar type.  */
11012       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
11013     }
11014   else
11015     {
11016       tree bftype = TREE_TYPE (vectype);
11017       if (VECTOR_BOOLEAN_TYPE_P (vectype))
11018         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11019       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
11020       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11021                                        &stmts, true, NULL_TREE);
11022     }
11023
11024   *exit_gsi = gsi_after_labels (exit_bb);
11025   if (stmts)
11026     gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
11027
11028   return new_tree;
11029 }
11030
11031 /* Function vectorizable_live_operation.
11032
11033    STMT_INFO computes a value that is used outside the loop.  Check if
11034    it can be supported.  */
11035
11036 bool
11037 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
11038                              slp_tree slp_node, slp_instance slp_node_instance,
11039                              int slp_index, bool vec_stmt_p,
11040                              stmt_vector_for_cost *cost_vec)
11041 {
11042   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
11043   imm_use_iterator imm_iter;
11044   tree lhs, lhs_type, bitsize;
11045   tree vectype = (slp_node
11046                   ? SLP_TREE_VECTYPE (slp_node)
11047                   : STMT_VINFO_VECTYPE (stmt_info));
11048   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
11049   int ncopies;
11050   gimple *use_stmt;
11051   use_operand_p use_p;
11052   auto_vec<tree> vec_oprnds;
11053   int vec_entry = 0;
11054   poly_uint64 vec_index = 0;
11055
11056   gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
11057               || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
11058
11059   /* If a stmt of a reduction is live, vectorize it via
11060      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
11061      validity so just trigger the transform here.  */
11062   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
11063     {
11064       if (!vec_stmt_p)
11065         return true;
11066       /* For SLP reductions we vectorize the epilogue for all involved stmts
11067          together.  */
11068       if (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) && slp_index != 0)
11069         return true;
11070       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
11071       gcc_assert (reduc_info->is_reduc_info);
11072       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
11073           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
11074         return true;
11075
11076       if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
11077           || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
11078         vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
11079                                           slp_node_instance,
11080                                           LOOP_VINFO_IV_EXIT (loop_vinfo));
11081
11082       /* If early break we only have to materialize the reduction on the merge
11083          block, but we have to find an alternate exit first.  */
11084       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11085         {
11086           slp_tree phis_node = slp_node ? slp_node_instance->reduc_phis : NULL;
11087           for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11088             if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
11089               {
11090                 vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
11091                                                   phis_node, slp_node_instance,
11092                                                   exit);
11093                 break;
11094               }
11095           if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
11096             vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
11097                                               phis_node, slp_node_instance,
11098                                               LOOP_VINFO_IV_EXIT (loop_vinfo));
11099         }
11100
11101       return true;
11102     }
11103
11104   /* If STMT is not relevant and it is a simple assignment and its inputs are
11105      invariant then it can remain in place, unvectorized.  The original last
11106      scalar value that it computes will be used.  */
11107   if (!STMT_VINFO_RELEVANT_P (stmt_info))
11108     {
11109       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
11110       if (dump_enabled_p ())
11111         dump_printf_loc (MSG_NOTE, vect_location,
11112                          "statement is simple and uses invariant.  Leaving in "
11113                          "place.\n");
11114       return true;
11115     }
11116
11117   if (slp_node)
11118     ncopies = 1;
11119   else
11120     ncopies = vect_get_num_copies (loop_vinfo, vectype);
11121
11122   if (slp_node)
11123     {
11124       gcc_assert (slp_index >= 0);
11125
11126       /* Get the last occurrence of the scalar index from the concatenation of
11127          all the slp vectors. Calculate which slp vector it is and the index
11128          within.  */
11129       int num_scalar = SLP_TREE_LANES (slp_node);
11130       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
11131       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
11132
11133       /* Calculate which vector contains the result, and which lane of
11134          that vector we need.  */
11135       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
11136         {
11137           if (dump_enabled_p ())
11138             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11139                              "Cannot determine which vector holds the"
11140                              " final result.\n");
11141           return false;
11142         }
11143     }
11144
11145   if (!vec_stmt_p)
11146     {
11147       /* No transformation required.  */
11148       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
11149         {
11150           if (slp_node && SLP_TREE_LANES (slp_node) != 1)
11151             {
11152               if (dump_enabled_p ())
11153                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11154                                  "can't operate on partial vectors "
11155                                  "because an SLP statement is live after "
11156                                  "the loop.\n");
11157               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
11158             }
11159           else if (ncopies > 1
11160                    || (slp_node && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1))
11161             {
11162               if (dump_enabled_p ())
11163                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11164                                  "can't operate on partial vectors "
11165                                  "because ncopies is greater than 1.\n");
11166               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
11167             }
11168           else
11169             {
11170               gcc_assert (ncopies == 1
11171                           && (!slp_node || SLP_TREE_LANES (slp_node) == 1));
11172               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
11173                                                   OPTIMIZE_FOR_SPEED))
11174                 vect_record_loop_mask (loop_vinfo,
11175                                        &LOOP_VINFO_MASKS (loop_vinfo),
11176                                        1, vectype, NULL);
11177               else if (can_vec_extract_var_idx_p (
11178                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
11179                 vect_record_loop_len (loop_vinfo,
11180                                       &LOOP_VINFO_LENS (loop_vinfo),
11181                                       1, vectype, 1);
11182               else
11183                 {
11184                   if (dump_enabled_p ())
11185                     dump_printf_loc (
11186                       MSG_MISSED_OPTIMIZATION, vect_location,
11187                       "can't operate on partial vectors "
11188                       "because the target doesn't support extract "
11189                       "last reduction.\n");
11190                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
11191                 }
11192             }
11193         }
11194       /* ???  Enable for loop costing as well.  */
11195       if (!loop_vinfo)
11196         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
11197                           0, vect_epilogue);
11198       return true;
11199     }
11200
11201   /* Use the lhs of the original scalar statement.  */
11202   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
11203   if (dump_enabled_p ())
11204     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
11205                      "stmt %G", stmt);
11206
11207   lhs = gimple_get_lhs (stmt);
11208   lhs_type = TREE_TYPE (lhs);
11209
11210   bitsize = vector_element_bits_tree (vectype);
11211
11212   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
11213   tree vec_lhs, vec_lhs0, bitstart;
11214   gimple *vec_stmt, *vec_stmt0;
11215   if (slp_node)
11216     {
11217       gcc_assert (!loop_vinfo
11218                   || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
11219                        && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
11220                       || SLP_TREE_LANES (slp_node) == 1));
11221
11222       /* Get the correct slp vectorized stmt.  */
11223       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
11224       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
11225
11226       /* In case we need to early break vectorize also get the first stmt.  */
11227       vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
11228       vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
11229
11230       /* Get entry to use.  */
11231       bitstart = bitsize_int (vec_index);
11232       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
11233     }
11234   else
11235     {
11236       /* For multiple copies, get the last copy.  */
11237       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
11238       vec_lhs = gimple_get_lhs (vec_stmt);
11239
11240       /* In case we need to early break vectorize also get the first stmt.  */
11241       vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
11242       vec_lhs0 = gimple_get_lhs (vec_stmt0);
11243
11244       /* Get the last lane in the vector.  */
11245       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
11246     }
11247
11248   if (loop_vinfo)
11249     {
11250       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
11251          requirement, insert one phi node for it.  It looks like:
11252            loop;
11253          BB:
11254            # lhs' = PHI <lhs>
11255          ==>
11256            loop;
11257          BB:
11258            # vec_lhs' = PHI <vec_lhs>
11259            new_tree = lane_extract <vec_lhs', ...>;
11260            lhs' = new_tree;  */
11261
11262       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11263       /* Check if we have a loop where the chosen exit is not the main exit,
11264          in these cases for an early break we restart the iteration the vector code
11265          did.  For the live values we want the value at the start of the iteration
11266          rather than at the end.  */
11267       edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11268       bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
11269       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11270         if (!is_gimple_debug (use_stmt)
11271             && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
11272           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11273             {
11274               edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
11275                                            phi_arg_index_from_use (use_p));
11276               gcc_assert (loop_exit_edge_p (loop, e));
11277               bool main_exit_edge = e == main_e;
11278               tree tmp_vec_lhs = vec_lhs;
11279               tree tmp_bitstart = bitstart;
11280
11281               /* For early exit where the exit is not in the BB that leads
11282                  to the latch then we're restarting the iteration in the
11283                  scalar loop.  So get the first live value.  */
11284               if ((all_exits_as_early_p || !main_exit_edge)
11285                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
11286                 {
11287                   tmp_vec_lhs = vec_lhs0;
11288                   tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
11289                 }
11290
11291               gimple_stmt_iterator exit_gsi;
11292               tree new_tree
11293                 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
11294                                                  e->dest, vectype, ncopies,
11295                                                  slp_node, bitsize,
11296                                                  tmp_bitstart, tmp_vec_lhs,
11297                                                  lhs_type, &exit_gsi);
11298
11299               auto gsi = gsi_for_stmt (use_stmt);
11300               tree lhs_phi = gimple_phi_result (use_stmt);
11301               remove_phi_node (&gsi, false);
11302               gimple *copy = gimple_build_assign (lhs_phi, new_tree);
11303               gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
11304               break;
11305             }
11306
11307       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
11308       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11309         gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
11310     }
11311   else
11312     {
11313       /* For basic-block vectorization simply insert the lane-extraction.  */
11314       tree bftype = TREE_TYPE (vectype);
11315       if (VECTOR_BOOLEAN_TYPE_P (vectype))
11316         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11317       tree new_tree = build3 (BIT_FIELD_REF, bftype,
11318                               vec_lhs, bitsize, bitstart);
11319       gimple_seq stmts = NULL;
11320       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11321                                        &stmts, true, NULL_TREE);
11322       if (TREE_CODE (new_tree) == SSA_NAME
11323           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11324         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11325       if (is_a <gphi *> (vec_stmt))
11326         {
11327           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11328           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11329         }
11330       else
11331         {
11332           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11333           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11334         }
11335
11336       /* Replace use of lhs with newly computed result.  If the use stmt is a
11337          single arg PHI, just replace all uses of PHI result.  It's necessary
11338          because lcssa PHI defining lhs may be before newly inserted stmt.  */
11339       use_operand_p use_p;
11340       stmt_vec_info use_stmt_info;
11341       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11342         if (!is_gimple_debug (use_stmt)
11343             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11344                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11345           {
11346             /* ???  This can happen when the live lane ends up being
11347                rooted in a vector construction code-generated by an
11348                external SLP node (and code-generation for that already
11349                happened).  See gcc.dg/vect/bb-slp-47.c.
11350                Doing this is what would happen if that vector CTOR
11351                were not code-generated yet so it is not too bad.
11352                ???  In fact we'd likely want to avoid this situation
11353                in the first place.  */
11354             if (TREE_CODE (new_tree) == SSA_NAME
11355                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11356                 && gimple_code (use_stmt) != GIMPLE_PHI
11357                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11358                                                 use_stmt))
11359               {
11360                 if (dump_enabled_p ())
11361                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11362                                    "Using original scalar computation for "
11363                                    "live lane because use preceeds vector "
11364                                    "def\n");
11365                 continue;
11366               }
11367             /* ???  It can also happen that we end up pulling a def into
11368                a loop where replacing out-of-loop uses would require
11369                a new LC SSA PHI node.  Retain the original scalar in
11370                those cases as well.  PR98064.  */
11371             if (TREE_CODE (new_tree) == SSA_NAME
11372                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11373                 && (gimple_bb (use_stmt)->loop_father
11374                     != gimple_bb (vec_stmt)->loop_father)
11375                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11376                                         gimple_bb (use_stmt)->loop_father))
11377               {
11378                 if (dump_enabled_p ())
11379                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11380                                    "Using original scalar computation for "
11381                                    "live lane because there is an out-of-loop "
11382                                    "definition for it\n");
11383                 continue;
11384               }
11385             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11386               SET_USE (use_p, new_tree);
11387             update_stmt (use_stmt);
11388           }
11389     }
11390
11391   return true;
11392 }
11393
11394 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
11395
11396 static void
11397 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11398 {
11399   ssa_op_iter op_iter;
11400   imm_use_iterator imm_iter;
11401   def_operand_p def_p;
11402   gimple *ustmt;
11403
11404   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11405     {
11406       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11407         {
11408           basic_block bb;
11409
11410           if (!is_gimple_debug (ustmt))
11411             continue;
11412
11413           bb = gimple_bb (ustmt);
11414
11415           if (!flow_bb_inside_loop_p (loop, bb))
11416             {
11417               if (gimple_debug_bind_p (ustmt))
11418                 {
11419                   if (dump_enabled_p ())
11420                     dump_printf_loc (MSG_NOTE, vect_location,
11421                                      "killing debug use\n");
11422
11423                   gimple_debug_bind_reset_value (ustmt);
11424                   update_stmt (ustmt);
11425                 }
11426               else
11427                 gcc_unreachable ();
11428             }
11429         }
11430     }
11431 }
11432
11433 /* Given loop represented by LOOP_VINFO, return true if computation of
11434    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11435    otherwise.  */
11436
11437 static bool
11438 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11439 {
11440   /* Constant case.  */
11441   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11442     {
11443       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11444       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11445
11446       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11447       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11448       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11449         return true;
11450     }
11451
11452   widest_int max;
11453   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11454   /* Check the upper bound of loop niters.  */
11455   if (get_max_loop_iterations (loop, &max))
11456     {
11457       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11458       signop sgn = TYPE_SIGN (type);
11459       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11460       if (max < type_max)
11461         return true;
11462     }
11463   return false;
11464 }
11465
11466 /* Return a mask type with half the number of elements as OLD_TYPE,
11467    given that it should have mode NEW_MODE.  */
11468
11469 tree
11470 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11471 {
11472   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11473   return build_truth_vector_type_for_mode (nunits, new_mode);
11474 }
11475
11476 /* Return a mask type with twice as many elements as OLD_TYPE,
11477    given that it should have mode NEW_MODE.  */
11478
11479 tree
11480 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11481 {
11482   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11483   return build_truth_vector_type_for_mode (nunits, new_mode);
11484 }
11485
11486 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11487    contain a sequence of NVECTORS masks that each control a vector of type
11488    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
11489    these vector masks with the vector version of SCALAR_MASK.  */
11490
11491 void
11492 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11493                        unsigned int nvectors, tree vectype, tree scalar_mask)
11494 {
11495   gcc_assert (nvectors != 0);
11496
11497   if (scalar_mask)
11498     {
11499       scalar_cond_masked_key cond (scalar_mask, nvectors);
11500       loop_vinfo->scalar_cond_masked_set.add (cond);
11501     }
11502
11503   masks->mask_set.add (std::make_pair (vectype, nvectors));
11504 }
11505
11506 /* Given a complete set of masks MASKS, extract mask number INDEX
11507    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11508    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
11509
11510    See the comment above vec_loop_masks for more details about the mask
11511    arrangement.  */
11512
11513 tree
11514 vect_get_loop_mask (loop_vec_info loop_vinfo,
11515                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11516                     unsigned int nvectors, tree vectype, unsigned int index)
11517 {
11518   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11519       == vect_partial_vectors_while_ult)
11520     {
11521       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11522       tree mask_type = rgm->type;
11523
11524       /* Populate the rgroup's mask array, if this is the first time we've
11525          used it.  */
11526       if (rgm->controls.is_empty ())
11527         {
11528           rgm->controls.safe_grow_cleared (nvectors, true);
11529           for (unsigned int i = 0; i < nvectors; ++i)
11530             {
11531               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11532               /* Provide a dummy definition until the real one is available.  */
11533               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11534               rgm->controls[i] = mask;
11535             }
11536         }
11537
11538       tree mask = rgm->controls[index];
11539       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11540                     TYPE_VECTOR_SUBPARTS (vectype)))
11541         {
11542           /* A loop mask for data type X can be reused for data type Y
11543              if X has N times more elements than Y and if Y's elements
11544              are N times bigger than X's.  In this case each sequence
11545              of N elements in the loop mask will be all-zero or all-one.
11546              We can then view-convert the mask so that each sequence of
11547              N elements is replaced by a single element.  */
11548           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11549                                   TYPE_VECTOR_SUBPARTS (vectype)));
11550           gimple_seq seq = NULL;
11551           mask_type = truth_type_for (vectype);
11552           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11553           if (seq)
11554             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11555         }
11556       return mask;
11557     }
11558   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11559            == vect_partial_vectors_avx512)
11560     {
11561       /* The number of scalars per iteration and the number of vectors are
11562          both compile-time constants.  */
11563       unsigned int nscalars_per_iter
11564         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11565                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11566
11567       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11568
11569       /* The stored nV is dependent on the mask type produced.  */
11570       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11571                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11572                   == rgm->factor);
11573       nvectors = rgm->factor;
11574
11575       /* Populate the rgroup's mask array, if this is the first time we've
11576          used it.  */
11577       if (rgm->controls.is_empty ())
11578         {
11579           rgm->controls.safe_grow_cleared (nvectors, true);
11580           for (unsigned int i = 0; i < nvectors; ++i)
11581             {
11582               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11583               /* Provide a dummy definition until the real one is available.  */
11584               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11585               rgm->controls[i] = mask;
11586             }
11587         }
11588       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11589                     TYPE_VECTOR_SUBPARTS (vectype)))
11590         return rgm->controls[index];
11591
11592       /* Split the vector if needed.  Since we are dealing with integer mode
11593          masks with AVX512 we can operate on the integer representation
11594          performing the whole vector shifting.  */
11595       unsigned HOST_WIDE_INT factor;
11596       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11597                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11598       gcc_assert (ok);
11599       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11600       tree mask_type = truth_type_for (vectype);
11601       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11602       unsigned vi = index / factor;
11603       unsigned vpart = index % factor;
11604       tree vec = rgm->controls[vi];
11605       gimple_seq seq = NULL;
11606       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11607                           lang_hooks.types.type_for_mode
11608                                 (TYPE_MODE (rgm->type), 1), vec);
11609       /* For integer mode masks simply shift the right bits into position.  */
11610       if (vpart != 0)
11611         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11612                             build_int_cst (integer_type_node,
11613                                            (TYPE_VECTOR_SUBPARTS (vectype)
11614                                             * vpart)));
11615       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11616                                     (TYPE_MODE (mask_type), 1), vec);
11617       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11618       if (seq)
11619         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11620       return vec;
11621     }
11622   else
11623     gcc_unreachable ();
11624 }
11625
11626 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11627    lengths for controlling an operation on VECTYPE.  The operation splits
11628    each element of VECTYPE into FACTOR separate subelements, measuring the
11629    length as a number of these subelements.  */
11630
11631 void
11632 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11633                       unsigned int nvectors, tree vectype, unsigned int factor)
11634 {
11635   gcc_assert (nvectors != 0);
11636   if (lens->length () < nvectors)
11637     lens->safe_grow_cleared (nvectors, true);
11638   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11639
11640   /* The number of scalars per iteration, scalar occupied bytes and
11641      the number of vectors are both compile-time constants.  */
11642   unsigned int nscalars_per_iter
11643     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11644                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11645
11646   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11647     {
11648       /* For now, we only support cases in which all loads and stores fall back
11649          to VnQI or none do.  */
11650       gcc_assert (!rgl->max_nscalars_per_iter
11651                   || (rgl->factor == 1 && factor == 1)
11652                   || (rgl->max_nscalars_per_iter * rgl->factor
11653                       == nscalars_per_iter * factor));
11654       rgl->max_nscalars_per_iter = nscalars_per_iter;
11655       rgl->type = vectype;
11656       rgl->factor = factor;
11657     }
11658 }
11659
11660 /* Given a complete set of lengths LENS, extract length number INDEX
11661    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11662    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11663    multipled by the number of elements that should be processed.
11664    Insert any set-up statements before GSI.  */
11665
11666 tree
11667 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11668                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11669                    unsigned int index, unsigned int factor)
11670 {
11671   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11672   bool use_bias_adjusted_len =
11673     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11674
11675   /* Populate the rgroup's len array, if this is the first time we've
11676      used it.  */
11677   if (rgl->controls.is_empty ())
11678     {
11679       rgl->controls.safe_grow_cleared (nvectors, true);
11680       for (unsigned int i = 0; i < nvectors; ++i)
11681         {
11682           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11683           gcc_assert (len_type != NULL_TREE);
11684
11685           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11686
11687           /* Provide a dummy definition until the real one is available.  */
11688           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11689           rgl->controls[i] = len;
11690
11691           if (use_bias_adjusted_len)
11692             {
11693               gcc_assert (i == 0);
11694               tree adjusted_len =
11695                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11696               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11697               rgl->bias_adjusted_ctrl = adjusted_len;
11698             }
11699         }
11700     }
11701
11702   if (use_bias_adjusted_len)
11703     return rgl->bias_adjusted_ctrl;
11704
11705   tree loop_len = rgl->controls[index];
11706   if (rgl->factor == 1 && factor == 1)
11707     {
11708       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11709       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11710       if (maybe_ne (nunits1, nunits2))
11711         {
11712           /* A loop len for data type X can be reused for data type Y
11713              if X has N times more elements than Y and if Y's elements
11714              are N times bigger than X's.  */
11715           gcc_assert (multiple_p (nunits1, nunits2));
11716           factor = exact_div (nunits1, nunits2).to_constant ();
11717           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11718           gimple_seq seq = NULL;
11719           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11720                                    build_int_cst (iv_type, factor));
11721           if (seq)
11722             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11723         }
11724     }
11725   return loop_len;
11726 }
11727
11728 /* Generate the tree for the loop len mask and return it.  Given the lens,
11729    nvectors, vectype, index and factor to gen the len mask as below.
11730
11731    tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
11732 */
11733 tree
11734 vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11735                         gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
11736                         unsigned int nvectors, tree vectype, tree stmt,
11737                         unsigned int index, unsigned int factor)
11738 {
11739   tree all_one_mask = build_all_ones_cst (vectype);
11740   tree all_zero_mask = build_zero_cst (vectype);
11741   tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
11742                                 factor);
11743   tree bias = build_int_cst (intQI_type_node,
11744                              LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
11745   tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
11746   gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
11747                                             all_one_mask, all_zero_mask, len,
11748                                             bias);
11749   gimple_call_set_lhs (call, len_mask);
11750   gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
11751
11752   return len_mask;
11753 }
11754
11755 /* Scale profiling counters by estimation for LOOP which is vectorized
11756    by factor VF.
11757    If FLAT is true, the loop we started with had unrealistically flat
11758    profile.  */
11759
11760 static void
11761 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11762 {
11763   /* For flat profiles do not scale down proportionally by VF and only
11764      cap by known iteration count bounds.  */
11765   if (flat)
11766     {
11767       if (dump_file && (dump_flags & TDF_DETAILS))
11768         fprintf (dump_file,
11769                  "Vectorized loop profile seems flat; not scaling iteration "
11770                  "count down by the vectorization factor %i\n", vf);
11771       scale_loop_profile (loop, profile_probability::always (),
11772                           get_likely_max_loop_iterations_int (loop));
11773       return;
11774     }
11775   /* Loop body executes VF fewer times and exit increases VF times.  */
11776   profile_count entry_count = loop_preheader_edge (loop)->count ();
11777
11778   /* If we have unreliable loop profile avoid dropping entry
11779      count bellow header count.  This can happen since loops
11780      has unrealistically low trip counts.  */
11781   while (vf > 1
11782          && loop->header->count > entry_count
11783          && loop->header->count < entry_count * vf)
11784     {
11785       if (dump_file && (dump_flags & TDF_DETAILS))
11786         fprintf (dump_file,
11787                  "Vectorization factor %i seems too large for profile "
11788                  "prevoiusly believed to be consistent; reducing.\n", vf);
11789       vf /= 2;
11790     }
11791
11792   if (entry_count.nonzero_p ())
11793     set_edge_probability_and_rescale_others
11794             (exit_e,
11795              entry_count.probability_in (loop->header->count / vf));
11796   /* Avoid producing very large exit probability when we do not have
11797      sensible profile.  */
11798   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11799     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11800   loop->latch->count = single_pred_edge (loop->latch)->count ();
11801
11802   scale_loop_profile (loop, profile_probability::always () / vf,
11803                       get_likely_max_loop_iterations_int (loop));
11804 }
11805
11806 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11807    latch edge values originally defined by it.  */
11808
11809 static void
11810 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11811                                      stmt_vec_info def_stmt_info)
11812 {
11813   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11814   if (!def || TREE_CODE (def) != SSA_NAME)
11815     return;
11816   stmt_vec_info phi_info;
11817   imm_use_iterator iter;
11818   use_operand_p use_p;
11819   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11820     {
11821       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11822       if (!phi)
11823         continue;
11824       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11825             && (phi_info = loop_vinfo->lookup_stmt (phi))
11826             && STMT_VINFO_RELEVANT_P (phi_info)))
11827         continue;
11828       loop_p loop = gimple_bb (phi)->loop_father;
11829       edge e = loop_latch_edge (loop);
11830       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11831         continue;
11832
11833       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11834           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11835           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11836         {
11837           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11838           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11839           gcc_assert (phi_defs.length () == latch_defs.length ());
11840           for (unsigned i = 0; i < phi_defs.length (); ++i)
11841             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11842                          gimple_get_lhs (latch_defs[i]), e,
11843                          gimple_phi_arg_location (phi, e->dest_idx));
11844         }
11845       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11846         {
11847           /* For first order recurrences we have to update both uses of
11848              the latch definition, the one in the PHI node and the one
11849              in the generated VEC_PERM_EXPR.  */
11850           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11851           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11852           gcc_assert (phi_defs.length () == latch_defs.length ());
11853           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11854           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11855           for (unsigned i = 0; i < phi_defs.length (); ++i)
11856             {
11857               gassign *perm = as_a <gassign *> (phi_defs[i]);
11858               if (i > 0)
11859                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11860               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11861               update_stmt (perm);
11862             }
11863           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11864                        gimple_phi_arg_location (phi, e->dest_idx));
11865         }
11866     }
11867 }
11868
11869 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11870    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11871    stmt_vec_info.  */
11872
11873 static bool
11874 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11875                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11876 {
11877   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11878   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11879
11880   if (dump_enabled_p ())
11881     dump_printf_loc (MSG_NOTE, vect_location,
11882                      "------>vectorizing statement: %G", stmt_info->stmt);
11883
11884   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11885     vect_loop_kill_debug_uses (loop, stmt_info);
11886
11887   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11888       && !STMT_VINFO_LIVE_P (stmt_info))
11889     {
11890       if (is_gimple_call (stmt_info->stmt)
11891           && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11892         {
11893           gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11894           *seen_store = stmt_info;
11895           return false;
11896         }
11897       return false;
11898     }
11899
11900   if (STMT_VINFO_VECTYPE (stmt_info))
11901     {
11902       poly_uint64 nunits
11903         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11904       if (!STMT_SLP_TYPE (stmt_info)
11905           && maybe_ne (nunits, vf)
11906           && dump_enabled_p ())
11907         /* For SLP VF is set according to unrolling factor, and not
11908            to vector size, hence for SLP this print is not valid.  */
11909         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11910     }
11911
11912   /* Pure SLP statements have already been vectorized.  We still need
11913      to apply loop vectorization to hybrid SLP statements.  */
11914   if (PURE_SLP_STMT (stmt_info))
11915     return false;
11916
11917   if (dump_enabled_p ())
11918     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11919
11920   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11921     *seen_store = stmt_info;
11922
11923   return true;
11924 }
11925
11926 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11927    in the hash_map with its corresponding values.  */
11928
11929 static tree
11930 find_in_mapping (tree t, void *context)
11931 {
11932   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11933
11934   tree *value = mapping->get (t);
11935   return value ? *value : t;
11936 }
11937
11938 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11939    original loop that has now been vectorized.
11940
11941    The inits of the data_references need to be advanced with the number of
11942    iterations of the main loop.  This has been computed in vect_do_peeling and
11943    is stored in parameter ADVANCE.  We first restore the data_references
11944    initial offset with the values recored in ORIG_DRS_INIT.
11945
11946    Since the loop_vec_info of this EPILOGUE was constructed for the original
11947    loop, its stmt_vec_infos all point to the original statements.  These need
11948    to be updated to point to their corresponding copies as well as the SSA_NAMES
11949    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11950
11951    The data_reference's connections also need to be updated.  Their
11952    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11953    stmt_vec_infos, their statements need to point to their corresponding copy,
11954    if they are gather loads or scatter stores then their reference needs to be
11955    updated to point to its corresponding copy.  */
11956
11957 static void
11958 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11959 {
11960   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11961   auto_vec<gimple *> stmt_worklist;
11962   hash_map<tree,tree> mapping;
11963   gimple *orig_stmt, *new_stmt;
11964   gimple_stmt_iterator epilogue_gsi;
11965   gphi_iterator epilogue_phi_gsi;
11966   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11967   basic_block *epilogue_bbs = get_loop_body (epilogue);
11968   unsigned i;
11969
11970   free (LOOP_VINFO_BBS (epilogue_vinfo));
11971   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11972   LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
11973
11974   /* Advance data_reference's with the number of iterations of the previous
11975      loop and its prologue.  */
11976   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11977
11978
11979   /* The EPILOGUE loop is a copy of the original loop so they share the same
11980      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11981      point to the copied statements.  We also create a mapping of all LHS' in
11982      the original loop and all the LHS' in the EPILOGUE and create worklists to
11983      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11984   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11985     {
11986       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11987            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11988         {
11989           new_stmt = epilogue_phi_gsi.phi ();
11990
11991           gcc_assert (gimple_uid (new_stmt) > 0);
11992           stmt_vinfo
11993             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11994
11995           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11996           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11997
11998           mapping.put (gimple_phi_result (orig_stmt),
11999                        gimple_phi_result (new_stmt));
12000           /* PHI nodes can not have patterns or related statements.  */
12001           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
12002                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
12003         }
12004
12005       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
12006            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
12007         {
12008           new_stmt = gsi_stmt (epilogue_gsi);
12009           if (is_gimple_debug (new_stmt))
12010             continue;
12011
12012           gcc_assert (gimple_uid (new_stmt) > 0);
12013           stmt_vinfo
12014             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
12015
12016           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
12017           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
12018
12019           if (tree old_lhs = gimple_get_lhs (orig_stmt))
12020             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
12021
12022           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
12023             {
12024               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
12025               for (gimple_stmt_iterator gsi = gsi_start (seq);
12026                    !gsi_end_p (gsi); gsi_next (&gsi))
12027                 stmt_worklist.safe_push (gsi_stmt (gsi));
12028             }
12029
12030           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
12031           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
12032             {
12033               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
12034               stmt_worklist.safe_push (stmt);
12035               /* Set BB such that the assert in
12036                 'get_initial_def_for_reduction' is able to determine that
12037                 the BB of the related stmt is inside this loop.  */
12038               gimple_set_bb (stmt,
12039                              gimple_bb (new_stmt));
12040               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
12041               gcc_assert (related_vinfo == NULL
12042                           || related_vinfo == stmt_vinfo);
12043             }
12044         }
12045     }
12046
12047   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
12048      using the original main loop and thus need to be updated to refer to the
12049      cloned variables used in the epilogue.  */
12050   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
12051     {
12052       gimple *stmt = stmt_worklist[i];
12053       tree *new_op;
12054
12055       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
12056         {
12057           tree op = gimple_op (stmt, j);
12058           if ((new_op = mapping.get(op)))
12059             gimple_set_op (stmt, j, *new_op);
12060           else
12061             {
12062               /* PR92429: The last argument of simplify_replace_tree disables
12063                  folding when replacing arguments.  This is required as
12064                  otherwise you might end up with different statements than the
12065                  ones analyzed in vect_loop_analyze, leading to different
12066                  vectorization.  */
12067               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
12068                                           &find_in_mapping, &mapping, false);
12069               gimple_set_op (stmt, j, op);
12070             }
12071         }
12072     }
12073
12074   struct data_reference *dr;
12075   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
12076   FOR_EACH_VEC_ELT (datarefs, i, dr)
12077     {
12078       orig_stmt = DR_STMT (dr);
12079       gcc_assert (gimple_uid (orig_stmt) > 0);
12080       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
12081       /* Data references for gather loads and scatter stores do not use the
12082          updated offset we set using ADVANCE.  Instead we have to make sure the
12083          reference in the data references point to the corresponding copy of
12084          the original in the epilogue.  Make sure to update both
12085          gather/scatters recognized by dataref analysis and also other
12086          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
12087       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
12088       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
12089           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
12090         {
12091           DR_REF (dr)
12092             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
12093                                      &find_in_mapping, &mapping);
12094           DR_BASE_ADDRESS (dr)
12095             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
12096                                      &find_in_mapping, &mapping);
12097         }
12098       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
12099       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
12100     }
12101
12102   epilogue_vinfo->shared->datarefs_copy.release ();
12103   epilogue_vinfo->shared->save_datarefs ();
12104 }
12105
12106 /*  When vectorizing early break statements instructions that happen before
12107     the early break in the current BB need to be moved to after the early
12108     break.  This function deals with that and assumes that any validity
12109     checks has already been performed.
12110
12111     While moving the instructions if it encounters a VUSE or VDEF it then
12112     corrects the VUSES as it moves the statements along.  GDEST is the location
12113     in which to insert the new statements.  */
12114
12115 static void
12116 move_early_exit_stmts (loop_vec_info loop_vinfo)
12117 {
12118   DUMP_VECT_SCOPE ("move_early_exit_stmts");
12119
12120   if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
12121     return;
12122
12123   /* Move all stmts that need moving.  */
12124   basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
12125   gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
12126
12127   tree last_seen_vuse = NULL_TREE;
12128   for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
12129     {
12130       /* We have to update crossed degenerate virtual PHIs.  Simply
12131          elide them.  */
12132       if (gphi *vphi = dyn_cast <gphi *> (stmt))
12133         {
12134           tree vdef = gimple_phi_result (vphi);
12135           tree vuse = gimple_phi_arg_def (vphi, 0);
12136           imm_use_iterator iter;
12137           use_operand_p use_p;
12138           gimple *use_stmt;
12139           FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
12140             {
12141               FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
12142                 SET_USE (use_p, vuse);
12143             }
12144           auto gsi = gsi_for_stmt (stmt);
12145           remove_phi_node (&gsi, true);
12146           last_seen_vuse = vuse;
12147           continue;
12148         }
12149
12150       /* Check to see if statement is still required for vect or has been
12151          elided.  */
12152       auto stmt_info = loop_vinfo->lookup_stmt (stmt);
12153       if (!stmt_info)
12154         continue;
12155
12156       if (dump_enabled_p ())
12157         dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
12158
12159       gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
12160       gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
12161       last_seen_vuse = gimple_vuse (stmt);
12162     }
12163
12164   /* Update all the stmts with their new reaching VUSES.  */
12165   for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
12166     {
12167       if (dump_enabled_p ())
12168           dump_printf_loc (MSG_NOTE, vect_location,
12169                            "updating vuse to %T for load %G",
12170                            last_seen_vuse, p);
12171       gimple_set_vuse (p, last_seen_vuse);
12172       update_stmt (p);
12173     }
12174
12175   /* And update the LC PHIs on exits.  */
12176   for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP  (loop_vinfo)))
12177     if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
12178       if (gphi *phi = get_virtual_phi (e->dest))
12179         SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
12180 }
12181
12182 /* Function vect_transform_loop.
12183
12184    The analysis phase has determined that the loop is vectorizable.
12185    Vectorize the loop - created vectorized stmts to replace the scalar
12186    stmts in the loop, and update the loop exit condition.
12187    Returns scalar epilogue loop if any.  */
12188
12189 class loop *
12190 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
12191 {
12192   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12193   class loop *epilogue = NULL;
12194   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
12195   int nbbs = loop->num_nodes;
12196   int i;
12197   tree niters_vector = NULL_TREE;
12198   tree step_vector = NULL_TREE;
12199   tree niters_vector_mult_vf = NULL_TREE;
12200   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12201   unsigned int lowest_vf = constant_lower_bound (vf);
12202   gimple *stmt;
12203   bool check_profitability = false;
12204   unsigned int th;
12205   bool flat = maybe_flat_loop_profile (loop);
12206
12207   DUMP_VECT_SCOPE ("vec_transform_loop");
12208
12209   loop_vinfo->shared->check_datarefs ();
12210
12211   /* Use the more conservative vectorization threshold.  If the number
12212      of iterations is constant assume the cost check has been performed
12213      by our caller.  If the threshold makes all loops profitable that
12214      run at least the (estimated) vectorization factor number of times
12215      checking is pointless, too.  */
12216   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
12217   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
12218     {
12219       if (dump_enabled_p ())
12220         dump_printf_loc (MSG_NOTE, vect_location,
12221                          "Profitability threshold is %d loop iterations.\n",
12222                          th);
12223       check_profitability = true;
12224     }
12225
12226   /* Make sure there exists a single-predecessor exit bb.  Do this before
12227      versioning.   */
12228   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
12229   if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
12230     {
12231       split_loop_exit_edge (e, true);
12232       if (dump_enabled_p ())
12233         dump_printf (MSG_NOTE, "split exit edge\n");
12234     }
12235
12236   /* Version the loop first, if required, so the profitability check
12237      comes first.  */
12238
12239   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
12240     {
12241       class loop *sloop
12242         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
12243       sloop->force_vectorize = false;
12244       check_profitability = false;
12245     }
12246
12247   /* Make sure there exists a single-predecessor exit bb also on the
12248      scalar loop copy.  Do this after versioning but before peeling
12249      so CFG structure is fine for both scalar and if-converted loop
12250      to make slpeel_duplicate_current_defs_from_edges face matched
12251      loop closed PHI nodes on the exit.  */
12252   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
12253     {
12254       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
12255       if (! single_pred_p (e->dest))
12256         {
12257           split_loop_exit_edge (e, true);
12258           if (dump_enabled_p ())
12259             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
12260         }
12261     }
12262
12263   tree niters = vect_build_loop_niters (loop_vinfo);
12264   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
12265   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
12266   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
12267   tree advance;
12268   drs_init_vec orig_drs_init;
12269
12270   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
12271                               &step_vector, &niters_vector_mult_vf, th,
12272                               check_profitability, niters_no_overflow,
12273                               &advance);
12274   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
12275       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
12276     {
12277       /* Ifcvt duplicates loop preheader, loop body and produces an basic
12278          block after loop exit.  We need to scale all that.  */
12279       basic_block preheader
12280         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
12281       preheader->count
12282         = preheader->count.apply_probability
12283               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
12284       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
12285                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
12286       LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
12287     }
12288
12289   if (niters_vector == NULL_TREE)
12290     {
12291       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
12292           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
12293           && known_eq (lowest_vf, vf))
12294         {
12295           niters_vector
12296             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
12297                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
12298           step_vector = build_one_cst (TREE_TYPE (niters));
12299         }
12300       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
12301         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
12302                                      &step_vector, niters_no_overflow);
12303       else
12304         /* vect_do_peeling subtracted the number of peeled prologue
12305            iterations from LOOP_VINFO_NITERS.  */
12306         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
12307                                      &niters_vector, &step_vector,
12308                                      niters_no_overflow);
12309     }
12310
12311   /* 1) Make sure the loop header has exactly two entries
12312      2) Make sure we have a preheader basic block.  */
12313
12314   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
12315
12316   split_edge (loop_preheader_edge (loop));
12317
12318   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
12319     /* This will deal with any possible peeling.  */
12320     vect_prepare_for_masked_peels (loop_vinfo);
12321
12322   /* Handle any code motion that we need to for early-break vectorization after
12323      we've done peeling but just before we start vectorizing.  */
12324   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
12325     move_early_exit_stmts (loop_vinfo);
12326
12327   /* Schedule the SLP instances first, then handle loop vectorization
12328      below.  */
12329   if (!loop_vinfo->slp_instances.is_empty ())
12330     {
12331       DUMP_VECT_SCOPE ("scheduling SLP instances");
12332       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
12333     }
12334
12335   /* FORNOW: the vectorizer supports only loops which body consist
12336      of one basic block (header + empty latch). When the vectorizer will
12337      support more involved loop forms, the order by which the BBs are
12338      traversed need to be reconsidered.  */
12339
12340   for (i = 0; i < nbbs; i++)
12341     {
12342       basic_block bb = bbs[i];
12343       stmt_vec_info stmt_info;
12344
12345       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12346            gsi_next (&si))
12347         {
12348           gphi *phi = si.phi ();
12349           if (dump_enabled_p ())
12350             dump_printf_loc (MSG_NOTE, vect_location,
12351                              "------>vectorizing phi: %G", (gimple *) phi);
12352           stmt_info = loop_vinfo->lookup_stmt (phi);
12353           if (!stmt_info)
12354             continue;
12355
12356           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
12357             vect_loop_kill_debug_uses (loop, stmt_info);
12358
12359           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12360               && !STMT_VINFO_LIVE_P (stmt_info))
12361             continue;
12362
12363           if (STMT_VINFO_VECTYPE (stmt_info)
12364               && (maybe_ne
12365                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12366               && dump_enabled_p ())
12367             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12368
12369           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12370                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12371                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12372                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12373                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12374                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12375               && ! PURE_SLP_STMT (stmt_info))
12376             {
12377               if (dump_enabled_p ())
12378                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12379               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12380             }
12381         }
12382
12383       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12384            gsi_next (&si))
12385         {
12386           gphi *phi = si.phi ();
12387           stmt_info = loop_vinfo->lookup_stmt (phi);
12388           if (!stmt_info)
12389             continue;
12390
12391           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12392               && !STMT_VINFO_LIVE_P (stmt_info))
12393             continue;
12394
12395           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12396                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12397                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12398                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12399                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12400                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12401               && ! PURE_SLP_STMT (stmt_info))
12402             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12403         }
12404
12405       for (gimple_stmt_iterator si = gsi_start_bb (bb);
12406            !gsi_end_p (si);)
12407         {
12408           stmt = gsi_stmt (si);
12409           /* During vectorization remove existing clobber stmts and
12410              prefetches.  */
12411           if (gimple_clobber_p (stmt)
12412               || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
12413             {
12414               unlink_stmt_vdef (stmt);
12415               gsi_remove (&si, true);
12416               release_defs (stmt);
12417             }
12418           else
12419             {
12420               /* Ignore vector stmts created in the outer loop.  */
12421               stmt_info = loop_vinfo->lookup_stmt (stmt);
12422
12423               /* vector stmts created in the outer-loop during vectorization of
12424                  stmts in an inner-loop may not have a stmt_info, and do not
12425                  need to be vectorized.  */
12426               stmt_vec_info seen_store = NULL;
12427               if (stmt_info)
12428                 {
12429                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12430                     {
12431                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12432                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12433                            !gsi_end_p (subsi); gsi_next (&subsi))
12434                         {
12435                           stmt_vec_info pat_stmt_info
12436                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12437                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12438                                                     &si, &seen_store);
12439                         }
12440                       stmt_vec_info pat_stmt_info
12441                         = STMT_VINFO_RELATED_STMT (stmt_info);
12442                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12443                                                     &si, &seen_store))
12444                         maybe_set_vectorized_backedge_value (loop_vinfo,
12445                                                              pat_stmt_info);
12446                     }
12447                   else
12448                     {
12449                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12450                                                     &seen_store))
12451                         maybe_set_vectorized_backedge_value (loop_vinfo,
12452                                                              stmt_info);
12453                     }
12454                 }
12455               gsi_next (&si);
12456               if (seen_store)
12457                 {
12458                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12459                     /* Interleaving.  If IS_STORE is TRUE, the
12460                        vectorization of the interleaving chain was
12461                        completed - free all the stores in the chain.  */
12462                     vect_remove_stores (loop_vinfo,
12463                                         DR_GROUP_FIRST_ELEMENT (seen_store));
12464                   else
12465                     /* Free the attached stmt_vec_info and remove the stmt.  */
12466                     loop_vinfo->remove_stmt (stmt_info);
12467                 }
12468             }
12469         }
12470
12471       /* Stub out scalar statements that must not survive vectorization.
12472          Doing this here helps with grouped statements, or statements that
12473          are involved in patterns.  */
12474       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12475            !gsi_end_p (gsi); gsi_next (&gsi))
12476         {
12477           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12478           if (!call || !gimple_call_internal_p (call))
12479             continue;
12480           internal_fn ifn = gimple_call_internal_fn (call);
12481           if (ifn == IFN_MASK_LOAD)
12482             {
12483               tree lhs = gimple_get_lhs (call);
12484               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12485                 {
12486                   tree zero = build_zero_cst (TREE_TYPE (lhs));
12487                   gimple *new_stmt = gimple_build_assign (lhs, zero);
12488                   gsi_replace (&gsi, new_stmt, true);
12489                 }
12490             }
12491           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12492             {
12493               tree lhs = gimple_get_lhs (call);
12494               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12495                 {
12496                   tree else_arg
12497                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12498                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12499                   gsi_replace (&gsi, new_stmt, true);
12500                 }
12501             }
12502         }
12503     }                           /* BBs in loop */
12504
12505   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12506      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
12507   if (integer_onep (step_vector))
12508     niters_no_overflow = true;
12509   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12510                            niters_vector, step_vector, niters_vector_mult_vf,
12511                            !niters_no_overflow);
12512
12513   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12514
12515   /* True if the final iteration might not handle a full vector's
12516      worth of scalar iterations.  */
12517   bool final_iter_may_be_partial
12518     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
12519       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
12520
12521   /* +1 to convert latch counts to loop iteration counts.  */
12522   int bias_for_lowest = 1;
12523
12524   /* When we are peeling for gaps then we take away one scalar iteration
12525      from the vector loop.  Thus we can adjust the upper bound by one
12526      scalar iteration.  But only when we know the bound applies to the
12527      IV exit test which might not be true when we have multiple exits.  */
12528   if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
12529     bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12530
12531   int bias_for_assumed = bias_for_lowest;
12532   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12533   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12534     {
12535       /* When the amount of peeling is known at compile time, the first
12536          iteration will have exactly alignment_npeels active elements.
12537          In the worst case it will have at least one.  */
12538       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12539       bias_for_lowest += lowest_vf - min_first_active;
12540       bias_for_assumed += assumed_vf - min_first_active;
12541     }
12542   /* In these calculations the "- 1" converts loop iteration counts
12543      back to latch counts.  */
12544   if (loop->any_upper_bound)
12545     {
12546       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12547       loop->nb_iterations_upper_bound
12548         = (final_iter_may_be_partial
12549            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12550                             lowest_vf) - 1
12551            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12552                              lowest_vf) - 1);
12553       if (main_vinfo
12554           /* Both peeling for alignment and peeling for gaps can end up
12555              with the scalar epilogue running for more than VF-1 iterations.  */
12556           && !main_vinfo->peeling_for_alignment
12557           && !main_vinfo->peeling_for_gaps)
12558         {
12559           unsigned int bound;
12560           poly_uint64 main_iters
12561             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12562                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12563           main_iters
12564             = upper_bound (main_iters,
12565                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12566           if (can_div_away_from_zero_p (main_iters,
12567                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12568                                         &bound))
12569             loop->nb_iterations_upper_bound
12570               = wi::umin ((bound_wide_int) (bound - 1),
12571                           loop->nb_iterations_upper_bound);
12572       }
12573   }
12574   if (loop->any_likely_upper_bound)
12575     loop->nb_iterations_likely_upper_bound
12576       = (final_iter_may_be_partial
12577          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12578                           + bias_for_lowest, lowest_vf) - 1
12579          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12580                            + bias_for_lowest, lowest_vf) - 1);
12581   if (loop->any_estimate)
12582     loop->nb_iterations_estimate
12583       = (final_iter_may_be_partial
12584          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12585                           assumed_vf) - 1
12586          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12587                            assumed_vf) - 1);
12588   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12589                                assumed_vf, flat);
12590
12591   if (dump_enabled_p ())
12592     {
12593       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12594         {
12595           dump_printf_loc (MSG_NOTE, vect_location,
12596                            "LOOP VECTORIZED\n");
12597           if (loop->inner)
12598             dump_printf_loc (MSG_NOTE, vect_location,
12599                              "OUTER LOOP VECTORIZED\n");
12600           dump_printf (MSG_NOTE, "\n");
12601         }
12602       else
12603         dump_printf_loc (MSG_NOTE, vect_location,
12604                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12605                          GET_MODE_NAME (loop_vinfo->vector_mode));
12606     }
12607
12608   /* Loops vectorized with a variable factor won't benefit from
12609      unrolling/peeling.  */
12610   if (!vf.is_constant ())
12611     {
12612       loop->unroll = 1;
12613       if (dump_enabled_p ())
12614         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12615                          " variable-length vectorization factor\n");
12616     }
12617   /* Free SLP instances here because otherwise stmt reference counting
12618      won't work.  */
12619   slp_instance instance;
12620   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12621     vect_free_slp_instance (instance);
12622   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12623   /* Clear-up safelen field since its value is invalid after vectorization
12624      since vectorized loop can have loop-carried dependencies.  */
12625   loop->safelen = 0;
12626
12627   if (epilogue)
12628     {
12629       update_epilogue_loop_vinfo (epilogue, advance);
12630
12631       epilogue->simduid = loop->simduid;
12632       epilogue->force_vectorize = loop->force_vectorize;
12633       epilogue->dont_vectorize = false;
12634     }
12635
12636   return epilogue;
12637 }
12638
12639 /* The code below is trying to perform simple optimization - revert
12640    if-conversion for masked stores, i.e. if the mask of a store is zero
12641    do not perform it and all stored value producers also if possible.
12642    For example,
12643      for (i=0; i<n; i++)
12644        if (c[i])
12645         {
12646           p1[i] += 1;
12647           p2[i] = p3[i] +2;
12648         }
12649    this transformation will produce the following semi-hammock:
12650
12651    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12652      {
12653        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12654        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12655        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12656        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12657        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12658        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12659      }
12660 */
12661
12662 void
12663 optimize_mask_stores (class loop *loop)
12664 {
12665   basic_block *bbs = get_loop_body (loop);
12666   unsigned nbbs = loop->num_nodes;
12667   unsigned i;
12668   basic_block bb;
12669   class loop *bb_loop;
12670   gimple_stmt_iterator gsi;
12671   gimple *stmt;
12672   auto_vec<gimple *> worklist;
12673   auto_purge_vect_location sentinel;
12674
12675   vect_location = find_loop_location (loop);
12676   /* Pick up all masked stores in loop if any.  */
12677   for (i = 0; i < nbbs; i++)
12678     {
12679       bb = bbs[i];
12680       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12681            gsi_next (&gsi))
12682         {
12683           stmt = gsi_stmt (gsi);
12684           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12685             worklist.safe_push (stmt);
12686         }
12687     }
12688
12689   free (bbs);
12690   if (worklist.is_empty ())
12691     return;
12692
12693   /* Loop has masked stores.  */
12694   while (!worklist.is_empty ())
12695     {
12696       gimple *last, *last_store;
12697       edge e, efalse;
12698       tree mask;
12699       basic_block store_bb, join_bb;
12700       gimple_stmt_iterator gsi_to;
12701       tree vdef, new_vdef;
12702       gphi *phi;
12703       tree vectype;
12704       tree zero;
12705
12706       last = worklist.pop ();
12707       mask = gimple_call_arg (last, 2);
12708       bb = gimple_bb (last);
12709       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12710          the same loop as if_bb.  It could be different to LOOP when two
12711          level loop-nest is vectorized and mask_store belongs to the inner
12712          one.  */
12713       e = split_block (bb, last);
12714       bb_loop = bb->loop_father;
12715       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12716       join_bb = e->dest;
12717       store_bb = create_empty_bb (bb);
12718       add_bb_to_loop (store_bb, bb_loop);
12719       e->flags = EDGE_TRUE_VALUE;
12720       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12721       /* Put STORE_BB to likely part.  */
12722       efalse->probability = profile_probability::likely ();
12723       e->probability = efalse->probability.invert ();
12724       store_bb->count = efalse->count ();
12725       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12726       if (dom_info_available_p (CDI_DOMINATORS))
12727         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12728       if (dump_enabled_p ())
12729         dump_printf_loc (MSG_NOTE, vect_location,
12730                          "Create new block %d to sink mask stores.",
12731                          store_bb->index);
12732       /* Create vector comparison with boolean result.  */
12733       vectype = TREE_TYPE (mask);
12734       zero = build_zero_cst (vectype);
12735       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12736       gsi = gsi_last_bb (bb);
12737       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12738       /* Create new PHI node for vdef of the last masked store:
12739          .MEM_2 = VDEF <.MEM_1>
12740          will be converted to
12741          .MEM.3 = VDEF <.MEM_1>
12742          and new PHI node will be created in join bb
12743          .MEM_2 = PHI <.MEM_1, .MEM_3>
12744       */
12745       vdef = gimple_vdef (last);
12746       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12747       gimple_set_vdef (last, new_vdef);
12748       phi = create_phi_node (vdef, join_bb);
12749       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12750
12751       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12752       while (true)
12753         {
12754           gimple_stmt_iterator gsi_from;
12755           gimple *stmt1 = NULL;
12756
12757           /* Move masked store to STORE_BB.  */
12758           last_store = last;
12759           gsi = gsi_for_stmt (last);
12760           gsi_from = gsi;
12761           /* Shift GSI to the previous stmt for further traversal.  */
12762           gsi_prev (&gsi);
12763           gsi_to = gsi_start_bb (store_bb);
12764           gsi_move_before (&gsi_from, &gsi_to);
12765           /* Setup GSI_TO to the non-empty block start.  */
12766           gsi_to = gsi_start_bb (store_bb);
12767           if (dump_enabled_p ())
12768             dump_printf_loc (MSG_NOTE, vect_location,
12769                              "Move stmt to created bb\n%G", last);
12770           /* Move all stored value producers if possible.  */
12771           while (!gsi_end_p (gsi))
12772             {
12773               tree lhs;
12774               imm_use_iterator imm_iter;
12775               use_operand_p use_p;
12776               bool res;
12777
12778               /* Skip debug statements.  */
12779               if (is_gimple_debug (gsi_stmt (gsi)))
12780                 {
12781                   gsi_prev (&gsi);
12782                   continue;
12783                 }
12784               stmt1 = gsi_stmt (gsi);
12785               /* Do not consider statements writing to memory or having
12786                  volatile operand.  */
12787               if (gimple_vdef (stmt1)
12788                   || gimple_has_volatile_ops (stmt1))
12789                 break;
12790               gsi_from = gsi;
12791               gsi_prev (&gsi);
12792               lhs = gimple_get_lhs (stmt1);
12793               if (!lhs)
12794                 break;
12795
12796               /* LHS of vectorized stmt must be SSA_NAME.  */
12797               if (TREE_CODE (lhs) != SSA_NAME)
12798                 break;
12799
12800               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12801                 {
12802                   /* Remove dead scalar statement.  */
12803                   if (has_zero_uses (lhs))
12804                     {
12805                       gsi_remove (&gsi_from, true);
12806                       continue;
12807                     }
12808                 }
12809
12810               /* Check that LHS does not have uses outside of STORE_BB.  */
12811               res = true;
12812               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12813                 {
12814                   gimple *use_stmt;
12815                   use_stmt = USE_STMT (use_p);
12816                   if (is_gimple_debug (use_stmt))
12817                     continue;
12818                   if (gimple_bb (use_stmt) != store_bb)
12819                     {
12820                       res = false;
12821                       break;
12822                     }
12823                 }
12824               if (!res)
12825                 break;
12826
12827               if (gimple_vuse (stmt1)
12828                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12829                 break;
12830
12831               /* Can move STMT1 to STORE_BB.  */
12832               if (dump_enabled_p ())
12833                 dump_printf_loc (MSG_NOTE, vect_location,
12834                                  "Move stmt to created bb\n%G", stmt1);
12835               gsi_move_before (&gsi_from, &gsi_to);
12836               /* Shift GSI_TO for further insertion.  */
12837               gsi_prev (&gsi_to);
12838             }
12839           /* Put other masked stores with the same mask to STORE_BB.  */
12840           if (worklist.is_empty ()
12841               || gimple_call_arg (worklist.last (), 2) != mask
12842               || worklist.last () != stmt1)
12843             break;
12844           last = worklist.pop ();
12845         }
12846       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12847     }
12848 }
12849
12850 /* Decide whether it is possible to use a zero-based induction variable
12851    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12852    the value that the induction variable must be able to hold in order
12853    to ensure that the rgroups eventually have no active vector elements.
12854    Return -1 otherwise.  */
12855
12856 widest_int
12857 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12858 {
12859   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12860   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12861   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12862
12863   /* Calculate the value that the induction variable must be able
12864      to hit in order to ensure that we end the loop with an all-false mask.
12865      This involves adding the maximum number of inactive trailing scalar
12866      iterations.  */
12867   widest_int iv_limit = -1;
12868   if (max_loop_iterations (loop, &iv_limit))
12869     {
12870       if (niters_skip)
12871         {
12872           /* Add the maximum number of skipped iterations to the
12873              maximum iteration count.  */
12874           if (TREE_CODE (niters_skip) == INTEGER_CST)
12875             iv_limit += wi::to_widest (niters_skip);
12876           else
12877             iv_limit += max_vf - 1;
12878         }
12879       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12880         /* Make a conservatively-correct assumption.  */
12881         iv_limit += max_vf - 1;
12882
12883       /* IV_LIMIT is the maximum number of latch iterations, which is also
12884          the maximum in-range IV value.  Round this value down to the previous
12885          vector alignment boundary and then add an extra full iteration.  */
12886       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12887       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12888     }
12889   return iv_limit;
12890 }
12891
12892 /* For the given rgroup_controls RGC, check whether an induction variable
12893    would ever hit a value that produces a set of all-false masks or zero
12894    lengths before wrapping around.  Return true if it's possible to wrap
12895    around before hitting the desirable value, otherwise return false.  */
12896
12897 bool
12898 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12899 {
12900   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12901
12902   if (iv_limit == -1)
12903     return true;
12904
12905   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12906   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12907   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12908
12909   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12910     return true;
12911
12912   return false;
12913 }