gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #define INCLUDE_MEMORY
  24 #include "config.h"
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "gimple.h"
  32 #include "cfghooks.h"
  33 #include "tree-pass.h"
  34 #include "ssa.h"
  35 #include "optabs-tree.h"
  36 #include "memmodel.h"
  37 #include "optabs.h"
  38 #include "diagnostic-core.h"
  39 #include "fold-const.h"
  40 #include "stor-layout.h"
  41 #include "cfganal.h"
  42 #include "gimplify.h"
  43 #include "gimple-iterator.h"
  44 #include "gimplify-me.h"
  45 #include "tree-ssa-loop-ivopts.h"
  46 #include "tree-ssa-loop-manip.h"
  47 #include "tree-ssa-loop-niter.h"
  48 #include "tree-ssa-loop.h"
  49 #include "cfgloop.h"
  50 #include "tree-scalar-evolution.h"
  51 #include "tree-vectorizer.h"
  52 #include "gimple-fold.h"
  53 #include "cgraph.h"
  54 #include "tree-cfg.h"
  55 #include "tree-if-conv.h"
  56 #include "internal-fn.h"
  57 #include "tree-vector-builder.h"
  58 #include "vec-perm-indices.h"
  59 #include "tree-eh.h"
  60 #include "case-cfn-macros.h"
  61 #include "langhooks.h"
  62
  63 /* Loop Vectorization Pass.
  64
  65    This pass tries to vectorize loops.
  66
  67    For example, the vectorizer transforms the following simple loop:
  68
  69         short a[N]; short b[N]; short c[N]; int i;
  70
  71         for (i=0; i<N; i++){
  72           a[i] = b[i] + c[i];
  73         }
  74
  75    as if it was manually vectorized by rewriting the source code into:
  76
  77         typedef int __attribute__((mode(V8HI))) v8hi;
  78         short a[N];  short b[N]; short c[N];   int i;
  79         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  80         v8hi va, vb, vc;
  81
  82         for (i=0; i<N/8; i++){
  83           vb = pb[i];
  84           vc = pc[i];
  85           va = vb + vc;
  86           pa[i] = va;
  87         }
  88
  89         The main entry to this pass is vectorize_loops(), in which
  90    the vectorizer applies a set of analyses on a given set of loops,
  91    followed by the actual vectorization transformation for the loops that
  92    had successfully passed the analysis phase.
  93         Throughout this pass we make a distinction between two types of
  94    data: scalars (which are represented by SSA_NAMES), and memory references
  95    ("data-refs").  These two types of data require different handling both
  96    during analysis and transformation. The types of data-refs that the
  97    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  98    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  99    accesses are required to have a simple (consecutive) access pattern.
 100
 101    Analysis phase:
 102    ===============
 103         The driver for the analysis phase is vect_analyze_loop().
 104    It applies a set of analyses, some of which rely on the scalar evolution
 105    analyzer (scev) developed by Sebastian Pop.
 106
 107         During the analysis phase the vectorizer records some information
 108    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 109    loop, as well as general information about the loop as a whole, which is
 110    recorded in a "loop_vec_info" struct attached to each loop.
 111
 112    Transformation phase:
 113    =====================
 114         The loop transformation phase scans all the stmts in the loop, and
 115    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 116    the loop that needs to be vectorized.  It inserts the vector code sequence
 117    just before the scalar stmt S, and records a pointer to the vector code
 118    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 119    attached to S).  This pointer will be used for the vectorization of following
 120    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 121    otherwise, we rely on dead code elimination for removing it.
 122
 123         For example, say stmt S1 was vectorized into stmt VS1:
 124
 125    VS1: vb = px[i];
 126    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 127    S2:  a = b;
 128
 129    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 130    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 131    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 132    resulting sequence would be:
 133
 134    VS1: vb = px[i];
 135    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 136    VS2: va = vb;
 137    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 138
 139         Operands that are not SSA_NAMEs, are data-refs that appear in
 140    load/store operations (like 'x[i]' in S1), and are handled differently.
 141
 142    Target modeling:
 143    =================
 144         Currently the only target specific information that is used is the
 145    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 146    Targets that can support different sizes of vectors, for now will need
 147    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 148    flexibility will be added in the future.
 149
 150         Since we only vectorize operations which vector form can be
 151    expressed using existing tree codes, to verify that an operation is
 152    supported, the vectorizer checks the relevant optab at the relevant
 153    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 154    the value found is CODE_FOR_nothing, then there's no target support, and
 155    we can't vectorize the stmt.
 156
 157    For additional information on this project see:
 158    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 159 */
 160
 161 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 162                                                 unsigned *);
 163 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 164                                                bool *, bool *, bool);
 165
 166 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 167    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 168    may already be set for general statements (not just data refs).  */
 169
 170 static opt_result
 171 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 172                               bool vectype_maybe_set_p,
 173                               poly_uint64 *vf)
 174 {
 175   gimple *stmt = stmt_info->stmt;
 176
 177   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 178        && !STMT_VINFO_LIVE_P (stmt_info))
 179       || gimple_clobber_p (stmt))
 180     {
 181       if (dump_enabled_p ())
 182         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 183       return opt_result::success ();
 184     }
 185
 186   tree stmt_vectype, nunits_vectype;
 187   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 188                                                    &stmt_vectype,
 189                                                    &nunits_vectype);
 190   if (!res)
 191     return res;
 192
 193   if (stmt_vectype)
 194     {
 195       if (STMT_VINFO_VECTYPE (stmt_info))
 196         /* The only case when a vectype had been already set is for stmts
 197            that contain a data ref, or for "pattern-stmts" (stmts generated
 198            by the vectorizer to represent/replace a certain idiom).  */
 199         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 200                      || vectype_maybe_set_p)
 201                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 202       else
 203         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 204     }
 205
 206   if (nunits_vectype)
 207     vect_update_max_nunits (vf, nunits_vectype);
 208
 209   return opt_result::success ();
 210 }
 211
 212 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 213    types of STMT_INFO and all attached pattern statements and update
 214    the vectorization factor VF accordingly.  Return true on success
 215    or false if something prevented vectorization.  */
 216
 217 static opt_result
 218 vect_determine_vf_for_stmt (vec_info *vinfo,
 219                             stmt_vec_info stmt_info, poly_uint64 *vf)
 220 {
 221   if (dump_enabled_p ())
 222     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 223                      stmt_info->stmt);
 224   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 225   if (!res)
 226     return res;
 227
 228   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 229       && STMT_VINFO_RELATED_STMT (stmt_info))
 230     {
 231       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 232       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 233
 234       /* If a pattern statement has def stmts, analyze them too.  */
 235       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 236            !gsi_end_p (si); gsi_next (&si))
 237         {
 238           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 239           if (dump_enabled_p ())
 240             dump_printf_loc (MSG_NOTE, vect_location,
 241                              "==> examining pattern def stmt: %G",
 242                              def_stmt_info->stmt);
 243           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 244           if (!res)
 245             return res;
 246         }
 247
 248       if (dump_enabled_p ())
 249         dump_printf_loc (MSG_NOTE, vect_location,
 250                          "==> examining pattern statement: %G",
 251                          stmt_info->stmt);
 252       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 253       if (!res)
 254         return res;
 255     }
 256
 257   return opt_result::success ();
 258 }
 259
 260 /* Function vect_determine_vectorization_factor
 261
 262    Determine the vectorization factor (VF).  VF is the number of data elements
 263    that are operated upon in parallel in a single iteration of the vectorized
 264    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 265    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 266    elements can fit in a single vector register.
 267
 268    We currently support vectorization of loops in which all types operated upon
 269    are of the same size.  Therefore this function currently sets VF according to
 270    the size of the types operated upon, and fails if there are multiple sizes
 271    in the loop.
 272
 273    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 274    original loop:
 275         for (i=0; i<N; i++){
 276           a[i] = b[i] + c[i];
 277         }
 278
 279    vectorized loop:
 280         for (i=0; i<N; i+=VF){
 281           a[i:VF] = b[i:VF] + c[i:VF];
 282         }
 283 */
 284
 285 static opt_result
 286 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 287 {
 288   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 289   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 290   unsigned nbbs = loop->num_nodes;
 291   poly_uint64 vectorization_factor = 1;
 292   tree scalar_type = NULL_TREE;
 293   gphi *phi;
 294   tree vectype;
 295   stmt_vec_info stmt_info;
 296   unsigned i;
 297
 298   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 299
 300   for (i = 0; i < nbbs; i++)
 301     {
 302       basic_block bb = bbs[i];
 303
 304       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 305            gsi_next (&si))
 306         {
 307           phi = si.phi ();
 308           stmt_info = loop_vinfo->lookup_stmt (phi);
 309           if (dump_enabled_p ())
 310             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 311                              (gimple *) phi);
 312
 313           gcc_assert (stmt_info);
 314
 315           if (STMT_VINFO_RELEVANT_P (stmt_info)
 316               || STMT_VINFO_LIVE_P (stmt_info))
 317             {
 318               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 319               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 320
 321               if (dump_enabled_p ())
 322                 dump_printf_loc (MSG_NOTE, vect_location,
 323                                  "get vectype for scalar type:  %T\n",
 324                                  scalar_type);
 325
 326               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 327               if (!vectype)
 328                 return opt_result::failure_at (phi,
 329                                                "not vectorized: unsupported "
 330                                                "data-type %T\n",
 331                                                scalar_type);
 332               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 333
 334               if (dump_enabled_p ())
 335                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 336                                  vectype);
 337
 338               if (dump_enabled_p ())
 339                 {
 340                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 341                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 342                   dump_printf (MSG_NOTE, "\n");
 343                 }
 344
 345               vect_update_max_nunits (&vectorization_factor, vectype);
 346             }
 347         }
 348
 349       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 350            gsi_next (&si))
 351         {
 352           if (is_gimple_debug (gsi_stmt (si)))
 353             continue;
 354           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 355           opt_result res
 356             = vect_determine_vf_for_stmt (loop_vinfo,
 357                                           stmt_info, &vectorization_factor);
 358           if (!res)
 359             return res;
 360         }
 361     }
 362
 363   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 364   if (dump_enabled_p ())
 365     {
 366       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 367       dump_dec (MSG_NOTE, vectorization_factor);
 368       dump_printf (MSG_NOTE, "\n");
 369     }
 370
 371   if (known_le (vectorization_factor, 1U))
 372     return opt_result::failure_at (vect_location,
 373                                    "not vectorized: unsupported data-type\n");
 374   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 375   return opt_result::success ();
 376 }
 377
 378
 379 /* Function vect_is_simple_iv_evolution.
 380
 381    FORNOW: A simple evolution of an induction variables in the loop is
 382    considered a polynomial evolution.  */
 383
 384 static bool
 385 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 386                              tree * step)
 387 {
 388   tree init_expr;
 389   tree step_expr;
 390   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 391   basic_block bb;
 392
 393   /* When there is no evolution in this loop, the evolution function
 394      is not "simple".  */
 395   if (evolution_part == NULL_TREE)
 396     return false;
 397
 398   /* When the evolution is a polynomial of degree >= 2
 399      the evolution function is not "simple".  */
 400   if (tree_is_chrec (evolution_part))
 401     return false;
 402
 403   step_expr = evolution_part;
 404   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 405
 406   if (dump_enabled_p ())
 407     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 408                      step_expr, init_expr);
 409
 410   *init = init_expr;
 411   *step = step_expr;
 412
 413   if (TREE_CODE (step_expr) != INTEGER_CST
 414       && (TREE_CODE (step_expr) != SSA_NAME
 415           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 416               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 417           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 418               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 419                   || !flag_associative_math)))
 420       && (TREE_CODE (step_expr) != REAL_CST
 421           || !flag_associative_math))
 422     {
 423       if (dump_enabled_p ())
 424         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 425                          "step unknown.\n");
 426       return false;
 427     }
 428
 429   return true;
 430 }
 431
 432 /* Function vect_is_nonlinear_iv_evolution
 433
 434    Only support nonlinear induction for integer type
 435    1. neg
 436    2. mul by constant
 437    3. lshift/rshift by constant.
 438
 439    For neg induction, return a fake step as integer -1.  */
 440 static bool
 441 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 442                                 gphi* loop_phi_node, tree *init, tree *step)
 443 {
 444   tree init_expr, ev_expr, result, op1, op2;
 445   gimple* def;
 446
 447   if (gimple_phi_num_args (loop_phi_node) != 2)
 448     return false;
 449
 450   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 451   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 452
 453   /* Support nonlinear induction only for integer type.  */
 454   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 455     return false;
 456
 457   *init = init_expr;
 458   result = PHI_RESULT (loop_phi_node);
 459
 460   if (TREE_CODE (ev_expr) != SSA_NAME
 461       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 462       || !is_gimple_assign (def))
 463     return false;
 464
 465   enum tree_code t_code = gimple_assign_rhs_code (def);
 466   switch (t_code)
 467     {
 468     case NEGATE_EXPR:
 469       if (gimple_assign_rhs1 (def) != result)
 470         return false;
 471       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 472       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 473       break;
 474
 475     case RSHIFT_EXPR:
 476     case LSHIFT_EXPR:
 477     case MULT_EXPR:
 478       op1 = gimple_assign_rhs1 (def);
 479       op2 = gimple_assign_rhs2 (def);
 480       if (TREE_CODE (op2) != INTEGER_CST
 481           || op1 != result)
 482         return false;
 483       *step = op2;
 484       if (t_code == LSHIFT_EXPR)
 485         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 486       else if (t_code == RSHIFT_EXPR)
 487         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 488       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 489       else
 490         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 491       break;
 492
 493     default:
 494       return false;
 495     }
 496
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 498   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 499
 500   return true;
 501 }
 502
 503 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 504    what we are assuming is a double reduction.  For example, given
 505    a structure like this:
 506
 507       outer1:
 508         x_1 = PHI <x_4(outer2), ...>;
 509         ...
 510
 511       inner:
 512         x_2 = PHI <x_1(outer1), ...>;
 513         ...
 514         x_3 = ...;
 515         ...
 516
 517       outer2:
 518         x_4 = PHI <x_3(inner)>;
 519         ...
 520
 521    outer loop analysis would treat x_1 as a double reduction phi and
 522    this function would then return true for x_2.  */
 523
 524 static bool
 525 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 526 {
 527   use_operand_p use_p;
 528   ssa_op_iter op_iter;
 529   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 530     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 531       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 532         return true;
 533   return false;
 534 }
 535
 536 /* Returns true if Phi is a first-order recurrence. A first-order
 537    recurrence is a non-reduction recurrence relation in which the value of
 538    the recurrence in the current loop iteration equals a value defined in
 539    the previous iteration.  */
 540
 541 static bool
 542 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 543                                    gphi *phi)
 544 {
 545   /* A nested cycle isn't vectorizable as first order recurrence.  */
 546   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 547     return false;
 548
 549   /* Ensure the loop latch definition is from within the loop.  */
 550   edge latch = loop_latch_edge (loop);
 551   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 552   if (TREE_CODE (ldef) != SSA_NAME
 553       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 554       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 555       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 556     return false;
 557
 558   tree def = gimple_phi_result (phi);
 559
 560   /* Ensure every use_stmt of the phi node is dominated by the latch
 561      definition.  */
 562   imm_use_iterator imm_iter;
 563   use_operand_p use_p;
 564   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 565     if (!is_gimple_debug (USE_STMT (use_p))
 566         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 567             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 568                                             USE_STMT (use_p))))
 569       return false;
 570
 571   /* First-order recurrence autovectorization needs shuffle vector.  */
 572   tree scalar_type = TREE_TYPE (def);
 573   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 574   if (!vectype)
 575     return false;
 576
 577   return true;
 578 }
 579
 580 /* Function vect_analyze_scalar_cycles_1.
 581
 582    Examine the cross iteration def-use cycles of scalar variables
 583    in LOOP.  LOOP_VINFO represents the loop that is now being
 584    considered for vectorization (can be LOOP, or an outer-loop
 585    enclosing LOOP).  SLP indicates there will be some subsequent
 586    slp analyses or not.  */
 587
 588 static void
 589 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 590                               bool slp)
 591 {
 592   basic_block bb = loop->header;
 593   tree init, step;
 594   auto_vec<stmt_vec_info, 64> worklist;
 595   gphi_iterator gsi;
 596   bool double_reduc, reduc_chain;
 597
 598   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 599
 600   /* First - identify all inductions.  Reduction detection assumes that all the
 601      inductions have been identified, therefore, this order must not be
 602      changed.  */
 603   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 604     {
 605       gphi *phi = gsi.phi ();
 606       tree access_fn = NULL;
 607       tree def = PHI_RESULT (phi);
 608       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 609
 610       if (dump_enabled_p ())
 611         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 612                          (gimple *) phi);
 613
 614       /* Skip virtual phi's.  The data dependences that are associated with
 615          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 616       if (virtual_operand_p (def))
 617         continue;
 618
 619       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 620
 621       /* Analyze the evolution function.  */
 622       access_fn = analyze_scalar_evolution (loop, def);
 623       if (access_fn)
 624         {
 625           STRIP_NOPS (access_fn);
 626           if (dump_enabled_p ())
 627             dump_printf_loc (MSG_NOTE, vect_location,
 628                              "Access function of PHI: %T\n", access_fn);
 629           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 630             = initial_condition_in_loop_num (access_fn, loop->num);
 631           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 632             = evolution_part_in_loop_num (access_fn, loop->num);
 633         }
 634
 635       if ((!access_fn
 636            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 637            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 638                                             &init, &step)
 639            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 640                && TREE_CODE (step) != INTEGER_CST))
 641           /* Only handle nonlinear iv for same loop.  */
 642           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 643               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 644                                                   phi, &init, &step)))
 645         {
 646           worklist.safe_push (stmt_vinfo);
 647           continue;
 648         }
 649
 650       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 651                   != NULL_TREE);
 652       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 653
 654       if (dump_enabled_p ())
 655         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 656       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 657     }
 658
 659
 660   /* Second - identify all reductions and nested cycles.  */
 661   while (worklist.length () > 0)
 662     {
 663       stmt_vec_info stmt_vinfo = worklist.pop ();
 664       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 665       tree def = PHI_RESULT (phi);
 666
 667       if (dump_enabled_p ())
 668         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 669                          (gimple *) phi);
 670
 671       gcc_assert (!virtual_operand_p (def)
 672                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 673
 674       stmt_vec_info reduc_stmt_info
 675         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 676                                     &reduc_chain, slp);
 677       if (reduc_stmt_info)
 678         {
 679           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 680           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 681           if (double_reduc)
 682             {
 683               if (dump_enabled_p ())
 684                 dump_printf_loc (MSG_NOTE, vect_location,
 685                                  "Detected double reduction.\n");
 686
 687               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 688               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 689               /* Make it accessible for SLP vectorization.  */
 690               LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt_info);
 691             }
 692           else
 693             {
 694               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 695                 {
 696                   if (dump_enabled_p ())
 697                     dump_printf_loc (MSG_NOTE, vect_location,
 698                                      "Detected vectorizable nested cycle.\n");
 699
 700                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 701                 }
 702               else
 703                 {
 704                   if (dump_enabled_p ())
 705                     dump_printf_loc (MSG_NOTE, vect_location,
 706                                      "Detected reduction.\n");
 707
 708                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 709                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 710                   /* Store the reduction cycles for possible vectorization in
 711                      loop-aware SLP if it was not detected as reduction
 712                      chain.  */
 713                   if (! reduc_chain)
 714                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 715                       (reduc_stmt_info);
 716                 }
 717             }
 718         }
 719       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 720         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 721       else
 722         if (dump_enabled_p ())
 723           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 724                            "Unknown def-use cycle pattern.\n");
 725     }
 726 }
 727
 728
 729 /* Function vect_analyze_scalar_cycles.
 730
 731    Examine the cross iteration def-use cycles of scalar variables, by
 732    analyzing the loop-header PHIs of scalar variables.  Classify each
 733    cycle as one of the following: invariant, induction, reduction, unknown.
 734    We do that for the loop represented by LOOP_VINFO, and also to its
 735    inner-loop, if exists.
 736    Examples for scalar cycles:
 737
 738    Example1: reduction:
 739
 740               loop1:
 741               for (i=0; i<N; i++)
 742                  sum += a[i];
 743
 744    Example2: induction:
 745
 746               loop2:
 747               for (i=0; i<N; i++)
 748                  a[i] = i;  */
 749
 750 static void
 751 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 752 {
 753   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 754
 755   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 756
 757   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 758      Reductions in such inner-loop therefore have different properties than
 759      the reductions in the nest that gets vectorized:
 760      1. When vectorized, they are executed in the same order as in the original
 761         scalar loop, so we can't change the order of computation when
 762         vectorizing them.
 763      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 764         current checks are too strict.  */
 765
 766   if (loop->inner)
 767     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 768 }
 769
 770 /* Transfer group and reduction information from STMT_INFO to its
 771    pattern stmt.  */
 772
 773 static void
 774 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 775 {
 776   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 777   stmt_vec_info stmtp;
 778   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 779               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 780   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 781   do
 782     {
 783       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 784       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 785                            == STMT_VINFO_DEF_TYPE (stmt_info));
 786       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 787       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 788       if (stmt_info)
 789         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 790           = STMT_VINFO_RELATED_STMT (stmt_info);
 791     }
 792   while (stmt_info);
 793 }
 794
 795 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 796
 797 static void
 798 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 799 {
 800   stmt_vec_info first;
 801   unsigned i;
 802
 803   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 804     {
 805       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 806       while (next)
 807         {
 808           if ((STMT_VINFO_IN_PATTERN_P (next)
 809                != STMT_VINFO_IN_PATTERN_P (first))
 810               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 811             break;
 812           next = REDUC_GROUP_NEXT_ELEMENT (next);
 813         }
 814       /* If all reduction chain members are well-formed patterns adjust
 815          the group to group the pattern stmts instead.  */
 816       if (! next
 817           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 818         {
 819           if (STMT_VINFO_IN_PATTERN_P (first))
 820             {
 821               vect_fixup_reduc_chain (first);
 822               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 823                 = STMT_VINFO_RELATED_STMT (first);
 824             }
 825         }
 826       /* If not all stmt in the chain are patterns or if we failed
 827          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 828          it as regular reduction instead.  */
 829       else
 830         {
 831           stmt_vec_info vinfo = first;
 832           stmt_vec_info last = NULL;
 833           while (vinfo)
 834             {
 835               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 836               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 837               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 838               last = vinfo;
 839               vinfo = next;
 840             }
 841           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 842             = vect_internal_def;
 843           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 844           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 845           --i;
 846         }
 847     }
 848 }
 849
 850 /* Function vect_get_loop_niters.
 851
 852    Determine how many iterations the loop is executed and place it
 853    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 854    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 855    niter information holds in ASSUMPTIONS.
 856
 857    Return the loop exit conditions.  */
 858
 859
 860 static vec<gcond *>
 861 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 862                       tree *number_of_iterations, tree *number_of_iterationsm1)
 863 {
 864   auto_vec<edge> exits = get_loop_exit_edges (loop);
 865   vec<gcond *> conds;
 866   conds.create (exits.length ());
 867   class tree_niter_desc niter_desc;
 868   tree niter_assumptions, niter, may_be_zero;
 869
 870   *assumptions = boolean_true_node;
 871   *number_of_iterationsm1 = chrec_dont_know;
 872   *number_of_iterations = chrec_dont_know;
 873
 874   DUMP_VECT_SCOPE ("get_loop_niters");
 875
 876   if (exits.is_empty ())
 877     return conds;
 878
 879   if (dump_enabled_p ())
 880     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 881                      exits.length ());
 882
 883   edge exit;
 884   unsigned int i;
 885   FOR_EACH_VEC_ELT (exits, i, exit)
 886     {
 887       gcond *cond = get_loop_exit_condition (exit);
 888       if (cond)
 889         conds.safe_push (cond);
 890
 891       if (dump_enabled_p ())
 892         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 893
 894       if (exit != main_exit)
 895         continue;
 896
 897       may_be_zero = NULL_TREE;
 898       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 899           || chrec_contains_undetermined (niter_desc.niter))
 900         continue;
 901
 902       niter_assumptions = niter_desc.assumptions;
 903       may_be_zero = niter_desc.may_be_zero;
 904       niter = niter_desc.niter;
 905
 906       if (may_be_zero && integer_zerop (may_be_zero))
 907         may_be_zero = NULL_TREE;
 908
 909       if (may_be_zero)
 910         {
 911           if (COMPARISON_CLASS_P (may_be_zero))
 912             {
 913               /* Try to combine may_be_zero with assumptions, this can simplify
 914                  computation of niter expression.  */
 915               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 916                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 917                                                  niter_assumptions,
 918                                                  fold_build1 (TRUTH_NOT_EXPR,
 919                                                               boolean_type_node,
 920                                                               may_be_zero));
 921               else
 922                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 923                                      build_int_cst (TREE_TYPE (niter), 0),
 924                                      rewrite_to_non_trapping_overflow (niter));
 925
 926               may_be_zero = NULL_TREE;
 927             }
 928           else if (integer_nonzerop (may_be_zero))
 929             {
 930               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 931               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 932               continue;
 933             }
 934           else
 935             continue;
 936        }
 937
 938       /* Loop assumptions are based off the normal exit.  */
 939       *assumptions = niter_assumptions;
 940       *number_of_iterationsm1 = niter;
 941
 942       /* We want the number of loop header executions which is the number
 943          of latch executions plus one.
 944          ???  For UINT_MAX latch executions this number overflows to zero
 945          for loops like do { n++; } while (n != 0);  */
 946       if (niter && !chrec_contains_undetermined (niter))
 947         {
 948           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 949                                unshare_expr (niter),
 950                                build_int_cst (TREE_TYPE (niter), 1));
 951           if (TREE_CODE (niter) == INTEGER_CST
 952               && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
 953             {
 954               /* If we manage to fold niter + 1 into INTEGER_CST even when
 955                  niter is some complex expression, ensure back
 956                  *number_of_iterationsm1 is an INTEGER_CST as well.  See
 957                  PR113210.  */
 958               *number_of_iterationsm1
 959                 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
 960                                build_minus_one_cst (TREE_TYPE (niter)));
 961             }
 962         }
 963       *number_of_iterations = niter;
 964     }
 965
 966   if (dump_enabled_p ())
 967     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 968
 969   return conds;
 970 }
 971
 972 /*  Determine the main loop exit for the vectorizer.  */
 973
 974 edge
 975 vec_init_loop_exit_info (class loop *loop)
 976 {
 977   /* Before we begin we must first determine which exit is the main one and
 978      which are auxilary exits.  */
 979   auto_vec<edge> exits = get_loop_exit_edges (loop);
 980   if (exits.length () == 1)
 981     return exits[0];
 982
 983   /* If we have multiple exits we only support counting IV at the moment.
 984      Analyze all exits and return the last one we can analyze.  */
 985   class tree_niter_desc niter_desc;
 986   edge candidate = NULL;
 987   for (edge exit : exits)
 988     {
 989       if (!get_loop_exit_condition (exit))
 990         continue;
 991
 992       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 993           && !chrec_contains_undetermined (niter_desc.niter))
 994         {
 995           tree may_be_zero = niter_desc.may_be_zero;
 996           if ((integer_zerop (may_be_zero)
 997                /* As we are handling may_be_zero that's not false by
 998                   rewriting niter to may_be_zero ? 0 : niter we require
 999                   an empty latch.  */
1000                || (single_pred_p (loop->latch)
1001                    && exit->src == single_pred (loop->latch)
1002                    && (integer_nonzerop (may_be_zero)
1003                        || COMPARISON_CLASS_P (may_be_zero))))
1004               && (!candidate
1005                   || dominated_by_p (CDI_DOMINATORS, exit->src,
1006                                      candidate->src)))
1007             candidate = exit;
1008         }
1009     }
1010
1011   return candidate;
1012 }
1013
1014 /* Function bb_in_loop_p
1015
1016    Used as predicate for dfs order traversal of the loop bbs.  */
1017
1018 static bool
1019 bb_in_loop_p (const_basic_block bb, const void *data)
1020 {
1021   const class loop *const loop = (const class loop *)data;
1022   if (flow_bb_inside_loop_p (loop, bb))
1023     return true;
1024   return false;
1025 }
1026
1027
1028 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1029    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1030
1031 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1032   : vec_info (vec_info::loop, shared),
1033     loop (loop_in),
1034     num_itersm1 (NULL_TREE),
1035     num_iters (NULL_TREE),
1036     num_iters_unchanged (NULL_TREE),
1037     num_iters_assumptions (NULL_TREE),
1038     vector_costs (nullptr),
1039     scalar_costs (nullptr),
1040     th (0),
1041     versioning_threshold (0),
1042     vectorization_factor (0),
1043     main_loop_edge (nullptr),
1044     skip_main_loop_edge (nullptr),
1045     skip_this_loop_edge (nullptr),
1046     reusable_accumulators (),
1047     suggested_unroll_factor (1),
1048     max_vectorization_factor (0),
1049     mask_skip_niters (NULL_TREE),
1050     rgroup_compare_type (NULL_TREE),
1051     simd_if_cond (NULL_TREE),
1052     partial_vector_style (vect_partial_vectors_none),
1053     unaligned_dr (NULL),
1054     peeling_for_alignment (0),
1055     ptr_mask (0),
1056     ivexpr_map (NULL),
1057     scan_map (NULL),
1058     slp_unrolling_factor (1),
1059     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1060     vectorizable (false),
1061     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1062     using_partial_vectors_p (false),
1063     using_decrementing_iv_p (false),
1064     using_select_vl_p (false),
1065     epil_using_partial_vectors_p (false),
1066     partial_load_store_bias (0),
1067     peeling_for_gaps (false),
1068     peeling_for_niter (false),
1069     early_breaks (false),
1070     no_data_dependencies (false),
1071     has_mask_store (false),
1072     scalar_loop_scaling (profile_probability::uninitialized ()),
1073     scalar_loop (NULL),
1074     orig_loop_info (NULL),
1075     vec_loop_iv_exit (NULL),
1076     vec_epilogue_loop_iv_exit (NULL),
1077     scalar_loop_iv_exit (NULL)
1078 {
1079   /* CHECKME: We want to visit all BBs before their successors (except for
1080      latch blocks, for which this assertion wouldn't hold).  In the simple
1081      case of the loop forms we allow, a dfs order of the BBs would the same
1082      as reversed postorder traversal, so we are safe.  */
1083
1084   bbs = XCNEWVEC (basic_block, loop->num_nodes);
1085   nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, bbs,
1086                              loop->num_nodes, loop);
1087   gcc_assert (nbbs == loop->num_nodes);
1088
1089   for (unsigned int i = 0; i < nbbs; i++)
1090     {
1091       basic_block bb = bbs[i];
1092       gimple_stmt_iterator si;
1093
1094       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1095         {
1096           gimple *phi = gsi_stmt (si);
1097           gimple_set_uid (phi, 0);
1098           add_stmt (phi);
1099         }
1100
1101       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1102         {
1103           gimple *stmt = gsi_stmt (si);
1104           gimple_set_uid (stmt, 0);
1105           if (is_gimple_debug (stmt))
1106             continue;
1107           add_stmt (stmt);
1108           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1109              third argument is the #pragma omp simd if (x) condition, when 0,
1110              loop shouldn't be vectorized, when non-zero constant, it should
1111              be vectorized normally, otherwise versioned with vectorized loop
1112              done if the condition is non-zero at runtime.  */
1113           if (loop_in->simduid
1114               && is_gimple_call (stmt)
1115               && gimple_call_internal_p (stmt)
1116               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1117               && gimple_call_num_args (stmt) >= 3
1118               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1119               && (loop_in->simduid
1120                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1121             {
1122               tree arg = gimple_call_arg (stmt, 2);
1123               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1124                 simd_if_cond = arg;
1125               else
1126                 gcc_assert (integer_nonzerop (arg));
1127             }
1128         }
1129     }
1130
1131   epilogue_vinfos.create (6);
1132 }
1133
1134 /* Free all levels of rgroup CONTROLS.  */
1135
1136 void
1137 release_vec_loop_controls (vec<rgroup_controls> *controls)
1138 {
1139   rgroup_controls *rgc;
1140   unsigned int i;
1141   FOR_EACH_VEC_ELT (*controls, i, rgc)
1142     rgc->controls.release ();
1143   controls->release ();
1144 }
1145
1146 /* Free all memory used by the _loop_vec_info, as well as all the
1147    stmt_vec_info structs of all the stmts in the loop.  */
1148
1149 _loop_vec_info::~_loop_vec_info ()
1150 {
1151   free (bbs);
1152
1153   release_vec_loop_controls (&masks.rgc_vec);
1154   release_vec_loop_controls (&lens);
1155   delete ivexpr_map;
1156   delete scan_map;
1157   epilogue_vinfos.release ();
1158   delete scalar_costs;
1159   delete vector_costs;
1160
1161   /* When we release an epiloge vinfo that we do not intend to use
1162      avoid clearing AUX of the main loop which should continue to
1163      point to the main loop vinfo since otherwise we'll leak that.  */
1164   if (loop->aux == this)
1165     loop->aux = NULL;
1166 }
1167
1168 /* Return an invariant or register for EXPR and emit necessary
1169    computations in the LOOP_VINFO loop preheader.  */
1170
1171 tree
1172 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1173 {
1174   if (is_gimple_reg (expr)
1175       || is_gimple_min_invariant (expr))
1176     return expr;
1177
1178   if (! loop_vinfo->ivexpr_map)
1179     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1180   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1181   if (! cached)
1182     {
1183       gimple_seq stmts = NULL;
1184       cached = force_gimple_operand (unshare_expr (expr),
1185                                      &stmts, true, NULL_TREE);
1186       if (stmts)
1187         {
1188           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1189           gsi_insert_seq_on_edge_immediate (e, stmts);
1190         }
1191     }
1192   return cached;
1193 }
1194
1195 /* Return true if we can use CMP_TYPE as the comparison type to produce
1196    all masks required to mask LOOP_VINFO.  */
1197
1198 static bool
1199 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1200 {
1201   rgroup_controls *rgm;
1202   unsigned int i;
1203   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1204     if (rgm->type != NULL_TREE
1205         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1206                                             cmp_type, rgm->type,
1207                                             OPTIMIZE_FOR_SPEED))
1208       return false;
1209   return true;
1210 }
1211
1212 /* Calculate the maximum number of scalars per iteration for every
1213    rgroup in LOOP_VINFO.  */
1214
1215 static unsigned int
1216 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1217 {
1218   unsigned int res = 1;
1219   unsigned int i;
1220   rgroup_controls *rgm;
1221   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1222     res = MAX (res, rgm->max_nscalars_per_iter);
1223   return res;
1224 }
1225
1226 /* Calculate the minimum precision necessary to represent:
1227
1228       MAX_NITERS * FACTOR
1229
1230    as an unsigned integer, where MAX_NITERS is the maximum number of
1231    loop header iterations for the original scalar form of LOOP_VINFO.  */
1232
1233 static unsigned
1234 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1235 {
1236   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1237
1238   /* Get the maximum number of iterations that is representable
1239      in the counter type.  */
1240   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1241   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1242
1243   /* Get a more refined estimate for the number of iterations.  */
1244   widest_int max_back_edges;
1245   if (max_loop_iterations (loop, &max_back_edges))
1246     max_ni = wi::smin (max_ni, max_back_edges + 1);
1247
1248   /* Work out how many bits we need to represent the limit.  */
1249   return wi::min_precision (max_ni * factor, UNSIGNED);
1250 }
1251
1252 /* True if the loop needs peeling or partial vectors when vectorized.  */
1253
1254 static bool
1255 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1256 {
1257   unsigned HOST_WIDE_INT const_vf;
1258   HOST_WIDE_INT max_niter
1259     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1260
1261   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1262   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1263     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1264                                           (loop_vinfo));
1265
1266   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1267       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1268     {
1269       /* Work out the (constant) number of iterations that need to be
1270          peeled for reasons other than niters.  */
1271       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1272       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1273         peel_niter += 1;
1274       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1275                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1276         return true;
1277     }
1278   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1279       /* ??? When peeling for gaps but not alignment, we could
1280          try to check whether the (variable) niters is known to be
1281          VF * N + 1.  That's something of a niche case though.  */
1282       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1283       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1284       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1285            < (unsigned) exact_log2 (const_vf))
1286           /* In case of versioning, check if the maximum number of
1287              iterations is greater than th.  If they are identical,
1288              the epilogue is unnecessary.  */
1289           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1290               || ((unsigned HOST_WIDE_INT) max_niter
1291                   /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1292                      but that's only computed later based on our result.
1293                      The following is the most conservative approximation.  */
1294                   > (std::max ((unsigned HOST_WIDE_INT) th,
1295                                const_vf) / const_vf) * const_vf))))
1296     return true;
1297
1298   return false;
1299 }
1300
1301 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1302    whether we can actually generate the masks required.  Return true if so,
1303    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1304
1305 static bool
1306 vect_verify_full_masking (loop_vec_info loop_vinfo)
1307 {
1308   unsigned int min_ni_width;
1309
1310   /* Use a normal loop if there are no statements that need masking.
1311      This only happens in rare degenerate cases: it means that the loop
1312      has no loads, no stores, and no live-out values.  */
1313   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1314     return false;
1315
1316   /* Produce the rgroup controls.  */
1317   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1318     {
1319       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1320       tree vectype = mask.first;
1321       unsigned nvectors = mask.second;
1322
1323       if (masks->rgc_vec.length () < nvectors)
1324         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1325       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1326       /* The number of scalars per iteration and the number of vectors are
1327          both compile-time constants.  */
1328       unsigned int nscalars_per_iter
1329           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1330                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1331
1332       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1333         {
1334           rgm->max_nscalars_per_iter = nscalars_per_iter;
1335           rgm->type = truth_type_for (vectype);
1336           rgm->factor = 1;
1337         }
1338     }
1339
1340   unsigned int max_nscalars_per_iter
1341     = vect_get_max_nscalars_per_iter (loop_vinfo);
1342
1343   /* Work out how many bits we need to represent the limit.  */
1344   min_ni_width
1345     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1346
1347   /* Find a scalar mode for which WHILE_ULT is supported.  */
1348   opt_scalar_int_mode cmp_mode_iter;
1349   tree cmp_type = NULL_TREE;
1350   tree iv_type = NULL_TREE;
1351   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1352   unsigned int iv_precision = UINT_MAX;
1353
1354   if (iv_limit != -1)
1355     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1356                                       UNSIGNED);
1357
1358   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1359     {
1360       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1361       if (cmp_bits >= min_ni_width
1362           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1363         {
1364           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1365           if (this_type
1366               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1367             {
1368               /* Although we could stop as soon as we find a valid mode,
1369                  there are at least two reasons why that's not always the
1370                  best choice:
1371
1372                  - An IV that's Pmode or wider is more likely to be reusable
1373                    in address calculations than an IV that's narrower than
1374                    Pmode.
1375
1376                  - Doing the comparison in IV_PRECISION or wider allows
1377                    a natural 0-based IV, whereas using a narrower comparison
1378                    type requires mitigations against wrap-around.
1379
1380                  Conversely, if the IV limit is variable, doing the comparison
1381                  in a wider type than the original type can introduce
1382                  unnecessary extensions, so picking the widest valid mode
1383                  is not always a good choice either.
1384
1385                  Here we prefer the first IV type that's Pmode or wider,
1386                  and the first comparison type that's IV_PRECISION or wider.
1387                  (The comparison type must be no wider than the IV type,
1388                  to avoid extensions in the vector loop.)
1389
1390                  ??? We might want to try continuing beyond Pmode for ILP32
1391                  targets if CMP_BITS < IV_PRECISION.  */
1392               iv_type = this_type;
1393               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1394                 cmp_type = this_type;
1395               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1396                 break;
1397             }
1398         }
1399     }
1400
1401   if (!cmp_type)
1402     {
1403       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1404       return false;
1405     }
1406
1407   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1408   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1409   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1410   return true;
1411 }
1412
1413 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1414    whether we can actually generate AVX512 style masks.  Return true if so,
1415    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1416
1417 static bool
1418 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1419 {
1420   /* Produce differently organized rgc_vec and differently check
1421      we can produce masks.  */
1422
1423   /* Use a normal loop if there are no statements that need masking.
1424      This only happens in rare degenerate cases: it means that the loop
1425      has no loads, no stores, and no live-out values.  */
1426   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1427     return false;
1428
1429   /* For the decrementing IV we need to represent all values in
1430      [0, niter + niter_skip] where niter_skip is the elements we
1431      skip in the first iteration for prologue peeling.  */
1432   tree iv_type = NULL_TREE;
1433   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1434   unsigned int iv_precision = UINT_MAX;
1435   if (iv_limit != -1)
1436     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1437
1438   /* First compute the type for the IV we use to track the remaining
1439      scalar iterations.  */
1440   opt_scalar_int_mode cmp_mode_iter;
1441   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1442     {
1443       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1444       if (cmp_bits >= iv_precision
1445           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1446         {
1447           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1448           if (iv_type)
1449             break;
1450         }
1451     }
1452   if (!iv_type)
1453     return false;
1454
1455   /* Produce the rgroup controls.  */
1456   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1457     {
1458       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1459       tree vectype = mask.first;
1460       unsigned nvectors = mask.second;
1461
1462       /* The number of scalars per iteration and the number of vectors are
1463          both compile-time constants.  */
1464       unsigned int nscalars_per_iter
1465         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1466                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1467
1468       /* We index the rgroup_controls vector with nscalars_per_iter
1469          which we keep constant and instead have a varying nvectors,
1470          remembering the vector mask with the fewest nV.  */
1471       if (masks->rgc_vec.length () < nscalars_per_iter)
1472         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1473       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1474
1475       if (!rgm->type || rgm->factor > nvectors)
1476         {
1477           rgm->type = truth_type_for (vectype);
1478           rgm->compare_type = NULL_TREE;
1479           rgm->max_nscalars_per_iter = nscalars_per_iter;
1480           rgm->factor = nvectors;
1481           rgm->bias_adjusted_ctrl = NULL_TREE;
1482         }
1483     }
1484
1485   /* There is no fixed compare type we are going to use but we have to
1486      be able to get at one for each mask group.  */
1487   unsigned int min_ni_width
1488     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1489
1490   bool ok = true;
1491   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1492     {
1493       tree mask_type = rgc.type;
1494       if (!mask_type)
1495         continue;
1496
1497       /* For now vect_get_loop_mask only supports integer mode masks
1498          when we need to split it.  */
1499       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1500           || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1501         {
1502           ok = false;
1503           break;
1504         }
1505
1506       /* If iv_type is usable as compare type use that - we can elide the
1507          saturation in that case.   */
1508       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1509         {
1510           tree cmp_vectype
1511             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1512           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1513             rgc.compare_type = cmp_vectype;
1514         }
1515       if (!rgc.compare_type)
1516         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1517           {
1518             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1519             if (cmp_bits >= min_ni_width
1520                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1521               {
1522                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1523                 if (!cmp_type)
1524                   continue;
1525
1526                 /* Check whether we can produce the mask with cmp_type.  */
1527                 tree cmp_vectype
1528                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1529                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1530                   {
1531                     rgc.compare_type = cmp_vectype;
1532                     break;
1533                   }
1534               }
1535         }
1536       if (!rgc.compare_type)
1537         {
1538           ok = false;
1539           break;
1540         }
1541     }
1542   if (!ok)
1543     {
1544       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1545       return false;
1546     }
1547
1548   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1549   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1550   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1551   return true;
1552 }
1553
1554 /* Check whether we can use vector access with length based on precison
1555    comparison.  So far, to keep it simple, we only allow the case that the
1556    precision of the target supported length is larger than the precision
1557    required by loop niters.  */
1558
1559 static bool
1560 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1561 {
1562   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1563     return false;
1564
1565   machine_mode len_load_mode, len_store_mode;
1566   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1567          .exists (&len_load_mode))
1568     return false;
1569   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1570          .exists (&len_store_mode))
1571     return false;
1572
1573   signed char partial_load_bias = internal_len_load_store_bias
1574     (IFN_LEN_LOAD, len_load_mode);
1575
1576   signed char partial_store_bias = internal_len_load_store_bias
1577     (IFN_LEN_STORE, len_store_mode);
1578
1579   gcc_assert (partial_load_bias == partial_store_bias);
1580
1581   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1582     return false;
1583
1584   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1585      len_loads with a length of zero.  In order to avoid that we prohibit
1586      more than one loop length here.  */
1587   if (partial_load_bias == -1
1588       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1589     return false;
1590
1591   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1592
1593   unsigned int max_nitems_per_iter = 1;
1594   unsigned int i;
1595   rgroup_controls *rgl;
1596   /* Find the maximum number of items per iteration for every rgroup.  */
1597   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1598     {
1599       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1600       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1601     }
1602
1603   /* Work out how many bits we need to represent the length limit.  */
1604   unsigned int min_ni_prec
1605     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1606
1607   /* Now use the maximum of below precisions for one suitable IV type:
1608      - the IV's natural precision
1609      - the precision needed to hold: the maximum number of scalar
1610        iterations multiplied by the scale factor (min_ni_prec above)
1611      - the Pmode precision
1612
1613      If min_ni_prec is less than the precision of the current niters,
1614      we perfer to still use the niters type.  Prefer to use Pmode and
1615      wider IV to avoid narrow conversions.  */
1616
1617   unsigned int ni_prec
1618     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1619   min_ni_prec = MAX (min_ni_prec, ni_prec);
1620   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1621
1622   tree iv_type = NULL_TREE;
1623   opt_scalar_int_mode tmode_iter;
1624   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1625     {
1626       scalar_mode tmode = tmode_iter.require ();
1627       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1628
1629       /* ??? Do we really want to construct one IV whose precision exceeds
1630          BITS_PER_WORD?  */
1631       if (tbits > BITS_PER_WORD)
1632         break;
1633
1634       /* Find the first available standard integral type.  */
1635       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1636         {
1637           iv_type = build_nonstandard_integer_type (tbits, true);
1638           break;
1639         }
1640     }
1641
1642   if (!iv_type)
1643     {
1644       if (dump_enabled_p ())
1645         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1646                          "can't vectorize with length-based partial vectors"
1647                          " because there is no suitable iv type.\n");
1648       return false;
1649     }
1650
1651   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1652   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1653   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1654
1655   return true;
1656 }
1657
1658 /* Calculate the cost of one scalar iteration of the loop.  */
1659 static void
1660 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1661 {
1662   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1663   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1664   int nbbs = loop->num_nodes, factor;
1665   int innerloop_iters, i;
1666
1667   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1668
1669   /* Gather costs for statements in the scalar loop.  */
1670
1671   /* FORNOW.  */
1672   innerloop_iters = 1;
1673   if (loop->inner)
1674     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1675
1676   for (i = 0; i < nbbs; i++)
1677     {
1678       gimple_stmt_iterator si;
1679       basic_block bb = bbs[i];
1680
1681       if (bb->loop_father == loop->inner)
1682         factor = innerloop_iters;
1683       else
1684         factor = 1;
1685
1686       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1687         {
1688           gimple *stmt = gsi_stmt (si);
1689           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1690
1691           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1692             continue;
1693
1694           /* Skip stmts that are not vectorized inside the loop.  */
1695           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1696           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1697               && (!STMT_VINFO_LIVE_P (vstmt_info)
1698                   || !VECTORIZABLE_CYCLE_DEF
1699                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1700             continue;
1701
1702           vect_cost_for_stmt kind;
1703           if (STMT_VINFO_DATA_REF (stmt_info))
1704             {
1705               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1706                kind = scalar_load;
1707              else
1708                kind = scalar_store;
1709             }
1710           else if (vect_nop_conversion_p (stmt_info))
1711             continue;
1712           else
1713             kind = scalar_stmt;
1714
1715           /* We are using vect_prologue here to avoid scaling twice
1716              by the inner loop factor.  */
1717           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1718                             factor, kind, stmt_info, 0, vect_prologue);
1719         }
1720     }
1721
1722   /* Now accumulate cost.  */
1723   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1724   add_stmt_costs (loop_vinfo->scalar_costs,
1725                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1726   loop_vinfo->scalar_costs->finish_cost (nullptr);
1727 }
1728
1729 /* Function vect_analyze_loop_form.
1730
1731    Verify that certain CFG restrictions hold, including:
1732    - the loop has a pre-header
1733    - the loop has a single entry
1734    - nested loops can have only a single exit.
1735    - the loop exit condition is simple enough
1736    - the number of iterations can be analyzed, i.e, a countable loop.  The
1737      niter could be analyzed under some assumptions.  */
1738
1739 opt_result
1740 vect_analyze_loop_form (class loop *loop, gimple *loop_vectorized_call,
1741                         vect_loop_form_info *info)
1742 {
1743   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1744
1745   edge exit_e = vec_init_loop_exit_info (loop);
1746   if (!exit_e)
1747     return opt_result::failure_at (vect_location,
1748                                    "not vectorized:"
1749                                    " could not determine main exit from"
1750                                    " loop with multiple exits.\n");
1751   if (loop_vectorized_call)
1752     {
1753       tree arg = gimple_call_arg (loop_vectorized_call, 1);
1754       class loop *scalar_loop = get_loop (cfun, tree_to_shwi (arg));
1755       edge scalar_exit_e = vec_init_loop_exit_info (scalar_loop);
1756       if (!scalar_exit_e)
1757         return opt_result::failure_at (vect_location,
1758                                        "not vectorized:"
1759                                        " could not determine main exit from"
1760                                        " loop with multiple exits.\n");
1761     }
1762
1763   info->loop_exit = exit_e;
1764   if (dump_enabled_p ())
1765       dump_printf_loc (MSG_NOTE, vect_location,
1766                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1767                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1768
1769   /* Check if we have any control flow that doesn't leave the loop.  */
1770   basic_block *bbs = get_loop_body (loop);
1771   for (unsigned i = 0; i < loop->num_nodes; i++)
1772     if (EDGE_COUNT (bbs[i]->succs) != 1
1773         && (EDGE_COUNT (bbs[i]->succs) != 2
1774             || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1775       {
1776         free (bbs);
1777         return opt_result::failure_at (vect_location,
1778                                        "not vectorized:"
1779                                        " unsupported control flow in loop.\n");
1780       }
1781   free (bbs);
1782
1783   /* Different restrictions apply when we are considering an inner-most loop,
1784      vs. an outer (nested) loop.
1785      (FORNOW. May want to relax some of these restrictions in the future).  */
1786
1787   info->inner_loop_cond = NULL;
1788   if (!loop->inner)
1789     {
1790       /* Inner-most loop.  */
1791
1792       if (empty_block_p (loop->header))
1793         return opt_result::failure_at (vect_location,
1794                                        "not vectorized: empty loop.\n");
1795     }
1796   else
1797     {
1798       class loop *innerloop = loop->inner;
1799       edge entryedge;
1800
1801       /* Nested loop. We currently require that the loop is doubly-nested,
1802          contains a single inner loop with a single exit to the block
1803          with the single exit condition in the outer loop.
1804          Vectorizable outer-loops look like this:
1805
1806                         (pre-header)
1807                            |
1808                           header <---+
1809                            |         |
1810                           inner-loop |
1811                            |         |
1812                           tail ------+
1813                            |
1814                         (exit-bb)
1815
1816          The inner-loop also has the properties expected of inner-most loops
1817          as described above.  */
1818
1819       if ((loop->inner)->inner || (loop->inner)->next)
1820         return opt_result::failure_at (vect_location,
1821                                        "not vectorized:"
1822                                        " multiple nested loops.\n");
1823
1824       entryedge = loop_preheader_edge (innerloop);
1825       if (entryedge->src != loop->header
1826           || !single_exit (innerloop)
1827           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1828         return opt_result::failure_at (vect_location,
1829                                        "not vectorized:"
1830                                        " unsupported outerloop form.\n");
1831
1832       /* Analyze the inner-loop.  */
1833       vect_loop_form_info inner;
1834       opt_result res = vect_analyze_loop_form (loop->inner, NULL, &inner);
1835       if (!res)
1836         {
1837           if (dump_enabled_p ())
1838             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1839                              "not vectorized: Bad inner loop.\n");
1840           return res;
1841         }
1842
1843       /* Don't support analyzing niter under assumptions for inner
1844          loop.  */
1845       if (!integer_onep (inner.assumptions))
1846         return opt_result::failure_at (vect_location,
1847                                        "not vectorized: Bad inner loop.\n");
1848
1849       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1850         return opt_result::failure_at (vect_location,
1851                                        "not vectorized: inner-loop count not"
1852                                        " invariant.\n");
1853
1854       if (dump_enabled_p ())
1855         dump_printf_loc (MSG_NOTE, vect_location,
1856                          "Considering outer-loop vectorization.\n");
1857       info->inner_loop_cond = inner.conds[0];
1858     }
1859
1860   if (EDGE_COUNT (loop->header->preds) != 2)
1861     return opt_result::failure_at (vect_location,
1862                                    "not vectorized:"
1863                                    " too many incoming edges.\n");
1864
1865   /* We assume that the latch is empty.  */
1866   basic_block latch = loop->latch;
1867   do
1868     {
1869       if (!empty_block_p (latch)
1870           || !gimple_seq_empty_p (phi_nodes (latch)))
1871         return opt_result::failure_at (vect_location,
1872                                        "not vectorized: latch block not "
1873                                        "empty.\n");
1874       latch = single_pred (latch);
1875     }
1876   while (single_succ_p (latch));
1877
1878   /* Make sure there is no abnormal exit.  */
1879   auto_vec<edge> exits = get_loop_exit_edges (loop);
1880   for (edge e : exits)
1881     {
1882       if (e->flags & EDGE_ABNORMAL)
1883         return opt_result::failure_at (vect_location,
1884                                        "not vectorized:"
1885                                        " abnormal loop exit edge.\n");
1886     }
1887
1888   info->conds
1889     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1890                             &info->number_of_iterations,
1891                             &info->number_of_iterationsm1);
1892   if (info->conds.is_empty ())
1893     return opt_result::failure_at
1894       (vect_location,
1895        "not vectorized: complicated exit condition.\n");
1896
1897   /* Determine what the primary and alternate exit conds are.  */
1898   for (unsigned i = 0; i < info->conds.length (); i++)
1899     {
1900       gcond *cond = info->conds[i];
1901       if (exit_e->src == gimple_bb (cond))
1902         std::swap (info->conds[0], info->conds[i]);
1903     }
1904
1905   if (integer_zerop (info->assumptions)
1906       || !info->number_of_iterations
1907       || chrec_contains_undetermined (info->number_of_iterations))
1908     return opt_result::failure_at
1909       (info->conds[0],
1910        "not vectorized: number of iterations cannot be computed.\n");
1911
1912   if (integer_zerop (info->number_of_iterations))
1913     return opt_result::failure_at
1914       (info->conds[0],
1915        "not vectorized: number of iterations = 0.\n");
1916
1917   if (!(tree_fits_shwi_p (info->number_of_iterations)
1918         && tree_to_shwi (info->number_of_iterations) > 0))
1919     {
1920       if (dump_enabled_p ())
1921         {
1922           dump_printf_loc (MSG_NOTE, vect_location,
1923                            "Symbolic number of iterations is ");
1924           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1925           dump_printf (MSG_NOTE, "\n");
1926         }
1927     }
1928
1929   return opt_result::success ();
1930 }
1931
1932 /* Create a loop_vec_info for LOOP with SHARED and the
1933    vect_analyze_loop_form result.  */
1934
1935 loop_vec_info
1936 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1937                         const vect_loop_form_info *info,
1938                         loop_vec_info main_loop_info)
1939 {
1940   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1941   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1942   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1943   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1944   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1945   /* Also record the assumptions for versioning.  */
1946   if (!integer_onep (info->assumptions) && !main_loop_info)
1947     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1948
1949   for (gcond *cond : info->conds)
1950     {
1951       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1952       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1953       /* Mark the statement as a condition.  */
1954       STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1955     }
1956
1957   for (unsigned i = 1; i < info->conds.length (); i ++)
1958     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1959   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1960
1961   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1962
1963   /* Check to see if we're vectorizing multiple exits.  */
1964   LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1965     = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1966
1967   if (info->inner_loop_cond)
1968     {
1969       stmt_vec_info inner_loop_cond_info
1970         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1971       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1972       /* If we have an estimate on the number of iterations of the inner
1973          loop use that to limit the scale for costing, otherwise use
1974          --param vect-inner-loop-cost-factor literally.  */
1975       widest_int nit;
1976       if (estimated_stmt_executions (loop->inner, &nit))
1977         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1978           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1979     }
1980
1981   return loop_vinfo;
1982 }
1983
1984
1985
1986 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1987    statements update the vectorization factor.  */
1988
1989 static void
1990 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1991 {
1992   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1993   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1994   int nbbs = loop->num_nodes;
1995   poly_uint64 vectorization_factor;
1996   int i;
1997
1998   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1999
2000   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2001   gcc_assert (known_ne (vectorization_factor, 0U));
2002
2003   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
2004      vectorization factor of the loop is the unrolling factor required by
2005      the SLP instances.  If that unrolling factor is 1, we say, that we
2006      perform pure SLP on loop - cross iteration parallelism is not
2007      exploited.  */
2008   bool only_slp_in_loop = true;
2009   for (i = 0; i < nbbs; i++)
2010     {
2011       basic_block bb = bbs[i];
2012       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2013            gsi_next (&si))
2014         {
2015           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
2016           if (!stmt_info)
2017             continue;
2018           if ((STMT_VINFO_RELEVANT_P (stmt_info)
2019                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2020               && !PURE_SLP_STMT (stmt_info))
2021             /* STMT needs both SLP and loop-based vectorization.  */
2022             only_slp_in_loop = false;
2023         }
2024       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2025            gsi_next (&si))
2026         {
2027           if (is_gimple_debug (gsi_stmt (si)))
2028             continue;
2029           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2030           stmt_info = vect_stmt_to_vectorize (stmt_info);
2031           if ((STMT_VINFO_RELEVANT_P (stmt_info)
2032                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2033               && !PURE_SLP_STMT (stmt_info))
2034             /* STMT needs both SLP and loop-based vectorization.  */
2035             only_slp_in_loop = false;
2036         }
2037     }
2038
2039   if (only_slp_in_loop)
2040     {
2041       if (dump_enabled_p ())
2042         dump_printf_loc (MSG_NOTE, vect_location,
2043                          "Loop contains only SLP stmts\n");
2044       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2045     }
2046   else
2047     {
2048       if (dump_enabled_p ())
2049         dump_printf_loc (MSG_NOTE, vect_location,
2050                          "Loop contains SLP and non-SLP stmts\n");
2051       /* Both the vectorization factor and unroll factor have the form
2052          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2053          so they must have a common multiple.  */
2054       vectorization_factor
2055         = force_common_multiple (vectorization_factor,
2056                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2057     }
2058
2059   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2060   if (dump_enabled_p ())
2061     {
2062       dump_printf_loc (MSG_NOTE, vect_location,
2063                        "Updating vectorization factor to ");
2064       dump_dec (MSG_NOTE, vectorization_factor);
2065       dump_printf (MSG_NOTE, ".\n");
2066     }
2067 }
2068
2069 /* Return true if STMT_INFO describes a double reduction phi and if
2070    the other phi in the reduction is also relevant for vectorization.
2071    This rejects cases such as:
2072
2073       outer1:
2074         x_1 = PHI <x_3(outer2), ...>;
2075         ...
2076
2077       inner:
2078         x_2 = ...;
2079         ...
2080
2081       outer2:
2082         x_3 = PHI <x_2(inner)>;
2083
2084    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2085
2086 static bool
2087 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2088 {
2089   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2090     return false;
2091
2092   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2093 }
2094
2095 /* Function vect_analyze_loop_operations.
2096
2097    Scan the loop stmts and make sure they are all vectorizable.  */
2098
2099 static opt_result
2100 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2101 {
2102   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2103   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2104   int nbbs = loop->num_nodes;
2105   int i;
2106   stmt_vec_info stmt_info;
2107   bool need_to_vectorize = false;
2108   bool ok;
2109
2110   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2111
2112   auto_vec<stmt_info_for_cost> cost_vec;
2113
2114   for (i = 0; i < nbbs; i++)
2115     {
2116       basic_block bb = bbs[i];
2117
2118       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2119            gsi_next (&si))
2120         {
2121           gphi *phi = si.phi ();
2122           ok = true;
2123
2124           stmt_info = loop_vinfo->lookup_stmt (phi);
2125           if (dump_enabled_p ())
2126             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2127                              (gimple *) phi);
2128           if (virtual_operand_p (gimple_phi_result (phi)))
2129             continue;
2130
2131           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2132              (i.e., a phi in the tail of the outer-loop).  */
2133           if (! is_loop_header_bb_p (bb))
2134             {
2135               /* FORNOW: we currently don't support the case that these phis
2136                  are not used in the outerloop (unless it is double reduction,
2137                  i.e., this phi is vect_reduction_def), cause this case
2138                  requires to actually do something here.  */
2139               if (STMT_VINFO_LIVE_P (stmt_info)
2140                   && !vect_active_double_reduction_p (stmt_info))
2141                 return opt_result::failure_at (phi,
2142                                                "Unsupported loop-closed phi"
2143                                                " in outer-loop.\n");
2144
2145               /* If PHI is used in the outer loop, we check that its operand
2146                  is defined in the inner loop.  */
2147               if (STMT_VINFO_RELEVANT_P (stmt_info))
2148                 {
2149                   tree phi_op;
2150
2151                   if (gimple_phi_num_args (phi) != 1)
2152                     return opt_result::failure_at (phi, "unsupported phi");
2153
2154                   phi_op = PHI_ARG_DEF (phi, 0);
2155                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2156                   if (!op_def_info)
2157                     return opt_result::failure_at (phi, "unsupported phi\n");
2158
2159                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2160                       && (STMT_VINFO_RELEVANT (op_def_info)
2161                           != vect_used_in_outer_by_reduction))
2162                     return opt_result::failure_at (phi, "unsupported phi\n");
2163
2164                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2165                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2166                            == vect_double_reduction_def))
2167                       && !vectorizable_lc_phi (loop_vinfo,
2168                                                stmt_info, NULL, NULL))
2169                     return opt_result::failure_at (phi, "unsupported phi\n");
2170                 }
2171
2172               continue;
2173             }
2174
2175           gcc_assert (stmt_info);
2176
2177           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2178                || STMT_VINFO_LIVE_P (stmt_info))
2179               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2180               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2181             /* A scalar-dependence cycle that we don't support.  */
2182             return opt_result::failure_at (phi,
2183                                            "not vectorized:"
2184                                            " scalar dependence cycle.\n");
2185
2186           if (STMT_VINFO_RELEVANT_P (stmt_info))
2187             {
2188               need_to_vectorize = true;
2189               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2190                   && ! PURE_SLP_STMT (stmt_info))
2191                 ok = vectorizable_induction (loop_vinfo,
2192                                              stmt_info, NULL, NULL,
2193                                              &cost_vec);
2194               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2195                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2196                             == vect_double_reduction_def)
2197                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2198                        && ! PURE_SLP_STMT (stmt_info))
2199                 ok = vectorizable_reduction (loop_vinfo,
2200                                              stmt_info, NULL, NULL, &cost_vec);
2201               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2202                         == vect_first_order_recurrence)
2203                        && ! PURE_SLP_STMT (stmt_info))
2204                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2205                                            &cost_vec);
2206             }
2207
2208           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2209           if (ok
2210               && STMT_VINFO_LIVE_P (stmt_info)
2211               && !PURE_SLP_STMT (stmt_info))
2212             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2213                                               -1, false, &cost_vec);
2214
2215           if (!ok)
2216             return opt_result::failure_at (phi,
2217                                            "not vectorized: relevant phi not "
2218                                            "supported: %G",
2219                                            static_cast <gimple *> (phi));
2220         }
2221
2222       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2223            gsi_next (&si))
2224         {
2225           gimple *stmt = gsi_stmt (si);
2226           if (!gimple_clobber_p (stmt)
2227               && !is_gimple_debug (stmt))
2228             {
2229               opt_result res
2230                 = vect_analyze_stmt (loop_vinfo,
2231                                      loop_vinfo->lookup_stmt (stmt),
2232                                      &need_to_vectorize,
2233                                      NULL, NULL, &cost_vec);
2234               if (!res)
2235                 return res;
2236             }
2237         }
2238     } /* bbs */
2239
2240   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2241
2242   /* All operations in the loop are either irrelevant (deal with loop
2243      control, or dead), or only used outside the loop and can be moved
2244      out of the loop (e.g. invariants, inductions).  The loop can be
2245      optimized away by scalar optimizations.  We're better off not
2246      touching this loop.  */
2247   if (!need_to_vectorize)
2248     {
2249       if (dump_enabled_p ())
2250         dump_printf_loc (MSG_NOTE, vect_location,
2251                          "All the computation can be taken out of the loop.\n");
2252       return opt_result::failure_at
2253         (vect_location,
2254          "not vectorized: redundant loop. no profit to vectorize.\n");
2255     }
2256
2257   return opt_result::success ();
2258 }
2259
2260 /* Return true if we know that the iteration count is smaller than the
2261    vectorization factor.  Return false if it isn't, or if we can't be sure
2262    either way.  */
2263
2264 static bool
2265 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2266 {
2267   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2268
2269   HOST_WIDE_INT max_niter;
2270   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2271     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2272   else
2273     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2274
2275   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2276     return true;
2277
2278   return false;
2279 }
2280
2281 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2282    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2283    definitely no, or -1 if it's worth retrying.  */
2284
2285 static int
2286 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2287                            unsigned *suggested_unroll_factor)
2288 {
2289   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2290   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2291
2292   /* Only loops that can handle partially-populated vectors can have iteration
2293      counts less than the vectorization factor.  */
2294   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2295       && vect_known_niters_smaller_than_vf (loop_vinfo))
2296     {
2297       if (dump_enabled_p ())
2298         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2299                          "not vectorized: iteration count smaller than "
2300                          "vectorization factor.\n");
2301       return 0;
2302     }
2303
2304   /* If we know the number of iterations we can do better, for the
2305      epilogue we can also decide whether the main loop leaves us
2306      with enough iterations, prefering a smaller vector epilog then
2307      also possibly used for the case we skip the vector loop.  */
2308   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2309     {
2310       widest_int scalar_niters
2311         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2312       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2313         {
2314           loop_vec_info orig_loop_vinfo
2315             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2316           unsigned lowest_vf
2317             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2318           int prolog_peeling = 0;
2319           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2320             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2321           if (prolog_peeling >= 0
2322               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2323                            lowest_vf))
2324             {
2325               unsigned gap
2326                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2327               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2328                                % lowest_vf + gap);
2329             }
2330         }
2331       /* Reject vectorizing for a single scalar iteration, even if
2332          we could in principle implement that using partial vectors.  */
2333       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2334       if (scalar_niters <= peeling_gap + 1)
2335         {
2336           if (dump_enabled_p ())
2337             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2338                              "not vectorized: loop only has a single "
2339                              "scalar iteration.\n");
2340           return 0;
2341         }
2342
2343       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2344         {
2345           /* Check that the loop processes at least one full vector.  */
2346           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2347           if (known_lt (scalar_niters, vf))
2348             {
2349               if (dump_enabled_p ())
2350                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2351                                  "loop does not have enough iterations "
2352                                  "to support vectorization.\n");
2353               return 0;
2354             }
2355
2356           /* If we need to peel an extra epilogue iteration to handle data
2357              accesses with gaps, check that there are enough scalar iterations
2358              available.
2359
2360              The check above is redundant with this one when peeling for gaps,
2361              but the distinction is useful for diagnostics.  */
2362           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2363               && known_le (scalar_niters, vf))
2364             {
2365               if (dump_enabled_p ())
2366                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2367                                  "loop does not have enough iterations "
2368                                  "to support peeling for gaps.\n");
2369               return 0;
2370             }
2371         }
2372     }
2373
2374   /* If using the "very cheap" model. reject cases in which we'd keep
2375      a copy of the scalar code (even if we might be able to vectorize it).  */
2376   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2377       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2378           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)))
2379     {
2380       if (dump_enabled_p ())
2381         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2382                          "some scalar iterations would need to be peeled\n");
2383       return 0;
2384     }
2385
2386   int min_profitable_iters, min_profitable_estimate;
2387   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2388                                       &min_profitable_estimate,
2389                                       suggested_unroll_factor);
2390
2391   if (min_profitable_iters < 0)
2392     {
2393       if (dump_enabled_p ())
2394         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2395                          "not vectorized: vectorization not profitable.\n");
2396       if (dump_enabled_p ())
2397         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2398                          "not vectorized: vector version will never be "
2399                          "profitable.\n");
2400       return -1;
2401     }
2402
2403   int min_scalar_loop_bound = (param_min_vect_loop_bound
2404                                * assumed_vf);
2405
2406   /* Use the cost model only if it is more conservative than user specified
2407      threshold.  */
2408   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2409                                     min_profitable_iters);
2410
2411   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2412
2413   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2414       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2415     {
2416       if (dump_enabled_p ())
2417         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2418                          "not vectorized: vectorization not profitable.\n");
2419       if (dump_enabled_p ())
2420         dump_printf_loc (MSG_NOTE, vect_location,
2421                          "not vectorized: iteration count smaller than user "
2422                          "specified loop bound parameter or minimum profitable "
2423                          "iterations (whichever is more conservative).\n");
2424       return 0;
2425     }
2426
2427   /* The static profitablity threshold min_profitable_estimate includes
2428      the cost of having to check at runtime whether the scalar loop
2429      should be used instead.  If it turns out that we don't need or want
2430      such a check, the threshold we should use for the static estimate
2431      is simply the point at which the vector loop becomes more profitable
2432      than the scalar loop.  */
2433   if (min_profitable_estimate > min_profitable_iters
2434       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2435       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2436       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2437       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2438     {
2439       if (dump_enabled_p ())
2440         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2441                          " choice between the scalar and vector loops\n");
2442       min_profitable_estimate = min_profitable_iters;
2443     }
2444
2445   /* If the vector loop needs multiple iterations to be beneficial then
2446      things are probably too close to call, and the conservative thing
2447      would be to stick with the scalar code.  */
2448   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2449       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2450     {
2451       if (dump_enabled_p ())
2452         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2453                          "one iteration of the vector loop would be"
2454                          " more expensive than the equivalent number of"
2455                          " iterations of the scalar loop\n");
2456       return 0;
2457     }
2458
2459   HOST_WIDE_INT estimated_niter;
2460
2461   /* If we are vectorizing an epilogue then we know the maximum number of
2462      scalar iterations it will cover is at least one lower than the
2463      vectorization factor of the main loop.  */
2464   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2465     estimated_niter
2466       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2467   else
2468     {
2469       estimated_niter = estimated_stmt_executions_int (loop);
2470       if (estimated_niter == -1)
2471         estimated_niter = likely_max_stmt_executions_int (loop);
2472     }
2473   if (estimated_niter != -1
2474       && ((unsigned HOST_WIDE_INT) estimated_niter
2475           < MAX (th, (unsigned) min_profitable_estimate)))
2476     {
2477       if (dump_enabled_p ())
2478         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2479                          "not vectorized: estimated iteration count too "
2480                          "small.\n");
2481       if (dump_enabled_p ())
2482         dump_printf_loc (MSG_NOTE, vect_location,
2483                          "not vectorized: estimated iteration count smaller "
2484                          "than specified loop bound parameter or minimum "
2485                          "profitable iterations (whichever is more "
2486                          "conservative).\n");
2487       return -1;
2488     }
2489
2490   return 1;
2491 }
2492
2493 static opt_result
2494 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2495                            vec<data_reference_p> *datarefs,
2496                            unsigned int *n_stmts)
2497 {
2498   *n_stmts = 0;
2499   for (unsigned i = 0; i < loop->num_nodes; i++)
2500     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2501          !gsi_end_p (gsi); gsi_next (&gsi))
2502       {
2503         gimple *stmt = gsi_stmt (gsi);
2504         if (is_gimple_debug (stmt))
2505           continue;
2506         ++(*n_stmts);
2507         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2508                                                         NULL, 0);
2509         if (!res)
2510           {
2511             if (is_gimple_call (stmt) && loop->safelen)
2512               {
2513                 tree fndecl = gimple_call_fndecl (stmt), op;
2514                 if (fndecl == NULL_TREE
2515                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2516                   {
2517                     fndecl = gimple_call_arg (stmt, 0);
2518                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2519                     fndecl = TREE_OPERAND (fndecl, 0);
2520                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2521                   }
2522                 if (fndecl != NULL_TREE)
2523                   {
2524                     cgraph_node *node = cgraph_node::get (fndecl);
2525                     if (node != NULL && node->simd_clones != NULL)
2526                       {
2527                         unsigned int j, n = gimple_call_num_args (stmt);
2528                         for (j = 0; j < n; j++)
2529                           {
2530                             op = gimple_call_arg (stmt, j);
2531                             if (DECL_P (op)
2532                                 || (REFERENCE_CLASS_P (op)
2533                                     && get_base_address (op)))
2534                               break;
2535                           }
2536                         op = gimple_call_lhs (stmt);
2537                         /* Ignore #pragma omp declare simd functions
2538                            if they don't have data references in the
2539                            call stmt itself.  */
2540                         if (j == n
2541                             && !(op
2542                                  && (DECL_P (op)
2543                                      || (REFERENCE_CLASS_P (op)
2544                                          && get_base_address (op)))))
2545                           continue;
2546                       }
2547                   }
2548               }
2549             return res;
2550           }
2551         /* If dependence analysis will give up due to the limit on the
2552            number of datarefs stop here and fail fatally.  */
2553         if (datarefs->length ()
2554             > (unsigned)param_loop_max_datarefs_for_datadeps)
2555           return opt_result::failure_at (stmt, "exceeded param "
2556                                          "loop-max-datarefs-for-datadeps\n");
2557       }
2558   return opt_result::success ();
2559 }
2560
2561 /* Look for SLP-only access groups and turn each individual access into its own
2562    group.  */
2563 static void
2564 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2565 {
2566   unsigned int i;
2567   struct data_reference *dr;
2568
2569   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2570
2571   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2572   FOR_EACH_VEC_ELT (datarefs, i, dr)
2573     {
2574       gcc_assert (DR_REF (dr));
2575       stmt_vec_info stmt_info
2576         = vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (DR_STMT (dr)));
2577
2578       /* Check if the load is a part of an interleaving chain.  */
2579       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2580         {
2581           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2582           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2583           unsigned int group_size = DR_GROUP_SIZE (first_element);
2584
2585           /* Check if SLP-only groups.  */
2586           if (!STMT_SLP_TYPE (stmt_info)
2587               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2588             {
2589               /* Dissolve the group.  */
2590               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2591
2592               stmt_vec_info vinfo = first_element;
2593               while (vinfo)
2594                 {
2595                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2596                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2597                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2598                   DR_GROUP_SIZE (vinfo) = 1;
2599                   if (STMT_VINFO_STRIDED_P (first_element)
2600                       /* We cannot handle stores with gaps.  */
2601                       || DR_IS_WRITE (dr_info->dr))
2602                     {
2603                       STMT_VINFO_STRIDED_P (vinfo) = true;
2604                       DR_GROUP_GAP (vinfo) = 0;
2605                     }
2606                   else
2607                     DR_GROUP_GAP (vinfo) = group_size - 1;
2608                   /* Duplicate and adjust alignment info, it needs to
2609                      be present on each group leader, see dr_misalignment.  */
2610                   if (vinfo != first_element)
2611                     {
2612                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2613                       dr_info2->target_alignment = dr_info->target_alignment;
2614                       int misalignment = dr_info->misalignment;
2615                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2616                         {
2617                           HOST_WIDE_INT diff
2618                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2619                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2620                           unsigned HOST_WIDE_INT align_c
2621                             = dr_info->target_alignment.to_constant ();
2622                           misalignment = (misalignment + diff) % align_c;
2623                         }
2624                       dr_info2->misalignment = misalignment;
2625                     }
2626                   vinfo = next;
2627                 }
2628             }
2629         }
2630     }
2631 }
2632
2633 /* Determine if operating on full vectors for LOOP_VINFO might leave
2634    some scalar iterations still to do.  If so, decide how we should
2635    handle those scalar iterations.  The possibilities are:
2636
2637    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2638        In this case:
2639
2640          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2641          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2642          LOOP_VINFO_PEELING_FOR_NITER == false
2643
2644    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2645        to handle the remaining scalar iterations.  In this case:
2646
2647          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2648          LOOP_VINFO_PEELING_FOR_NITER == true
2649
2650        There are two choices:
2651
2652        (2a) Consider vectorizing the epilogue loop at the same VF as the
2653             main loop, but using partial vectors instead of full vectors.
2654             In this case:
2655
2656               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2657
2658        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2659             In this case:
2660
2661               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2662  */
2663
2664 opt_result
2665 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2666 {
2667   /* Determine whether there would be any scalar iterations left over.  */
2668   bool need_peeling_or_partial_vectors_p
2669     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2670
2671   /* Decide whether to vectorize the loop with partial vectors.  */
2672   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2673   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2674   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2675       && need_peeling_or_partial_vectors_p)
2676     {
2677       /* For partial-vector-usage=1, try to push the handling of partial
2678          vectors to the epilogue, with the main loop continuing to operate
2679          on full vectors.
2680
2681          If we are unrolling we also do not want to use partial vectors. This
2682          is to avoid the overhead of generating multiple masks and also to
2683          avoid having to execute entire iterations of FALSE masked instructions
2684          when dealing with one or less full iterations.
2685
2686          ??? We could then end up failing to use partial vectors if we
2687          decide to peel iterations into a prologue, and if the main loop
2688          then ends up processing fewer than VF iterations.  */
2689       if ((param_vect_partial_vector_usage == 1
2690            || loop_vinfo->suggested_unroll_factor > 1)
2691           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2692           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2693         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2694       else
2695         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2696     }
2697
2698   if (dump_enabled_p ())
2699     dump_printf_loc (MSG_NOTE, vect_location,
2700                      "operating on %s vectors%s.\n",
2701                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2702                      ? "partial" : "full",
2703                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2704                      ? " for epilogue loop" : "");
2705
2706   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2707     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2708        && need_peeling_or_partial_vectors_p);
2709
2710   /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2711      analysis that we don't know whether the loop is vectorized by partial
2712      vectors (More details see tree-vect-loop-manip.cc).
2713
2714      However, SELECT_VL vectorizaton style should only applied on partial
2715      vectorization since SELECT_VL is the GIMPLE IR that calculates the
2716      number of elements to be process for each iteration.
2717
2718      After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2719      if it is not partial vectorized loop.  */
2720   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2721     LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2722
2723   return opt_result::success ();
2724 }
2725
2726 /* Function vect_analyze_loop_2.
2727
2728    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2729    analyses will record information in some members of LOOP_VINFO.  FATAL
2730    indicates if some analysis meets fatal error.  If one non-NULL pointer
2731    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2732    worked out suggested unroll factor, while one NULL pointer shows it's
2733    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2734    is to hold the slp decision when the suggested unroll factor is worked
2735    out.  */
2736 static opt_result
2737 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2738                      unsigned *suggested_unroll_factor,
2739                      unsigned& slp_done_for_suggested_uf)
2740 {
2741   opt_result ok = opt_result::success ();
2742   int res;
2743   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2744   poly_uint64 min_vf = 2;
2745   loop_vec_info orig_loop_vinfo = NULL;
2746
2747   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2748      loop_vec_info of the first vectorized loop.  */
2749   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2750     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2751   else
2752     orig_loop_vinfo = loop_vinfo;
2753   gcc_assert (orig_loop_vinfo);
2754
2755   /* The first group of checks is independent of the vector size.  */
2756   fatal = true;
2757
2758   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2759       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2760     return opt_result::failure_at (vect_location,
2761                                    "not vectorized: simd if(0)\n");
2762
2763   /* Find all data references in the loop (which correspond to vdefs/vuses)
2764      and analyze their evolution in the loop.  */
2765
2766   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2767
2768   /* Gather the data references and count stmts in the loop.  */
2769   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2770     {
2771       opt_result res
2772         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2773                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2774                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2775       if (!res)
2776         {
2777           if (dump_enabled_p ())
2778             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2779                              "not vectorized: loop contains function "
2780                              "calls or data references that cannot "
2781                              "be analyzed\n");
2782           return res;
2783         }
2784       loop_vinfo->shared->save_datarefs ();
2785     }
2786   else
2787     loop_vinfo->shared->check_datarefs ();
2788
2789   /* Analyze the data references and also adjust the minimal
2790      vectorization factor according to the loads and stores.  */
2791
2792   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2793   if (!ok)
2794     {
2795       if (dump_enabled_p ())
2796         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2797                          "bad data references.\n");
2798       return ok;
2799     }
2800
2801   /* Check if we are applying unroll factor now.  */
2802   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2803   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2804
2805   /* If the slp decision is false when suggested unroll factor is worked
2806      out, and we are applying suggested unroll factor, we can simply skip
2807      all slp related analyses this time.  */
2808   unsigned slp = !applying_suggested_uf ? 2 : slp_done_for_suggested_uf;
2809
2810   /* Classify all cross-iteration scalar data-flow cycles.
2811      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2812   vect_analyze_scalar_cycles (loop_vinfo, slp == 2);
2813
2814   vect_pattern_recog (loop_vinfo);
2815
2816   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2817
2818   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2819      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2820
2821   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2822   if (!ok)
2823     {
2824       if (dump_enabled_p ())
2825         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2826                          "bad data access.\n");
2827       return ok;
2828     }
2829
2830   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2831
2832   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2833   if (!ok)
2834     {
2835       if (dump_enabled_p ())
2836         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2837                          "unexpected pattern.\n");
2838       return ok;
2839     }
2840
2841   /* While the rest of the analysis below depends on it in some way.  */
2842   fatal = false;
2843
2844   /* Analyze data dependences between the data-refs in the loop
2845      and adjust the maximum vectorization factor according to
2846      the dependences.
2847      FORNOW: fail at the first data dependence that we encounter.  */
2848
2849   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2850   if (!ok)
2851     {
2852       if (dump_enabled_p ())
2853         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2854                          "bad data dependence.\n");
2855       return ok;
2856     }
2857   if (max_vf != MAX_VECTORIZATION_FACTOR
2858       && maybe_lt (max_vf, min_vf))
2859     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2860   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2861
2862   ok = vect_determine_vectorization_factor (loop_vinfo);
2863   if (!ok)
2864     {
2865       if (dump_enabled_p ())
2866         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2867                          "can't determine vectorization factor.\n");
2868       return ok;
2869     }
2870
2871   /* Compute the scalar iteration cost.  */
2872   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2873
2874   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2875   bool saved_can_use_partial_vectors_p
2876     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2877
2878   /* This is the point where we can re-start analysis with SLP forced off.  */
2879 start_over:
2880
2881   if (slp)
2882     {
2883       /* Check the SLP opportunities in the loop, analyze and build
2884          SLP trees.  */
2885       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo),
2886                              slp == 1);
2887       if (!ok)
2888         return ok;
2889
2890       /* If there are any SLP instances mark them as pure_slp.  */
2891       if (vect_make_slp_decision (loop_vinfo))
2892         {
2893           /* Find stmts that need to be both vectorized and SLPed.  */
2894           vect_detect_hybrid_slp (loop_vinfo);
2895
2896           /* Update the vectorization factor based on the SLP decision.  */
2897           vect_update_vf_for_slp (loop_vinfo);
2898
2899           /* Optimize the SLP graph with the vectorization factor fixed.  */
2900           vect_optimize_slp (loop_vinfo);
2901
2902           /* Gather the loads reachable from the SLP graph entries.  */
2903           vect_gather_slp_loads (loop_vinfo);
2904         }
2905     }
2906
2907   /* We don't expect to have to roll back to anything other than an empty
2908      set of rgroups.  */
2909   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2910
2911   /* When we arrive here with SLP disabled and we are supposed
2912      to use SLP for everything fail vectorization.  */
2913   if (!slp && param_vect_force_slp)
2914     return opt_result::failure_at (vect_location,
2915                                    "may need non-SLP handling\n");
2916
2917   /* Apply the suggested unrolling factor, this was determined by the backend
2918      during finish_cost the first time we ran the analyzis for this
2919      vector mode.  */
2920   if (applying_suggested_uf)
2921     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2922
2923   /* Now the vectorization factor is final.  */
2924   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2925   gcc_assert (known_ne (vectorization_factor, 0U));
2926
2927   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2928     {
2929       dump_printf_loc (MSG_NOTE, vect_location,
2930                        "vectorization_factor = ");
2931       dump_dec (MSG_NOTE, vectorization_factor);
2932       dump_printf (MSG_NOTE, ", niters = %wd\n",
2933                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2934     }
2935
2936   if (max_vf != MAX_VECTORIZATION_FACTOR
2937       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2938     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2939
2940   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2941
2942   /* Analyze the alignment of the data-refs in the loop.
2943      Fail if a data reference is found that cannot be vectorized.  */
2944
2945   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2946   if (!ok)
2947     {
2948       if (dump_enabled_p ())
2949         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2950                          "bad data alignment.\n");
2951       return ok;
2952     }
2953
2954   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2955      It is important to call pruning after vect_analyze_data_ref_accesses,
2956      since we use grouping information gathered by interleaving analysis.  */
2957   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2958   if (!ok)
2959     return ok;
2960
2961   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2962      vectorization, since we do not want to add extra peeling or
2963      add versioning for alignment.  */
2964   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2965     /* This pass will decide on using loop versioning and/or loop peeling in
2966        order to enhance the alignment of data references in the loop.  */
2967     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2968   if (!ok)
2969     return ok;
2970
2971   if (slp)
2972     {
2973       /* Analyze operations in the SLP instances.  We can't simply
2974          remove unsupported SLP instances as this makes the above
2975          SLP kind detection invalid and might also affect the VF.  */
2976       if (! vect_slp_analyze_operations (loop_vinfo))
2977         {
2978           ok = opt_result::failure_at (vect_location,
2979                                        "unsupported SLP instances\n");
2980           goto again;
2981         }
2982     }
2983
2984   /* Dissolve SLP-only groups.  */
2985   vect_dissolve_slp_only_groups (loop_vinfo);
2986
2987   /* Scan all the remaining operations in the loop that are not subject
2988      to SLP and make sure they are vectorizable.  */
2989   ok = vect_analyze_loop_operations (loop_vinfo);
2990   if (!ok)
2991     {
2992       if (dump_enabled_p ())
2993         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2994                          "bad operation or unsupported loop bound.\n");
2995       return ok;
2996     }
2997
2998   /* For now, we don't expect to mix both masking and length approaches for one
2999      loop, disable it if both are recorded.  */
3000   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3001       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3002       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3003     {
3004       if (dump_enabled_p ())
3005         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3006                          "can't vectorize a loop with partial vectors"
3007                          " because we don't expect to mix different"
3008                          " approaches with partial vectors for the"
3009                          " same loop.\n");
3010       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3011     }
3012
3013   /* If we still have the option of using partial vectors,
3014      check whether we can generate the necessary loop controls.  */
3015   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3016     {
3017       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3018         {
3019           if (!vect_verify_full_masking (loop_vinfo)
3020               && !vect_verify_full_masking_avx512 (loop_vinfo))
3021             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3022         }
3023       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3024         if (!vect_verify_loop_lens (loop_vinfo))
3025           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3026     }
3027
3028   /* If we're vectorizing a loop that uses length "controls" and
3029      can iterate more than once, we apply decrementing IV approach
3030      in loop control.  */
3031   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3032       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3033       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3034       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3035            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3036                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3037     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3038
3039   /* If a loop uses length controls and has a decrementing loop control IV,
3040      we will normally pass that IV through a MIN_EXPR to calcaluate the
3041      basis for the length controls.  E.g. in a loop that processes one
3042      element per scalar iteration, the number of elements would be
3043      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3044
3045      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3046      step, since only the final iteration of the vector loop can have
3047      inactive lanes.
3048
3049      However, some targets have a dedicated instruction for calculating the
3050      preferred length, given the total number of elements that still need to
3051      be processed.  This is encapsulated in the SELECT_VL internal function.
3052
3053      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3054      to determine the basis for the length controls.  However, unlike the
3055      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3056      lanes inactive in any iteration of the vector loop, not just the last
3057      iteration.  This SELECT_VL approach therefore requires us to use pointer
3058      IVs with variable steps.
3059
3060      Once we've decided how many elements should be processed by one
3061      iteration of the vector loop, we need to populate the rgroup controls.
3062      If a loop has multiple rgroups, we need to make sure that those rgroups
3063      "line up" (that is, they must be consistent about which elements are
3064      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3065
3066      In principle, it would be possible to use vect_adjust_loop_lens_control
3067      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3068      However:
3069
3070      (1) In practice, it only makes sense to use SELECT_VL when a vector
3071          operation will be controlled directly by the result.  It is not
3072          worth using SELECT_VL if it would only be the input to other
3073          calculations.
3074
3075      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3076          pointer IV will need N updates by a variable amount (N-1 updates
3077          within the iteration and 1 update to move to the next iteration).
3078
3079      Because of this, we prefer to use the MIN_EXPR approach whenever there
3080      is more than one length control.
3081
3082      In addition, SELECT_VL always operates to a granularity of 1 unit.
3083      If we wanted to use it to control an SLP operation on N consecutive
3084      elements, we would need to make the SELECT_VL inputs measure scalar
3085      iterations (rather than elements) and then multiply the SELECT_VL
3086      result by N.  But using SELECT_VL this way is inefficient because
3087      of (1) above.
3088
3089      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3090         satisfied:
3091
3092      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3093      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3094
3095      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3096      we will fail to gain benefits of following unroll optimizations. We prefer
3097      using the MIN_EXPR approach in this situation.  */
3098   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3099     {
3100       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3101       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3102                                           OPTIMIZE_FOR_SPEED)
3103           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3104           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1
3105           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3106               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3107         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3108
3109       /* If any of the SLP instances cover more than a single lane
3110          we cannot use .SELECT_VL at the moment, even if the number
3111          of lanes is uniform throughout the SLP graph.  */
3112       if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
3113         for (slp_instance inst : LOOP_VINFO_SLP_INSTANCES (loop_vinfo))
3114           if (SLP_TREE_LANES (SLP_INSTANCE_TREE (inst)) != 1
3115               && !(SLP_INSTANCE_KIND (inst) == slp_inst_kind_store
3116                    && SLP_INSTANCE_TREE (inst)->ldst_lanes))
3117             {
3118               LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
3119               break;
3120             }
3121     }
3122
3123   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3124      assuming that the loop will be used as a main loop.  We will redo
3125      this analysis later if we instead decide to use the loop as an
3126      epilogue loop.  */
3127   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3128   if (!ok)
3129     return ok;
3130
3131   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3132      to be able to handle fewer than VF scalars, or needs to have a lower VF
3133      than the main loop.  */
3134   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3135       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3136     {
3137       poly_uint64 unscaled_vf
3138         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3139                      orig_loop_vinfo->suggested_unroll_factor);
3140       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3141         return opt_result::failure_at (vect_location,
3142                                        "Vectorization factor too high for"
3143                                        " epilogue loop.\n");
3144     }
3145
3146   /* Check the costings of the loop make vectorizing worthwhile.  */
3147   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3148   if (res < 0)
3149     {
3150       ok = opt_result::failure_at (vect_location,
3151                                    "Loop costings may not be worthwhile.\n");
3152       goto again;
3153     }
3154   if (!res)
3155     return opt_result::failure_at (vect_location,
3156                                    "Loop costings not worthwhile.\n");
3157
3158   /* If an epilogue loop is required make sure we can create one.  */
3159   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3160       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3161       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3162     {
3163       if (dump_enabled_p ())
3164         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3165       if (!vect_can_advance_ivs_p (loop_vinfo)
3166           || !slpeel_can_duplicate_loop_p (loop,
3167                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3168                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3169         {
3170           ok = opt_result::failure_at (vect_location,
3171                                        "not vectorized: can't create required "
3172                                        "epilog loop\n");
3173           goto again;
3174         }
3175     }
3176
3177   /* During peeling, we need to check if number of loop iterations is
3178      enough for both peeled prolog loop and vector loop.  This check
3179      can be merged along with threshold check of loop versioning, so
3180      increase threshold for this case if necessary.
3181
3182      If we are analyzing an epilogue we still want to check what its
3183      versioning threshold would be.  If we decide to vectorize the epilogues we
3184      will want to use the lowest versioning threshold of all epilogues and main
3185      loop.  This will enable us to enter a vectorized epilogue even when
3186      versioning the loop.  We can't simply check whether the epilogue requires
3187      versioning though since we may have skipped some versioning checks when
3188      analyzing the epilogue.  For instance, checks for alias versioning will be
3189      skipped when dealing with epilogues as we assume we already checked them
3190      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3191   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3192     {
3193       poly_uint64 niters_th = 0;
3194       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3195
3196       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3197         {
3198           /* Niters for peeled prolog loop.  */
3199           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3200             {
3201               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3202               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3203               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3204             }
3205           else
3206             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3207         }
3208
3209       /* Niters for at least one iteration of vectorized loop.  */
3210       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3211         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3212       /* One additional iteration because of peeling for gap.  */
3213       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3214         niters_th += 1;
3215
3216       /*  Use the same condition as vect_transform_loop to decide when to use
3217           the cost to determine a versioning threshold.  */
3218       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3219           && ordered_p (th, niters_th))
3220         niters_th = ordered_max (poly_uint64 (th), niters_th);
3221
3222       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3223     }
3224
3225   gcc_assert (known_eq (vectorization_factor,
3226                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3227
3228   slp_done_for_suggested_uf = slp;
3229
3230   /* Ok to vectorize!  */
3231   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3232   return opt_result::success ();
3233
3234 again:
3235   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3236   gcc_assert (!ok);
3237
3238   /* Try again with SLP degraded but if we didn't do any SLP there is
3239      no point in re-trying.  */
3240   if (!slp)
3241     return ok;
3242
3243   /* If we are applying suggested unroll factor, we don't need to
3244      re-try any more as we want to keep the SLP mode fixed.  */
3245   if (applying_suggested_uf)
3246     return ok;
3247
3248   /* If there are reduction chains re-trying will fail anyway.  */
3249   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3250     return ok;
3251
3252   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3253      via interleaving or lane instructions.  */
3254   slp_instance instance;
3255   slp_tree node;
3256   unsigned i, j;
3257   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3258     {
3259       if (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance)) != vect_internal_def)
3260         continue;
3261
3262       stmt_vec_info vinfo;
3263       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3264       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3265         continue;
3266       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3267       unsigned int size = DR_GROUP_SIZE (vinfo);
3268       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3269       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3270          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3271          && ! vect_grouped_store_supported (vectype, size))
3272         return opt_result::failure_at (vinfo->stmt,
3273                                        "unsupported grouped store\n");
3274       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3275         {
3276           vinfo = SLP_TREE_REPRESENTATIVE (node);
3277           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3278             {
3279               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3280               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3281               size = DR_GROUP_SIZE (vinfo);
3282               vectype = STMT_VINFO_VECTYPE (vinfo);
3283               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3284                   && ! vect_grouped_load_supported (vectype, single_element_p,
3285                                                     size))
3286                 return opt_result::failure_at (vinfo->stmt,
3287                                                "unsupported grouped load\n");
3288             }
3289         }
3290     }
3291
3292   /* Roll back state appropriately.  Degrade SLP this time.  From multi-
3293      to single-lane to disabled.  */
3294   --slp;
3295   if (dump_enabled_p ())
3296     {
3297       if (slp)
3298         dump_printf_loc (MSG_NOTE, vect_location,
3299                          "re-trying with single-lane SLP\n");
3300       else
3301         dump_printf_loc (MSG_NOTE, vect_location,
3302                          "re-trying with SLP disabled\n");
3303     }
3304
3305   /* Restore vectorization factor as it were without SLP.  */
3306   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3307   /* Free the SLP instances.  */
3308   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3309     vect_free_slp_instance (instance);
3310   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3311   /* Reset SLP type to loop_vect on all stmts.  */
3312   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3313     {
3314       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3315       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3316            !gsi_end_p (si); gsi_next (&si))
3317         {
3318           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3319           STMT_SLP_TYPE (stmt_info) = loop_vect;
3320           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3321               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3322             {
3323               /* vectorizable_reduction adjusts reduction stmt def-types,
3324                  restore them to that of the PHI.  */
3325               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3326                 = STMT_VINFO_DEF_TYPE (stmt_info);
3327               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3328                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3329                 = STMT_VINFO_DEF_TYPE (stmt_info);
3330             }
3331         }
3332       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3333            !gsi_end_p (si); gsi_next (&si))
3334         {
3335           if (is_gimple_debug (gsi_stmt (si)))
3336             continue;
3337           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3338           STMT_SLP_TYPE (stmt_info) = loop_vect;
3339           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3340             {
3341               stmt_vec_info pattern_stmt_info
3342                 = STMT_VINFO_RELATED_STMT (stmt_info);
3343               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3344                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3345
3346               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3347               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3348               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3349                    !gsi_end_p (pi); gsi_next (&pi))
3350                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3351                   = loop_vect;
3352             }
3353         }
3354     }
3355   /* Free optimized alias test DDRS.  */
3356   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3357   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3358   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3359   /* Reset target cost data.  */
3360   delete loop_vinfo->vector_costs;
3361   loop_vinfo->vector_costs = nullptr;
3362   /* Reset accumulated rgroup information.  */
3363   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3364   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3365   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3366   /* Reset assorted flags.  */
3367   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3368   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3369   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3370   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3371   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3372     = saved_can_use_partial_vectors_p;
3373   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
3374   if (loop_vinfo->scan_map)
3375     loop_vinfo->scan_map->empty ();
3376
3377   goto start_over;
3378 }
3379
3380 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3381    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3382    OLD_LOOP_VINFO is better unless something specifically indicates
3383    otherwise.
3384
3385    Note that this deliberately isn't a partial order.  */
3386
3387 static bool
3388 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3389                           loop_vec_info old_loop_vinfo)
3390 {
3391   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3392   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3393
3394   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3395   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3396
3397   /* Always prefer a VF of loop->simdlen over any other VF.  */
3398   if (loop->simdlen)
3399     {
3400       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3401       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3402       if (new_simdlen_p != old_simdlen_p)
3403         return new_simdlen_p;
3404     }
3405
3406   const auto *old_costs = old_loop_vinfo->vector_costs;
3407   const auto *new_costs = new_loop_vinfo->vector_costs;
3408   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3409     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3410
3411   return new_costs->better_main_loop_than_p (old_costs);
3412 }
3413
3414 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3415    true if we should.  */
3416
3417 static bool
3418 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3419                         loop_vec_info old_loop_vinfo)
3420 {
3421   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3422     return false;
3423
3424   if (dump_enabled_p ())
3425     dump_printf_loc (MSG_NOTE, vect_location,
3426                      "***** Preferring vector mode %s to vector mode %s\n",
3427                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3428                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3429   return true;
3430 }
3431
3432 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3433    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3434    MODE_I to the next mode useful to analyze.
3435    Return the loop_vinfo on success and wrapped null on failure.  */
3436
3437 static opt_loop_vec_info
3438 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3439                      const vect_loop_form_info *loop_form_info,
3440                      loop_vec_info main_loop_vinfo,
3441                      const vector_modes &vector_modes, unsigned &mode_i,
3442                      machine_mode &autodetected_vector_mode,
3443                      bool &fatal)
3444 {
3445   loop_vec_info loop_vinfo
3446     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3447
3448   machine_mode vector_mode = vector_modes[mode_i];
3449   loop_vinfo->vector_mode = vector_mode;
3450   unsigned int suggested_unroll_factor = 1;
3451   unsigned slp_done_for_suggested_uf = 0;
3452
3453   /* Run the main analysis.  */
3454   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3455                                         &suggested_unroll_factor,
3456                                         slp_done_for_suggested_uf);
3457   if (dump_enabled_p ())
3458     dump_printf_loc (MSG_NOTE, vect_location,
3459                      "***** Analysis %s with vector mode %s\n",
3460                      res ? "succeeded" : "failed",
3461                      GET_MODE_NAME (loop_vinfo->vector_mode));
3462
3463   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3464     {
3465       if (dump_enabled_p ())
3466         dump_printf_loc (MSG_NOTE, vect_location,
3467                          "***** Re-trying analysis for unrolling"
3468                          " with unroll factor %d and slp %s.\n",
3469                          suggested_unroll_factor,
3470                          slp_done_for_suggested_uf ? "on" : "off");
3471       loop_vec_info unroll_vinfo
3472         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3473       unroll_vinfo->vector_mode = vector_mode;
3474       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3475       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3476                                                 slp_done_for_suggested_uf);
3477       if (new_res)
3478         {
3479           delete loop_vinfo;
3480           loop_vinfo = unroll_vinfo;
3481         }
3482       else
3483         delete unroll_vinfo;
3484     }
3485
3486   /* Remember the autodetected vector mode.  */
3487   if (vector_mode == VOIDmode)
3488     autodetected_vector_mode = loop_vinfo->vector_mode;
3489
3490   /* Advance mode_i, first skipping modes that would result in the
3491      same analysis result.  */
3492   while (mode_i + 1 < vector_modes.length ()
3493          && vect_chooses_same_modes_p (loop_vinfo,
3494                                        vector_modes[mode_i + 1]))
3495     {
3496       if (dump_enabled_p ())
3497         dump_printf_loc (MSG_NOTE, vect_location,
3498                          "***** The result for vector mode %s would"
3499                          " be the same\n",
3500                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3501       mode_i += 1;
3502     }
3503   if (mode_i + 1 < vector_modes.length ()
3504       && VECTOR_MODE_P (autodetected_vector_mode)
3505       && (related_vector_mode (vector_modes[mode_i + 1],
3506                                GET_MODE_INNER (autodetected_vector_mode))
3507           == autodetected_vector_mode)
3508       && (related_vector_mode (autodetected_vector_mode,
3509                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3510           == vector_modes[mode_i + 1]))
3511     {
3512       if (dump_enabled_p ())
3513         dump_printf_loc (MSG_NOTE, vect_location,
3514                          "***** Skipping vector mode %s, which would"
3515                          " repeat the analysis for %s\n",
3516                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3517                          GET_MODE_NAME (autodetected_vector_mode));
3518       mode_i += 1;
3519     }
3520   mode_i++;
3521
3522   if (!res)
3523     {
3524       delete loop_vinfo;
3525       if (fatal)
3526         gcc_checking_assert (main_loop_vinfo == NULL);
3527       return opt_loop_vec_info::propagate_failure (res);
3528     }
3529
3530   return opt_loop_vec_info::success (loop_vinfo);
3531 }
3532
3533 /* Function vect_analyze_loop.
3534
3535    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3536    for it.  The different analyses will record information in the
3537    loop_vec_info struct.  */
3538 opt_loop_vec_info
3539 vect_analyze_loop (class loop *loop, gimple *loop_vectorized_call,
3540                    vec_info_shared *shared)
3541 {
3542   DUMP_VECT_SCOPE ("analyze_loop_nest");
3543
3544   if (loop_outer (loop)
3545       && loop_vec_info_for_loop (loop_outer (loop))
3546       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3547     return opt_loop_vec_info::failure_at (vect_location,
3548                                           "outer-loop already vectorized.\n");
3549
3550   if (!find_loop_nest (loop, &shared->loop_nest))
3551     return opt_loop_vec_info::failure_at
3552       (vect_location,
3553        "not vectorized: loop nest containing two or more consecutive inner"
3554        " loops cannot be vectorized\n");
3555
3556   /* Analyze the loop form.  */
3557   vect_loop_form_info loop_form_info;
3558   opt_result res = vect_analyze_loop_form (loop, loop_vectorized_call,
3559                                            &loop_form_info);
3560   if (!res)
3561     {
3562       if (dump_enabled_p ())
3563         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3564                          "bad loop form.\n");
3565       return opt_loop_vec_info::propagate_failure (res);
3566     }
3567   if (!integer_onep (loop_form_info.assumptions))
3568     {
3569       /* We consider to vectorize this loop by versioning it under
3570          some assumptions.  In order to do this, we need to clear
3571          existing information computed by scev and niter analyzer.  */
3572       scev_reset_htab ();
3573       free_numbers_of_iterations_estimates (loop);
3574       /* Also set flag for this loop so that following scev and niter
3575          analysis are done under the assumptions.  */
3576       loop_constraint_set (loop, LOOP_C_FINITE);
3577     }
3578   else
3579     /* Clear the existing niter information to make sure the nonwrapping flag
3580        will be calculated and set propriately.  */
3581     free_numbers_of_iterations_estimates (loop);
3582
3583   auto_vector_modes vector_modes;
3584   /* Autodetect first vector size we try.  */
3585   vector_modes.safe_push (VOIDmode);
3586   unsigned int autovec_flags
3587     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3588                                                     loop->simdlen != 0);
3589   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3590                              && !unlimited_cost_model (loop));
3591   machine_mode autodetected_vector_mode = VOIDmode;
3592   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3593   unsigned int mode_i = 0;
3594   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3595
3596   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3597      a mode has not been analyzed.  */
3598   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3599   for (unsigned i = 0; i < vector_modes.length (); ++i)
3600     cached_vf_per_mode.safe_push (0);
3601
3602   /* First determine the main loop vectorization mode, either the first
3603      one that works, starting with auto-detecting the vector mode and then
3604      following the targets order of preference, or the one with the
3605      lowest cost if pick_lowest_cost_p.  */
3606   while (1)
3607     {
3608       bool fatal;
3609       unsigned int last_mode_i = mode_i;
3610       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3611          failed.  */
3612       cached_vf_per_mode[last_mode_i] = -1;
3613       opt_loop_vec_info loop_vinfo
3614         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3615                                NULL, vector_modes, mode_i,
3616                                autodetected_vector_mode, fatal);
3617       if (fatal)
3618         break;
3619
3620       if (loop_vinfo)
3621         {
3622           /*  Analyzis has been successful so update the VF value.  The
3623               VF should always be a multiple of unroll_factor and we want to
3624               capture the original VF here.  */
3625           cached_vf_per_mode[last_mode_i]
3626             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3627                          loop_vinfo->suggested_unroll_factor);
3628           /* Once we hit the desired simdlen for the first time,
3629              discard any previous attempts.  */
3630           if (simdlen
3631               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3632             {
3633               delete first_loop_vinfo;
3634               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3635               simdlen = 0;
3636             }
3637           else if (pick_lowest_cost_p
3638                    && first_loop_vinfo
3639                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3640             {
3641               /* Pick loop_vinfo over first_loop_vinfo.  */
3642               delete first_loop_vinfo;
3643               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3644             }
3645           if (first_loop_vinfo == NULL)
3646             first_loop_vinfo = loop_vinfo;
3647           else
3648             {
3649               delete loop_vinfo;
3650               loop_vinfo = opt_loop_vec_info::success (NULL);
3651             }
3652
3653           /* Commit to first_loop_vinfo if we have no reason to try
3654              alternatives.  */
3655           if (!simdlen && !pick_lowest_cost_p)
3656             break;
3657         }
3658       if (mode_i == vector_modes.length ()
3659           || autodetected_vector_mode == VOIDmode)
3660         break;
3661
3662       /* Try the next biggest vector size.  */
3663       if (dump_enabled_p ())
3664         dump_printf_loc (MSG_NOTE, vect_location,
3665                          "***** Re-trying analysis with vector mode %s\n",
3666                          GET_MODE_NAME (vector_modes[mode_i]));
3667     }
3668   if (!first_loop_vinfo)
3669     return opt_loop_vec_info::propagate_failure (res);
3670
3671   if (dump_enabled_p ())
3672     dump_printf_loc (MSG_NOTE, vect_location,
3673                      "***** Choosing vector mode %s\n",
3674                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3675
3676   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3677      enabled, SIMDUID is not set, it is the innermost loop and we have
3678      either already found the loop's SIMDLEN or there was no SIMDLEN to
3679      begin with.
3680      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3681   bool vect_epilogues = (!simdlen
3682                          && loop->inner == NULL
3683                          && param_vect_epilogues_nomask
3684                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3685                            /* No code motion support for multiple epilogues so for now
3686                               not supported when multiple exits.  */
3687                          && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3688                          && !loop->simduid
3689                          && loop_cost_model (loop) > VECT_COST_MODEL_VERY_CHEAP);
3690   if (!vect_epilogues)
3691     return first_loop_vinfo;
3692
3693   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3694   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3695
3696   /* For epilogues start the analysis from the first mode.  The motivation
3697      behind starting from the beginning comes from cases where the VECTOR_MODES
3698      array may contain length-agnostic and length-specific modes.  Their
3699      ordering is not guaranteed, so we could end up picking a mode for the main
3700      loop that is after the epilogue's optimal mode.  */
3701   vector_modes[0] = autodetected_vector_mode;
3702   mode_i = 0;
3703
3704   bool supports_partial_vectors =
3705     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3706   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3707
3708   while (1)
3709     {
3710       /* If the target does not support partial vectors we can shorten the
3711          number of modes to analyze for the epilogue as we know we can't pick a
3712          mode that would lead to a VF at least as big as the
3713          FIRST_VINFO_VF.  */
3714       if (!supports_partial_vectors
3715           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3716         {
3717           mode_i++;
3718           if (mode_i == vector_modes.length ())
3719             break;
3720           continue;
3721         }
3722
3723       if (dump_enabled_p ())
3724         dump_printf_loc (MSG_NOTE, vect_location,
3725                          "***** Re-trying epilogue analysis with vector "
3726                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3727
3728       bool fatal;
3729       opt_loop_vec_info loop_vinfo
3730         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3731                                first_loop_vinfo,
3732                                vector_modes, mode_i,
3733                                autodetected_vector_mode, fatal);
3734       if (fatal)
3735         break;
3736
3737       if (loop_vinfo)
3738         {
3739           if (pick_lowest_cost_p)
3740             {
3741               /* Keep trying to roll back vectorization attempts while the
3742                  loop_vec_infos they produced were worse than this one.  */
3743               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3744               while (!vinfos.is_empty ()
3745                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3746                 {
3747                   gcc_assert (vect_epilogues);
3748                   delete vinfos.pop ();
3749                 }
3750             }
3751           /* For now only allow one epilogue loop.  */
3752           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3753             {
3754               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3755               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3756               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3757                           || maybe_ne (lowest_th, 0U));
3758               /* Keep track of the known smallest versioning
3759                  threshold.  */
3760               if (ordered_p (lowest_th, th))
3761                 lowest_th = ordered_min (lowest_th, th);
3762             }
3763           else
3764             {
3765               delete loop_vinfo;
3766               loop_vinfo = opt_loop_vec_info::success (NULL);
3767             }
3768
3769           /* For now only allow one epilogue loop, but allow
3770              pick_lowest_cost_p to replace it, so commit to the
3771              first epilogue if we have no reason to try alternatives.  */
3772           if (!pick_lowest_cost_p)
3773             break;
3774         }
3775
3776       if (mode_i == vector_modes.length ())
3777         break;
3778
3779     }
3780
3781   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3782     {
3783       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3784       if (dump_enabled_p ())
3785         dump_printf_loc (MSG_NOTE, vect_location,
3786                          "***** Choosing epilogue vector mode %s\n",
3787                          GET_MODE_NAME
3788                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3789     }
3790
3791   return first_loop_vinfo;
3792 }
3793
3794 /* Return true if there is an in-order reduction function for CODE, storing
3795    it in *REDUC_FN if so.  */
3796
3797 static bool
3798 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3799 {
3800   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3801      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3802      (-0.0) = -0.0.  */
3803   if (code == PLUS_EXPR || code == MINUS_EXPR)
3804     {
3805       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3806       return true;
3807     }
3808   return false;
3809 }
3810
3811 /* Function reduction_fn_for_scalar_code
3812
3813    Input:
3814    CODE - tree_code of a reduction operations.
3815
3816    Output:
3817    REDUC_FN - the corresponding internal function to be used to reduce the
3818       vector of partial results into a single scalar result, or IFN_LAST
3819       if the operation is a supported reduction operation, but does not have
3820       such an internal function.
3821
3822    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3823
3824 bool
3825 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3826 {
3827   if (code.is_tree_code ())
3828     switch (tree_code (code))
3829       {
3830       case MAX_EXPR:
3831         *reduc_fn = IFN_REDUC_MAX;
3832         return true;
3833
3834       case MIN_EXPR:
3835         *reduc_fn = IFN_REDUC_MIN;
3836         return true;
3837
3838       case PLUS_EXPR:
3839         *reduc_fn = IFN_REDUC_PLUS;
3840         return true;
3841
3842       case BIT_AND_EXPR:
3843         *reduc_fn = IFN_REDUC_AND;
3844         return true;
3845
3846       case BIT_IOR_EXPR:
3847         *reduc_fn = IFN_REDUC_IOR;
3848         return true;
3849
3850       case BIT_XOR_EXPR:
3851         *reduc_fn = IFN_REDUC_XOR;
3852         return true;
3853
3854       case MULT_EXPR:
3855       case MINUS_EXPR:
3856         *reduc_fn = IFN_LAST;
3857         return true;
3858
3859       default:
3860         return false;
3861       }
3862   else
3863     switch (combined_fn (code))
3864       {
3865       CASE_CFN_FMAX:
3866         *reduc_fn = IFN_REDUC_FMAX;
3867         return true;
3868
3869       CASE_CFN_FMIN:
3870         *reduc_fn = IFN_REDUC_FMIN;
3871         return true;
3872
3873       default:
3874         return false;
3875       }
3876 }
3877
3878 /* If there is a neutral value X such that a reduction would not be affected
3879    by the introduction of additional X elements, return that X, otherwise
3880    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3881    of the scalar elements.  If the reduction has just a single initial value
3882    then INITIAL_VALUE is that value, otherwise it is null.
3883    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3884    In that case no signed zero is returned.  */
3885
3886 tree
3887 neutral_op_for_reduction (tree scalar_type, code_helper code,
3888                           tree initial_value, bool as_initial)
3889 {
3890   if (code.is_tree_code ())
3891     switch (tree_code (code))
3892       {
3893       case DOT_PROD_EXPR:
3894       case SAD_EXPR:
3895       case MINUS_EXPR:
3896       case BIT_IOR_EXPR:
3897       case BIT_XOR_EXPR:
3898         return build_zero_cst (scalar_type);
3899       case WIDEN_SUM_EXPR:
3900       case PLUS_EXPR:
3901         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3902           return build_real (scalar_type, dconstm0);
3903         else
3904           return build_zero_cst (scalar_type);
3905
3906       case MULT_EXPR:
3907         return build_one_cst (scalar_type);
3908
3909       case BIT_AND_EXPR:
3910         return build_all_ones_cst (scalar_type);
3911
3912       case MAX_EXPR:
3913       case MIN_EXPR:
3914         return initial_value;
3915
3916       default:
3917         return NULL_TREE;
3918       }
3919   else
3920     switch (combined_fn (code))
3921       {
3922       CASE_CFN_FMIN:
3923       CASE_CFN_FMAX:
3924         return initial_value;
3925
3926       default:
3927         return NULL_TREE;
3928       }
3929 }
3930
3931 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3932    STMT is printed with a message MSG. */
3933
3934 static void
3935 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3936 {
3937   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3938 }
3939
3940 /* Return true if we need an in-order reduction for operation CODE
3941    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3942    overflow must wrap.  */
3943
3944 bool
3945 needs_fold_left_reduction_p (tree type, code_helper code)
3946 {
3947   /* CHECKME: check for !flag_finite_math_only too?  */
3948   if (SCALAR_FLOAT_TYPE_P (type))
3949     {
3950       if (code.is_tree_code ())
3951         switch (tree_code (code))
3952           {
3953           case MIN_EXPR:
3954           case MAX_EXPR:
3955             return false;
3956
3957           default:
3958             return !flag_associative_math;
3959           }
3960       else
3961         switch (combined_fn (code))
3962           {
3963           CASE_CFN_FMIN:
3964           CASE_CFN_FMAX:
3965             return false;
3966
3967           default:
3968             return !flag_associative_math;
3969           }
3970     }
3971
3972   if (INTEGRAL_TYPE_P (type))
3973     return (!code.is_tree_code ()
3974             || !operation_no_trapping_overflow (type, tree_code (code)));
3975
3976   if (SAT_FIXED_POINT_TYPE_P (type))
3977     return true;
3978
3979   return false;
3980 }
3981
3982 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3983    has a handled computation expression.  Store the main reduction
3984    operation in *CODE.  */
3985
3986 static bool
3987 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3988                       tree loop_arg, code_helper *code,
3989                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3990 {
3991   auto_bitmap visited;
3992   tree lookfor = PHI_RESULT (phi);
3993   ssa_op_iter curri;
3994   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3995   while (USE_FROM_PTR (curr) != loop_arg)
3996     curr = op_iter_next_use (&curri);
3997   curri.i = curri.numops;
3998   do
3999     {
4000       path.safe_push (std::make_pair (curri, curr));
4001       tree use = USE_FROM_PTR (curr);
4002       if (use == lookfor)
4003         break;
4004       gimple *def = SSA_NAME_DEF_STMT (use);
4005       if (gimple_nop_p (def)
4006           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4007         {
4008 pop:
4009           do
4010             {
4011               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4012               curri = x.first;
4013               curr = x.second;
4014               do
4015                 curr = op_iter_next_use (&curri);
4016               /* Skip already visited or non-SSA operands (from iterating
4017                  over PHI args).  */
4018               while (curr != NULL_USE_OPERAND_P
4019                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4020                          || ! bitmap_set_bit (visited,
4021                                               SSA_NAME_VERSION
4022                                                 (USE_FROM_PTR (curr)))));
4023             }
4024           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4025           if (curr == NULL_USE_OPERAND_P)
4026             break;
4027         }
4028       else
4029         {
4030           if (gimple_code (def) == GIMPLE_PHI)
4031             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4032           else
4033             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4034           while (curr != NULL_USE_OPERAND_P
4035                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4036                      || ! bitmap_set_bit (visited,
4037                                           SSA_NAME_VERSION
4038                                             (USE_FROM_PTR (curr)))))
4039             curr = op_iter_next_use (&curri);
4040           if (curr == NULL_USE_OPERAND_P)
4041             goto pop;
4042         }
4043     }
4044   while (1);
4045   if (dump_file && (dump_flags & TDF_DETAILS))
4046     {
4047       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4048       unsigned i;
4049       std::pair<ssa_op_iter, use_operand_p> *x;
4050       FOR_EACH_VEC_ELT (path, i, x)
4051         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4052       dump_printf (MSG_NOTE, "\n");
4053     }
4054
4055   /* Check whether the reduction path detected is valid.  */
4056   bool fail = path.length () == 0;
4057   bool neg = false;
4058   int sign = -1;
4059   *code = ERROR_MARK;
4060   for (unsigned i = 1; i < path.length (); ++i)
4061     {
4062       gimple *use_stmt = USE_STMT (path[i].second);
4063       gimple_match_op op;
4064       if (!gimple_extract_op (use_stmt, &op))
4065         {
4066           fail = true;
4067           break;
4068         }
4069       unsigned int opi = op.num_ops;
4070       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4071         {
4072           /* The following make sure we can compute the operand index
4073              easily plus it mostly disallows chaining via COND_EXPR condition
4074              operands.  */
4075           for (opi = 0; opi < op.num_ops; ++opi)
4076             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4077               break;
4078         }
4079       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4080         {
4081           for (opi = 0; opi < op.num_ops; ++opi)
4082             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4083               break;
4084         }
4085       if (opi == op.num_ops)
4086         {
4087           fail = true;
4088           break;
4089         }
4090       op.code = canonicalize_code (op.code, op.type);
4091       if (op.code == MINUS_EXPR)
4092         {
4093           op.code = PLUS_EXPR;
4094           /* Track whether we negate the reduction value each iteration.  */
4095           if (op.ops[1] == op.ops[opi])
4096             neg = ! neg;
4097         }
4098       else if (op.code == IFN_COND_SUB)
4099         {
4100           op.code = IFN_COND_ADD;
4101           /* Track whether we negate the reduction value each iteration.  */
4102           if (op.ops[2] == op.ops[opi])
4103             neg = ! neg;
4104         }
4105       if (CONVERT_EXPR_CODE_P (op.code)
4106           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4107         ;
4108       else if (*code == ERROR_MARK)
4109         {
4110           *code = op.code;
4111           sign = TYPE_SIGN (op.type);
4112         }
4113       else if (op.code != *code)
4114         {
4115           fail = true;
4116           break;
4117         }
4118       else if ((op.code == MIN_EXPR
4119                 || op.code == MAX_EXPR)
4120                && sign != TYPE_SIGN (op.type))
4121         {
4122           fail = true;
4123           break;
4124         }
4125       /* Check there's only a single stmt the op is used on.  For the
4126          not value-changing tail and the last stmt allow out-of-loop uses.
4127          ???  We could relax this and handle arbitrary live stmts by
4128          forcing a scalar epilogue for example.  */
4129       imm_use_iterator imm_iter;
4130       use_operand_p use_p;
4131       gimple *op_use_stmt;
4132       unsigned cnt = 0;
4133       bool cond_fn_p = op.code.is_internal_fn ()
4134         && (conditional_internal_fn_code (internal_fn (op.code))
4135             != ERROR_MARK);
4136
4137       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4138         {
4139           /* In case of a COND_OP (mask, op1, op2, op1) reduction we should
4140              have op1 twice (once as definition, once as else) in the same
4141              operation.  Enforce this.  */
4142           if (cond_fn_p && op_use_stmt == use_stmt)
4143             {
4144               gcall *call = as_a<gcall *> (use_stmt);
4145               unsigned else_pos
4146                 = internal_fn_else_index (internal_fn (op.code));
4147               if (gimple_call_arg (call, else_pos) != op.ops[opi])
4148                 {
4149                   fail = true;
4150                   break;
4151                 }
4152               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4153                 {
4154                   if (j == else_pos)
4155                     continue;
4156                   if (gimple_call_arg (call, j) == op.ops[opi])
4157                     cnt++;
4158                 }
4159             }
4160           else if (!is_gimple_debug (op_use_stmt)
4161                    && (*code != ERROR_MARK
4162                        || flow_bb_inside_loop_p (loop,
4163                                                  gimple_bb (op_use_stmt))))
4164             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4165               cnt++;
4166         }
4167
4168       if (cnt != 1)
4169         {
4170           fail = true;
4171           break;
4172         }
4173     }
4174   return ! fail && ! neg && *code != ERROR_MARK;
4175 }
4176
4177 bool
4178 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4179                       tree loop_arg, enum tree_code code)
4180 {
4181   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4182   code_helper code_;
4183   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4184           && code_ == code);
4185 }
4186
4187
4188
4189 /* Function vect_is_simple_reduction
4190
4191    (1) Detect a cross-iteration def-use cycle that represents a simple
4192    reduction computation.  We look for the following pattern:
4193
4194    loop_header:
4195      a1 = phi < a0, a2 >
4196      a3 = ...
4197      a2 = operation (a3, a1)
4198
4199    or
4200
4201    a3 = ...
4202    loop_header:
4203      a1 = phi < a0, a2 >
4204      a2 = operation (a3, a1)
4205
4206    such that:
4207    1. operation is commutative and associative and it is safe to
4208       change the order of the computation
4209    2. no uses for a2 in the loop (a2 is used out of the loop)
4210    3. no uses of a1 in the loop besides the reduction operation
4211    4. no uses of a1 outside the loop.
4212
4213    Conditions 1,4 are tested here.
4214    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4215
4216    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4217    nested cycles.
4218
4219    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4220    reductions:
4221
4222      a1 = phi < a0, a2 >
4223      inner loop (def of a3)
4224      a2 = phi < a3 >
4225
4226    (4) Detect condition expressions, ie:
4227      for (int i = 0; i < N; i++)
4228        if (a[i] < val)
4229         ret_val = a[i];
4230
4231 */
4232
4233 static stmt_vec_info
4234 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4235                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4236 {
4237   gphi *phi = as_a <gphi *> (phi_info->stmt);
4238   gimple *phi_use_stmt = NULL;
4239   imm_use_iterator imm_iter;
4240   use_operand_p use_p;
4241
4242   *double_reduc = false;
4243   *reduc_chain_p = false;
4244   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4245
4246   tree phi_name = PHI_RESULT (phi);
4247   /* ???  If there are no uses of the PHI result the inner loop reduction
4248      won't be detected as possibly double-reduction by vectorizable_reduction
4249      because that tries to walk the PHI arg from the preheader edge which
4250      can be constant.  See PR60382.  */
4251   if (has_zero_uses (phi_name))
4252     return NULL;
4253   class loop *loop = (gimple_bb (phi))->loop_father;
4254   unsigned nphi_def_loop_uses = 0;
4255   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4256     {
4257       gimple *use_stmt = USE_STMT (use_p);
4258       if (is_gimple_debug (use_stmt))
4259         continue;
4260
4261       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4262         {
4263           if (dump_enabled_p ())
4264             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4265                              "intermediate value used outside loop.\n");
4266
4267           return NULL;
4268         }
4269
4270       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4271          op1 twice (once as definition, once as else) in the same operation.
4272          Only count it as one. */
4273       if (use_stmt != phi_use_stmt)
4274         {
4275           nphi_def_loop_uses++;
4276           phi_use_stmt = use_stmt;
4277         }
4278     }
4279
4280   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4281   if (TREE_CODE (latch_def) != SSA_NAME)
4282     {
4283       if (dump_enabled_p ())
4284         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4285                          "reduction: not ssa_name: %T\n", latch_def);
4286       return NULL;
4287     }
4288
4289   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4290   if (!def_stmt_info
4291       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4292     return NULL;
4293
4294   bool nested_in_vect_loop
4295     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4296   unsigned nlatch_def_loop_uses = 0;
4297   auto_vec<gphi *, 3> lcphis;
4298   bool inner_loop_of_double_reduc = false;
4299   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4300     {
4301       gimple *use_stmt = USE_STMT (use_p);
4302       if (is_gimple_debug (use_stmt))
4303         continue;
4304       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4305         nlatch_def_loop_uses++;
4306       else
4307         {
4308           /* We can have more than one loop-closed PHI.  */
4309           lcphis.safe_push (as_a <gphi *> (use_stmt));
4310           if (nested_in_vect_loop
4311               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4312                   == vect_double_reduction_def))
4313             inner_loop_of_double_reduc = true;
4314         }
4315     }
4316
4317   /* If we are vectorizing an inner reduction we are executing that
4318      in the original order only in case we are not dealing with a
4319      double reduction.  */
4320   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4321     {
4322       if (dump_enabled_p ())
4323         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4324                         "detected nested cycle: ");
4325       return def_stmt_info;
4326     }
4327
4328   /* When the inner loop of a double reduction ends up with more than
4329      one loop-closed PHI we have failed to classify alternate such
4330      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4331   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4332     {
4333       if (dump_enabled_p ())
4334         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4335                          "unhandle double reduction\n");
4336       return NULL;
4337     }
4338
4339   /* If this isn't a nested cycle or if the nested cycle reduction value
4340      is used ouside of the inner loop we cannot handle uses of the reduction
4341      value.  */
4342   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4343     {
4344       if (dump_enabled_p ())
4345         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4346                          "reduction used in loop.\n");
4347       return NULL;
4348     }
4349
4350   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4351      defined in the inner loop.  */
4352   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4353     {
4354       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4355       if (gimple_phi_num_args (def_stmt) != 1
4356           || TREE_CODE (op1) != SSA_NAME)
4357         {
4358           if (dump_enabled_p ())
4359             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4360                              "unsupported phi node definition.\n");
4361
4362           return NULL;
4363         }
4364
4365       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4366          and the latch definition op1.  */
4367       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4368       if (gimple_bb (def1)
4369           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4370           && loop->inner
4371           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4372           && (is_gimple_assign (def1) || is_gimple_call (def1))
4373           && is_a <gphi *> (phi_use_stmt)
4374           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4375           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4376                                             loop_latch_edge (loop->inner)))
4377           && lcphis.length () == 1)
4378         {
4379           if (dump_enabled_p ())
4380             report_vect_op (MSG_NOTE, def_stmt,
4381                             "detected double reduction: ");
4382
4383           *double_reduc = true;
4384           return def_stmt_info;
4385         }
4386
4387       return NULL;
4388     }
4389
4390   /* Look for the expression computing latch_def from then loop PHI result.  */
4391   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4392   code_helper code;
4393   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4394                             path))
4395     {
4396       STMT_VINFO_REDUC_CODE (phi_info) = code;
4397       if (code == COND_EXPR && !nested_in_vect_loop)
4398         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4399
4400       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4401          reduction chain for which the additional restriction is that
4402          all operations in the chain are the same.  */
4403       auto_vec<stmt_vec_info, 8> reduc_chain;
4404       unsigned i;
4405       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4406       for (i = path.length () - 1; i >= 1; --i)
4407         {
4408           gimple *stmt = USE_STMT (path[i].second);
4409           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4410           gimple_match_op op;
4411           if (!gimple_extract_op (stmt, &op))
4412             gcc_unreachable ();
4413           if (gassign *assign = dyn_cast<gassign *> (stmt))
4414             STMT_VINFO_REDUC_IDX (stmt_info)
4415               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4416           else
4417             {
4418               gcall *call = as_a<gcall *> (stmt);
4419               STMT_VINFO_REDUC_IDX (stmt_info)
4420                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4421             }
4422           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4423                                      && (i == 1 || i == path.length () - 1));
4424           if ((op.code != code && !leading_conversion)
4425               /* We can only handle the final value in epilogue
4426                  generation for reduction chains.  */
4427               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4428             is_slp_reduc = false;
4429           /* For reduction chains we support a trailing/leading
4430              conversions.  We do not store those in the actual chain.  */
4431           if (leading_conversion)
4432             continue;
4433           reduc_chain.safe_push (stmt_info);
4434         }
4435       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4436         {
4437           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4438             {
4439               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4440               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4441             }
4442           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4443           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4444
4445           /* Save the chain for further analysis in SLP detection.  */
4446           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4447           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4448
4449           *reduc_chain_p = true;
4450           if (dump_enabled_p ())
4451             dump_printf_loc (MSG_NOTE, vect_location,
4452                             "reduction: detected reduction chain\n");
4453         }
4454       else if (dump_enabled_p ())
4455         dump_printf_loc (MSG_NOTE, vect_location,
4456                          "reduction: detected reduction\n");
4457
4458       return def_stmt_info;
4459     }
4460
4461   if (dump_enabled_p ())
4462     dump_printf_loc (MSG_NOTE, vect_location,
4463                      "reduction: unknown pattern\n");
4464
4465   return NULL;
4466 }
4467
4468 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4469    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4470    or -1 if not known.  */
4471
4472 static int
4473 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4474 {
4475   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4476   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4477     {
4478       if (dump_enabled_p ())
4479         dump_printf_loc (MSG_NOTE, vect_location,
4480                          "cost model: epilogue peel iters set to vf/2 "
4481                          "because loop iterations are unknown .\n");
4482       return assumed_vf / 2;
4483     }
4484   else
4485     {
4486       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4487       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4488       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4489       /* If we need to peel for gaps, but no peeling is required, we have to
4490          peel VF iterations.  */
4491       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4492         peel_iters_epilogue = assumed_vf;
4493       return peel_iters_epilogue;
4494     }
4495 }
4496
4497 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4498 int
4499 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4500                              int *peel_iters_epilogue,
4501                              stmt_vector_for_cost *scalar_cost_vec,
4502                              stmt_vector_for_cost *prologue_cost_vec,
4503                              stmt_vector_for_cost *epilogue_cost_vec)
4504 {
4505   int retval = 0;
4506
4507   *peel_iters_epilogue
4508     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4509
4510   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4511     {
4512       /* If peeled iterations are known but number of scalar loop
4513          iterations are unknown, count a taken branch per peeled loop.  */
4514       if (peel_iters_prologue > 0)
4515         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4516                                    vect_prologue);
4517       if (*peel_iters_epilogue > 0)
4518         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4519                                     vect_epilogue);
4520     }
4521
4522   stmt_info_for_cost *si;
4523   int j;
4524   if (peel_iters_prologue)
4525     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4526       retval += record_stmt_cost (prologue_cost_vec,
4527                                   si->count * peel_iters_prologue,
4528                                   si->kind, si->stmt_info, si->misalign,
4529                                   vect_prologue);
4530   if (*peel_iters_epilogue)
4531     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4532       retval += record_stmt_cost (epilogue_cost_vec,
4533                                   si->count * *peel_iters_epilogue,
4534                                   si->kind, si->stmt_info, si->misalign,
4535                                   vect_epilogue);
4536
4537   return retval;
4538 }
4539
4540 /* Function vect_estimate_min_profitable_iters
4541
4542    Return the number of iterations required for the vector version of the
4543    loop to be profitable relative to the cost of the scalar version of the
4544    loop.
4545
4546    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4547    of iterations for vectorization.  -1 value means loop vectorization
4548    is not profitable.  This returned value may be used for dynamic
4549    profitability check.
4550
4551    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4552    for static check against estimated number of iterations.  */
4553
4554 static void
4555 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4556                                     int *ret_min_profitable_niters,
4557                                     int *ret_min_profitable_estimate,
4558                                     unsigned *suggested_unroll_factor)
4559 {
4560   int min_profitable_iters;
4561   int min_profitable_estimate;
4562   int peel_iters_prologue;
4563   int peel_iters_epilogue;
4564   unsigned vec_inside_cost = 0;
4565   int vec_outside_cost = 0;
4566   unsigned vec_prologue_cost = 0;
4567   unsigned vec_epilogue_cost = 0;
4568   int scalar_single_iter_cost = 0;
4569   int scalar_outside_cost = 0;
4570   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4571   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4572   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4573
4574   /* Cost model disabled.  */
4575   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4576     {
4577       if (dump_enabled_p ())
4578         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4579       *ret_min_profitable_niters = 0;
4580       *ret_min_profitable_estimate = 0;
4581       return;
4582     }
4583
4584   /* Requires loop versioning tests to handle misalignment.  */
4585   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4586     {
4587       /*  FIXME: Make cost depend on complexity of individual check.  */
4588       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4589       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4590       if (dump_enabled_p ())
4591         dump_printf (MSG_NOTE,
4592                      "cost model: Adding cost of checks for loop "
4593                      "versioning to treat misalignment.\n");
4594     }
4595
4596   /* Requires loop versioning with alias checks.  */
4597   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4598     {
4599       /*  FIXME: Make cost depend on complexity of individual check.  */
4600       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4601       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4602       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4603       if (len)
4604         /* Count LEN - 1 ANDs and LEN comparisons.  */
4605         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4606                               scalar_stmt, vect_prologue);
4607       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4608       if (len)
4609         {
4610           /* Count LEN - 1 ANDs and LEN comparisons.  */
4611           unsigned int nstmts = len * 2 - 1;
4612           /* +1 for each bias that needs adding.  */
4613           for (unsigned int i = 0; i < len; ++i)
4614             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4615               nstmts += 1;
4616           (void) add_stmt_cost (target_cost_data, nstmts,
4617                                 scalar_stmt, vect_prologue);
4618         }
4619       if (dump_enabled_p ())
4620         dump_printf (MSG_NOTE,
4621                      "cost model: Adding cost of checks for loop "
4622                      "versioning aliasing.\n");
4623     }
4624
4625   /* Requires loop versioning with niter checks.  */
4626   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4627     {
4628       /*  FIXME: Make cost depend on complexity of individual check.  */
4629       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4630                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4631       if (dump_enabled_p ())
4632         dump_printf (MSG_NOTE,
4633                      "cost model: Adding cost of checks for loop "
4634                      "versioning niters.\n");
4635     }
4636
4637   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4638     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4639                           vect_prologue);
4640
4641   /* Count statements in scalar loop.  Using this as scalar cost for a single
4642      iteration for now.
4643
4644      TODO: Add outer loop support.
4645
4646      TODO: Consider assigning different costs to different scalar
4647      statements.  */
4648
4649   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4650
4651   /* Add additional cost for the peeled instructions in prologue and epilogue
4652      loop.  (For fully-masked loops there will be no peeling.)
4653
4654      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4655      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4656
4657      TODO: Build an expression that represents peel_iters for prologue and
4658      epilogue to be used in a run-time test.  */
4659
4660   bool prologue_need_br_taken_cost = false;
4661   bool prologue_need_br_not_taken_cost = false;
4662
4663   /* Calculate peel_iters_prologue.  */
4664   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4665     peel_iters_prologue = 0;
4666   else if (npeel < 0)
4667     {
4668       peel_iters_prologue = assumed_vf / 2;
4669       if (dump_enabled_p ())
4670         dump_printf (MSG_NOTE, "cost model: "
4671                      "prologue peel iters set to vf/2.\n");
4672
4673       /* If peeled iterations are unknown, count a taken branch and a not taken
4674          branch per peeled loop.  Even if scalar loop iterations are known,
4675          vector iterations are not known since peeled prologue iterations are
4676          not known.  Hence guards remain the same.  */
4677       prologue_need_br_taken_cost = true;
4678       prologue_need_br_not_taken_cost = true;
4679     }
4680   else
4681     {
4682       peel_iters_prologue = npeel;
4683       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4684         /* If peeled iterations are known but number of scalar loop
4685            iterations are unknown, count a taken branch per peeled loop.  */
4686         prologue_need_br_taken_cost = true;
4687     }
4688
4689   bool epilogue_need_br_taken_cost = false;
4690   bool epilogue_need_br_not_taken_cost = false;
4691
4692   /* Calculate peel_iters_epilogue.  */
4693   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4694     /* We need to peel exactly one iteration for gaps.  */
4695     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4696   else if (npeel < 0)
4697     {
4698       /* If peeling for alignment is unknown, loop bound of main loop
4699          becomes unknown.  */
4700       peel_iters_epilogue = assumed_vf / 2;
4701       if (dump_enabled_p ())
4702         dump_printf (MSG_NOTE, "cost model: "
4703                      "epilogue peel iters set to vf/2 because "
4704                      "peeling for alignment is unknown.\n");
4705
4706       /* See the same reason above in peel_iters_prologue calculation.  */
4707       epilogue_need_br_taken_cost = true;
4708       epilogue_need_br_not_taken_cost = true;
4709     }
4710   else
4711     {
4712       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4713       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4714         /* If peeled iterations are known but number of scalar loop
4715            iterations are unknown, count a taken branch per peeled loop.  */
4716         epilogue_need_br_taken_cost = true;
4717     }
4718
4719   stmt_info_for_cost *si;
4720   int j;
4721   /* Add costs associated with peel_iters_prologue.  */
4722   if (peel_iters_prologue)
4723     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4724       {
4725         (void) add_stmt_cost (target_cost_data,
4726                               si->count * peel_iters_prologue, si->kind,
4727                               si->stmt_info, si->node, si->vectype,
4728                               si->misalign, vect_prologue);
4729       }
4730
4731   /* Add costs associated with peel_iters_epilogue.  */
4732   if (peel_iters_epilogue)
4733     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4734       {
4735         (void) add_stmt_cost (target_cost_data,
4736                               si->count * peel_iters_epilogue, si->kind,
4737                               si->stmt_info, si->node, si->vectype,
4738                               si->misalign, vect_epilogue);
4739       }
4740
4741   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4742
4743   if (prologue_need_br_taken_cost)
4744     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4745                           vect_prologue);
4746
4747   if (prologue_need_br_not_taken_cost)
4748     (void) add_stmt_cost (target_cost_data, 1,
4749                           cond_branch_not_taken, vect_prologue);
4750
4751   if (epilogue_need_br_taken_cost)
4752     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4753                           vect_epilogue);
4754
4755   if (epilogue_need_br_not_taken_cost)
4756     (void) add_stmt_cost (target_cost_data, 1,
4757                           cond_branch_not_taken, vect_epilogue);
4758
4759   /* Take care of special costs for rgroup controls of partial vectors.  */
4760   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4761       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4762           == vect_partial_vectors_avx512))
4763     {
4764       /* Calculate how many masks we need to generate.  */
4765       unsigned int num_masks = 0;
4766       bool need_saturation = false;
4767       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4768         if (rgm.type)
4769           {
4770             unsigned nvectors = rgm.factor;
4771             num_masks += nvectors;
4772             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4773                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4774               need_saturation = true;
4775           }
4776
4777       /* ???  The target isn't able to identify the costs below as
4778          producing masks so it cannot penaltize cases where we'd run
4779          out of mask registers for example.  */
4780
4781       /* ???  We are also failing to account for smaller vector masks
4782          we generate by splitting larger masks in vect_get_loop_mask.  */
4783
4784       /* In the worst case, we need to generate each mask in the prologue
4785          and in the loop body.  We need one splat per group and one
4786          compare per mask.
4787
4788          Sometimes the prologue mask will fold to a constant,
4789          so the actual prologue cost might be smaller.  However, it's
4790          simpler and safer to use the worst-case cost; if this ends up
4791          being the tie-breaker between vectorizing or not, then it's
4792          probably better not to vectorize.  */
4793       (void) add_stmt_cost (target_cost_data,
4794                             num_masks
4795                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4796                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4797                             vect_prologue);
4798       (void) add_stmt_cost (target_cost_data,
4799                             num_masks
4800                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4801                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4802
4803       /* When we need saturation we need it both in the prologue and
4804          the epilogue.  */
4805       if (need_saturation)
4806         {
4807           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4808                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4809           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4810                                 NULL, NULL, NULL_TREE, 0, vect_body);
4811         }
4812     }
4813   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4814            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4815                == vect_partial_vectors_while_ult))
4816     {
4817       /* Calculate how many masks we need to generate.  */
4818       unsigned int num_masks = 0;
4819       rgroup_controls *rgm;
4820       unsigned int num_vectors_m1;
4821       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4822                         num_vectors_m1, rgm)
4823         if (rgm->type)
4824           num_masks += num_vectors_m1 + 1;
4825       gcc_assert (num_masks > 0);
4826
4827       /* In the worst case, we need to generate each mask in the prologue
4828          and in the loop body.  One of the loop body mask instructions
4829          replaces the comparison in the scalar loop, and since we don't
4830          count the scalar comparison against the scalar body, we shouldn't
4831          count that vector instruction against the vector body either.
4832
4833          Sometimes we can use unpacks instead of generating prologue
4834          masks and sometimes the prologue mask will fold to a constant,
4835          so the actual prologue cost might be smaller.  However, it's
4836          simpler and safer to use the worst-case cost; if this ends up
4837          being the tie-breaker between vectorizing or not, then it's
4838          probably better not to vectorize.  */
4839       (void) add_stmt_cost (target_cost_data, num_masks,
4840                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4841                             vect_prologue);
4842       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4843                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4844                             vect_body);
4845     }
4846   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4847     {
4848       /* Referring to the functions vect_set_loop_condition_partial_vectors
4849          and vect_set_loop_controls_directly, we need to generate each
4850          length in the prologue and in the loop body if required. Although
4851          there are some possible optimizations, we consider the worst case
4852          here.  */
4853
4854       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4855       signed char partial_load_store_bias
4856         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4857       bool need_iterate_p
4858         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4859            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4860
4861       /* Calculate how many statements to be added.  */
4862       unsigned int prologue_stmts = 0;
4863       unsigned int body_stmts = 0;
4864
4865       rgroup_controls *rgc;
4866       unsigned int num_vectors_m1;
4867       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4868         if (rgc->type)
4869           {
4870             /* May need one SHIFT for nitems_total computation.  */
4871             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4872             if (nitems != 1 && !niters_known_p)
4873               prologue_stmts += 1;
4874
4875             /* May need one MAX and one MINUS for wrap around.  */
4876             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4877               prologue_stmts += 2;
4878
4879             /* Need one MAX and one MINUS for each batch limit excepting for
4880                the 1st one.  */
4881             prologue_stmts += num_vectors_m1 * 2;
4882
4883             unsigned int num_vectors = num_vectors_m1 + 1;
4884
4885             /* Need to set up lengths in prologue, only one MIN required
4886                for each since start index is zero.  */
4887             prologue_stmts += num_vectors;
4888
4889             /* If we have a non-zero partial load bias, we need one PLUS
4890                to adjust the load length.  */
4891             if (partial_load_store_bias != 0)
4892               body_stmts += 1;
4893
4894             unsigned int length_update_cost = 0;
4895             if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4896               /* For decrement IV style, Each only need a single SELECT_VL
4897                  or MIN since beginning to calculate the number of elements
4898                  need to be processed in current iteration.  */
4899               length_update_cost = 1;
4900             else
4901               /* For increment IV stype, Each may need two MINs and one MINUS to
4902                  update lengths in body for next iteration.  */
4903               length_update_cost = 3;
4904
4905             if (need_iterate_p)
4906               body_stmts += length_update_cost * num_vectors;
4907           }
4908
4909       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4910                             scalar_stmt, vect_prologue);
4911       (void) add_stmt_cost (target_cost_data, body_stmts,
4912                             scalar_stmt, vect_body);
4913     }
4914
4915   /* FORNOW: The scalar outside cost is incremented in one of the
4916      following ways:
4917
4918      1. The vectorizer checks for alignment and aliasing and generates
4919      a condition that allows dynamic vectorization.  A cost model
4920      check is ANDED with the versioning condition.  Hence scalar code
4921      path now has the added cost of the versioning check.
4922
4923        if (cost > th & versioning_check)
4924          jmp to vector code
4925
4926      Hence run-time scalar is incremented by not-taken branch cost.
4927
4928      2. The vectorizer then checks if a prologue is required.  If the
4929      cost model check was not done before during versioning, it has to
4930      be done before the prologue check.
4931
4932        if (cost <= th)
4933          prologue = scalar_iters
4934        if (prologue == 0)
4935          jmp to vector code
4936        else
4937          execute prologue
4938        if (prologue == num_iters)
4939          go to exit
4940
4941      Hence the run-time scalar cost is incremented by a taken branch,
4942      plus a not-taken branch, plus a taken branch cost.
4943
4944      3. The vectorizer then checks if an epilogue is required.  If the
4945      cost model check was not done before during prologue check, it
4946      has to be done with the epilogue check.
4947
4948        if (prologue == 0)
4949          jmp to vector code
4950        else
4951          execute prologue
4952        if (prologue == num_iters)
4953          go to exit
4954        vector code:
4955          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4956            jmp to epilogue
4957
4958      Hence the run-time scalar cost should be incremented by 2 taken
4959      branches.
4960
4961      TODO: The back end may reorder the BBS's differently and reverse
4962      conditions/branch directions.  Change the estimates below to
4963      something more reasonable.  */
4964
4965   /* If the number of iterations is known and we do not do versioning, we can
4966      decide whether to vectorize at compile time.  Hence the scalar version
4967      do not carry cost model guard costs.  */
4968   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4969       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4970     {
4971       /* Cost model check occurs at versioning.  */
4972       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4973         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4974       else
4975         {
4976           /* Cost model check occurs at prologue generation.  */
4977           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4978             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4979               + vect_get_stmt_cost (cond_branch_not_taken);
4980           /* Cost model check occurs at epilogue generation.  */
4981           else
4982             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4983         }
4984     }
4985
4986   /* Complete the target-specific cost calculations.  */
4987   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4988                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4989                suggested_unroll_factor);
4990
4991   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4992       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4993       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4994                     *suggested_unroll_factor,
4995                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4996     {
4997       if (dump_enabled_p ())
4998         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4999                          "can't unroll as unrolled vectorization factor larger"
5000                          " than maximum vectorization factor: "
5001                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5002                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5003       *suggested_unroll_factor = 1;
5004     }
5005
5006   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5007
5008   if (dump_enabled_p ())
5009     {
5010       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5011       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
5012                    vec_inside_cost);
5013       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
5014                    vec_prologue_cost);
5015       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
5016                    vec_epilogue_cost);
5017       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
5018                    scalar_single_iter_cost);
5019       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
5020                    scalar_outside_cost);
5021       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
5022                    vec_outside_cost);
5023       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
5024                    peel_iters_prologue);
5025       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
5026                    peel_iters_epilogue);
5027     }
5028
5029   /* Calculate number of iterations required to make the vector version
5030      profitable, relative to the loop bodies only.  The following condition
5031      must hold true:
5032      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5033      where
5034      SIC = scalar iteration cost, VIC = vector iteration cost,
5035      VOC = vector outside cost, VF = vectorization factor,
5036      NPEEL = prologue iterations + epilogue iterations,
5037      SOC = scalar outside cost for run time cost model check.  */
5038
5039   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5040                           - vec_inside_cost);
5041   if (saving_per_viter <= 0)
5042     {
5043       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5044         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5045                     "vectorization did not happen for a simd loop");
5046
5047       if (dump_enabled_p ())
5048         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5049                          "cost model: the vector iteration cost = %d "
5050                          "divided by the scalar iteration cost = %d "
5051                          "is greater or equal to the vectorization factor = %d"
5052                          ".\n",
5053                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5054       *ret_min_profitable_niters = -1;
5055       *ret_min_profitable_estimate = -1;
5056       return;
5057     }
5058
5059   /* ??? The "if" arm is written to handle all cases; see below for what
5060      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5061   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5062     {
5063       /* Rewriting the condition above in terms of the number of
5064          vector iterations (vniters) rather than the number of
5065          scalar iterations (niters) gives:
5066
5067          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5068
5069          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5070
5071          For integer N, X and Y when X > 0:
5072
5073          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5074       int outside_overhead = (vec_outside_cost
5075                               - scalar_single_iter_cost * peel_iters_prologue
5076                               - scalar_single_iter_cost * peel_iters_epilogue
5077                               - scalar_outside_cost);
5078       /* We're only interested in cases that require at least one
5079          vector iteration.  */
5080       int min_vec_niters = 1;
5081       if (outside_overhead > 0)
5082         min_vec_niters = outside_overhead / saving_per_viter + 1;
5083
5084       if (dump_enabled_p ())
5085         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5086                      min_vec_niters);
5087
5088       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5089         {
5090           /* Now that we know the minimum number of vector iterations,
5091              find the minimum niters for which the scalar cost is larger:
5092
5093              SIC * niters > VIC * vniters + VOC - SOC
5094
5095              We know that the minimum niters is no more than
5096              vniters * VF + NPEEL, but it might be (and often is) less
5097              than that if a partial vector iteration is cheaper than the
5098              equivalent scalar code.  */
5099           int threshold = (vec_inside_cost * min_vec_niters
5100                            + vec_outside_cost
5101                            - scalar_outside_cost);
5102           if (threshold <= 0)
5103             min_profitable_iters = 1;
5104           else
5105             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5106         }
5107       else
5108         /* Convert the number of vector iterations into a number of
5109            scalar iterations.  */
5110         min_profitable_iters = (min_vec_niters * assumed_vf
5111                                 + peel_iters_prologue
5112                                 + peel_iters_epilogue);
5113     }
5114   else
5115     {
5116       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5117                               * assumed_vf
5118                               - vec_inside_cost * peel_iters_prologue
5119                               - vec_inside_cost * peel_iters_epilogue);
5120       if (min_profitable_iters <= 0)
5121         min_profitable_iters = 0;
5122       else
5123         {
5124           min_profitable_iters /= saving_per_viter;
5125
5126           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5127               <= (((int) vec_inside_cost * min_profitable_iters)
5128                   + (((int) vec_outside_cost - scalar_outside_cost)
5129                      * assumed_vf)))
5130             min_profitable_iters++;
5131         }
5132     }
5133
5134   if (dump_enabled_p ())
5135     dump_printf (MSG_NOTE,
5136                  "  Calculated minimum iters for profitability: %d\n",
5137                  min_profitable_iters);
5138
5139   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5140       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5141     /* We want the vectorized loop to execute at least once.  */
5142     min_profitable_iters = assumed_vf + peel_iters_prologue;
5143   else if (min_profitable_iters < peel_iters_prologue)
5144     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5145        vectorized loop executes at least once.  */
5146     min_profitable_iters = peel_iters_prologue;
5147
5148   if (dump_enabled_p ())
5149     dump_printf_loc (MSG_NOTE, vect_location,
5150                      "  Runtime profitability threshold = %d\n",
5151                      min_profitable_iters);
5152
5153   *ret_min_profitable_niters = min_profitable_iters;
5154
5155   /* Calculate number of iterations required to make the vector version
5156      profitable, relative to the loop bodies only.
5157
5158      Non-vectorized variant is SIC * niters and it must win over vector
5159      variant on the expected loop trip count.  The following condition must hold true:
5160      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5161
5162   if (vec_outside_cost <= 0)
5163     min_profitable_estimate = 0;
5164   /* ??? This "else if" arm is written to handle all cases; see below for
5165      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5166   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5167     {
5168       /* This is a repeat of the code above, but with + SOC rather
5169          than - SOC.  */
5170       int outside_overhead = (vec_outside_cost
5171                               - scalar_single_iter_cost * peel_iters_prologue
5172                               - scalar_single_iter_cost * peel_iters_epilogue
5173                               + scalar_outside_cost);
5174       int min_vec_niters = 1;
5175       if (outside_overhead > 0)
5176         min_vec_niters = outside_overhead / saving_per_viter + 1;
5177
5178       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5179         {
5180           int threshold = (vec_inside_cost * min_vec_niters
5181                            + vec_outside_cost
5182                            + scalar_outside_cost);
5183           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5184         }
5185       else
5186         min_profitable_estimate = (min_vec_niters * assumed_vf
5187                                    + peel_iters_prologue
5188                                    + peel_iters_epilogue);
5189     }
5190   else
5191     {
5192       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5193                                  * assumed_vf
5194                                  - vec_inside_cost * peel_iters_prologue
5195                                  - vec_inside_cost * peel_iters_epilogue)
5196                                  / ((scalar_single_iter_cost * assumed_vf)
5197                                    - vec_inside_cost);
5198     }
5199   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5200   if (dump_enabled_p ())
5201     dump_printf_loc (MSG_NOTE, vect_location,
5202                      "  Static estimate profitability threshold = %d\n",
5203                      min_profitable_estimate);
5204
5205   *ret_min_profitable_estimate = min_profitable_estimate;
5206 }
5207
5208 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5209    vector elements (not bits) for a vector with NELT elements.  */
5210 static void
5211 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5212                               vec_perm_builder *sel)
5213 {
5214   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5215      by vec_perm_indices.  */
5216   sel->new_vector (nelt, 1, 3);
5217   for (unsigned int i = 0; i < 3; i++)
5218     sel->quick_push (i + offset);
5219 }
5220
5221 /* Checks whether the target supports whole-vector shifts for vectors of mode
5222    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5223    it supports vec_perm_const with masks for all necessary shift amounts.  */
5224 static bool
5225 have_whole_vector_shift (machine_mode mode)
5226 {
5227   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5228     return true;
5229
5230   /* Variable-length vectors should be handled via the optab.  */
5231   unsigned int nelt;
5232   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5233     return false;
5234
5235   vec_perm_builder sel;
5236   vec_perm_indices indices;
5237   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5238     {
5239       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5240       indices.new_vector (sel, 2, nelt);
5241       if (!can_vec_perm_const_p (mode, mode, indices, false))
5242         return false;
5243     }
5244   return true;
5245 }
5246
5247 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5248    multiplication operands have differing signs and (b) we intend
5249    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5250    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5251
5252 static bool
5253 vect_is_emulated_mixed_dot_prod (stmt_vec_info stmt_info)
5254 {
5255   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5256   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5257     return false;
5258
5259   tree rhs1 = gimple_assign_rhs1 (assign);
5260   tree rhs2 = gimple_assign_rhs2 (assign);
5261   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5262     return false;
5263
5264   gcc_assert (STMT_VINFO_REDUC_VECTYPE_IN (stmt_info));
5265   return !directly_supported_p (DOT_PROD_EXPR,
5266                                 STMT_VINFO_VECTYPE (stmt_info),
5267                                 STMT_VINFO_REDUC_VECTYPE_IN (stmt_info),
5268                                 optab_vector_mixed_sign);
5269 }
5270
5271 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5272    functions. Design better to avoid maintenance issues.  */
5273
5274 /* Function vect_model_reduction_cost.
5275
5276    Models cost for a reduction operation, including the vector ops
5277    generated within the strip-mine loop in some cases, the initial
5278    definition before the loop, and the epilogue code that must be generated.  */
5279
5280 static void
5281 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5282                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5283                            vect_reduction_type reduction_type,
5284                            int ncopies, stmt_vector_for_cost *cost_vec)
5285 {
5286   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5287   tree vectype;
5288   machine_mode mode;
5289   class loop *loop = NULL;
5290
5291   if (loop_vinfo)
5292     loop = LOOP_VINFO_LOOP (loop_vinfo);
5293
5294   /* Condition reductions generate two reductions in the loop.  */
5295   if (reduction_type == COND_REDUCTION)
5296     ncopies *= 2;
5297
5298   vectype = STMT_VINFO_VECTYPE (stmt_info);
5299   mode = TYPE_MODE (vectype);
5300   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5301
5302   gimple_match_op op;
5303   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5304     gcc_unreachable ();
5305
5306   if (reduction_type == EXTRACT_LAST_REDUCTION)
5307     /* No extra instructions are needed in the prologue.  The loop body
5308        operations are costed in vectorizable_condition.  */
5309     inside_cost = 0;
5310   else if (reduction_type == FOLD_LEFT_REDUCTION)
5311     {
5312       /* No extra instructions needed in the prologue.  */
5313       prologue_cost = 0;
5314
5315       if (reduc_fn != IFN_LAST)
5316         /* Count one reduction-like operation per vector.  */
5317         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5318                                         stmt_info, 0, vect_body);
5319       else
5320         {
5321           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5322           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5323           inside_cost = record_stmt_cost (cost_vec, nelements,
5324                                           vec_to_scalar, stmt_info, 0,
5325                                           vect_body);
5326           inside_cost += record_stmt_cost (cost_vec, nelements,
5327                                            scalar_stmt, stmt_info, 0,
5328                                            vect_body);
5329         }
5330     }
5331   else
5332     {
5333       /* Add in the cost of the initial definitions.  */
5334       int prologue_stmts;
5335       if (reduction_type == COND_REDUCTION)
5336         /* For cond reductions we have four vectors: initial index, step,
5337            initial result of the data reduction, initial value of the index
5338            reduction.  */
5339         prologue_stmts = 4;
5340       else
5341         /* We need the initial reduction value.  */
5342         prologue_stmts = 1;
5343       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5344                                          scalar_to_vec, stmt_info, 0,
5345                                          vect_prologue);
5346     }
5347
5348   /* Determine cost of epilogue code.
5349
5350      We have a reduction operator that will reduce the vector in one statement.
5351      Also requires scalar extract.  */
5352
5353   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5354     {
5355       if (reduc_fn != IFN_LAST)
5356         {
5357           if (reduction_type == COND_REDUCTION)
5358             {
5359               /* An EQ stmt and an COND_EXPR stmt.  */
5360               epilogue_cost += record_stmt_cost (cost_vec, 2,
5361                                                  vector_stmt, stmt_info, 0,
5362                                                  vect_epilogue);
5363               /* Reduction of the max index and a reduction of the found
5364                  values.  */
5365               epilogue_cost += record_stmt_cost (cost_vec, 2,
5366                                                  vec_to_scalar, stmt_info, 0,
5367                                                  vect_epilogue);
5368               /* A broadcast of the max value.  */
5369               epilogue_cost += record_stmt_cost (cost_vec, 1,
5370                                                  scalar_to_vec, stmt_info, 0,
5371                                                  vect_epilogue);
5372             }
5373           else
5374             {
5375               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5376                                                  stmt_info, 0, vect_epilogue);
5377               epilogue_cost += record_stmt_cost (cost_vec, 1,
5378                                                  vec_to_scalar, stmt_info, 0,
5379                                                  vect_epilogue);
5380             }
5381         }
5382       else if (reduction_type == COND_REDUCTION)
5383         {
5384           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5385           /* Extraction of scalar elements.  */
5386           epilogue_cost += record_stmt_cost (cost_vec,
5387                                              2 * estimated_nunits,
5388                                              vec_to_scalar, stmt_info, 0,
5389                                              vect_epilogue);
5390           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5391           epilogue_cost += record_stmt_cost (cost_vec,
5392                                              2 * estimated_nunits - 3,
5393                                              scalar_stmt, stmt_info, 0,
5394                                              vect_epilogue);
5395         }
5396       else if (reduction_type == EXTRACT_LAST_REDUCTION
5397                || reduction_type == FOLD_LEFT_REDUCTION)
5398         /* No extra instructions need in the epilogue.  */
5399         ;
5400       else
5401         {
5402           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5403           tree bitsize = TYPE_SIZE (op.type);
5404           int element_bitsize = tree_to_uhwi (bitsize);
5405           int nelements = vec_size_in_bits / element_bitsize;
5406
5407           if (op.code == COND_EXPR)
5408             op.code = MAX_EXPR;
5409
5410           /* We have a whole vector shift available.  */
5411           if (VECTOR_MODE_P (mode)
5412               && directly_supported_p (op.code, vectype)
5413               && have_whole_vector_shift (mode))
5414             {
5415               /* Final reduction via vector shifts and the reduction operator.
5416                  Also requires scalar extract.  */
5417               epilogue_cost += record_stmt_cost (cost_vec,
5418                                                  exact_log2 (nelements) * 2,
5419                                                  vector_stmt, stmt_info, 0,
5420                                                  vect_epilogue);
5421               epilogue_cost += record_stmt_cost (cost_vec, 1,
5422                                                  vec_to_scalar, stmt_info, 0,
5423                                                  vect_epilogue);
5424             }
5425           else
5426             /* Use extracts and reduction op for final reduction.  For N
5427                elements, we have N extracts and N-1 reduction ops.  */
5428             epilogue_cost += record_stmt_cost (cost_vec,
5429                                                nelements + nelements - 1,
5430                                                vector_stmt, stmt_info, 0,
5431                                                vect_epilogue);
5432         }
5433     }
5434
5435   if (dump_enabled_p ())
5436     dump_printf (MSG_NOTE,
5437                  "vect_model_reduction_cost: inside_cost = %d, "
5438                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5439                  prologue_cost, epilogue_cost);
5440 }
5441
5442 /* SEQ is a sequence of instructions that initialize the reduction
5443    described by REDUC_INFO.  Emit them in the appropriate place.  */
5444
5445 static void
5446 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5447                                 stmt_vec_info reduc_info, gimple *seq)
5448 {
5449   if (reduc_info->reused_accumulator)
5450     {
5451       /* When reusing an accumulator from the main loop, we only need
5452          initialization instructions if the main loop can be skipped.
5453          In that case, emit the initialization instructions at the end
5454          of the guard block that does the skip.  */
5455       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5456       gcc_assert (skip_edge);
5457       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5458       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5459     }
5460   else
5461     {
5462       /* The normal case: emit the initialization instructions on the
5463          preheader edge.  */
5464       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5465       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5466     }
5467 }
5468
5469 /* Function get_initial_def_for_reduction
5470
5471    Input:
5472    REDUC_INFO - the info_for_reduction
5473    INIT_VAL - the initial value of the reduction variable
5474    NEUTRAL_OP - a value that has no effect on the reduction, as per
5475                 neutral_op_for_reduction
5476
5477    Output:
5478    Return a vector variable, initialized according to the operation that
5479         STMT_VINFO performs. This vector will be used as the initial value
5480         of the vector of partial results.
5481
5482    The value we need is a vector in which element 0 has value INIT_VAL
5483    and every other element has value NEUTRAL_OP.  */
5484
5485 static tree
5486 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5487                                stmt_vec_info reduc_info,
5488                                tree init_val, tree neutral_op)
5489 {
5490   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5491   tree scalar_type = TREE_TYPE (init_val);
5492   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5493   tree init_def;
5494   gimple_seq stmts = NULL;
5495
5496   gcc_assert (vectype);
5497
5498   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5499               || SCALAR_FLOAT_TYPE_P (scalar_type));
5500
5501   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5502               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5503
5504   if (operand_equal_p (init_val, neutral_op))
5505     {
5506       /* If both elements are equal then the vector described above is
5507          just a splat.  */
5508       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5509       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5510     }
5511   else
5512     {
5513       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5514       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5515       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5516         {
5517           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5518              element 0.  */
5519           init_def = gimple_build_vector_from_val (&stmts, vectype,
5520                                                    neutral_op);
5521           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5522                                    vectype, init_def, init_val);
5523         }
5524       else
5525         {
5526           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5527           tree_vector_builder elts (vectype, 1, 2);
5528           elts.quick_push (init_val);
5529           elts.quick_push (neutral_op);
5530           init_def = gimple_build_vector (&stmts, &elts);
5531         }
5532     }
5533
5534   if (stmts)
5535     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5536   return init_def;
5537 }
5538
5539 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5540    which performs a reduction involving GROUP_SIZE scalar statements.
5541    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5542    is nonnull, introducing extra elements of that value will not change the
5543    result.  */
5544
5545 static void
5546 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5547                                 stmt_vec_info reduc_info,
5548                                 vec<tree> *vec_oprnds,
5549                                 unsigned int number_of_vectors,
5550                                 unsigned int group_size, tree neutral_op)
5551 {
5552   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5553   unsigned HOST_WIDE_INT nunits;
5554   unsigned j, number_of_places_left_in_vector;
5555   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5556   unsigned int i;
5557
5558   gcc_assert (group_size == initial_values.length () || neutral_op);
5559
5560   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5561      created vectors. It is greater than 1 if unrolling is performed.
5562
5563      For example, we have two scalar operands, s1 and s2 (e.g., group of
5564      strided accesses of size two), while NUNITS is four (i.e., four scalars
5565      of this type can be packed in a vector).  The output vector will contain
5566      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5567      will be 2).
5568
5569      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5570      vectors containing the operands.
5571
5572      For example, NUNITS is four as before, and the group size is 8
5573      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5574      {s5, s6, s7, s8}.  */
5575
5576   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5577     nunits = group_size;
5578
5579   number_of_places_left_in_vector = nunits;
5580   bool constant_p = true;
5581   tree_vector_builder elts (vector_type, nunits, 1);
5582   elts.quick_grow (nunits);
5583   gimple_seq ctor_seq = NULL;
5584   if (neutral_op
5585       && !useless_type_conversion_p (TREE_TYPE (vector_type),
5586                                      TREE_TYPE (neutral_op)))
5587     neutral_op = gimple_convert (&ctor_seq,
5588                                  TREE_TYPE (vector_type),
5589                                  neutral_op);
5590   for (j = 0; j < nunits * number_of_vectors; ++j)
5591     {
5592       tree op;
5593       i = j % group_size;
5594
5595       /* Get the def before the loop.  In reduction chain we have only
5596          one initial value.  Else we have as many as PHIs in the group.  */
5597       if (i >= initial_values.length () || (j > i && neutral_op))
5598         op = neutral_op;
5599       else
5600         {
5601           if (!useless_type_conversion_p (TREE_TYPE (vector_type),
5602                                           TREE_TYPE (initial_values[i])))
5603             initial_values[i] = gimple_convert (&ctor_seq,
5604                                                 TREE_TYPE (vector_type),
5605                                                 initial_values[i]);
5606           op = initial_values[i];
5607         }
5608
5609       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5610       number_of_places_left_in_vector--;
5611       elts[nunits - number_of_places_left_in_vector - 1] = op;
5612       if (!CONSTANT_CLASS_P (op))
5613         constant_p = false;
5614
5615       if (number_of_places_left_in_vector == 0)
5616         {
5617           tree init;
5618           if (constant_p && !neutral_op
5619               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5620               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5621             /* Build the vector directly from ELTS.  */
5622             init = gimple_build_vector (&ctor_seq, &elts);
5623           else if (neutral_op)
5624             {
5625               /* Build a vector of the neutral value and shift the
5626                  other elements into place.  */
5627               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5628                                                    neutral_op);
5629               int k = nunits;
5630               while (k > 0 && operand_equal_p (elts[k - 1], neutral_op))
5631                 k -= 1;
5632               while (k > 0)
5633                 {
5634                   k -= 1;
5635                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5636                                        vector_type, init, elts[k]);
5637                 }
5638             }
5639           else
5640             {
5641               /* First time round, duplicate ELTS to fill the
5642                  required number of vectors.  */
5643               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5644                                         elts, number_of_vectors, *vec_oprnds);
5645               break;
5646             }
5647           vec_oprnds->quick_push (init);
5648
5649           number_of_places_left_in_vector = nunits;
5650           elts.new_vector (vector_type, nunits, 1);
5651           elts.quick_grow (nunits);
5652           constant_p = true;
5653         }
5654     }
5655   if (ctor_seq != NULL)
5656     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5657 }
5658
5659 /* For a statement STMT_INFO taking part in a reduction operation return
5660    the stmt_vec_info the meta information is stored on.  */
5661
5662 stmt_vec_info
5663 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5664 {
5665   stmt_info = vect_orig_stmt (stmt_info);
5666   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5667   if (!is_a <gphi *> (stmt_info->stmt)
5668       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5669     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5670   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5671   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5672     {
5673       if (gimple_phi_num_args (phi) == 1)
5674         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5675     }
5676   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5677     {
5678       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5679       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5680         stmt_info = info;
5681     }
5682   return stmt_info;
5683 }
5684
5685 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5686    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5687    return false.  */
5688
5689 static bool
5690 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5691                                 stmt_vec_info reduc_info)
5692 {
5693   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5694   if (!main_loop_vinfo)
5695     return false;
5696
5697   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5698     return false;
5699
5700   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5701   auto_vec<tree, 16> main_loop_results (num_phis);
5702   auto_vec<tree, 16> initial_values (num_phis);
5703   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5704     {
5705       /* The epilogue loop can be entered either from the main loop or
5706          from an earlier guard block.  */
5707       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5708       for (tree incoming_value : reduc_info->reduc_initial_values)
5709         {
5710           /* Look for:
5711
5712                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5713                                     INITIAL_VALUE(guard block)>.  */
5714           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5715
5716           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5717           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5718
5719           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5720           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5721
5722           main_loop_results.quick_push (from_main_loop);
5723           initial_values.quick_push (from_skip);
5724         }
5725     }
5726   else
5727     /* The main loop dominates the epilogue loop.  */
5728     main_loop_results.splice (reduc_info->reduc_initial_values);
5729
5730   /* See if the main loop has the kind of accumulator we need.  */
5731   vect_reusable_accumulator *accumulator
5732     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5733   if (!accumulator
5734       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5735       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5736                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5737     return false;
5738
5739   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5740   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5741   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5742   unsigned HOST_WIDE_INT m;
5743   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5744                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5745     return false;
5746   /* Check the intermediate vector types and operations are available.  */
5747   tree prev_vectype = old_vectype;
5748   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5749   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5750     {
5751       intermediate_nunits = exact_div (intermediate_nunits, 2);
5752       tree intermediate_vectype = get_related_vectype_for_scalar_type
5753         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5754       if (!intermediate_vectype
5755           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5756                                     intermediate_vectype)
5757           || !can_vec_extract (TYPE_MODE (prev_vectype),
5758                                TYPE_MODE (intermediate_vectype)))
5759         return false;
5760       prev_vectype = intermediate_vectype;
5761     }
5762
5763   /* Non-SLP reductions might apply an adjustment after the reduction
5764      operation, in order to simplify the initialization of the accumulator.
5765      If the epilogue loop carries on from where the main loop left off,
5766      it should apply the same adjustment to the final reduction result.
5767
5768      If the epilogue loop can also be entered directly (rather than via
5769      the main loop), we need to be able to handle that case in the same way,
5770      with the same adjustment.  (In principle we could add a PHI node
5771      to select the correct adjustment, but in practice that shouldn't be
5772      necessary.)  */
5773   tree main_adjustment
5774     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5775   if (loop_vinfo->main_loop_edge && main_adjustment)
5776     {
5777       gcc_assert (num_phis == 1);
5778       tree initial_value = initial_values[0];
5779       /* Check that we can use INITIAL_VALUE as the adjustment and
5780          initialize the accumulator with a neutral value instead.  */
5781       if (!operand_equal_p (initial_value, main_adjustment))
5782         return false;
5783       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5784       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5785                                                     code, initial_value);
5786     }
5787   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5788   reduc_info->reduc_initial_values.truncate (0);
5789   reduc_info->reduc_initial_values.splice (initial_values);
5790   reduc_info->reused_accumulator = accumulator;
5791   return true;
5792 }
5793
5794 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5795    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5796
5797 static tree
5798 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5799                             gimple_seq *seq)
5800 {
5801   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5802   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5803   tree stype = TREE_TYPE (vectype);
5804   tree new_temp = vec_def;
5805   while (nunits > nunits1)
5806     {
5807       nunits /= 2;
5808       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5809                                                            stype, nunits);
5810       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5811
5812       /* The target has to make sure we support lowpart/highpart
5813          extraction, either via direct vector extract or through
5814          an integer mode punning.  */
5815       tree dst1, dst2;
5816       gimple *epilog_stmt;
5817       if (convert_optab_handler (vec_extract_optab,
5818                                  TYPE_MODE (TREE_TYPE (new_temp)),
5819                                  TYPE_MODE (vectype1))
5820           != CODE_FOR_nothing)
5821         {
5822           /* Extract sub-vectors directly once vec_extract becomes
5823              a conversion optab.  */
5824           dst1 = make_ssa_name (vectype1);
5825           epilog_stmt
5826               = gimple_build_assign (dst1, BIT_FIELD_REF,
5827                                      build3 (BIT_FIELD_REF, vectype1,
5828                                              new_temp, TYPE_SIZE (vectype1),
5829                                              bitsize_int (0)));
5830           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5831           dst2 =  make_ssa_name (vectype1);
5832           epilog_stmt
5833               = gimple_build_assign (dst2, BIT_FIELD_REF,
5834                                      build3 (BIT_FIELD_REF, vectype1,
5835                                              new_temp, TYPE_SIZE (vectype1),
5836                                              bitsize_int (bitsize)));
5837           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5838         }
5839       else
5840         {
5841           /* Extract via punning to appropriately sized integer mode
5842              vector.  */
5843           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5844           tree etype = build_vector_type (eltype, 2);
5845           gcc_assert (convert_optab_handler (vec_extract_optab,
5846                                              TYPE_MODE (etype),
5847                                              TYPE_MODE (eltype))
5848                       != CODE_FOR_nothing);
5849           tree tem = make_ssa_name (etype);
5850           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5851                                              build1 (VIEW_CONVERT_EXPR,
5852                                                      etype, new_temp));
5853           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5854           new_temp = tem;
5855           tem = make_ssa_name (eltype);
5856           epilog_stmt
5857               = gimple_build_assign (tem, BIT_FIELD_REF,
5858                                      build3 (BIT_FIELD_REF, eltype,
5859                                              new_temp, TYPE_SIZE (eltype),
5860                                              bitsize_int (0)));
5861           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5862           dst1 = make_ssa_name (vectype1);
5863           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5864                                              build1 (VIEW_CONVERT_EXPR,
5865                                                      vectype1, tem));
5866           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5867           tem = make_ssa_name (eltype);
5868           epilog_stmt
5869               = gimple_build_assign (tem, BIT_FIELD_REF,
5870                                      build3 (BIT_FIELD_REF, eltype,
5871                                              new_temp, TYPE_SIZE (eltype),
5872                                              bitsize_int (bitsize)));
5873           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5874           dst2 =  make_ssa_name (vectype1);
5875           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5876                                              build1 (VIEW_CONVERT_EXPR,
5877                                                      vectype1, tem));
5878           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5879         }
5880
5881       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5882     }
5883
5884   return new_temp;
5885 }
5886
5887 /* Function vect_create_epilog_for_reduction
5888
5889    Create code at the loop-epilog to finalize the result of a reduction
5890    computation.
5891
5892    STMT_INFO is the scalar reduction stmt that is being vectorized.
5893    SLP_NODE is an SLP node containing a group of reduction statements. The
5894      first one in this group is STMT_INFO.
5895    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5896    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5897      (counting from 0)
5898    LOOP_EXIT is the edge to update in the merge block.  In the case of a single
5899      exit this edge is always the main loop exit.
5900
5901    This function:
5902    1. Completes the reduction def-use cycles.
5903    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5904       by calling the function specified by REDUC_FN if available, or by
5905       other means (whole-vector shifts or a scalar loop).
5906       The function also creates a new phi node at the loop exit to preserve
5907       loop-closed form, as illustrated below.
5908
5909      The flow at the entry to this function:
5910
5911         loop:
5912           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5913           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5914           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5915         loop_exit:
5916           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5917           use <s_out0>
5918           use <s_out0>
5919
5920      The above is transformed by this function into:
5921
5922         loop:
5923           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5924           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5925           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5926         loop_exit:
5927           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5928           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5929           v_out2 = reduce <v_out1>
5930           s_out3 = extract_field <v_out2, 0>
5931           s_out4 = adjust_result <s_out3>
5932           use <s_out4>
5933           use <s_out4>
5934 */
5935
5936 static void
5937 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5938                                   stmt_vec_info stmt_info,
5939                                   slp_tree slp_node,
5940                                   slp_instance slp_node_instance,
5941                                   edge loop_exit)
5942 {
5943   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5944   gcc_assert (reduc_info->is_reduc_info);
5945   /* For double reductions we need to get at the inner loop reduction
5946      stmt which has the meta info attached.  Our stmt_info is that of the
5947      loop-closed PHI of the inner loop which we remember as
5948      def for the reduction PHI generation.  */
5949   bool double_reduc = false;
5950   stmt_vec_info rdef_info = stmt_info;
5951   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5952     {
5953       double_reduc = true;
5954       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5955                                             (stmt_info->stmt, 0));
5956       stmt_info = vect_stmt_to_vectorize (stmt_info);
5957     }
5958   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5959   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5960   tree vectype;
5961   machine_mode mode;
5962   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5963   basic_block exit_bb;
5964   tree scalar_dest;
5965   tree scalar_type;
5966   gimple *new_phi = NULL, *phi = NULL;
5967   gimple_stmt_iterator exit_gsi;
5968   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5969   gimple *epilog_stmt = NULL;
5970   gimple *exit_phi;
5971   tree bitsize;
5972   tree def;
5973   tree orig_name, scalar_result;
5974   imm_use_iterator imm_iter, phi_imm_iter;
5975   use_operand_p use_p, phi_use_p;
5976   gimple *use_stmt;
5977   auto_vec<tree> reduc_inputs;
5978   int j, i;
5979   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5980   unsigned int group_size = 1, k;
5981   /* SLP reduction without reduction chain, e.g.,
5982      # a1 = phi <a2, a0>
5983      # b1 = phi <b2, b0>
5984      a2 = operation (a1)
5985      b2 = operation (b1)  */
5986   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5987   bool direct_slp_reduc;
5988   tree induction_index = NULL_TREE;
5989
5990   if (slp_node)
5991     group_size = SLP_TREE_LANES (slp_node);
5992
5993   if (nested_in_vect_loop_p (loop, stmt_info))
5994     {
5995       outer_loop = loop;
5996       loop = loop->inner;
5997       gcc_assert (double_reduc);
5998     }
5999
6000   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6001   gcc_assert (vectype);
6002   mode = TYPE_MODE (vectype);
6003
6004   tree induc_val = NULL_TREE;
6005   tree adjustment_def = NULL;
6006   /* Optimize: for induction condition reduction, if we can't use zero
6007      for induc_val, use initial_def.  */
6008   if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6009     induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6010   else if (double_reduc)
6011     ;
6012   else
6013     adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6014
6015   stmt_vec_info single_live_out_stmt[] = { stmt_info };
6016   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6017   if (slp_reduc)
6018     /* All statements produce live-out values.  */
6019     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6020
6021   unsigned vec_num;
6022   int ncopies;
6023   if (slp_node)
6024     {
6025       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6026       ncopies = 1;
6027     }
6028   else
6029     {
6030       vec_num = 1;
6031       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6032     }
6033
6034   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6035      which is updated with the current index of the loop for every match of
6036      the original loop's cond_expr (VEC_STMT).  This results in a vector
6037      containing the last time the condition passed for that vector lane.
6038      The first match will be a 1 to allow 0 to be used for non-matching
6039      indexes.  If there are no matches at all then the vector will be all
6040      zeroes.
6041
6042      PR92772: This algorithm is broken for architectures that support
6043      masked vectors, but do not provide fold_extract_last.  */
6044   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6045     {
6046       auto_vec<std::pair<tree, bool>, 2> ccompares;
6047       if (slp_node)
6048         {
6049           slp_tree cond_node = slp_node_instance->root;
6050           while (cond_node != slp_node_instance->reduc_phis)
6051             {
6052               stmt_vec_info cond_info = SLP_TREE_REPRESENTATIVE (cond_node);
6053               int slp_reduc_idx;
6054               if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6055                 {
6056                   gimple *vec_stmt
6057                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (cond_node)[0]);
6058                   gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6059                   ccompares.safe_push
6060                     (std::make_pair (gimple_assign_rhs1 (vec_stmt),
6061                                      STMT_VINFO_REDUC_IDX (cond_info) == 2));
6062                   /* ???  We probably want to have REDUC_IDX on the SLP node?
6063                      We have both three and four children COND_EXPR nodes
6064                      dependent on whether the comparison is still embedded
6065                      as GENERIC.  So work backwards.  */
6066                   slp_reduc_idx = (SLP_TREE_CHILDREN (cond_node).length () - 3
6067                                    + STMT_VINFO_REDUC_IDX (cond_info));
6068                 }
6069               else
6070                 slp_reduc_idx = STMT_VINFO_REDUC_IDX (cond_info);
6071               cond_node = SLP_TREE_CHILDREN (cond_node)[slp_reduc_idx];
6072             }
6073         }
6074       else
6075         {
6076           stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6077           cond_info = vect_stmt_to_vectorize (cond_info);
6078           while (cond_info != reduc_info)
6079             {
6080               if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6081                 {
6082                   gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6083                   gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6084                   ccompares.safe_push
6085                     (std::make_pair (gimple_assign_rhs1 (vec_stmt),
6086                                      STMT_VINFO_REDUC_IDX (cond_info) == 2));
6087                 }
6088               cond_info
6089                 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6090                                                      1 + STMT_VINFO_REDUC_IDX
6091                                                      (cond_info)));
6092               cond_info = vect_stmt_to_vectorize (cond_info);
6093             }
6094         }
6095       gcc_assert (ccompares.length () != 0);
6096
6097       tree indx_before_incr, indx_after_incr;
6098       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6099       int scalar_precision
6100         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6101       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6102       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6103         (TYPE_MODE (vectype), cr_index_scalar_type,
6104          TYPE_VECTOR_SUBPARTS (vectype));
6105
6106       /* First we create a simple vector induction variable which starts
6107          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6108          vector size (STEP).  */
6109
6110       /* Create a {1,2,3,...} vector.  */
6111       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6112
6113       /* Create a vector of the step value.  */
6114       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6115       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6116
6117       /* Create an induction variable.  */
6118       gimple_stmt_iterator incr_gsi;
6119       bool insert_after;
6120       vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6121       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6122                  insert_after, &indx_before_incr, &indx_after_incr);
6123
6124       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6125          filled with zeros (VEC_ZERO).  */
6126
6127       /* Create a vector of 0s.  */
6128       tree zero = build_zero_cst (cr_index_scalar_type);
6129       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6130
6131       /* Create a vector phi node.  */
6132       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6133       new_phi = create_phi_node (new_phi_tree, loop->header);
6134       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6135                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6136
6137       /* Now take the condition from the loops original cond_exprs
6138          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6139          every match uses values from the induction variable
6140          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6141          (NEW_PHI_TREE).
6142          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6143          the new cond_expr (INDEX_COND_EXPR).  */
6144       gimple_seq stmts = NULL;
6145       for (int i = ccompares.length () - 1; i != -1; --i)
6146         {
6147           tree ccompare = ccompares[i].first;
6148           if (ccompares[i].second)
6149             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6150                                          cr_index_vector_type,
6151                                          ccompare,
6152                                          indx_before_incr, new_phi_tree);
6153           else
6154             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6155                                          cr_index_vector_type,
6156                                          ccompare,
6157                                          new_phi_tree, indx_before_incr);
6158         }
6159       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6160
6161       /* Update the phi with the vec cond.  */
6162       induction_index = new_phi_tree;
6163       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6164                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6165     }
6166
6167   /* 2. Create epilog code.
6168         The reduction epilog code operates across the elements of the vector
6169         of partial results computed by the vectorized loop.
6170         The reduction epilog code consists of:
6171
6172         step 1: compute the scalar result in a vector (v_out2)
6173         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6174         step 3: adjust the scalar result (s_out3) if needed.
6175
6176         Step 1 can be accomplished using one the following three schemes:
6177           (scheme 1) using reduc_fn, if available.
6178           (scheme 2) using whole-vector shifts, if available.
6179           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6180                      combined.
6181
6182           The overall epilog code looks like this:
6183
6184           s_out0 = phi <s_loop>         # original EXIT_PHI
6185           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6186           v_out2 = reduce <v_out1>              # step 1
6187           s_out3 = extract_field <v_out2, 0>    # step 2
6188           s_out4 = adjust_result <s_out3>       # step 3
6189
6190           (step 3 is optional, and steps 1 and 2 may be combined).
6191           Lastly, the uses of s_out0 are replaced by s_out4.  */
6192
6193
6194   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6195          v_out1 = phi <VECT_DEF>
6196          Store them in NEW_PHIS.  */
6197   if (double_reduc)
6198     loop = outer_loop;
6199   /* We need to reduce values in all exits.  */
6200   exit_bb = loop_exit->dest;
6201   exit_gsi = gsi_after_labels (exit_bb);
6202   reduc_inputs.create (slp_node ? vec_num : ncopies);
6203   for (unsigned i = 0; i < vec_num; i++)
6204     {
6205       gimple_seq stmts = NULL;
6206       if (slp_node)
6207         def = vect_get_slp_vect_def (slp_node, i);
6208       else
6209         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6210       for (j = 0; j < ncopies; j++)
6211         {
6212           tree new_def = copy_ssa_name (def);
6213           phi = create_phi_node (new_def, exit_bb);
6214           if (j)
6215             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6216           if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6217             SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6218           else
6219             {
6220               for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6221                 SET_PHI_ARG_DEF (phi, k, def);
6222             }
6223           new_def = gimple_convert (&stmts, vectype, new_def);
6224           reduc_inputs.quick_push (new_def);
6225         }
6226       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6227     }
6228
6229   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6230          (i.e. when reduc_fn is not available) and in the final adjustment
6231          code (if needed).  Also get the original scalar reduction variable as
6232          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6233          represents a reduction pattern), the tree-code and scalar-def are
6234          taken from the original stmt that the pattern-stmt (STMT) replaces.
6235          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6236          are taken from STMT.  */
6237
6238   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6239   if (orig_stmt_info != stmt_info)
6240     {
6241       /* Reduction pattern  */
6242       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6243       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6244     }
6245
6246   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6247   scalar_type = TREE_TYPE (scalar_dest);
6248   scalar_results.truncate (0);
6249   scalar_results.reserve_exact (group_size);
6250   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6251   bitsize = TYPE_SIZE (scalar_type);
6252
6253   /* True if we should implement SLP_REDUC using native reduction operations
6254      instead of scalar operations.  */
6255   direct_slp_reduc = (reduc_fn != IFN_LAST
6256                       && slp_reduc
6257                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6258
6259   /* In case of reduction chain, e.g.,
6260      # a1 = phi <a3, a0>
6261      a2 = operation (a1)
6262      a3 = operation (a2),
6263
6264      we may end up with more than one vector result.  Here we reduce them
6265      to one vector.
6266
6267      The same is true for a SLP reduction, e.g.,
6268      # a1 = phi <a2, a0>
6269      # b1 = phi <b2, b0>
6270      a2 = operation (a1)
6271      b2 = operation (a2),
6272
6273      where we can end up with more than one vector as well.  We can
6274      easily accumulate vectors when the number of vector elements is
6275      a multiple of the SLP group size.
6276
6277      The same is true if we couldn't use a single defuse cycle.  */
6278   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6279       || direct_slp_reduc
6280       || (slp_reduc
6281           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6282       || ncopies > 1)
6283     {
6284       gimple_seq stmts = NULL;
6285       tree single_input = reduc_inputs[0];
6286       for (k = 1; k < reduc_inputs.length (); k++)
6287         single_input = gimple_build (&stmts, code, vectype,
6288                                      single_input, reduc_inputs[k]);
6289       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6290
6291       reduc_inputs.truncate (0);
6292       reduc_inputs.safe_push (single_input);
6293     }
6294
6295   tree orig_reduc_input = reduc_inputs[0];
6296
6297   /* If this loop is an epilogue loop that can be skipped after the
6298      main loop, we can only share a reduction operation between the
6299      main loop and the epilogue if we put it at the target of the
6300      skip edge.
6301
6302      We can still reuse accumulators if this check fails.  Doing so has
6303      the minor(?) benefit of making the epilogue loop's scalar result
6304      independent of the main loop's scalar result.  */
6305   bool unify_with_main_loop_p = false;
6306   if (reduc_info->reused_accumulator
6307       && loop_vinfo->skip_this_loop_edge
6308       && single_succ_p (exit_bb)
6309       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6310     {
6311       unify_with_main_loop_p = true;
6312
6313       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6314       reduc_inputs[0] = make_ssa_name (vectype);
6315       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6316       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6317                    UNKNOWN_LOCATION);
6318       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6319                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6320       exit_gsi = gsi_after_labels (reduc_block);
6321     }
6322
6323   /* Shouldn't be used beyond this point.  */
6324   exit_bb = nullptr;
6325
6326   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6327       && reduc_fn != IFN_LAST)
6328     {
6329       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6330          various data values where the condition matched and another vector
6331          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6332          need to extract the last matching index (which will be the index with
6333          highest value) and use this to index into the data vector.
6334          For the case where there were no matches, the data vector will contain
6335          all default values and the index vector will be all zeros.  */
6336
6337       /* Get various versions of the type of the vector of indexes.  */
6338       tree index_vec_type = TREE_TYPE (induction_index);
6339       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6340       tree index_scalar_type = TREE_TYPE (index_vec_type);
6341       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6342
6343       /* Get an unsigned integer version of the type of the data vector.  */
6344       int scalar_precision
6345         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6346       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6347       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6348                                                 vectype);
6349
6350       /* First we need to create a vector (ZERO_VEC) of zeros and another
6351          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6352          can create using a MAX reduction and then expanding.
6353          In the case where the loop never made any matches, the max index will
6354          be zero.  */
6355
6356       /* Vector of {0, 0, 0,...}.  */
6357       tree zero_vec = build_zero_cst (vectype);
6358
6359       /* Find maximum value from the vector of found indexes.  */
6360       tree max_index = make_ssa_name (index_scalar_type);
6361       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6362                                                           1, induction_index);
6363       gimple_call_set_lhs (max_index_stmt, max_index);
6364       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6365
6366       /* Vector of {max_index, max_index, max_index,...}.  */
6367       tree max_index_vec = make_ssa_name (index_vec_type);
6368       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6369                                                       max_index);
6370       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6371                                                         max_index_vec_rhs);
6372       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6373
6374       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6375          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6376          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6377          otherwise.  Only one value should match, resulting in a vector
6378          (VEC_COND) with one data value and the rest zeros.
6379          In the case where the loop never made any matches, every index will
6380          match, resulting in a vector with all data values (which will all be
6381          the default value).  */
6382
6383       /* Compare the max index vector to the vector of found indexes to find
6384          the position of the max value.  */
6385       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6386       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6387                                                       induction_index,
6388                                                       max_index_vec);
6389       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6390
6391       /* Use the compare to choose either values from the data vector or
6392          zero.  */
6393       tree vec_cond = make_ssa_name (vectype);
6394       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6395                                                    vec_compare,
6396                                                    reduc_inputs[0],
6397                                                    zero_vec);
6398       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6399
6400       /* Finally we need to extract the data value from the vector (VEC_COND)
6401          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6402          reduction, but because this doesn't exist, we can use a MAX reduction
6403          instead.  The data value might be signed or a float so we need to cast
6404          it first.
6405          In the case where the loop never made any matches, the data values are
6406          all identical, and so will reduce down correctly.  */
6407
6408       /* Make the matched data values unsigned.  */
6409       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6410       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6411                                        vec_cond);
6412       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6413                                                         VIEW_CONVERT_EXPR,
6414                                                         vec_cond_cast_rhs);
6415       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6416
6417       /* Reduce down to a scalar value.  */
6418       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6419       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6420                                                            1, vec_cond_cast);
6421       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6422       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6423
6424       /* Convert the reduced value back to the result type and set as the
6425          result.  */
6426       gimple_seq stmts = NULL;
6427       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6428                                data_reduc);
6429       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6430       scalar_results.safe_push (new_temp);
6431     }
6432   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6433            && reduc_fn == IFN_LAST)
6434     {
6435       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6436          idx = 0;
6437          idx_val = induction_index[0];
6438          val = data_reduc[0];
6439          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6440            if (induction_index[i] > idx_val)
6441              val = data_reduc[i], idx_val = induction_index[i];
6442          return val;  */
6443
6444       tree data_eltype = TREE_TYPE (vectype);
6445       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6446       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6447       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6448       /* Enforced by vectorizable_reduction, which ensures we have target
6449          support before allowing a conditional reduction on variable-length
6450          vectors.  */
6451       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6452       tree idx_val = NULL_TREE, val = NULL_TREE;
6453       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6454         {
6455           tree old_idx_val = idx_val;
6456           tree old_val = val;
6457           idx_val = make_ssa_name (idx_eltype);
6458           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6459                                              build3 (BIT_FIELD_REF, idx_eltype,
6460                                                      induction_index,
6461                                                      bitsize_int (el_size),
6462                                                      bitsize_int (off)));
6463           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6464           val = make_ssa_name (data_eltype);
6465           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6466                                              build3 (BIT_FIELD_REF,
6467                                                      data_eltype,
6468                                                      reduc_inputs[0],
6469                                                      bitsize_int (el_size),
6470                                                      bitsize_int (off)));
6471           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6472           if (off != 0)
6473             {
6474               tree new_idx_val = idx_val;
6475               if (off != v_size - el_size)
6476                 {
6477                   new_idx_val = make_ssa_name (idx_eltype);
6478                   epilog_stmt = gimple_build_assign (new_idx_val,
6479                                                      MAX_EXPR, idx_val,
6480                                                      old_idx_val);
6481                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6482                 }
6483               tree cond = make_ssa_name (boolean_type_node);
6484               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6485                                                  idx_val, old_idx_val);
6486               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6487               tree new_val = make_ssa_name (data_eltype);
6488               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6489                                                  cond, val, old_val);
6490               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6491               idx_val = new_idx_val;
6492               val = new_val;
6493             }
6494         }
6495       /* Convert the reduced value back to the result type and set as the
6496          result.  */
6497       gimple_seq stmts = NULL;
6498       val = gimple_convert (&stmts, scalar_type, val);
6499       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6500       scalar_results.safe_push (val);
6501     }
6502
6503   /* 2.3 Create the reduction code, using one of the three schemes described
6504          above. In SLP we simply need to extract all the elements from the
6505          vector (without reducing them), so we use scalar shifts.  */
6506   else if (reduc_fn != IFN_LAST && (!slp_reduc || group_size == 1))
6507     {
6508       tree tmp;
6509       tree vec_elem_type;
6510
6511       /* Case 1:  Create:
6512          v_out2 = reduc_expr <v_out1>  */
6513
6514       if (dump_enabled_p ())
6515         dump_printf_loc (MSG_NOTE, vect_location,
6516                          "Reduce using direct vector reduction.\n");
6517
6518       gimple_seq stmts = NULL;
6519       vec_elem_type = TREE_TYPE (vectype);
6520       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6521                                vec_elem_type, reduc_inputs[0]);
6522       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6523       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6524
6525       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6526           && induc_val)
6527         {
6528           /* Earlier we set the initial value to be a vector if induc_val
6529              values.  Check the result and if it is induc_val then replace
6530              with the original initial value, unless induc_val is
6531              the same as initial_def already.  */
6532           tree zcompare = make_ssa_name (boolean_type_node);
6533           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6534                                              new_temp, induc_val);
6535           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6536           tree initial_def = reduc_info->reduc_initial_values[0];
6537           tmp = make_ssa_name (new_scalar_dest);
6538           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6539                                              initial_def, new_temp);
6540           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6541           new_temp = tmp;
6542         }
6543
6544       scalar_results.safe_push (new_temp);
6545     }
6546   else if (direct_slp_reduc)
6547     {
6548       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6549          with the elements for other SLP statements replaced with the
6550          neutral value.  We can then do a normal reduction on each vector.  */
6551
6552       /* Enforced by vectorizable_reduction.  */
6553       gcc_assert (reduc_inputs.length () == 1);
6554       gcc_assert (pow2p_hwi (group_size));
6555
6556       gimple_seq seq = NULL;
6557
6558       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6559          and the same element size as VECTYPE.  */
6560       tree index = build_index_vector (vectype, 0, 1);
6561       tree index_type = TREE_TYPE (index);
6562       tree index_elt_type = TREE_TYPE (index_type);
6563       tree mask_type = truth_type_for (index_type);
6564
6565       /* Create a vector that, for each element, identifies which of
6566          the REDUC_GROUP_SIZE results should use it.  */
6567       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6568       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6569                             build_vector_from_val (index_type, index_mask));
6570
6571       /* Get a neutral vector value.  This is simply a splat of the neutral
6572          scalar value if we have one, otherwise the initial scalar value
6573          is itself a neutral value.  */
6574       tree vector_identity = NULL_TREE;
6575       tree neutral_op = NULL_TREE;
6576       if (slp_node)
6577         {
6578           tree initial_value = NULL_TREE;
6579           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6580             initial_value = reduc_info->reduc_initial_values[0];
6581           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6582                                                  initial_value, false);
6583         }
6584       if (neutral_op)
6585         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6586                                                         neutral_op);
6587       for (unsigned int i = 0; i < group_size; ++i)
6588         {
6589           /* If there's no univeral neutral value, we can use the
6590              initial scalar value from the original PHI.  This is used
6591              for MIN and MAX reduction, for example.  */
6592           if (!neutral_op)
6593             {
6594               tree scalar_value = reduc_info->reduc_initial_values[i];
6595               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6596                                              scalar_value);
6597               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6598                                                               scalar_value);
6599             }
6600
6601           /* Calculate the equivalent of:
6602
6603              sel[j] = (index[j] == i);
6604
6605              which selects the elements of REDUC_INPUTS[0] that should
6606              be included in the result.  */
6607           tree compare_val = build_int_cst (index_elt_type, i);
6608           compare_val = build_vector_from_val (index_type, compare_val);
6609           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6610                                    index, compare_val);
6611
6612           /* Calculate the equivalent of:
6613
6614              vec = seq ? reduc_inputs[0] : vector_identity;
6615
6616              VEC is now suitable for a full vector reduction.  */
6617           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6618                                    sel, reduc_inputs[0], vector_identity);
6619
6620           /* Do the reduction and convert it to the appropriate type.  */
6621           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6622                                       TREE_TYPE (vectype), vec);
6623           scalar = gimple_convert (&seq, scalar_type, scalar);
6624           scalar_results.safe_push (scalar);
6625         }
6626       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6627     }
6628   else
6629     {
6630       bool reduce_with_shift;
6631       tree vec_temp;
6632
6633       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6634
6635       /* See if the target wants to do the final (shift) reduction
6636          in a vector mode of smaller size and first reduce upper/lower
6637          halves against each other.  */
6638       enum machine_mode mode1 = mode;
6639       tree stype = TREE_TYPE (vectype);
6640       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6641       unsigned nunits1 = nunits;
6642       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6643           && reduc_inputs.length () == 1)
6644         {
6645           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6646           /* For SLP reductions we have to make sure lanes match up, but
6647              since we're doing individual element final reduction reducing
6648              vector width here is even more important.
6649              ???  We can also separate lanes with permutes, for the common
6650              case of power-of-two group-size odd/even extracts would work.  */
6651           if (slp_reduc && nunits != nunits1)
6652             {
6653               nunits1 = least_common_multiple (nunits1, group_size);
6654               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6655             }
6656         }
6657       if (!slp_reduc
6658           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6659         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6660
6661       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6662                                                            stype, nunits1);
6663       reduce_with_shift = have_whole_vector_shift (mode1);
6664       if (!VECTOR_MODE_P (mode1)
6665           || !directly_supported_p (code, vectype1))
6666         reduce_with_shift = false;
6667
6668       /* First reduce the vector to the desired vector size we should
6669          do shift reduction on by combining upper and lower halves.  */
6670       gimple_seq stmts = NULL;
6671       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6672                                              code, &stmts);
6673       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6674       reduc_inputs[0] = new_temp;
6675
6676       if (reduce_with_shift && (!slp_reduc || group_size == 1))
6677         {
6678           int element_bitsize = tree_to_uhwi (bitsize);
6679           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6680              for variable-length vectors and also requires direct target support
6681              for loop reductions.  */
6682           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6683           int nelements = vec_size_in_bits / element_bitsize;
6684           vec_perm_builder sel;
6685           vec_perm_indices indices;
6686
6687           int elt_offset;
6688
6689           tree zero_vec = build_zero_cst (vectype1);
6690           /* Case 2: Create:
6691              for (offset = nelements/2; offset >= 1; offset/=2)
6692                 {
6693                   Create:  va' = vec_shift <va, offset>
6694                   Create:  va = vop <va, va'>
6695                 }  */
6696
6697           tree rhs;
6698
6699           if (dump_enabled_p ())
6700             dump_printf_loc (MSG_NOTE, vect_location,
6701                              "Reduce using vector shifts\n");
6702
6703           gimple_seq stmts = NULL;
6704           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6705           for (elt_offset = nelements / 2;
6706                elt_offset >= 1;
6707                elt_offset /= 2)
6708             {
6709               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6710               indices.new_vector (sel, 2, nelements);
6711               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6712               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6713                                        new_temp, zero_vec, mask);
6714               new_temp = gimple_build (&stmts, code,
6715                                        vectype1, new_name, new_temp);
6716             }
6717           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6718
6719           /* 2.4  Extract the final scalar result.  Create:
6720              s_out3 = extract_field <v_out2, bitpos>  */
6721
6722           if (dump_enabled_p ())
6723             dump_printf_loc (MSG_NOTE, vect_location,
6724                              "extract scalar result\n");
6725
6726           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6727                         bitsize, bitsize_zero_node);
6728           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6729           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6730           gimple_assign_set_lhs (epilog_stmt, new_temp);
6731           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6732           scalar_results.safe_push (new_temp);
6733         }
6734       else
6735         {
6736           /* Case 3: Create:
6737              s = extract_field <v_out2, 0>
6738              for (offset = element_size;
6739                   offset < vector_size;
6740                   offset += element_size;)
6741                {
6742                  Create:  s' = extract_field <v_out2, offset>
6743                  Create:  s = op <s, s'>  // For non SLP cases
6744                }  */
6745
6746           if (dump_enabled_p ())
6747             dump_printf_loc (MSG_NOTE, vect_location,
6748                              "Reduce using scalar code.\n");
6749
6750           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6751           int element_bitsize = tree_to_uhwi (bitsize);
6752           tree compute_type = TREE_TYPE (vectype);
6753           gimple_seq stmts = NULL;
6754           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6755             {
6756               int bit_offset;
6757               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6758                                        vec_temp, bitsize, bitsize_zero_node);
6759
6760               /* In SLP we don't need to apply reduction operation, so we just
6761                  collect s' values in SCALAR_RESULTS.  */
6762               if (slp_reduc)
6763                 scalar_results.safe_push (new_temp);
6764
6765               for (bit_offset = element_bitsize;
6766                    bit_offset < vec_size_in_bits;
6767                    bit_offset += element_bitsize)
6768                 {
6769                   tree bitpos = bitsize_int (bit_offset);
6770                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6771                                            compute_type, vec_temp,
6772                                            bitsize, bitpos);
6773                   if (slp_reduc)
6774                     {
6775                       /* In SLP we don't need to apply reduction operation, so
6776                          we just collect s' values in SCALAR_RESULTS.  */
6777                       new_temp = new_name;
6778                       scalar_results.safe_push (new_name);
6779                     }
6780                   else
6781                     new_temp = gimple_build (&stmts, code, compute_type,
6782                                              new_name, new_temp);
6783                 }
6784             }
6785
6786           /* The only case where we need to reduce scalar results in SLP, is
6787              unrolling.  If the size of SCALAR_RESULTS is greater than
6788              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6789              REDUC_GROUP_SIZE.  */
6790           if (slp_reduc)
6791             {
6792               tree res, first_res, new_res;
6793
6794               /* Reduce multiple scalar results in case of SLP unrolling.  */
6795               for (j = group_size; scalar_results.iterate (j, &res);
6796                    j++)
6797                 {
6798                   first_res = scalar_results[j % group_size];
6799                   new_res = gimple_build (&stmts, code, compute_type,
6800                                           first_res, res);
6801                   scalar_results[j % group_size] = new_res;
6802                 }
6803               scalar_results.truncate (group_size);
6804               for (k = 0; k < group_size; k++)
6805                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6806                                                     scalar_results[k]);
6807             }
6808           else
6809             {
6810               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6811               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6812               scalar_results.safe_push (new_temp);
6813             }
6814
6815           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6816         }
6817
6818       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6819           && induc_val)
6820         {
6821           /* Earlier we set the initial value to be a vector if induc_val
6822              values.  Check the result and if it is induc_val then replace
6823              with the original initial value, unless induc_val is
6824              the same as initial_def already.  */
6825           tree zcompare = make_ssa_name (boolean_type_node);
6826           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6827                                              scalar_results[0], induc_val);
6828           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6829           tree initial_def = reduc_info->reduc_initial_values[0];
6830           tree tmp = make_ssa_name (new_scalar_dest);
6831           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6832                                              initial_def, scalar_results[0]);
6833           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6834           scalar_results[0] = tmp;
6835         }
6836     }
6837
6838   /* 2.5 Adjust the final result by the initial value of the reduction
6839          variable. (When such adjustment is not needed, then
6840          'adjustment_def' is zero).  For example, if code is PLUS we create:
6841          new_temp = loop_exit_def + adjustment_def  */
6842
6843   if (adjustment_def)
6844     {
6845       gcc_assert (!slp_reduc || group_size == 1);
6846       gimple_seq stmts = NULL;
6847       if (double_reduc)
6848         {
6849           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6850           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6851           new_temp = gimple_build (&stmts, code, vectype,
6852                                    reduc_inputs[0], adjustment_def);
6853         }
6854       else
6855         {
6856           new_temp = scalar_results[0];
6857           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6858           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6859                                            adjustment_def);
6860           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6861           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6862                                    new_temp, adjustment_def);
6863           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6864         }
6865
6866       epilog_stmt = gimple_seq_last_stmt (stmts);
6867       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6868       scalar_results[0] = new_temp;
6869     }
6870
6871   /* Record this operation if it could be reused by the epilogue loop.  */
6872   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6873       && reduc_inputs.length () == 1)
6874     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6875                                            { orig_reduc_input, reduc_info });
6876
6877   if (double_reduc)
6878     loop = outer_loop;
6879
6880   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6881           phis with new adjusted scalar results, i.e., replace use <s_out0>
6882           with use <s_out4>.
6883
6884      Transform:
6885         loop_exit:
6886           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6887           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6888           v_out2 = reduce <v_out1>
6889           s_out3 = extract_field <v_out2, 0>
6890           s_out4 = adjust_result <s_out3>
6891           use <s_out0>
6892           use <s_out0>
6893
6894      into:
6895
6896         loop_exit:
6897           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6898           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6899           v_out2 = reduce <v_out1>
6900           s_out3 = extract_field <v_out2, 0>
6901           s_out4 = adjust_result <s_out3>
6902           use <s_out4>
6903           use <s_out4> */
6904
6905   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6906   auto_vec<gimple *> phis;
6907   for (k = 0; k < live_out_stmts.size (); k++)
6908     {
6909       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6910       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6911
6912       /* Find the loop-closed-use at the loop exit of the original scalar
6913          result.  (The reduction result is expected to have two immediate uses,
6914          one at the latch block, and one at the loop exit).  For double
6915          reductions we are looking for exit phis of the outer loop.  */
6916       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6917         {
6918           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6919             {
6920               if (!is_gimple_debug (USE_STMT (use_p))
6921                   && gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6922                 phis.safe_push (USE_STMT (use_p));
6923             }
6924           else
6925             {
6926               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6927                 {
6928                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6929
6930                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6931                     {
6932                       if (!flow_bb_inside_loop_p (loop,
6933                                              gimple_bb (USE_STMT (phi_use_p)))
6934                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6935                         phis.safe_push (USE_STMT (phi_use_p));
6936                     }
6937                 }
6938             }
6939         }
6940
6941       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6942         {
6943           /* Replace the uses:  */
6944           orig_name = PHI_RESULT (exit_phi);
6945
6946           /* Look for a single use at the target of the skip edge.  */
6947           if (unify_with_main_loop_p)
6948             {
6949               use_operand_p use_p;
6950               gimple *user;
6951               if (!single_imm_use (orig_name, &use_p, &user))
6952                 gcc_unreachable ();
6953               orig_name = gimple_get_lhs (user);
6954             }
6955
6956           scalar_result = scalar_results[k];
6957           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6958             {
6959               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6960                 SET_USE (use_p, scalar_result);
6961               update_stmt (use_stmt);
6962             }
6963         }
6964
6965       phis.truncate (0);
6966     }
6967 }
6968
6969 /* Return a vector of type VECTYPE that is equal to the vector select
6970    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6971    before GSI.  */
6972
6973 static tree
6974 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6975                      tree vec, tree identity)
6976 {
6977   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6978   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6979                                           mask, vec, identity);
6980   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6981   return cond;
6982 }
6983
6984 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6985    order, starting with LHS.  Insert the extraction statements before GSI and
6986    associate the new scalar SSA names with variable SCALAR_DEST.
6987    If MASK is nonzero mask the input and then operate on it unconditionally.
6988    Return the SSA name for the result.  */
6989
6990 static tree
6991 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6992                        tree_code code, tree lhs, tree vector_rhs,
6993                        tree mask)
6994 {
6995   tree vectype = TREE_TYPE (vector_rhs);
6996   tree scalar_type = TREE_TYPE (vectype);
6997   tree bitsize = TYPE_SIZE (scalar_type);
6998   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6999   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
7000
7001   /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
7002      to perform an unconditional element-wise reduction of it.  */
7003   if (mask)
7004     {
7005       tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7006                                                    "masked_vector_rhs");
7007       tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7008                                                   false);
7009       tree vector_identity = build_vector_from_val (vectype, neutral_op);
7010       gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7011                                              mask, vector_rhs, vector_identity);
7012       gsi_insert_before (gsi, select, GSI_SAME_STMT);
7013       vector_rhs = masked_vector_rhs;
7014     }
7015
7016   for (unsigned HOST_WIDE_INT bit_offset = 0;
7017        bit_offset < vec_size_in_bits;
7018        bit_offset += element_bitsize)
7019     {
7020       tree bitpos = bitsize_int (bit_offset);
7021       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7022                          bitsize, bitpos);
7023
7024       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7025       rhs = make_ssa_name (scalar_dest, stmt);
7026       gimple_assign_set_lhs (stmt, rhs);
7027       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7028
7029       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7030       tree new_name = make_ssa_name (scalar_dest, stmt);
7031       gimple_assign_set_lhs (stmt, new_name);
7032       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7033       lhs = new_name;
7034     }
7035   return lhs;
7036 }
7037
7038 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
7039    type of the vector input.  */
7040
7041 static internal_fn
7042 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7043 {
7044   internal_fn mask_reduc_fn;
7045   internal_fn mask_len_reduc_fn;
7046
7047   switch (reduc_fn)
7048     {
7049     case IFN_FOLD_LEFT_PLUS:
7050       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7051       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7052       break;
7053
7054     default:
7055       return IFN_LAST;
7056     }
7057
7058   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7059                                       OPTIMIZE_FOR_SPEED))
7060     return mask_reduc_fn;
7061   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7062                                       OPTIMIZE_FOR_SPEED))
7063     return mask_len_reduc_fn;
7064   return IFN_LAST;
7065 }
7066
7067 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
7068    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
7069    statement.  CODE is the operation performed by STMT_INFO and OPS are
7070    its scalar operands.  REDUC_INDEX is the index of the operand in
7071    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
7072    implements in-order reduction, or IFN_LAST if we should open-code it.
7073    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
7074    that should be used to control the operation in a fully-masked loop.  */
7075
7076 static bool
7077 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7078                                stmt_vec_info stmt_info,
7079                                gimple_stmt_iterator *gsi,
7080                                gimple **vec_stmt, slp_tree slp_node,
7081                                gimple *reduc_def_stmt,
7082                                code_helper code, internal_fn reduc_fn,
7083                                tree *ops, int num_ops, tree vectype_in,
7084                                int reduc_index, vec_loop_masks *masks,
7085                                vec_loop_lens *lens)
7086 {
7087   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7088   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7089   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7090
7091   int ncopies;
7092   if (slp_node)
7093     ncopies = 1;
7094   else
7095     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7096
7097   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7098   gcc_assert (ncopies == 1);
7099
7100   bool is_cond_op = false;
7101   if (!code.is_tree_code ())
7102     {
7103       code = conditional_internal_fn_code (internal_fn (code));
7104       gcc_assert (code != ERROR_MARK);
7105       is_cond_op = true;
7106     }
7107
7108   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7109
7110   if (slp_node)
7111     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7112                           TYPE_VECTOR_SUBPARTS (vectype_in)));
7113
7114   /* The operands either come from a binary operation or an IFN_COND operation.
7115      The former is a gimple assign with binary rhs and the latter is a
7116      gimple call with four arguments.  */
7117   gcc_assert (num_ops == 2 || num_ops == 4);
7118
7119   int group_size = 1;
7120   stmt_vec_info scalar_dest_def_info;
7121   auto_vec<tree> vec_oprnds0, vec_opmask;
7122   if (slp_node)
7123     {
7124       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[(is_cond_op ? 2 : 0)
7125                                                       + (1 - reduc_index)],
7126                                                       &vec_oprnds0);
7127       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7128       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7129       /* For an IFN_COND_OP we also need the vector mask operand.  */
7130       if (is_cond_op)
7131         vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[0], &vec_opmask);
7132     }
7133   else
7134     {
7135       tree op0, opmask;
7136       if (!is_cond_op)
7137         op0 = ops[1 - reduc_index];
7138       else
7139         {
7140           op0 = ops[2 + (1 - reduc_index)];
7141           opmask = ops[0];
7142         }
7143       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7144                                      op0, &vec_oprnds0);
7145       scalar_dest_def_info = stmt_info;
7146
7147       /* For an IFN_COND_OP we also need the vector mask operand.  */
7148       if (is_cond_op)
7149         vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7150                                        opmask, &vec_opmask);
7151     }
7152
7153   gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7154   tree scalar_dest = gimple_get_lhs (sdef);
7155   tree scalar_type = TREE_TYPE (scalar_dest);
7156   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7157
7158   int vec_num = vec_oprnds0.length ();
7159   gcc_assert (vec_num == 1 || slp_node);
7160   tree vec_elem_type = TREE_TYPE (vectype_out);
7161   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7162
7163   tree vector_identity = NULL_TREE;
7164   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7165     {
7166       vector_identity = build_zero_cst (vectype_out);
7167       if (!HONOR_SIGNED_ZEROS (vectype_out))
7168         ;
7169       else
7170         {
7171           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7172           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7173                                         vector_identity);
7174         }
7175     }
7176
7177   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7178   int i;
7179   tree def0;
7180   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7181     {
7182       gimple *new_stmt;
7183       tree mask = NULL_TREE;
7184       tree len = NULL_TREE;
7185       tree bias = NULL_TREE;
7186       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7187         {
7188           tree loop_mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
7189                                                vec_num, vectype_in, i);
7190           if (is_cond_op)
7191             mask = prepare_vec_mask (loop_vinfo, TREE_TYPE (loop_mask),
7192                                      loop_mask, vec_opmask[i], gsi);
7193           else
7194             mask = loop_mask;
7195         }
7196       else if (is_cond_op)
7197         mask = vec_opmask[i];
7198       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7199         {
7200           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7201                                    i, 1);
7202           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7203           bias = build_int_cst (intQI_type_node, biasval);
7204           if (!is_cond_op)
7205             mask = build_minus_one_cst (truth_type_for (vectype_in));
7206         }
7207
7208       /* Handle MINUS by adding the negative.  */
7209       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7210         {
7211           tree negated = make_ssa_name (vectype_out);
7212           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7213           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7214           def0 = negated;
7215         }
7216
7217       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7218           && mask && mask_reduc_fn == IFN_LAST)
7219         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7220                                     vector_identity);
7221
7222       /* On the first iteration the input is simply the scalar phi
7223          result, and for subsequent iterations it is the output of
7224          the preceding operation.  */
7225       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7226         {
7227           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7228             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7229                                                    def0, mask, len, bias);
7230           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7231             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7232                                                    def0, mask);
7233           else
7234             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7235                                                    def0);
7236           /* For chained SLP reductions the output of the previous reduction
7237              operation serves as the input of the next. For the final statement
7238              the output cannot be a temporary - we reuse the original
7239              scalar destination of the last statement.  */
7240           if (i != vec_num - 1)
7241             {
7242               gimple_set_lhs (new_stmt, scalar_dest_var);
7243               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7244               gimple_set_lhs (new_stmt, reduc_var);
7245             }
7246         }
7247       else
7248         {
7249           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7250                                              tree_code (code), reduc_var, def0,
7251                                              mask);
7252           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7253           /* Remove the statement, so that we can use the same code paths
7254              as for statements that we've just created.  */
7255           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7256           gsi_remove (&tmp_gsi, true);
7257         }
7258
7259       if (i == vec_num - 1)
7260         {
7261           gimple_set_lhs (new_stmt, scalar_dest);
7262           vect_finish_replace_stmt (loop_vinfo,
7263                                     scalar_dest_def_info,
7264                                     new_stmt);
7265         }
7266       else
7267         vect_finish_stmt_generation (loop_vinfo,
7268                                      scalar_dest_def_info,
7269                                      new_stmt, gsi);
7270
7271       if (slp_node)
7272         slp_node->push_vec_def (new_stmt);
7273       else
7274         {
7275           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7276           *vec_stmt = new_stmt;
7277         }
7278     }
7279
7280   return true;
7281 }
7282
7283 /* Function is_nonwrapping_integer_induction.
7284
7285    Check if STMT_VINO (which is part of loop LOOP) both increments and
7286    does not cause overflow.  */
7287
7288 static bool
7289 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7290 {
7291   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7292   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7293   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7294   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7295   widest_int ni, max_loop_value, lhs_max;
7296   wi::overflow_type overflow = wi::OVF_NONE;
7297
7298   /* Make sure the loop is integer based.  */
7299   if (TREE_CODE (base) != INTEGER_CST
7300       || TREE_CODE (step) != INTEGER_CST)
7301     return false;
7302
7303   /* Check that the max size of the loop will not wrap.  */
7304
7305   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7306     return true;
7307
7308   if (! max_stmt_executions (loop, &ni))
7309     return false;
7310
7311   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7312                             &overflow);
7313   if (overflow)
7314     return false;
7315
7316   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7317                             TYPE_SIGN (lhs_type), &overflow);
7318   if (overflow)
7319     return false;
7320
7321   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7322           <= TYPE_PRECISION (lhs_type));
7323 }
7324
7325 /* Check if masking can be supported by inserting a conditional expression.
7326    CODE is the code for the operation.  COND_FN is the conditional internal
7327    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7328 static bool
7329 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7330                          tree vectype_in)
7331 {
7332   if (cond_fn != IFN_LAST
7333       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7334                                          OPTIMIZE_FOR_SPEED))
7335     return false;
7336
7337   if (code.is_tree_code ())
7338     switch (tree_code (code))
7339       {
7340       case DOT_PROD_EXPR:
7341       case SAD_EXPR:
7342         return true;
7343
7344       default:
7345         break;
7346       }
7347   return false;
7348 }
7349
7350 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7351    code for the operation.  VOP is the array of operands.  MASK is the loop
7352    mask.  GSI is a statement iterator used to place the new conditional
7353    expression.  */
7354 static void
7355 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7356                       gimple_stmt_iterator *gsi)
7357 {
7358   switch (tree_code (code))
7359     {
7360     case DOT_PROD_EXPR:
7361       {
7362         tree vectype = TREE_TYPE (vop[1]);
7363         tree zero = build_zero_cst (vectype);
7364         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7365         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7366                                                mask, vop[1], zero);
7367         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7368         vop[1] = masked_op1;
7369         break;
7370       }
7371
7372     case SAD_EXPR:
7373       {
7374         tree vectype = TREE_TYPE (vop[1]);
7375         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7376         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7377                                                mask, vop[1], vop[0]);
7378         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7379         vop[1] = masked_op1;
7380         break;
7381       }
7382
7383     default:
7384       gcc_unreachable ();
7385     }
7386 }
7387
7388 /* Given an operation with CODE in loop reduction path whose reduction PHI is
7389    specified by REDUC_INFO, the operation has TYPE of scalar result, and its
7390    input vectype is represented by VECTYPE_IN. The vectype of vectorized result
7391    may be different from VECTYPE_IN, either in base type or vectype lanes,
7392    lane-reducing operation is the case.  This function check if it is possible,
7393    and how to perform partial vectorization on the operation in the context
7394    of LOOP_VINFO.  */
7395
7396 static void
7397 vect_reduction_update_partial_vector_usage (loop_vec_info loop_vinfo,
7398                                             stmt_vec_info reduc_info,
7399                                             slp_tree slp_node,
7400                                             code_helper code, tree type,
7401                                             tree vectype_in)
7402 {
7403   enum vect_reduction_type reduc_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7404   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7405   internal_fn cond_fn = get_conditional_internal_fn (code, type);
7406
7407   if (reduc_type != FOLD_LEFT_REDUCTION
7408       && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7409       && (cond_fn == IFN_LAST
7410           || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7411                                               OPTIMIZE_FOR_SPEED)))
7412     {
7413       if (dump_enabled_p ())
7414         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7415                          "can't operate on partial vectors because"
7416                          " no conditional operation is available.\n");
7417       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7418     }
7419   else if (reduc_type == FOLD_LEFT_REDUCTION
7420            && reduc_fn == IFN_LAST
7421            && !expand_vec_cond_expr_p (vectype_in, truth_type_for (vectype_in),
7422                                        SSA_NAME))
7423     {
7424       if (dump_enabled_p ())
7425         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7426                         "can't operate on partial vectors because"
7427                         " no conditional operation is available.\n");
7428       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7429     }
7430   else if (reduc_type == FOLD_LEFT_REDUCTION
7431            && internal_fn_mask_index (reduc_fn) == -1
7432            && FLOAT_TYPE_P (vectype_in)
7433            && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
7434     {
7435       if (dump_enabled_p ())
7436         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7437                          "can't operate on partial vectors because"
7438                          " signed zeros cannot be preserved.\n");
7439       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7440     }
7441   else
7442     {
7443       internal_fn mask_reduc_fn
7444                         = get_masked_reduction_fn (reduc_fn, vectype_in);
7445       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7446       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
7447       unsigned nvectors = vect_get_num_copies (loop_vinfo, slp_node,
7448                                                vectype_in);
7449
7450       if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7451         vect_record_loop_len (loop_vinfo, lens, nvectors, vectype_in, 1);
7452       else
7453         vect_record_loop_mask (loop_vinfo, masks, nvectors, vectype_in, NULL);
7454     }
7455 }
7456
7457 /* Check if STMT_INFO is a lane-reducing operation that can be vectorized in
7458    the context of LOOP_VINFO, and vector cost will be recorded in COST_VEC,
7459    and the analysis is for slp if SLP_NODE is not NULL.
7460
7461    For a lane-reducing operation, the loop reduction path that it lies in,
7462    may contain normal operation, or other lane-reducing operation of different
7463    input type size, an example as:
7464
7465      int sum = 0;
7466      for (i)
7467        {
7468          ...
7469          sum += d0[i] * d1[i];       // dot-prod <vector(16) char>
7470          sum += w[i];                // widen-sum <vector(16) char>
7471          sum += abs(s0[i] - s1[i]);  // sad <vector(8) short>
7472          sum += n[i];                // normal <vector(4) int>
7473          ...
7474        }
7475
7476    Vectorization factor is essentially determined by operation whose input
7477    vectype has the most lanes ("vector(16) char" in the example), while we
7478    need to choose input vectype with the least lanes ("vector(4) int" in the
7479    example) to determine effective number of vector reduction PHIs.  */
7480
7481 bool
7482 vectorizable_lane_reducing (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7483                             slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7484 {
7485   gimple *stmt = stmt_info->stmt;
7486
7487   if (!lane_reducing_stmt_p (stmt))
7488     return false;
7489
7490   tree type = TREE_TYPE (gimple_assign_lhs (stmt));
7491
7492   if (!INTEGRAL_TYPE_P (type))
7493     return false;
7494
7495   /* Do not try to vectorize bit-precision reductions.  */
7496   if (!type_has_mode_precision_p (type))
7497     return false;
7498
7499   stmt_vec_info reduc_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7500
7501   /* TODO: Support lane-reducing operation that does not directly participate
7502      in loop reduction.  */
7503   if (!reduc_info || STMT_VINFO_REDUC_IDX (stmt_info) < 0)
7504     return false;
7505
7506   /* Lane-reducing pattern inside any inner loop of LOOP_VINFO is not
7507      recoginized.  */
7508   gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_reduction_def);
7509   gcc_assert (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION);
7510
7511   for (int i = 0; i < (int) gimple_num_ops (stmt) - 1; i++)
7512     {
7513       stmt_vec_info def_stmt_info;
7514       slp_tree slp_op;
7515       tree op;
7516       tree vectype;
7517       enum vect_def_type dt;
7518
7519       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_node, i, &op,
7520                                &slp_op, &dt, &vectype, &def_stmt_info))
7521         {
7522           if (dump_enabled_p ())
7523             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7524                              "use not simple.\n");
7525           return false;
7526         }
7527
7528       if (!vectype)
7529         {
7530           vectype = get_vectype_for_scalar_type (loop_vinfo, TREE_TYPE (op),
7531                                                  slp_op);
7532           if (!vectype)
7533             return false;
7534         }
7535
7536       if (slp_node && !vect_maybe_update_slp_op_vectype (slp_op, vectype))
7537         {
7538           if (dump_enabled_p ())
7539             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7540                              "incompatible vector types for invariants\n");
7541           return false;
7542         }
7543
7544       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7545         continue;
7546
7547       /* There should be at most one cycle def in the stmt.  */
7548       if (VECTORIZABLE_CYCLE_DEF (dt))
7549         return false;
7550     }
7551
7552   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
7553
7554   gcc_assert (vectype_in);
7555
7556   /* Compute number of effective vector statements for costing.  */
7557   unsigned int ncopies_for_cost = vect_get_num_copies (loop_vinfo, slp_node,
7558                                                        vectype_in);
7559   gcc_assert (ncopies_for_cost >= 1);
7560
7561   if (vect_is_emulated_mixed_dot_prod (stmt_info))
7562     {
7563       /* We need extra two invariants: one that contains the minimum signed
7564          value and one that contains half of its negative.  */
7565       int prologue_stmts = 2;
7566       unsigned cost = record_stmt_cost (cost_vec, prologue_stmts,
7567                                         scalar_to_vec, stmt_info, 0,
7568                                         vect_prologue);
7569       if (dump_enabled_p ())
7570         dump_printf (MSG_NOTE, "vectorizable_lane_reducing: "
7571                      "extra prologue_cost = %d .\n", cost);
7572
7573       /* Three dot-products and a subtraction.  */
7574       ncopies_for_cost *= 4;
7575     }
7576
7577   record_stmt_cost (cost_vec, (int) ncopies_for_cost, vector_stmt, stmt_info,
7578                     0, vect_body);
7579
7580   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7581     {
7582       enum tree_code code = gimple_assign_rhs_code (stmt);
7583       vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
7584                                                   slp_node, code, type,
7585                                                   vectype_in);
7586     }
7587
7588   /* Transform via vect_transform_reduction.  */
7589   STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7590   return true;
7591 }
7592
7593 /* Function vectorizable_reduction.
7594
7595    Check if STMT_INFO performs a reduction operation that can be vectorized.
7596    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7597    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7598    Return true if STMT_INFO is vectorizable in this way.
7599
7600    This function also handles reduction idioms (patterns) that have been
7601    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7602    may be of this form:
7603      X = pattern_expr (arg0, arg1, ..., X)
7604    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7605    sequence that had been detected and replaced by the pattern-stmt
7606    (STMT_INFO).
7607
7608    This function also handles reduction of condition expressions, for example:
7609      for (int i = 0; i < N; i++)
7610        if (a[i] < value)
7611          last = a[i];
7612    This is handled by vectorising the loop and creating an additional vector
7613    containing the loop indexes for which "a[i] < value" was true.  In the
7614    function epilogue this is reduced to a single max value and then used to
7615    index into the vector of results.
7616
7617    In some cases of reduction patterns, the type of the reduction variable X is
7618    different than the type of the other arguments of STMT_INFO.
7619    In such cases, the vectype that is used when transforming STMT_INFO into
7620    a vector stmt is different than the vectype that is used to determine the
7621    vectorization factor, because it consists of a different number of elements
7622    than the actual number of elements that are being operated upon in parallel.
7623
7624    For example, consider an accumulation of shorts into an int accumulator.
7625    On some targets it's possible to vectorize this pattern operating on 8
7626    shorts at a time (hence, the vectype for purposes of determining the
7627    vectorization factor should be V8HI); on the other hand, the vectype that
7628    is used to create the vector form is actually V4SI (the type of the result).
7629
7630    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7631    indicates what is the actual level of parallelism (V8HI in the example), so
7632    that the right vectorization factor would be derived.  This vectype
7633    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7634    be used to create the vectorized stmt.  The right vectype for the vectorized
7635    stmt is obtained from the type of the result X:
7636       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7637
7638    This means that, contrary to "regular" reductions (or "regular" stmts in
7639    general), the following equation:
7640       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7641    does *NOT* necessarily hold for reduction patterns.  */
7642
7643 bool
7644 vectorizable_reduction (loop_vec_info loop_vinfo,
7645                         stmt_vec_info stmt_info, slp_tree slp_node,
7646                         slp_instance slp_node_instance,
7647                         stmt_vector_for_cost *cost_vec)
7648 {
7649   tree vectype_in = NULL_TREE;
7650   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7651   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7652   stmt_vec_info cond_stmt_vinfo = NULL;
7653   int i;
7654   int ncopies;
7655   bool single_defuse_cycle = false;
7656   bool nested_cycle = false;
7657   bool double_reduc = false;
7658   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7659   tree cond_reduc_val = NULL_TREE;
7660
7661   /* Make sure it was already recognized as a reduction computation.  */
7662   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7663       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7664       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7665     return false;
7666
7667   /* The stmt we store reduction analysis meta on.  */
7668   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7669   reduc_info->is_reduc_info = true;
7670
7671   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7672     {
7673       if (is_a <gphi *> (stmt_info->stmt))
7674         {
7675           if (slp_node)
7676             {
7677               /* We eventually need to set a vector type on invariant
7678                  arguments.  */
7679               unsigned j;
7680               slp_tree child;
7681               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7682                 if (!vect_maybe_update_slp_op_vectype
7683                        (child, SLP_TREE_VECTYPE (slp_node)))
7684                   {
7685                     if (dump_enabled_p ())
7686                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7687                                        "incompatible vector types for "
7688                                        "invariants\n");
7689                     return false;
7690                   }
7691             }
7692           /* Analysis for double-reduction is done on the outer
7693              loop PHI, nested cycles have no further restrictions.  */
7694           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7695         }
7696       else
7697         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7698       return true;
7699     }
7700
7701   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7702   stmt_vec_info phi_info = stmt_info;
7703   if (!is_a <gphi *> (stmt_info->stmt))
7704     {
7705       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7706       return true;
7707     }
7708   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7709     {
7710       if (gimple_bb (stmt_info->stmt) != loop->header)
7711         {
7712           /* For SLP we arrive here for both the inner loop LC PHI and
7713              the outer loop PHI.  The latter is what we want to analyze
7714              the reduction with.  */
7715           gcc_assert (slp_node);
7716           return true;
7717         }
7718       use_operand_p use_p;
7719       gimple *use_stmt;
7720       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7721                                  &use_p, &use_stmt);
7722       gcc_assert (res);
7723       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7724     }
7725
7726   if (slp_node)
7727     {
7728       slp_node_instance->reduc_phis = slp_node;
7729       /* ???  We're leaving slp_node to point to the PHIs, we only
7730          need it to get at the number of vector stmts which wasn't
7731          yet initialized for the instance root.  */
7732     }
7733
7734   /* PHIs should not participate in patterns.  */
7735   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7736   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7737
7738   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7739      and compute the reduction chain length.  Discover the real
7740      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7741   tree reduc_def
7742     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7743                              loop_latch_edge
7744                                (gimple_bb (reduc_def_phi)->loop_father));
7745   unsigned reduc_chain_length = 0;
7746   bool only_slp_reduc_chain = true;
7747   stmt_info = NULL;
7748   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7749   /* For double-reductions we start SLP analysis at the inner loop LC PHI
7750      which is the def of the outer loop live stmt.  */
7751   if (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def
7752       && slp_node)
7753     slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7754   while (reduc_def != PHI_RESULT (reduc_def_phi))
7755     {
7756       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7757       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7758       int reduc_idx = STMT_VINFO_REDUC_IDX (vdef);
7759
7760       if (reduc_idx == -1)
7761         {
7762           if (dump_enabled_p ())
7763             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7764                              "reduction chain broken by patterns.\n");
7765           return false;
7766         }
7767       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7768         only_slp_reduc_chain = false;
7769       /* For epilogue generation live members of the chain need
7770          to point back to the PHI via their original stmt for
7771          info_for_reduction to work.  For SLP we need to look at
7772          all lanes here - even though we only will vectorize from
7773          the SLP node with live lane zero the other live lanes also
7774          need to be identified as part of a reduction to be able
7775          to skip code generation for them.  */
7776       if (slp_for_stmt_info)
7777         {
7778           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7779             if (STMT_VINFO_LIVE_P (s))
7780               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7781         }
7782       else if (STMT_VINFO_LIVE_P (vdef))
7783         STMT_VINFO_REDUC_DEF (def) = phi_info;
7784       gimple_match_op op;
7785       if (!gimple_extract_op (vdef->stmt, &op))
7786         {
7787           if (dump_enabled_p ())
7788             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7789                              "reduction chain includes unsupported"
7790                              " statement type.\n");
7791           return false;
7792         }
7793       if (CONVERT_EXPR_CODE_P (op.code))
7794         {
7795           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7796             {
7797               if (dump_enabled_p ())
7798                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7799                                  "conversion in the reduction chain.\n");
7800               return false;
7801             }
7802         }
7803       else
7804         {
7805           /* First non-conversion stmt.  */
7806           if (!stmt_info)
7807             stmt_info = vdef;
7808
7809           if (lane_reducing_op_p (op.code))
7810             {
7811               enum vect_def_type dt;
7812               tree vectype_op;
7813
7814               /* The last operand of lane-reducing operation is for
7815                  reduction.  */
7816               gcc_assert (reduc_idx > 0 && reduc_idx == (int) op.num_ops - 1);
7817
7818               if (!vect_is_simple_use (op.ops[0], loop_vinfo, &dt, &vectype_op))
7819                 return false;
7820
7821               tree type_op = TREE_TYPE (op.ops[0]);
7822
7823               if (!vectype_op)
7824                 {
7825                   vectype_op = get_vectype_for_scalar_type (loop_vinfo,
7826                                                             type_op);
7827                   if (!vectype_op)
7828                     return false;
7829                 }
7830
7831               /* For lane-reducing operation vectorizable analysis needs the
7832                  reduction PHI information.  */
7833               STMT_VINFO_REDUC_DEF (def) = phi_info;
7834
7835               /* Each lane-reducing operation has its own input vectype, while
7836                  reduction PHI will record the input vectype with the least
7837                  lanes.  */
7838               STMT_VINFO_REDUC_VECTYPE_IN (vdef) = vectype_op;
7839
7840               /* To accommodate lane-reducing operations of mixed input
7841                  vectypes, choose input vectype with the least lanes for the
7842                  reduction PHI statement, which would result in the most
7843                  ncopies for vectorized reduction results.  */
7844               if (!vectype_in
7845                   || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7846                        < GET_MODE_SIZE (SCALAR_TYPE_MODE (type_op))))
7847                 vectype_in = vectype_op;
7848             }
7849           else
7850             vectype_in = STMT_VINFO_VECTYPE (phi_info);
7851         }
7852
7853       reduc_def = op.ops[reduc_idx];
7854       reduc_chain_length++;
7855       if (!stmt_info && slp_node)
7856         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7857     }
7858   /* PHIs should not participate in patterns.  */
7859   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7860
7861   if (nested_in_vect_loop_p (loop, stmt_info))
7862     {
7863       loop = loop->inner;
7864       nested_cycle = true;
7865     }
7866
7867   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7868      element.  */
7869   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7870     {
7871       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7872       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7873     }
7874   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7875     gcc_assert (slp_node
7876                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7877
7878   /* 1. Is vectorizable reduction?  */
7879   /* Not supportable if the reduction variable is used in the loop, unless
7880      it's a reduction chain.  */
7881   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7882       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7883     return false;
7884
7885   /* Reductions that are not used even in an enclosing outer-loop,
7886      are expected to be "live" (used out of the loop).  */
7887   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7888       && !STMT_VINFO_LIVE_P (stmt_info))
7889     return false;
7890
7891   /* 2. Has this been recognized as a reduction pattern?
7892
7893      Check if STMT represents a pattern that has been recognized
7894      in earlier analysis stages.  For stmts that represent a pattern,
7895      the STMT_VINFO_RELATED_STMT field records the last stmt in
7896      the original sequence that constitutes the pattern.  */
7897
7898   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7899   if (orig_stmt_info)
7900     {
7901       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7902       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7903     }
7904
7905   /* 3. Check the operands of the operation.  The first operands are defined
7906         inside the loop body. The last operand is the reduction variable,
7907         which is defined by the loop-header-phi.  */
7908
7909   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7910   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7911   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7912
7913   gimple_match_op op;
7914   if (!gimple_extract_op (stmt_info->stmt, &op))
7915     gcc_unreachable ();
7916   bool lane_reducing = lane_reducing_op_p (op.code);
7917
7918   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7919       && !SCALAR_FLOAT_TYPE_P (op.type))
7920     return false;
7921
7922   /* Do not try to vectorize bit-precision reductions.  */
7923   if (!type_has_mode_precision_p (op.type))
7924     return false;
7925
7926   /* Lane-reducing ops also never can be used in a SLP reduction group
7927      since we'll mix lanes belonging to different reductions.  But it's
7928      OK to use them in a reduction chain or when the reduction group
7929      has just one element.  */
7930   if (lane_reducing
7931       && slp_node
7932       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7933       && SLP_TREE_LANES (slp_node) > 1)
7934     {
7935       if (dump_enabled_p ())
7936         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7937                          "lane-reducing reduction in reduction group.\n");
7938       return false;
7939     }
7940
7941   /* All uses but the last are expected to be defined in the loop.
7942      The last use is the reduction variable.  In case of nested cycle this
7943      assumption is not true: we use reduc_index to record the index of the
7944      reduction variable.  */
7945   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7946   tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7947   /* We need to skip an extra operand for COND_EXPRs with embedded
7948      comparison.  */
7949   unsigned opno_adjust = 0;
7950   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7951     opno_adjust = 1;
7952   for (i = 0; i < (int) op.num_ops; i++)
7953     {
7954       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7955       if (i == 0 && op.code == COND_EXPR)
7956         continue;
7957
7958       stmt_vec_info def_stmt_info;
7959       enum vect_def_type dt;
7960       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7961                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7962                                &vectype_op[i], &def_stmt_info))
7963         {
7964           if (dump_enabled_p ())
7965             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7966                              "use not simple.\n");
7967           return false;
7968         }
7969
7970       /* Skip reduction operands, and for an IFN_COND_OP we might hit the
7971          reduction operand twice (once as definition, once as else).  */
7972       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7973         continue;
7974
7975       /* There should be only one cycle def in the stmt, the one
7976          leading to reduc_def.  */
7977       if (VECTORIZABLE_CYCLE_DEF (dt))
7978         return false;
7979
7980       if (!vectype_op[i])
7981         vectype_op[i]
7982           = get_vectype_for_scalar_type (loop_vinfo,
7983                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7984
7985       /* Record how the non-reduction-def value of COND_EXPR is defined.
7986          ???  For a chain of multiple CONDs we'd have to match them up all.  */
7987       if (op.code == COND_EXPR && reduc_chain_length == 1)
7988         {
7989           if (dt == vect_constant_def)
7990             {
7991               cond_reduc_dt = dt;
7992               cond_reduc_val = op.ops[i];
7993             }
7994           else if (dt == vect_induction_def
7995                    && def_stmt_info
7996                    && is_nonwrapping_integer_induction (def_stmt_info, loop))
7997             {
7998               cond_reduc_dt = dt;
7999               cond_stmt_vinfo = def_stmt_info;
8000             }
8001         }
8002     }
8003
8004   enum vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (phi_info);
8005   STMT_VINFO_REDUC_TYPE (reduc_info) = reduction_type;
8006   /* If we have a condition reduction, see if we can simplify it further.  */
8007   if (reduction_type == COND_REDUCTION)
8008     {
8009       if (slp_node && SLP_TREE_LANES (slp_node) != 1)
8010         return false;
8011
8012       /* When the condition uses the reduction value in the condition, fail.  */
8013       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
8014         {
8015           if (dump_enabled_p ())
8016             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8017                              "condition depends on previous iteration\n");
8018           return false;
8019         }
8020
8021       if (reduc_chain_length == 1
8022           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
8023                                               OPTIMIZE_FOR_SPEED)
8024               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
8025                                                  vectype_in,
8026                                                  OPTIMIZE_FOR_SPEED)))
8027         {
8028           if (dump_enabled_p ())
8029             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8030                              "optimizing condition reduction with"
8031                              " FOLD_EXTRACT_LAST.\n");
8032           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
8033         }
8034       else if (cond_reduc_dt == vect_induction_def)
8035         {
8036           tree base
8037             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
8038           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
8039
8040           gcc_assert (TREE_CODE (base) == INTEGER_CST
8041                       && TREE_CODE (step) == INTEGER_CST);
8042           cond_reduc_val = NULL_TREE;
8043           enum tree_code cond_reduc_op_code = ERROR_MARK;
8044           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
8045           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
8046             ;
8047           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
8048              above base; punt if base is the minimum value of the type for
8049              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
8050           else if (tree_int_cst_sgn (step) == -1)
8051             {
8052               cond_reduc_op_code = MIN_EXPR;
8053               if (tree_int_cst_sgn (base) == -1)
8054                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
8055               else if (tree_int_cst_lt (base,
8056                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
8057                 cond_reduc_val
8058                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
8059             }
8060           else
8061             {
8062               cond_reduc_op_code = MAX_EXPR;
8063               if (tree_int_cst_sgn (base) == 1)
8064                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
8065               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
8066                                         base))
8067                 cond_reduc_val
8068                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
8069             }
8070           if (cond_reduc_val)
8071             {
8072               if (dump_enabled_p ())
8073                 dump_printf_loc (MSG_NOTE, vect_location,
8074                                  "condition expression based on "
8075                                  "integer induction.\n");
8076               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
8077               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
8078                 = cond_reduc_val;
8079               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
8080             }
8081         }
8082       else if (cond_reduc_dt == vect_constant_def)
8083         {
8084           enum vect_def_type cond_initial_dt;
8085           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
8086           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
8087           if (cond_initial_dt == vect_constant_def
8088               && types_compatible_p (TREE_TYPE (cond_initial_val),
8089                                      TREE_TYPE (cond_reduc_val)))
8090             {
8091               tree e = fold_binary (LE_EXPR, boolean_type_node,
8092                                     cond_initial_val, cond_reduc_val);
8093               if (e && (integer_onep (e) || integer_zerop (e)))
8094                 {
8095                   if (dump_enabled_p ())
8096                     dump_printf_loc (MSG_NOTE, vect_location,
8097                                      "condition expression based on "
8098                                      "compile time constant.\n");
8099                   /* Record reduction code at analysis stage.  */
8100                   STMT_VINFO_REDUC_CODE (reduc_info)
8101                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
8102                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
8103                 }
8104             }
8105         }
8106     }
8107
8108   if (STMT_VINFO_LIVE_P (phi_info))
8109     return false;
8110
8111   if (slp_node)
8112     ncopies = 1;
8113   else
8114     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8115
8116   gcc_assert (ncopies >= 1);
8117
8118   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
8119
8120   if (nested_cycle)
8121     {
8122       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
8123                   == vect_double_reduction_def);
8124       double_reduc = true;
8125     }
8126
8127   /* 4.2. Check support for the epilog operation.
8128
8129           If STMT represents a reduction pattern, then the type of the
8130           reduction variable may be different than the type of the rest
8131           of the arguments.  For example, consider the case of accumulation
8132           of shorts into an int accumulator; The original code:
8133                         S1: int_a = (int) short_a;
8134           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
8135
8136           was replaced with:
8137                         STMT: int_acc = widen_sum <short_a, int_acc>
8138
8139           This means that:
8140           1. The tree-code that is used to create the vector operation in the
8141              epilog code (that reduces the partial results) is not the
8142              tree-code of STMT, but is rather the tree-code of the original
8143              stmt from the pattern that STMT is replacing.  I.e, in the example
8144              above we want to use 'widen_sum' in the loop, but 'plus' in the
8145              epilog.
8146           2. The type (mode) we use to check available target support
8147              for the vector operation to be created in the *epilog*, is
8148              determined by the type of the reduction variable (in the example
8149              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
8150              However the type (mode) we use to check available target support
8151              for the vector operation to be created *inside the loop*, is
8152              determined by the type of the other arguments to STMT (in the
8153              example we'd check this: optab_handler (widen_sum_optab,
8154              vect_short_mode)).
8155
8156           This is contrary to "regular" reductions, in which the types of all
8157           the arguments are the same as the type of the reduction variable.
8158           For "regular" reductions we can therefore use the same vector type
8159           (and also the same tree-code) when generating the epilog code and
8160           when generating the code inside the loop.  */
8161
8162   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
8163
8164   /* If conversion might have created a conditional operation like
8165      IFN_COND_ADD already.  Use the internal code for the following checks.  */
8166   if (orig_code.is_internal_fn ())
8167     {
8168       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
8169       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
8170     }
8171
8172   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
8173
8174   reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8175   if (reduction_type == TREE_CODE_REDUCTION)
8176     {
8177       /* Check whether it's ok to change the order of the computation.
8178          Generally, when vectorizing a reduction we change the order of the
8179          computation.  This may change the behavior of the program in some
8180          cases, so we need to check that this is ok.  One exception is when
8181          vectorizing an outer-loop: the inner-loop is executed sequentially,
8182          and therefore vectorizing reductions in the inner-loop during
8183          outer-loop vectorization is safe.  Likewise when we are vectorizing
8184          a series of reductions using SLP and the VF is one the reductions
8185          are performed in scalar order.  */
8186       if (slp_node
8187           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8188           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
8189         ;
8190       else if (needs_fold_left_reduction_p (op.type, orig_code))
8191         {
8192           /* When vectorizing a reduction chain w/o SLP the reduction PHI
8193              is not directy used in stmt.  */
8194           if (!only_slp_reduc_chain
8195               && reduc_chain_length != 1)
8196             {
8197               if (dump_enabled_p ())
8198                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8199                                  "in-order reduction chain without SLP.\n");
8200               return false;
8201             }
8202           STMT_VINFO_REDUC_TYPE (reduc_info)
8203             = reduction_type = FOLD_LEFT_REDUCTION;
8204         }
8205       else if (!commutative_binary_op_p (orig_code, op.type)
8206                || !associative_binary_op_p (orig_code, op.type))
8207         {
8208           if (dump_enabled_p ())
8209             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8210                             "reduction: not commutative/associative\n");
8211           return false;
8212         }
8213     }
8214
8215   if ((reduction_type == COND_REDUCTION
8216        || reduction_type == INTEGER_INDUC_COND_REDUCTION
8217        || reduction_type == CONST_COND_REDUCTION
8218        || reduction_type == EXTRACT_LAST_REDUCTION)
8219       && slp_node
8220       && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1)
8221     {
8222       if (dump_enabled_p ())
8223         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8224                          "multiple types in condition reduction.\n");
8225       return false;
8226     }
8227
8228   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
8229       && ncopies > 1)
8230     {
8231       if (dump_enabled_p ())
8232         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8233                          "multiple types in double reduction or condition "
8234                          "reduction or fold-left reduction.\n");
8235       return false;
8236     }
8237
8238   internal_fn reduc_fn = IFN_LAST;
8239   if (reduction_type == TREE_CODE_REDUCTION
8240       || reduction_type == FOLD_LEFT_REDUCTION
8241       || reduction_type == INTEGER_INDUC_COND_REDUCTION
8242       || reduction_type == CONST_COND_REDUCTION)
8243     {
8244       if (reduction_type == FOLD_LEFT_REDUCTION
8245           ? fold_left_reduction_fn (orig_code, &reduc_fn)
8246           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
8247         {
8248           if (reduc_fn != IFN_LAST
8249               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
8250                                                   OPTIMIZE_FOR_SPEED))
8251             {
8252               if (dump_enabled_p ())
8253                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8254                                  "reduc op not supported by target.\n");
8255
8256               reduc_fn = IFN_LAST;
8257             }
8258         }
8259       else
8260         {
8261           if (!nested_cycle || double_reduc)
8262             {
8263               if (dump_enabled_p ())
8264                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8265                                  "no reduc code for scalar code.\n");
8266
8267               return false;
8268             }
8269         }
8270     }
8271   else if (reduction_type == COND_REDUCTION)
8272     {
8273       int scalar_precision
8274         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8275       cr_index_scalar_type = make_unsigned_type (scalar_precision);
8276       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8277                                                 vectype_out);
8278
8279       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8280                                           OPTIMIZE_FOR_SPEED))
8281         reduc_fn = IFN_REDUC_MAX;
8282     }
8283   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8284
8285   if (reduction_type != EXTRACT_LAST_REDUCTION
8286       && (!nested_cycle || double_reduc)
8287       && reduc_fn == IFN_LAST
8288       && !nunits_out.is_constant ())
8289     {
8290       if (dump_enabled_p ())
8291         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8292                          "missing target support for reduction on"
8293                          " variable-length vectors.\n");
8294       return false;
8295     }
8296
8297   /* For SLP reductions, see if there is a neutral value we can use.  */
8298   tree neutral_op = NULL_TREE;
8299   if (slp_node)
8300     {
8301       tree initial_value = NULL_TREE;
8302       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8303         initial_value = vect_phi_initial_value (reduc_def_phi);
8304       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8305                                              orig_code, initial_value);
8306     }
8307
8308   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8309     {
8310       /* We can't support in-order reductions of code such as this:
8311
8312            for (int i = 0; i < n1; ++i)
8313              for (int j = 0; j < n2; ++j)
8314                l += a[j];
8315
8316          since GCC effectively transforms the loop when vectorizing:
8317
8318            for (int i = 0; i < n1 / VF; ++i)
8319              for (int j = 0; j < n2; ++j)
8320                for (int k = 0; k < VF; ++k)
8321                  l += a[j];
8322
8323          which is a reassociation of the original operation.  */
8324       if (dump_enabled_p ())
8325         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8326                          "in-order double reduction not supported.\n");
8327
8328       return false;
8329     }
8330
8331   if (reduction_type == FOLD_LEFT_REDUCTION
8332       && (slp_node && SLP_TREE_LANES (slp_node) > 1)
8333       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8334     {
8335       /* We cannot use in-order reductions in this case because there is
8336          an implicit reassociation of the operations involved.  */
8337       if (dump_enabled_p ())
8338         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8339                          "in-order unchained SLP reductions not supported.\n");
8340       return false;
8341     }
8342
8343   /* For double reductions, and for SLP reductions with a neutral value,
8344      we construct a variable-length initial vector by loading a vector
8345      full of the neutral value and then shift-and-inserting the start
8346      values into the low-numbered elements.  */
8347   if ((double_reduc || neutral_op)
8348       && !nunits_out.is_constant ()
8349       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8350                                           vectype_out, OPTIMIZE_FOR_SPEED))
8351     {
8352       if (dump_enabled_p ())
8353         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8354                          "reduction on variable-length vectors requires"
8355                          " target support for a vector-shift-and-insert"
8356                          " operation.\n");
8357       return false;
8358     }
8359
8360   /* Check extra constraints for variable-length unchained SLP reductions.  */
8361   if (slp_node
8362       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8363       && !nunits_out.is_constant ())
8364     {
8365       /* We checked above that we could build the initial vector when
8366          there's a neutral element value.  Check here for the case in
8367          which each SLP statement has its own initial value and in which
8368          that value needs to be repeated for every instance of the
8369          statement within the initial vector.  */
8370       unsigned int group_size = SLP_TREE_LANES (slp_node);
8371       if (!neutral_op
8372           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8373                                               TREE_TYPE (vectype_out)))
8374         {
8375           if (dump_enabled_p ())
8376             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8377                              "unsupported form of SLP reduction for"
8378                              " variable-length vectors: cannot build"
8379                              " initial vector.\n");
8380           return false;
8381         }
8382       /* The epilogue code relies on the number of elements being a multiple
8383          of the group size.  The duplicate-and-interleave approach to setting
8384          up the initial vector does too.  */
8385       if (!multiple_p (nunits_out, group_size))
8386         {
8387           if (dump_enabled_p ())
8388             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8389                              "unsupported form of SLP reduction for"
8390                              " variable-length vectors: the vector size"
8391                              " is not a multiple of the number of results.\n");
8392           return false;
8393         }
8394     }
8395
8396   if (reduction_type == COND_REDUCTION)
8397     {
8398       widest_int ni;
8399
8400       if (! max_loop_iterations (loop, &ni))
8401         {
8402           if (dump_enabled_p ())
8403             dump_printf_loc (MSG_NOTE, vect_location,
8404                              "loop count not known, cannot create cond "
8405                              "reduction.\n");
8406           return false;
8407         }
8408       /* Convert backedges to iterations.  */
8409       ni += 1;
8410
8411       /* The additional index will be the same type as the condition.  Check
8412          that the loop can fit into this less one (because we'll use up the
8413          zero slot for when there are no matches).  */
8414       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8415       if (wi::geu_p (ni, wi::to_widest (max_index)))
8416         {
8417           if (dump_enabled_p ())
8418             dump_printf_loc (MSG_NOTE, vect_location,
8419                              "loop size is greater than data size.\n");
8420           return false;
8421         }
8422     }
8423
8424   /* In case the vectorization factor (VF) is bigger than the number
8425      of elements that we can fit in a vectype (nunits), we have to generate
8426      more than one vector stmt - i.e - we need to "unroll" the
8427      vector stmt by a factor VF/nunits.  For more details see documentation
8428      in vectorizable_operation.  */
8429
8430   /* If the reduction is used in an outer loop we need to generate
8431      VF intermediate results, like so (e.g. for ncopies=2):
8432         r0 = phi (init, r0)
8433         r1 = phi (init, r1)
8434         r0 = x0 + r0;
8435         r1 = x1 + r1;
8436     (i.e. we generate VF results in 2 registers).
8437     In this case we have a separate def-use cycle for each copy, and therefore
8438     for each copy we get the vector def for the reduction variable from the
8439     respective phi node created for this copy.
8440
8441     Otherwise (the reduction is unused in the loop nest), we can combine
8442     together intermediate results, like so (e.g. for ncopies=2):
8443         r = phi (init, r)
8444         r = x0 + r;
8445         r = x1 + r;
8446    (i.e. we generate VF/2 results in a single register).
8447    In this case for each copy we get the vector def for the reduction variable
8448    from the vectorized reduction operation generated in the previous iteration.
8449
8450    This only works when we see both the reduction PHI and its only consumer
8451    in vectorizable_reduction and there are no intermediate stmts
8452    participating.  When unrolling we want each unrolled iteration to have its
8453    own reduction accumulator since one of the main goals of unrolling a
8454    reduction is to reduce the aggregate loop-carried latency.  */
8455   if ((ncopies > 1
8456        || (slp_node
8457            && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8458            && SLP_TREE_LANES (slp_node) == 1
8459            && vect_get_num_copies (loop_vinfo, vectype_in) > 1))
8460       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8461       && reduc_chain_length == 1
8462       && loop_vinfo->suggested_unroll_factor == 1)
8463     single_defuse_cycle = true;
8464
8465   if (single_defuse_cycle && !lane_reducing)
8466     {
8467       gcc_assert (op.code != COND_EXPR);
8468
8469       /* 4. check support for the operation in the loop
8470
8471          This isn't necessary for the lane reduction codes, since they
8472          can only be produced by pattern matching, and it's up to the
8473          pattern matcher to test for support.  The main reason for
8474          specifically skipping this step is to avoid rechecking whether
8475          mixed-sign dot-products can be implemented using signed
8476          dot-products.  */
8477       machine_mode vec_mode = TYPE_MODE (vectype_in);
8478       if (!directly_supported_p (op.code, vectype_in, optab_vector))
8479         {
8480           if (dump_enabled_p ())
8481             dump_printf (MSG_NOTE, "op not supported by target.\n");
8482           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8483               || !vect_can_vectorize_without_simd_p (op.code))
8484             single_defuse_cycle = false;
8485           else
8486             if (dump_enabled_p ())
8487               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8488         }
8489
8490       if (vect_emulated_vector_p (vectype_in)
8491           && !vect_can_vectorize_without_simd_p (op.code))
8492         {
8493           if (dump_enabled_p ())
8494             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8495           return false;
8496         }
8497     }
8498   if (dump_enabled_p () && single_defuse_cycle)
8499     dump_printf_loc (MSG_NOTE, vect_location,
8500                      "using single def-use cycle for reduction by reducing "
8501                      "multiple vectors to one in the loop body\n");
8502   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8503
8504   /* For lane-reducing operation, the below processing related to single
8505      defuse-cycle will be done in its own vectorizable function.  One more
8506      thing to note is that the operation must not be involved in fold-left
8507      reduction.  */
8508   single_defuse_cycle &= !lane_reducing;
8509
8510   if (slp_node
8511       && (single_defuse_cycle || reduction_type == FOLD_LEFT_REDUCTION))
8512     for (i = 0; i < (int) op.num_ops; i++)
8513       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8514         {
8515           if (dump_enabled_p ())
8516             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8517                              "incompatible vector types for invariants\n");
8518           return false;
8519         }
8520
8521   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8522                              reduction_type, ncopies, cost_vec);
8523   /* Cost the reduction op inside the loop if transformed via
8524      vect_transform_reduction for non-lane-reducing operation.  Otherwise
8525      this is costed by the separate vectorizable_* routines.  */
8526   if (single_defuse_cycle)
8527     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
8528
8529   if (dump_enabled_p ()
8530       && reduction_type == FOLD_LEFT_REDUCTION)
8531     dump_printf_loc (MSG_NOTE, vect_location,
8532                      "using an in-order (fold-left) reduction.\n");
8533   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8534
8535   /* All but single defuse-cycle optimized and fold-left reductions go
8536      through their own vectorizable_* routines.  */
8537   if (!single_defuse_cycle && reduction_type != FOLD_LEFT_REDUCTION)
8538     {
8539       stmt_vec_info tem
8540         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8541       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8542         {
8543           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8544           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8545         }
8546       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8547       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8548     }
8549   else if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8550     vect_reduction_update_partial_vector_usage (loop_vinfo, reduc_info,
8551                                                 slp_node, op.code, op.type,
8552                                                 vectype_in);
8553   return true;
8554 }
8555
8556 /* STMT_INFO is a dot-product reduction whose multiplication operands
8557    have different signs.  Emit a sequence to emulate the operation
8558    using a series of signed DOT_PROD_EXPRs and return the last
8559    statement generated.  VEC_DEST is the result of the vector operation
8560    and VOP lists its inputs.  */
8561
8562 static gassign *
8563 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8564                              gimple_stmt_iterator *gsi, tree vec_dest,
8565                              tree vop[3])
8566 {
8567   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8568   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8569   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8570   gimple *new_stmt;
8571
8572   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8573   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8574     std::swap (vop[0], vop[1]);
8575
8576   /* Convert all inputs to signed types.  */
8577   for (int i = 0; i < 3; ++i)
8578     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8579       {
8580         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8581         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8582         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8583         vop[i] = tmp;
8584       }
8585
8586   /* In the comments below we assume 8-bit inputs for simplicity,
8587      but the approach works for any full integer type.  */
8588
8589   /* Create a vector of -128.  */
8590   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8591   tree min_narrow = build_vector_from_val (narrow_vectype,
8592                                            min_narrow_elttype);
8593
8594   /* Create a vector of 64.  */
8595   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8596   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8597   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8598
8599   /* Emit: SUB_RES = VOP[0] - 128.  */
8600   tree sub_res = make_ssa_name (narrow_vectype);
8601   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8602   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8603
8604   /* Emit:
8605
8606        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8607        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8608        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8609
8610      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8611      Doing the two 64 * y steps first allows more time to compute x.  */
8612   tree stage1 = make_ssa_name (wide_vectype);
8613   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8614                                   vop[1], half_narrow, vop[2]);
8615   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8616
8617   tree stage2 = make_ssa_name (wide_vectype);
8618   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8619                                   vop[1], half_narrow, stage1);
8620   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8621
8622   tree stage3 = make_ssa_name (wide_vectype);
8623   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8624                                   sub_res, vop[1], stage2);
8625   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8626
8627   /* Convert STAGE3 to the reduction type.  */
8628   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8629 }
8630
8631 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8632    value.  */
8633
8634 bool
8635 vect_transform_reduction (loop_vec_info loop_vinfo,
8636                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8637                           gimple **vec_stmt, slp_tree slp_node)
8638 {
8639   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8640   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8641   unsigned ncopies;
8642   unsigned vec_num;
8643
8644   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8645   gcc_assert (reduc_info->is_reduc_info);
8646
8647   if (nested_in_vect_loop_p (loop, stmt_info))
8648     {
8649       loop = loop->inner;
8650       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8651     }
8652
8653   gimple_match_op op;
8654   if (!gimple_extract_op (stmt_info->stmt, &op))
8655     gcc_unreachable ();
8656
8657   /* All uses but the last are expected to be defined in the loop.
8658      The last use is the reduction variable.  In case of nested cycle this
8659      assumption is not true: we use reduc_index to record the index of the
8660      reduction variable.  */
8661   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8662   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8663   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8664   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (stmt_info);
8665
8666   if (!vectype_in)
8667     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8668
8669   if (slp_node)
8670     {
8671       ncopies = 1;
8672       vec_num = vect_get_num_copies (loop_vinfo, slp_node, vectype_in);
8673     }
8674   else
8675     {
8676       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8677       vec_num = 1;
8678     }
8679
8680   code_helper code = canonicalize_code (op.code, op.type);
8681   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8682
8683   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8684   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8685   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8686
8687   /* Transform.  */
8688   tree new_temp = NULL_TREE;
8689   auto_vec<tree> vec_oprnds[3];
8690
8691   if (dump_enabled_p ())
8692     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8693
8694   /* FORNOW: Multiple types are not supported for condition.  */
8695   if (code == COND_EXPR)
8696     gcc_assert (ncopies == 1);
8697
8698   /* A binary COND_OP reduction must have the same definition and else
8699      value. */
8700   bool cond_fn_p = code.is_internal_fn ()
8701     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8702   if (cond_fn_p)
8703     {
8704       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8705                   || code == IFN_COND_MUL || code == IFN_COND_AND
8706                   || code == IFN_COND_IOR || code == IFN_COND_XOR
8707                   || code == IFN_COND_MIN || code == IFN_COND_MAX);
8708       gcc_assert (op.num_ops == 4
8709                   && (op.ops[reduc_index]
8710                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
8711     }
8712
8713   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8714
8715   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8716   if (reduction_type == FOLD_LEFT_REDUCTION)
8717     {
8718       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8719       gcc_assert (code.is_tree_code () || cond_fn_p);
8720       return vectorize_fold_left_reduction
8721           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8722            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8723            reduc_index, masks, lens);
8724     }
8725
8726   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8727   bool lane_reducing = lane_reducing_op_p (code);
8728   gcc_assert (single_defuse_cycle || lane_reducing);
8729
8730   if (lane_reducing)
8731     {
8732       /* The last operand of lane-reducing op is for reduction.  */
8733       gcc_assert (reduc_index == (int) op.num_ops - 1);
8734     }
8735
8736   /* Create the destination vector  */
8737   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8738   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8739
8740   if (lane_reducing && !slp_node && !single_defuse_cycle)
8741     {
8742       /* Note: there are still vectorizable cases that can not be handled by
8743          single-lane slp.  Probably it would take some time to evolve the
8744          feature to a mature state.  So we have to keep the below non-slp code
8745          path as failsafe for lane-reducing support.  */
8746       gcc_assert (op.num_ops <= 3);
8747       for (unsigned i = 0; i < op.num_ops; i++)
8748         {
8749           unsigned oprnd_ncopies = ncopies;
8750
8751           if ((int) i == reduc_index)
8752             {
8753               tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8754               oprnd_ncopies = vect_get_num_copies (loop_vinfo, vectype);
8755             }
8756
8757           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, oprnd_ncopies,
8758                                          op.ops[i], &vec_oprnds[i]);
8759         }
8760     }
8761   /* Get NCOPIES vector definitions for all operands except the reduction
8762      definition.  */
8763   else if (!cond_fn_p)
8764     {
8765       gcc_assert (reduc_index >= 0 && reduc_index <= 2);
8766       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8767                          single_defuse_cycle && reduc_index == 0
8768                          ? NULL_TREE : op.ops[0], &vec_oprnds[0],
8769                          single_defuse_cycle && reduc_index == 1
8770                          ? NULL_TREE : op.ops[1], &vec_oprnds[1],
8771                          op.num_ops == 3
8772                          && !(single_defuse_cycle && reduc_index == 2)
8773                          ? op.ops[2] : NULL_TREE, &vec_oprnds[2]);
8774     }
8775   else
8776     {
8777       /* For a conditional operation pass the truth type as mask
8778          vectype.  */
8779       gcc_assert (single_defuse_cycle
8780                   && (reduc_index == 1 || reduc_index == 2));
8781       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies, op.ops[0],
8782                          truth_type_for (vectype_in), &vec_oprnds[0],
8783                          reduc_index == 1 ? NULL_TREE : op.ops[1],
8784                          NULL_TREE, &vec_oprnds[1],
8785                          reduc_index == 2 ? NULL_TREE : op.ops[2],
8786                          NULL_TREE, &vec_oprnds[2]);
8787     }
8788
8789   /* For single def-use cycles get one copy of the vectorized reduction
8790      definition.  */
8791   if (single_defuse_cycle)
8792     {
8793       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, 1,
8794                          reduc_index == 0 ? op.ops[0] : NULL_TREE,
8795                          &vec_oprnds[0],
8796                          reduc_index == 1 ? op.ops[1] : NULL_TREE,
8797                          &vec_oprnds[1],
8798                          reduc_index == 2 ? op.ops[2] : NULL_TREE,
8799                          &vec_oprnds[2]);
8800     }
8801   else if (lane_reducing)
8802     {
8803       /* For normal reduction, consistency between vectorized def/use is
8804          naturally ensured when mapping from scalar statement.  But if lane-
8805          reducing op is involved in reduction, thing would become somewhat
8806          complicated in that the op's result and operand for accumulation are
8807          limited to less lanes than other operands, which certainly causes
8808          def/use mismatch on adjacent statements around the op if do not have
8809          any kind of specific adjustment.  One approach is to refit lane-
8810          reducing op in the way of introducing new trivial pass-through copies
8811          to fix possible def/use gap, so as to make it behave like a normal op.
8812          And vector reduction PHIs are always generated to the full extent, no
8813          matter lane-reducing op exists or not.  If some copies or PHIs are
8814          actually superfluous, they would be cleaned up by passes after
8815          vectorization.  An example for single-lane slp, lane-reducing ops
8816          with mixed input vectypes in a reduction chain, is given as below.
8817          Similarly, this handling is applicable for multiple-lane slp as well.
8818
8819            int sum = 1;
8820            for (i)
8821              {
8822                sum += d0[i] * d1[i];      // dot-prod <vector(16) char>
8823                sum += w[i];               // widen-sum <vector(16) char>
8824                sum += abs(s0[i] - s1[i]); // sad <vector(8) short>
8825                sum += n[i];               // normal <vector(4) int>
8826              }
8827
8828          The vector size is 128-bit，vectorization factor is 16.  Reduction
8829          statements would be transformed as:
8830
8831            vector<4> int sum_v0 = { 0, 0, 0, 1 };
8832            vector<4> int sum_v1 = { 0, 0, 0, 0 };
8833            vector<4> int sum_v2 = { 0, 0, 0, 0 };
8834            vector<4> int sum_v3 = { 0, 0, 0, 0 };
8835
8836            for (i / 16)
8837              {
8838                sum_v0 = DOT_PROD (d0_v0[i: 0 ~ 15], d1_v0[i: 0 ~ 15], sum_v0);
8839                sum_v1 = sum_v1;  // copy
8840                sum_v2 = sum_v2;  // copy
8841                sum_v3 = sum_v3;  // copy
8842
8843                sum_v0 = sum_v0;  // copy
8844                sum_v1 = WIDEN_SUM (w_v1[i: 0 ~ 15], sum_v1);
8845                sum_v2 = sum_v2;  // copy
8846                sum_v3 = sum_v3;  // copy
8847
8848                sum_v0 = sum_v0;  // copy
8849                sum_v1 = SAD (s0_v1[i: 0 ~ 7 ], s1_v1[i: 0 ~ 7 ], sum_v1);
8850                sum_v2 = SAD (s0_v2[i: 8 ~ 15], s1_v2[i: 8 ~ 15], sum_v2);
8851                sum_v3 = sum_v3;  // copy
8852
8853                sum_v0 += n_v0[i: 0  ~ 3 ];
8854                sum_v1 += n_v1[i: 4  ~ 7 ];
8855                sum_v2 += n_v2[i: 8  ~ 11];
8856                sum_v3 += n_v3[i: 12 ~ 15];
8857              }
8858
8859          Moreover, for a higher instruction parallelism in final vectorized
8860          loop, it is considered to make those effective vector lane-reducing
8861          ops be distributed evenly among all def-use cycles.  In the above
8862          example, DOT_PROD, WIDEN_SUM and SADs are generated into disparate
8863          cycles, instruction dependency among them could be eliminated.  */
8864       unsigned effec_ncopies = vec_oprnds[0].length ();
8865       unsigned total_ncopies = vec_oprnds[reduc_index].length ();
8866
8867       gcc_assert (effec_ncopies <= total_ncopies);
8868
8869       if (effec_ncopies < total_ncopies)
8870         {
8871           for (unsigned i = 0; i < op.num_ops - 1; i++)
8872             {
8873               gcc_assert (vec_oprnds[i].length () == effec_ncopies);
8874               vec_oprnds[i].safe_grow_cleared (total_ncopies);
8875             }
8876         }
8877
8878       tree reduc_vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8879       gcc_assert (reduc_vectype_in);
8880
8881       unsigned effec_reduc_ncopies
8882         = vect_get_num_copies (loop_vinfo, slp_node, reduc_vectype_in);
8883
8884       gcc_assert (effec_ncopies <= effec_reduc_ncopies);
8885
8886       if (effec_ncopies < effec_reduc_ncopies)
8887         {
8888           /* Find suitable def-use cycles to generate vectorized statements
8889              into, and reorder operands based on the selection.  */
8890           unsigned curr_pos = reduc_info->reduc_result_pos;
8891           unsigned next_pos = (curr_pos + effec_ncopies) % effec_reduc_ncopies;
8892
8893           gcc_assert (curr_pos < effec_reduc_ncopies);
8894           reduc_info->reduc_result_pos = next_pos;
8895
8896           if (curr_pos)
8897             {
8898               unsigned count = effec_reduc_ncopies - effec_ncopies;
8899               unsigned start = curr_pos - count;
8900
8901               if ((int) start < 0)
8902                 {
8903                   count = curr_pos;
8904                   start = 0;
8905                 }
8906
8907               for (unsigned i = 0; i < op.num_ops - 1; i++)
8908                 {
8909                   for (unsigned j = effec_ncopies; j > start; j--)
8910                     {
8911                       unsigned k = j - 1;
8912                       std::swap (vec_oprnds[i][k], vec_oprnds[i][k + count]);
8913                       gcc_assert (!vec_oprnds[i][k]);
8914                     }
8915                 }
8916             }
8917         }
8918     }
8919
8920   bool emulated_mixed_dot_prod = vect_is_emulated_mixed_dot_prod (stmt_info);
8921   unsigned num = vec_oprnds[reduc_index == 0 ? 1 : 0].length ();
8922   unsigned mask_index = 0;
8923
8924   for (unsigned i = 0; i < num; ++i)
8925     {
8926       gimple *new_stmt;
8927       tree vop[3] = { vec_oprnds[0][i], vec_oprnds[1][i], NULL_TREE };
8928       if (!vop[0] || !vop[1])
8929         {
8930           tree reduc_vop = vec_oprnds[reduc_index][i];
8931
8932           /* If could not generate an effective vector statement for current
8933              portion of reduction operand, insert a trivial copy to simply
8934              handle over the operand to other dependent statements.  */
8935           gcc_assert (reduc_vop);
8936
8937           if (slp_node && TREE_CODE (reduc_vop) == SSA_NAME
8938               && !SSA_NAME_IS_DEFAULT_DEF (reduc_vop))
8939             new_stmt = SSA_NAME_DEF_STMT (reduc_vop);
8940           else
8941             {
8942               new_temp = make_ssa_name (vec_dest);
8943               new_stmt = gimple_build_assign (new_temp, reduc_vop);
8944               vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt,
8945                                            gsi);
8946             }
8947         }
8948       else if (masked_loop_p && !mask_by_cond_expr)
8949         {
8950           /* No conditional ifns have been defined for lane-reducing op
8951              yet.  */
8952           gcc_assert (!lane_reducing);
8953
8954           /* Make sure that the reduction accumulator is vop[0].  */
8955           if (reduc_index == 1)
8956             {
8957               gcc_assert (commutative_binary_op_p (code, op.type));
8958               std::swap (vop[0], vop[1]);
8959             }
8960           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8961                                           vec_num * ncopies, vectype_in,
8962                                           mask_index++);
8963           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8964                                                     vop[0], vop[1], vop[0]);
8965           new_temp = make_ssa_name (vec_dest, call);
8966           gimple_call_set_lhs (call, new_temp);
8967           gimple_call_set_nothrow (call, true);
8968           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8969           new_stmt = call;
8970         }
8971       else
8972         {
8973           if (op.num_ops >= 3)
8974             vop[2] = vec_oprnds[2][i];
8975
8976           if (masked_loop_p && mask_by_cond_expr)
8977             {
8978               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8979                                               vec_num * ncopies, vectype_in,
8980                                               mask_index++);
8981               build_vect_cond_expr (code, vop, mask, gsi);
8982             }
8983
8984           if (emulated_mixed_dot_prod)
8985             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8986                                                     vec_dest, vop);
8987
8988           else if (code.is_internal_fn () && !cond_fn_p)
8989             new_stmt = gimple_build_call_internal (internal_fn (code),
8990                                                    op.num_ops,
8991                                                    vop[0], vop[1], vop[2]);
8992           else if (code.is_internal_fn () && cond_fn_p)
8993             new_stmt = gimple_build_call_internal (internal_fn (code),
8994                                                    op.num_ops,
8995                                                    vop[0], vop[1], vop[2],
8996                                                    vop[1]);
8997           else
8998             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8999                                             vop[0], vop[1], vop[2]);
9000           new_temp = make_ssa_name (vec_dest, new_stmt);
9001           gimple_set_lhs (new_stmt, new_temp);
9002           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
9003         }
9004
9005       if (single_defuse_cycle && i < num - 1)
9006         vec_oprnds[reduc_index].safe_push (gimple_get_lhs (new_stmt));
9007       else if (slp_node)
9008         slp_node->push_vec_def (new_stmt);
9009       else
9010         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9011     }
9012
9013   if (!slp_node)
9014     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9015
9016   return true;
9017 }
9018
9019 /* Transform phase of a cycle PHI.  */
9020
9021 bool
9022 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
9023                           stmt_vec_info stmt_info, gimple **vec_stmt,
9024                           slp_tree slp_node, slp_instance slp_node_instance)
9025 {
9026   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
9027   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9028   int i;
9029   int ncopies;
9030   int j;
9031   bool nested_cycle = false;
9032   int vec_num;
9033
9034   if (nested_in_vect_loop_p (loop, stmt_info))
9035     {
9036       loop = loop->inner;
9037       nested_cycle = true;
9038     }
9039
9040   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
9041   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
9042   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
9043   gcc_assert (reduc_info->is_reduc_info);
9044
9045   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
9046       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
9047     /* Leave the scalar phi in place.  */
9048     return true;
9049
9050   if (slp_node)
9051     {
9052       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9053       ncopies = 1;
9054     }
9055   else
9056     {
9057       vec_num = 1;
9058       ncopies = vect_get_num_copies (loop_vinfo,
9059                                      STMT_VINFO_VECTYPE (stmt_info));
9060     }
9061
9062   /* Check whether we should use a single PHI node and accumulate
9063      vectors to one before the backedge.  */
9064   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
9065     {
9066       ncopies = 1;
9067       vec_num = 1;
9068     }
9069
9070   /* Create the destination vector  */
9071   gphi *phi = as_a <gphi *> (stmt_info->stmt);
9072   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
9073                                                vectype_out);
9074
9075   /* Get the loop-entry arguments.  */
9076   tree vec_initial_def = NULL_TREE;
9077   auto_vec<tree> vec_initial_defs;
9078   if (slp_node)
9079     {
9080       vec_initial_defs.reserve (vec_num);
9081       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
9082          and we can't use zero for induc_val, use initial_def.  Similarly
9083          for REDUC_MIN and initial_def larger than the base.  */
9084       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
9085         {
9086           gcc_assert (SLP_TREE_LANES (slp_node) == 1);
9087           tree initial_def = vect_phi_initial_value (phi);
9088           reduc_info->reduc_initial_values.safe_push (initial_def);
9089           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
9090           if (TREE_CODE (initial_def) == INTEGER_CST
9091               && !integer_zerop (induc_val)
9092               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
9093                    && tree_int_cst_lt (initial_def, induc_val))
9094                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
9095                       && tree_int_cst_lt (induc_val, initial_def))))
9096             {
9097               induc_val = initial_def;
9098               /* Communicate we used the initial_def to epilouge
9099                  generation.  */
9100               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
9101             }
9102           vec_initial_defs.quick_push
9103             (build_vector_from_val (vectype_out, induc_val));
9104         }
9105       else if (nested_cycle)
9106         {
9107           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
9108           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
9109                              &vec_initial_defs);
9110         }
9111       else
9112         {
9113           gcc_assert (slp_node == slp_node_instance->reduc_phis);
9114           vec<tree> &initial_values = reduc_info->reduc_initial_values;
9115           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
9116
9117           unsigned int num_phis = stmts.length ();
9118           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
9119             num_phis = 1;
9120           initial_values.reserve (num_phis);
9121           for (unsigned int i = 0; i < num_phis; ++i)
9122             {
9123               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
9124               initial_values.quick_push (vect_phi_initial_value (this_phi));
9125             }
9126           if (vec_num == 1)
9127             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
9128           if (!initial_values.is_empty ())
9129             {
9130               tree initial_value
9131                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
9132               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
9133               tree neutral_op
9134                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
9135                                             code, initial_value);
9136               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
9137                                               &vec_initial_defs, vec_num,
9138                                               stmts.length (), neutral_op);
9139             }
9140         }
9141     }
9142   else
9143     {
9144       /* Get at the scalar def before the loop, that defines the initial
9145          value of the reduction variable.  */
9146       tree initial_def = vect_phi_initial_value (phi);
9147       reduc_info->reduc_initial_values.safe_push (initial_def);
9148       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
9149          and we can't use zero for induc_val, use initial_def.  Similarly
9150          for REDUC_MIN and initial_def larger than the base.  */
9151       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
9152         {
9153           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
9154           if (TREE_CODE (initial_def) == INTEGER_CST
9155               && !integer_zerop (induc_val)
9156               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
9157                    && tree_int_cst_lt (initial_def, induc_val))
9158                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
9159                       && tree_int_cst_lt (induc_val, initial_def))))
9160             {
9161               induc_val = initial_def;
9162               /* Communicate we used the initial_def to epilouge
9163                  generation.  */
9164               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
9165             }
9166           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
9167         }
9168       else if (nested_cycle)
9169         {
9170           /* Do not use an adjustment def as that case is not supported
9171              correctly if ncopies is not one.  */
9172           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
9173                                          ncopies, initial_def,
9174                                          &vec_initial_defs);
9175         }
9176       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
9177                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
9178         /* Fill the initial vector with the initial scalar value.  */
9179         vec_initial_def
9180           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
9181                                            initial_def, initial_def);
9182       else
9183         {
9184           if (ncopies == 1)
9185             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
9186           if (!reduc_info->reduc_initial_values.is_empty ())
9187             {
9188               initial_def = reduc_info->reduc_initial_values[0];
9189               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
9190               tree neutral_op
9191                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
9192                                             code, initial_def);
9193               gcc_assert (neutral_op);
9194               /* Try to simplify the vector initialization by applying an
9195                  adjustment after the reduction has been performed.  */
9196               if (!reduc_info->reused_accumulator
9197                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9198                   && !operand_equal_p (neutral_op, initial_def))
9199                 {
9200                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
9201                     = initial_def;
9202                   initial_def = neutral_op;
9203                 }
9204               vec_initial_def
9205                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
9206                                                  initial_def, neutral_op);
9207             }
9208         }
9209     }
9210
9211   if (vec_initial_def)
9212     {
9213       vec_initial_defs.create (ncopies);
9214       for (i = 0; i < ncopies; ++i)
9215         vec_initial_defs.quick_push (vec_initial_def);
9216     }
9217
9218   if (auto *accumulator = reduc_info->reused_accumulator)
9219     {
9220       tree def = accumulator->reduc_input;
9221       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
9222         {
9223           unsigned int nreduc;
9224           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
9225                                             (TREE_TYPE (def)),
9226                                           TYPE_VECTOR_SUBPARTS (vectype_out),
9227                                           &nreduc);
9228           gcc_assert (res);
9229           gimple_seq stmts = NULL;
9230           /* Reduce the single vector to a smaller one.  */
9231           if (nreduc != 1)
9232             {
9233               /* Perform the reduction in the appropriate type.  */
9234               tree rvectype = vectype_out;
9235               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
9236                                               TREE_TYPE (TREE_TYPE (def))))
9237                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
9238                                               TYPE_VECTOR_SUBPARTS
9239                                                 (vectype_out));
9240               def = vect_create_partial_epilog (def, rvectype,
9241                                                 STMT_VINFO_REDUC_CODE
9242                                                   (reduc_info),
9243                                                 &stmts);
9244             }
9245           /* The epilogue loop might use a different vector mode, like
9246              VNx2DI vs. V2DI.  */
9247           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
9248             {
9249               tree reduc_type = build_vector_type_for_mode
9250                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
9251               def = gimple_convert (&stmts, reduc_type, def);
9252             }
9253           /* Adjust the input so we pick up the partially reduced value
9254              for the skip edge in vect_create_epilog_for_reduction.  */
9255           accumulator->reduc_input = def;
9256           /* And the reduction could be carried out using a different sign.  */
9257           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
9258             def = gimple_convert (&stmts, vectype_out, def);
9259           edge e;
9260           if ((e = loop_vinfo->main_loop_edge)
9261               || (e = loop_vinfo->skip_this_loop_edge))
9262             {
9263               /* While we'd like to insert on the edge this will split
9264                  blocks and disturb bookkeeping, we also will eventually
9265                  need this on the skip edge.  Rely on sinking to
9266                  fixup optimal placement and insert in the pred.  */
9267               gimple_stmt_iterator gsi = gsi_last_bb (e->src);
9268               /* Insert before a cond that eventually skips the
9269                  epilogue.  */
9270               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
9271                 gsi_prev (&gsi);
9272               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
9273             }
9274           else
9275             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
9276                                               stmts);
9277         }
9278       if (loop_vinfo->main_loop_edge)
9279         vec_initial_defs[0]
9280           = vect_get_main_loop_result (loop_vinfo, def,
9281                                        vec_initial_defs[0]);
9282       else
9283         vec_initial_defs.safe_push (def);
9284     }
9285
9286   /* Generate the reduction PHIs upfront.  */
9287   for (i = 0; i < vec_num; i++)
9288     {
9289       tree vec_init_def = vec_initial_defs[i];
9290       for (j = 0; j < ncopies; j++)
9291         {
9292           /* Create the reduction-phi that defines the reduction
9293              operand.  */
9294           gphi *new_phi = create_phi_node (vec_dest, loop->header);
9295
9296           /* Set the loop-entry arg of the reduction-phi.  */
9297           if (j != 0 && nested_cycle)
9298             vec_init_def = vec_initial_defs[j];
9299           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
9300                        UNKNOWN_LOCATION);
9301
9302           /* The loop-latch arg is set in epilogue processing.  */
9303
9304           if (slp_node)
9305             slp_node->push_vec_def (new_phi);
9306           else
9307             {
9308               if (j == 0)
9309                 *vec_stmt = new_phi;
9310               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
9311             }
9312         }
9313     }
9314
9315   return true;
9316 }
9317
9318 /* Vectorizes LC PHIs.  */
9319
9320 bool
9321 vectorizable_lc_phi (loop_vec_info loop_vinfo,
9322                      stmt_vec_info stmt_info, gimple **vec_stmt,
9323                      slp_tree slp_node)
9324 {
9325   if (!loop_vinfo
9326       || !is_a <gphi *> (stmt_info->stmt)
9327       || gimple_phi_num_args (stmt_info->stmt) != 1)
9328     return false;
9329
9330   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
9331       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
9332     return false;
9333
9334   if (!vec_stmt) /* transformation not required.  */
9335     {
9336       /* Deal with copies from externs or constants that disguise as
9337          loop-closed PHI nodes (PR97886).  */
9338       if (slp_node
9339           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
9340                                                 SLP_TREE_VECTYPE (slp_node)))
9341         {
9342           if (dump_enabled_p ())
9343             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9344                              "incompatible vector types for invariants\n");
9345           return false;
9346         }
9347       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
9348       return true;
9349     }
9350
9351   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9352   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9353   basic_block bb = gimple_bb (stmt_info->stmt);
9354   edge e = single_pred_edge (bb);
9355   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9356   auto_vec<tree> vec_oprnds;
9357   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
9358                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
9359                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
9360   for (unsigned i = 0; i < vec_oprnds.length (); i++)
9361     {
9362       /* Create the vectorized LC PHI node.  */
9363       gphi *new_phi = create_phi_node (vec_dest, bb);
9364       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
9365       if (slp_node)
9366         slp_node->push_vec_def (new_phi);
9367       else
9368         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
9369     }
9370   if (!slp_node)
9371     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9372
9373   return true;
9374 }
9375
9376 /* Vectorizes PHIs.  */
9377
9378 bool
9379 vectorizable_phi (vec_info *,
9380                   stmt_vec_info stmt_info, gimple **vec_stmt,
9381                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9382 {
9383   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9384     return false;
9385
9386   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9387     return false;
9388
9389   tree vectype = SLP_TREE_VECTYPE (slp_node);
9390
9391   if (!vec_stmt) /* transformation not required.  */
9392     {
9393       slp_tree child;
9394       unsigned i;
9395       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9396         if (!child)
9397           {
9398             if (dump_enabled_p ())
9399               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9400                                "PHI node with unvectorized backedge def\n");
9401             return false;
9402           }
9403         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9404           {
9405             if (dump_enabled_p ())
9406               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9407                                "incompatible vector types for invariants\n");
9408             return false;
9409           }
9410         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9411                  && !useless_type_conversion_p (vectype,
9412                                                 SLP_TREE_VECTYPE (child)))
9413           {
9414             /* With bools we can have mask and non-mask precision vectors
9415                or different non-mask precisions.  while pattern recog is
9416                supposed to guarantee consistency here bugs in it can cause
9417                mismatches (PR103489 and PR103800 for example).
9418                Deal with them here instead of ICEing later.  */
9419             if (dump_enabled_p ())
9420               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9421                                "incompatible vector type setup from "
9422                                "bool pattern detection\n");
9423             return false;
9424           }
9425
9426       /* For single-argument PHIs assume coalescing which means zero cost
9427          for the scalar and the vector PHIs.  This avoids artificially
9428          favoring the vector path (but may pessimize it in some cases).  */
9429       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9430         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9431                           vector_stmt, stmt_info, vectype, 0, vect_body);
9432       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9433       return true;
9434     }
9435
9436   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9437   basic_block bb = gimple_bb (stmt_info->stmt);
9438   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9439   auto_vec<gphi *> new_phis;
9440   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9441     {
9442       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9443
9444       /* Skip not yet vectorized defs.  */
9445       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9446           && SLP_TREE_VEC_DEFS (child).is_empty ())
9447         continue;
9448
9449       auto_vec<tree> vec_oprnds;
9450       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9451       if (!new_phis.exists ())
9452         {
9453           new_phis.create (vec_oprnds.length ());
9454           for (unsigned j = 0; j < vec_oprnds.length (); j++)
9455             {
9456               /* Create the vectorized LC PHI node.  */
9457               new_phis.quick_push (create_phi_node (vec_dest, bb));
9458               slp_node->push_vec_def (new_phis[j]);
9459             }
9460         }
9461       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9462       for (unsigned j = 0; j < vec_oprnds.length (); j++)
9463         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9464     }
9465   /* We should have at least one already vectorized child.  */
9466   gcc_assert (new_phis.exists ());
9467
9468   return true;
9469 }
9470
9471 /* Vectorizes first order recurrences.  An overview of the transformation
9472    is described below. Suppose we have the following loop.
9473
9474      int t = 0;
9475      for (int i = 0; i < n; ++i)
9476        {
9477          b[i] = a[i] - t;
9478          t = a[i];
9479        }
9480
9481    There is a first-order recurrence on 'a'. For this loop, the scalar IR
9482    looks (simplified) like:
9483
9484     scalar.preheader:
9485       init = 0;
9486
9487     scalar.body:
9488       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9489       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9490       _1 = a[i]
9491       b[i] = _1 - _2
9492       if (i < n) goto scalar.body
9493
9494    In this example, _2 is a recurrence because it's value depends on the
9495    previous iteration.  We vectorize this as (VF = 4)
9496
9497     vector.preheader:
9498       vect_init = vect_cst(..., ..., ..., 0)
9499
9500     vector.body
9501       i = PHI <0(vector.preheader), i+4(vector.body)>
9502       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9503       vect_2 = a[i, i+1, i+2, i+3];
9504       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9505       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9506       if (..) goto vector.body
9507
9508    In this function, vectorizable_recurr, we code generate both the
9509    vector PHI node and the permute since those together compute the
9510    vectorized value of the scalar PHI.  We do not yet have the
9511    backedge value to fill in there nor into the vec_perm.  Those
9512    are filled in maybe_set_vectorized_backedge_value and
9513    vect_schedule_scc.
9514
9515    TODO:  Since the scalar loop does not have a use of the recurrence
9516    outside of the loop the natural way to implement peeling via
9517    vectorizing the live value doesn't work.  For now peeling of loops
9518    with a recurrence is not implemented.  For SLP the supported cases
9519    are restricted to those requiring a single vector recurrence PHI.  */
9520
9521 bool
9522 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9523                      gimple **vec_stmt, slp_tree slp_node,
9524                      stmt_vector_for_cost *cost_vec)
9525 {
9526   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9527     return false;
9528
9529   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9530
9531   /* So far we only support first-order recurrence auto-vectorization.  */
9532   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9533     return false;
9534
9535   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9536   unsigned ncopies;
9537   if (slp_node)
9538     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9539   else
9540     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9541   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9542   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9543   /* We need to be able to make progress with a single vector.  */
9544   if (maybe_gt (dist * 2, nunits))
9545     {
9546       if (dump_enabled_p ())
9547         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9548                          "first order recurrence exceeds half of "
9549                          "a vector\n");
9550       return false;
9551     }
9552
9553   /* First-order recurrence autovectorization needs to handle permutation
9554      with indices = [nunits-1, nunits, nunits+1, ...].  */
9555   vec_perm_builder sel (nunits, 1, 3);
9556   for (int i = 0; i < 3; ++i)
9557     sel.quick_push (nunits - dist + i);
9558   vec_perm_indices indices (sel, 2, nunits);
9559
9560   if (!vec_stmt) /* transformation not required.  */
9561     {
9562       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9563                                  indices))
9564         return false;
9565
9566       if (slp_node)
9567         {
9568           /* We eventually need to set a vector type on invariant
9569              arguments.  */
9570           unsigned j;
9571           slp_tree child;
9572           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9573             if (!vect_maybe_update_slp_op_vectype
9574                   (child, SLP_TREE_VECTYPE (slp_node)))
9575               {
9576                 if (dump_enabled_p ())
9577                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9578                                    "incompatible vector types for "
9579                                    "invariants\n");
9580                 return false;
9581               }
9582         }
9583
9584       /* Verify we have set up compatible types.  */
9585       edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9586       tree latch_vectype = NULL_TREE;
9587       if (slp_node)
9588         {
9589           slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
9590           latch_vectype = SLP_TREE_VECTYPE (latch_def);
9591         }
9592       else
9593         {
9594           tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, le);
9595           if (TREE_CODE (latch_def) == SSA_NAME)
9596             {
9597               stmt_vec_info latch_def_info = loop_vinfo->lookup_def (latch_def);
9598               latch_def_info = vect_stmt_to_vectorize (latch_def_info);
9599               latch_vectype = STMT_VINFO_VECTYPE (latch_def_info);
9600             }
9601         }
9602       if (!types_compatible_p (latch_vectype, vectype))
9603         return false;
9604
9605       /* The recurrence costs the initialization vector and one permute
9606          for each copy.  */
9607       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9608                                                  stmt_info, 0, vect_prologue);
9609       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9610                                                stmt_info, 0, vect_body);
9611       if (dump_enabled_p ())
9612         dump_printf_loc (MSG_NOTE, vect_location,
9613                          "vectorizable_recurr: inside_cost = %d, "
9614                          "prologue_cost = %d .\n", inside_cost,
9615                          prologue_cost);
9616
9617       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9618       return true;
9619     }
9620
9621   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9622   basic_block bb = gimple_bb (phi);
9623   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9624   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9625     {
9626       gimple_seq stmts = NULL;
9627       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9628       gsi_insert_seq_on_edge_immediate (pe, stmts);
9629     }
9630   tree vec_init = build_vector_from_val (vectype, preheader);
9631   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9632
9633   /* Create the vectorized first-order PHI node.  */
9634   tree vec_dest = vect_get_new_vect_var (vectype,
9635                                          vect_simple_var, "vec_recur_");
9636   gphi *new_phi = create_phi_node (vec_dest, bb);
9637   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9638
9639   /* Insert shuffles the first-order recurrence autovectorization.
9640        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9641   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9642
9643   /* Insert the required permute after the latch definition.  The
9644      second and later operands are tentative and will be updated when we have
9645      vectorized the latch definition.  */
9646   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9647   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9648   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9649   gsi_next (&gsi2);
9650
9651   for (unsigned i = 0; i < ncopies; ++i)
9652     {
9653       vec_dest = make_ssa_name (vectype);
9654       gassign *vperm
9655           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9656                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9657                                  NULL, perm);
9658       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9659
9660       if (slp_node)
9661         slp_node->push_vec_def (vperm);
9662       else
9663         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9664     }
9665
9666   if (!slp_node)
9667     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9668   return true;
9669 }
9670
9671 /* Return true if VECTYPE represents a vector that requires lowering
9672    by the vector lowering pass.  */
9673
9674 bool
9675 vect_emulated_vector_p (tree vectype)
9676 {
9677   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9678           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9679               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9680 }
9681
9682 /* Return true if we can emulate CODE on an integer mode representation
9683    of a vector.  */
9684
9685 bool
9686 vect_can_vectorize_without_simd_p (tree_code code)
9687 {
9688   switch (code)
9689     {
9690     case PLUS_EXPR:
9691     case MINUS_EXPR:
9692     case NEGATE_EXPR:
9693     case BIT_AND_EXPR:
9694     case BIT_IOR_EXPR:
9695     case BIT_XOR_EXPR:
9696     case BIT_NOT_EXPR:
9697       return true;
9698
9699     default:
9700       return false;
9701     }
9702 }
9703
9704 /* Likewise, but taking a code_helper.  */
9705
9706 bool
9707 vect_can_vectorize_without_simd_p (code_helper code)
9708 {
9709   return (code.is_tree_code ()
9710           && vect_can_vectorize_without_simd_p (tree_code (code)));
9711 }
9712
9713 /* Create vector init for vectorized iv.  */
9714 static tree
9715 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9716                                tree step_expr, poly_uint64 nunits,
9717                                tree vectype,
9718                                enum vect_induction_op_type induction_type)
9719 {
9720   unsigned HOST_WIDE_INT const_nunits;
9721   tree vec_shift, vec_init, new_name;
9722   unsigned i;
9723   tree itype = TREE_TYPE (vectype);
9724
9725   /* iv_loop is the loop to be vectorized. Create:
9726      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9727   new_name = gimple_convert (stmts, itype, init_expr);
9728   switch (induction_type)
9729     {
9730     case vect_step_op_shr:
9731     case vect_step_op_shl:
9732       /* Build the Initial value from shift_expr.  */
9733       vec_init = gimple_build_vector_from_val (stmts,
9734                                                vectype,
9735                                                new_name);
9736       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9737                                 build_zero_cst (itype), step_expr);
9738       vec_init = gimple_build (stmts,
9739                                (induction_type == vect_step_op_shr
9740                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9741                                vectype, vec_init, vec_shift);
9742       break;
9743
9744     case vect_step_op_neg:
9745       {
9746         vec_init = gimple_build_vector_from_val (stmts,
9747                                                  vectype,
9748                                                  new_name);
9749         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9750                                      vectype, vec_init);
9751         /* The encoding has 2 interleaved stepped patterns.  */
9752         vec_perm_builder sel (nunits, 2, 3);
9753         sel.quick_grow (6);
9754         for (i = 0; i < 3; i++)
9755           {
9756             sel[2 * i] = i;
9757             sel[2 * i + 1] = i + nunits;
9758           }
9759         vec_perm_indices indices (sel, 2, nunits);
9760         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9761            fail when vec_init is const vector. In that situation vec_perm is not
9762            really needed.  */
9763         tree perm_mask_even
9764           = vect_gen_perm_mask_any (vectype, indices);
9765         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9766                                  vectype,
9767                                  vec_init, vec_neg,
9768                                  perm_mask_even);
9769       }
9770       break;
9771
9772     case vect_step_op_mul:
9773       {
9774         /* Use unsigned mult to avoid UD integer overflow.  */
9775         gcc_assert (nunits.is_constant (&const_nunits));
9776         tree utype = unsigned_type_for (itype);
9777         tree uvectype = build_vector_type (utype,
9778                                            TYPE_VECTOR_SUBPARTS (vectype));
9779         new_name = gimple_convert (stmts, utype, new_name);
9780         vec_init = gimple_build_vector_from_val (stmts,
9781                                                  uvectype,
9782                                                  new_name);
9783         tree_vector_builder elts (uvectype, const_nunits, 1);
9784         tree elt_step = build_one_cst (utype);
9785
9786         elts.quick_push (elt_step);
9787         for (i = 1; i < const_nunits; i++)
9788           {
9789             /* Create: new_name_i = new_name + step_expr.  */
9790             elt_step = gimple_build (stmts, MULT_EXPR,
9791                                      utype, elt_step, step_expr);
9792             elts.quick_push (elt_step);
9793           }
9794         /* Create a vector from [new_name_0, new_name_1, ...,
9795            new_name_nunits-1].  */
9796         tree vec_mul = gimple_build_vector (stmts, &elts);
9797         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9798                                  vec_init, vec_mul);
9799         vec_init = gimple_convert (stmts, vectype, vec_init);
9800       }
9801       break;
9802
9803     default:
9804       gcc_unreachable ();
9805     }
9806
9807   return vec_init;
9808 }
9809
9810 /* Peel init_expr by skip_niter for induction_type.  */
9811 tree
9812 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9813                              tree skip_niters, tree step_expr,
9814                              enum vect_induction_op_type induction_type)
9815 {
9816   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9817   tree type = TREE_TYPE (init_expr);
9818   unsigned prec = TYPE_PRECISION (type);
9819   switch (induction_type)
9820     {
9821     case vect_step_op_neg:
9822       if (TREE_INT_CST_LOW (skip_niters) % 2)
9823         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9824       /* else no change.  */
9825       break;
9826
9827     case vect_step_op_shr:
9828     case vect_step_op_shl:
9829       skip_niters = gimple_convert (stmts, type, skip_niters);
9830       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9831       /* When shift mount >= precision, need to avoid UD.
9832          In the original loop, there's no UD, and according to semantic,
9833          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9834       if (!tree_fits_uhwi_p (step_expr)
9835           || tree_to_uhwi (step_expr) >= prec)
9836         {
9837           if (induction_type == vect_step_op_shl
9838               || TYPE_UNSIGNED (type))
9839             init_expr = build_zero_cst (type);
9840           else
9841             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9842                                       init_expr,
9843                                       wide_int_to_tree (type, prec - 1));
9844         }
9845       else
9846         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9847                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9848                                   type, init_expr, step_expr);
9849       break;
9850
9851     case vect_step_op_mul:
9852       {
9853         tree utype = unsigned_type_for (type);
9854         init_expr = gimple_convert (stmts, utype, init_expr);
9855         wide_int skipn = wi::to_wide (skip_niters);
9856         wide_int begin = wi::to_wide (step_expr);
9857         auto_mpz base, exp, mod, res;
9858         wi::to_mpz (begin, base, TYPE_SIGN (type));
9859         wi::to_mpz (skipn, exp, UNSIGNED);
9860         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9861         mpz_powm (res, base, exp, mod);
9862         begin = wi::from_mpz (utype, res, true);
9863         tree mult_expr = wide_int_to_tree (utype, begin);
9864         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9865                                   init_expr, mult_expr);
9866         init_expr = gimple_convert (stmts, type, init_expr);
9867       }
9868       break;
9869
9870     default:
9871       gcc_unreachable ();
9872     }
9873
9874   return init_expr;
9875 }
9876
9877 /* Create vector step for vectorized iv.  */
9878 static tree
9879 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9880                                poly_uint64 vf,
9881                                enum vect_induction_op_type induction_type)
9882 {
9883   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9884   tree new_name = NULL;
9885   /* Step should be pow (step, vf) for mult induction.  */
9886   if (induction_type == vect_step_op_mul)
9887     {
9888       gcc_assert (vf.is_constant ());
9889       wide_int begin = wi::to_wide (step_expr);
9890
9891       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9892         begin = wi::mul (begin, wi::to_wide (step_expr));
9893
9894       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9895     }
9896   else if (induction_type == vect_step_op_neg)
9897     /* Do nothing.  */
9898     ;
9899   else
9900     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9901                              expr, step_expr);
9902   return new_name;
9903 }
9904
9905 static tree
9906 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9907                                    stmt_vec_info stmt_info,
9908                                    tree new_name, tree vectype,
9909                                    enum vect_induction_op_type induction_type)
9910 {
9911   /* No step is needed for neg induction.  */
9912   if (induction_type == vect_step_op_neg)
9913     return NULL;
9914
9915   tree t = unshare_expr (new_name);
9916   gcc_assert (CONSTANT_CLASS_P (new_name)
9917               || TREE_CODE (new_name) == SSA_NAME);
9918   tree new_vec = build_vector_from_val (vectype, t);
9919   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9920                                     new_vec, vectype, NULL);
9921   return vec_step;
9922 }
9923
9924 /* Update vectorized iv with vect_step, induc_def is init.  */
9925 static tree
9926 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9927                           tree induc_def, tree vec_step,
9928                           enum vect_induction_op_type induction_type)
9929 {
9930   tree vec_def = induc_def;
9931   switch (induction_type)
9932     {
9933     case vect_step_op_mul:
9934       {
9935         /* Use unsigned mult to avoid UD integer overflow.  */
9936         tree uvectype
9937           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9938                                TYPE_VECTOR_SUBPARTS (vectype));
9939         vec_def = gimple_convert (stmts, uvectype, vec_def);
9940         vec_step = gimple_convert (stmts, uvectype, vec_step);
9941         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9942                                 vec_def, vec_step);
9943         vec_def = gimple_convert (stmts, vectype, vec_def);
9944       }
9945       break;
9946
9947     case vect_step_op_shr:
9948       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9949                               vec_def, vec_step);
9950       break;
9951
9952     case vect_step_op_shl:
9953       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9954                               vec_def, vec_step);
9955       break;
9956     case vect_step_op_neg:
9957       vec_def = induc_def;
9958       /* Do nothing.  */
9959       break;
9960     default:
9961       gcc_unreachable ();
9962     }
9963
9964   return vec_def;
9965
9966 }
9967
9968 /* Function vectorizable_induction
9969
9970    Check if STMT_INFO performs an nonlinear induction computation that can be
9971    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9972    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9973    basic block.
9974    Return true if STMT_INFO is vectorizable in this way.  */
9975
9976 static bool
9977 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9978                                   stmt_vec_info stmt_info,
9979                                   gimple **vec_stmt, slp_tree slp_node,
9980                                   stmt_vector_for_cost *cost_vec)
9981 {
9982   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9983   unsigned ncopies;
9984   bool nested_in_vect_loop = false;
9985   class loop *iv_loop;
9986   tree vec_def;
9987   edge pe = loop_preheader_edge (loop);
9988   basic_block new_bb;
9989   tree vec_init, vec_step;
9990   tree new_name;
9991   gimple *new_stmt;
9992   gphi *induction_phi;
9993   tree induc_def, vec_dest;
9994   tree init_expr, step_expr;
9995   tree niters_skip;
9996   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9997   unsigned i;
9998   gimple_stmt_iterator si;
9999
10000   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
10001
10002   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
10003   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10004   enum vect_induction_op_type induction_type
10005     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
10006
10007   gcc_assert (induction_type > vect_step_op_add);
10008
10009   if (slp_node)
10010     ncopies = 1;
10011   else
10012     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10013   gcc_assert (ncopies >= 1);
10014
10015   /* FORNOW. Only handle nonlinear induction in the same loop.  */
10016   if (nested_in_vect_loop_p (loop, stmt_info))
10017     {
10018       if (dump_enabled_p ())
10019         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10020                          "nonlinear induction in nested loop.\n");
10021       return false;
10022     }
10023
10024   iv_loop = loop;
10025   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10026
10027   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
10028      update for each iv and a permutation to generate wanted vector iv.  */
10029   if (slp_node)
10030     {
10031       if (dump_enabled_p ())
10032         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10033                          "SLP induction not supported for nonlinear"
10034                          " induction.\n");
10035       return false;
10036     }
10037
10038   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
10039     {
10040       if (dump_enabled_p ())
10041         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10042                          "floating point nonlinear induction vectorization"
10043                          " not supported.\n");
10044       return false;
10045     }
10046
10047   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10048   init_expr = vect_phi_initial_value (phi);
10049   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
10050               && TREE_CODE (step_expr) == INTEGER_CST);
10051   /* step_expr should be aligned with init_expr,
10052      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
10053   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
10054
10055   if (TREE_CODE (init_expr) == INTEGER_CST)
10056     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
10057   else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
10058     {
10059       /* INIT_EXPR could be a bit_field, bail out for such case.  */
10060       if (dump_enabled_p ())
10061         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10062                          "nonlinear induction vectorization failed:"
10063                          " component type of vectype is not a nop conversion"
10064                          " from type of init_expr.\n");
10065       return false;
10066     }
10067
10068   switch (induction_type)
10069     {
10070     case vect_step_op_neg:
10071       if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
10072         return false;
10073       if (TREE_CODE (init_expr) != INTEGER_CST
10074           && TREE_CODE (init_expr) != REAL_CST)
10075         {
10076           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
10077           if (!directly_supported_p (NEGATE_EXPR, vectype))
10078             return false;
10079
10080           /* The encoding has 2 interleaved stepped patterns.  */
10081           vec_perm_builder sel (nunits, 2, 3);
10082           machine_mode mode = TYPE_MODE (vectype);
10083           sel.quick_grow (6);
10084           for (i = 0; i < 3; i++)
10085             {
10086               sel[i * 2] = i;
10087               sel[i * 2 + 1] = i + nunits;
10088             }
10089           vec_perm_indices indices (sel, 2, nunits);
10090           if (!can_vec_perm_const_p (mode, mode, indices))
10091             return false;
10092         }
10093       break;
10094
10095     case vect_step_op_mul:
10096       {
10097         /* Check for backend support of MULT_EXPR.  */
10098         if (!directly_supported_p (MULT_EXPR, vectype))
10099           return false;
10100
10101         /* ?? How to construct vector step for variable number vector.
10102            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
10103         if (!vf.is_constant ())
10104           return false;
10105       }
10106       break;
10107
10108     case vect_step_op_shr:
10109       /* Check for backend support of RSHIFT_EXPR.  */
10110       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
10111         return false;
10112
10113       /* Don't shift more than type precision to avoid UD.  */
10114       if (!tree_fits_uhwi_p (step_expr)
10115           || maybe_ge (nunits * tree_to_uhwi (step_expr),
10116                        TYPE_PRECISION (TREE_TYPE (init_expr))))
10117         return false;
10118       break;
10119
10120     case vect_step_op_shl:
10121       /* Check for backend support of RSHIFT_EXPR.  */
10122       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
10123         return false;
10124
10125       /* Don't shift more than type precision to avoid UD.  */
10126       if (!tree_fits_uhwi_p (step_expr)
10127           || maybe_ge (nunits * tree_to_uhwi (step_expr),
10128                        TYPE_PRECISION (TREE_TYPE (init_expr))))
10129         return false;
10130
10131       break;
10132
10133     default:
10134       gcc_unreachable ();
10135     }
10136
10137   if (!vec_stmt) /* transformation not required.  */
10138     {
10139       unsigned inside_cost = 0, prologue_cost = 0;
10140       /* loop cost for vec_loop. Neg induction doesn't have any
10141          inside_cost.  */
10142       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10143                                       stmt_info, 0, vect_body);
10144
10145       /* loop cost for vec_loop. Neg induction doesn't have any
10146          inside_cost.  */
10147       if (induction_type == vect_step_op_neg)
10148         inside_cost = 0;
10149
10150       /* prologue cost for vec_init and vec_step.  */
10151       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10152                                         stmt_info, 0, vect_prologue);
10153
10154       if (dump_enabled_p ())
10155         dump_printf_loc (MSG_NOTE, vect_location,
10156                          "vect_model_induction_cost: inside_cost = %d, "
10157                          "prologue_cost = %d. \n", inside_cost,
10158                          prologue_cost);
10159
10160       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10161       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
10162       return true;
10163     }
10164
10165   /* Transform.  */
10166
10167   /* Compute a vector variable, initialized with the first VF values of
10168      the induction variable.  E.g., for an iv with IV_PHI='X' and
10169      evolution S, for a vector of 4 units, we want to compute:
10170      [X, X + S, X + 2*S, X + 3*S].  */
10171
10172   if (dump_enabled_p ())
10173     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10174
10175   pe = loop_preheader_edge (iv_loop);
10176   /* Find the first insertion point in the BB.  */
10177   basic_block bb = gimple_bb (phi);
10178   si = gsi_after_labels (bb);
10179
10180   gimple_seq stmts = NULL;
10181
10182   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10183   /* If we are using the loop mask to "peel" for alignment then we need
10184      to adjust the start value here.  */
10185   if (niters_skip != NULL_TREE)
10186     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
10187                                              step_expr, induction_type);
10188
10189   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
10190                                             step_expr, nunits, vectype,
10191                                             induction_type);
10192   if (stmts)
10193     {
10194       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10195       gcc_assert (!new_bb);
10196     }
10197
10198   stmts = NULL;
10199   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
10200                                             vf, induction_type);
10201   if (stmts)
10202     {
10203       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10204       gcc_assert (!new_bb);
10205     }
10206
10207   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
10208                                                 new_name, vectype,
10209                                                 induction_type);
10210   /* Create the following def-use cycle:
10211      loop prolog:
10212      vec_init = ...
10213      vec_step = ...
10214      loop:
10215      vec_iv = PHI <vec_init, vec_loop>
10216      ...
10217      STMT
10218      ...
10219      vec_loop = vec_iv + vec_step;  */
10220
10221   /* Create the induction-phi that defines the induction-operand.  */
10222   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10223   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10224   induc_def = PHI_RESULT (induction_phi);
10225
10226   /* Create the iv update inside the loop.  */
10227   stmts = NULL;
10228   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
10229                                       induc_def, vec_step,
10230                                       induction_type);
10231
10232   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10233   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10234
10235   /* Set the arguments of the phi node:  */
10236   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10237   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10238                UNKNOWN_LOCATION);
10239
10240   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10241   *vec_stmt = induction_phi;
10242
10243   /* In case that vectorization factor (VF) is bigger than the number
10244      of elements that we can fit in a vectype (nunits), we have to generate
10245      more than one vector stmt - i.e - we need to "unroll" the
10246      vector stmt by a factor VF/nunits.  For more details see documentation
10247      in vectorizable_operation.  */
10248
10249   if (ncopies > 1)
10250     {
10251       stmts = NULL;
10252       /* FORNOW. This restriction should be relaxed.  */
10253       gcc_assert (!nested_in_vect_loop);
10254
10255       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
10256                                                 nunits, induction_type);
10257
10258       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
10259                                                     new_name, vectype,
10260                                                     induction_type);
10261       vec_def = induc_def;
10262       for (i = 1; i < ncopies; i++)
10263         {
10264           /* vec_i = vec_prev + vec_step.  */
10265           stmts = NULL;
10266           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
10267                                               vec_def, vec_step,
10268                                               induction_type);
10269           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10270           new_stmt = SSA_NAME_DEF_STMT (vec_def);
10271           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10272         }
10273     }
10274
10275   if (dump_enabled_p ())
10276     dump_printf_loc (MSG_NOTE, vect_location,
10277                      "transform induction: created def-use cycle: %G%G",
10278                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10279
10280   return true;
10281 }
10282
10283 /* Function vectorizable_induction
10284
10285    Check if STMT_INFO performs an induction computation that can be vectorized.
10286    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
10287    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
10288    Return true if STMT_INFO is vectorizable in this way.  */
10289
10290 bool
10291 vectorizable_induction (loop_vec_info loop_vinfo,
10292                         stmt_vec_info stmt_info,
10293                         gimple **vec_stmt, slp_tree slp_node,
10294                         stmt_vector_for_cost *cost_vec)
10295 {
10296   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10297   unsigned ncopies;
10298   bool nested_in_vect_loop = false;
10299   class loop *iv_loop;
10300   tree vec_def;
10301   edge pe = loop_preheader_edge (loop);
10302   basic_block new_bb;
10303   tree new_vec, vec_init = NULL_TREE, vec_step, t;
10304   tree new_name;
10305   gimple *new_stmt;
10306   gphi *induction_phi;
10307   tree induc_def, vec_dest;
10308   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10309   unsigned i;
10310   tree expr;
10311   gimple_stmt_iterator si;
10312   enum vect_induction_op_type induction_type
10313     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
10314
10315   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
10316   if (!phi)
10317     return false;
10318
10319   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10320     return false;
10321
10322   /* Make sure it was recognized as induction computation.  */
10323   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
10324     return false;
10325
10326   /* Handle nonlinear induction in a separate place.  */
10327   if (induction_type != vect_step_op_add)
10328     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
10329                                              vec_stmt, slp_node, cost_vec);
10330
10331   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
10332   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10333
10334   if (slp_node)
10335     ncopies = 1;
10336   else
10337     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10338   gcc_assert (ncopies >= 1);
10339
10340   /* FORNOW. These restrictions should be relaxed.  */
10341   if (nested_in_vect_loop_p (loop, stmt_info))
10342     {
10343       imm_use_iterator imm_iter;
10344       use_operand_p use_p;
10345       gimple *exit_phi;
10346       edge latch_e;
10347       tree loop_arg;
10348
10349       if (ncopies > 1)
10350         {
10351           if (dump_enabled_p ())
10352             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10353                              "multiple types in nested loop.\n");
10354           return false;
10355         }
10356
10357       exit_phi = NULL;
10358       latch_e = loop_latch_edge (loop->inner);
10359       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
10360       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
10361         {
10362           gimple *use_stmt = USE_STMT (use_p);
10363           if (is_gimple_debug (use_stmt))
10364             continue;
10365
10366           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
10367             {
10368               exit_phi = use_stmt;
10369               break;
10370             }
10371         }
10372       if (exit_phi)
10373         {
10374           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
10375           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
10376                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
10377             {
10378               if (dump_enabled_p ())
10379                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10380                                  "inner-loop induction only used outside "
10381                                  "of the outer vectorized loop.\n");
10382               return false;
10383             }
10384         }
10385
10386       nested_in_vect_loop = true;
10387       iv_loop = loop->inner;
10388     }
10389   else
10390     iv_loop = loop;
10391   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10392
10393   if (slp_node && (!nunits.is_constant () && SLP_TREE_LANES (slp_node) != 1))
10394     {
10395       /* The current SLP code creates the step value element-by-element.  */
10396       if (dump_enabled_p ())
10397         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10398                          "SLP induction not supported for variable-length"
10399                          " vectors.\n");
10400       return false;
10401     }
10402
10403   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10404     {
10405       if (dump_enabled_p ())
10406         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10407                          "floating point induction vectorization disabled\n");
10408       return false;
10409     }
10410
10411   tree step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10412   gcc_assert (step_expr != NULL_TREE);
10413   if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10414       && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10415     {
10416       if (dump_enabled_p ())
10417         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10418                          "bit-precision induction vectorization not "
10419                          "supported.\n");
10420       return false;
10421     }
10422   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10423
10424   /* Check for backend support of PLUS/MINUS_EXPR. */
10425   if (!directly_supported_p (PLUS_EXPR, step_vectype)
10426       || !directly_supported_p (MINUS_EXPR, step_vectype))
10427     return false;
10428
10429   if (!vec_stmt) /* transformation not required.  */
10430     {
10431       unsigned inside_cost = 0, prologue_cost = 0;
10432       if (slp_node)
10433         {
10434           /* We eventually need to set a vector type on invariant
10435              arguments.  */
10436           unsigned j;
10437           slp_tree child;
10438           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10439             if (!vect_maybe_update_slp_op_vectype
10440                 (child, SLP_TREE_VECTYPE (slp_node)))
10441               {
10442                 if (dump_enabled_p ())
10443                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10444                                    "incompatible vector types for "
10445                                    "invariants\n");
10446                 return false;
10447               }
10448           /* loop cost for vec_loop.  */
10449           inside_cost
10450             = record_stmt_cost (cost_vec,
10451                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10452                                 vector_stmt, stmt_info, 0, vect_body);
10453           /* prologue cost for vec_init (if not nested) and step.  */
10454           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10455                                             scalar_to_vec,
10456                                             stmt_info, 0, vect_prologue);
10457         }
10458       else /* if (!slp_node) */
10459         {
10460           /* loop cost for vec_loop.  */
10461           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10462                                           stmt_info, 0, vect_body);
10463           /* prologue cost for vec_init and vec_step.  */
10464           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10465                                             stmt_info, 0, vect_prologue);
10466         }
10467       if (dump_enabled_p ())
10468         dump_printf_loc (MSG_NOTE, vect_location,
10469                          "vect_model_induction_cost: inside_cost = %d, "
10470                          "prologue_cost = %d .\n", inside_cost,
10471                          prologue_cost);
10472
10473       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10474       DUMP_VECT_SCOPE ("vectorizable_induction");
10475       return true;
10476     }
10477
10478   /* Transform.  */
10479
10480   /* Compute a vector variable, initialized with the first VF values of
10481      the induction variable.  E.g., for an iv with IV_PHI='X' and
10482      evolution S, for a vector of 4 units, we want to compute:
10483      [X, X + S, X + 2*S, X + 3*S].  */
10484
10485   if (dump_enabled_p ())
10486     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10487
10488   pe = loop_preheader_edge (iv_loop);
10489   /* Find the first insertion point in the BB.  */
10490   basic_block bb = gimple_bb (phi);
10491   si = gsi_after_labels (bb);
10492
10493   /* For SLP induction we have to generate several IVs as for example
10494      with group size 3 we need
10495        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10496        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
10497   if (slp_node)
10498     {
10499       /* The initial values are vectorized, but any lanes > group_size
10500          need adjustment.  */
10501       slp_tree init_node
10502         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10503
10504       /* Gather steps.  Since we do not vectorize inductions as
10505          cycles we have to reconstruct the step from SCEV data.  */
10506       unsigned group_size = SLP_TREE_LANES (slp_node);
10507       tree *steps = XALLOCAVEC (tree, group_size);
10508       tree *inits = XALLOCAVEC (tree, group_size);
10509       stmt_vec_info phi_info;
10510       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10511         {
10512           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10513           if (!init_node)
10514             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10515                                            pe->dest_idx);
10516         }
10517
10518       /* Now generate the IVs.  */
10519       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10520       gcc_assert (multiple_p (nunits * nvects, group_size));
10521       unsigned nivs;
10522       unsigned HOST_WIDE_INT const_nunits;
10523       if (nested_in_vect_loop)
10524         nivs = nvects;
10525       else if (nunits.is_constant (&const_nunits))
10526         {
10527           /* Compute the number of distinct IVs we need.  First reduce
10528              group_size if it is a multiple of const_nunits so we get
10529              one IV for a group_size of 4 but const_nunits 2.  */
10530           unsigned group_sizep = group_size;
10531           if (group_sizep % const_nunits == 0)
10532             group_sizep = group_sizep / const_nunits;
10533           nivs = least_common_multiple (group_sizep,
10534                                         const_nunits) / const_nunits;
10535         }
10536       else
10537         {
10538           gcc_assert (SLP_TREE_LANES (slp_node) == 1);
10539           nivs = 1;
10540         }
10541       gimple_seq init_stmts = NULL;
10542       tree stept = TREE_TYPE (step_vectype);
10543       tree lupdate_mul = NULL_TREE;
10544       if (!nested_in_vect_loop)
10545         {
10546           if (nunits.is_constant (&const_nunits))
10547             {
10548               /* The number of iterations covered in one vector iteration.  */
10549               unsigned lup_mul = (nvects * const_nunits) / group_size;
10550               lupdate_mul
10551                 = build_vector_from_val (step_vectype,
10552                                          SCALAR_FLOAT_TYPE_P (stept)
10553                                          ? build_real_from_wide (stept, lup_mul,
10554                                                                  UNSIGNED)
10555                                          : build_int_cstu (stept, lup_mul));
10556             }
10557           else
10558             {
10559               if (SCALAR_FLOAT_TYPE_P (stept))
10560                 {
10561                   tree tem = build_int_cst (integer_type_node, vf);
10562                   lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR,
10563                                               stept, tem);
10564                 }
10565               else
10566                 lupdate_mul = build_int_cst (stept, vf);
10567               lupdate_mul = gimple_build_vector_from_val (&init_stmts,
10568                                                           step_vectype,
10569                                                           lupdate_mul);
10570             }
10571         }
10572       tree peel_mul = NULL_TREE;
10573       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10574         {
10575           if (SCALAR_FLOAT_TYPE_P (stept))
10576             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10577                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10578           else
10579             peel_mul = gimple_convert (&init_stmts, stept,
10580                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10581           peel_mul = gimple_build_vector_from_val (&init_stmts,
10582                                                    step_vectype, peel_mul);
10583         }
10584       tree step_mul = NULL_TREE;
10585       unsigned ivn;
10586       auto_vec<tree> vec_steps;
10587       for (ivn = 0; ivn < nivs; ++ivn)
10588         {
10589           gimple_seq stmts = NULL;
10590           bool invariant = true;
10591           if (nunits.is_constant (&const_nunits))
10592             {
10593               tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10594               tree_vector_builder init_elts (vectype, const_nunits, 1);
10595               tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10596               for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10597                 {
10598                   /* The scalar steps of the IVs.  */
10599                   tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10600                   elt = gimple_convert (&init_stmts,
10601                                         TREE_TYPE (step_vectype), elt);
10602                   step_elts.quick_push (elt);
10603                   if (!init_node)
10604                     {
10605                       /* The scalar inits of the IVs if not vectorized.  */
10606                       elt = inits[(ivn*const_nunits + eltn) % group_size];
10607                       if (!useless_type_conversion_p (TREE_TYPE (vectype),
10608                                                       TREE_TYPE (elt)))
10609                         elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10610                                             TREE_TYPE (vectype), elt);
10611                       init_elts.quick_push (elt);
10612                     }
10613                   /* The number of steps to add to the initial values.  */
10614                   unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10615                   mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10616                                        ? build_real_from_wide (stept, mul_elt,
10617                                                                UNSIGNED)
10618                                        : build_int_cstu (stept, mul_elt));
10619                 }
10620               vec_step = gimple_build_vector (&init_stmts, &step_elts);
10621               step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10622               if (!init_node)
10623                 vec_init = gimple_build_vector (&init_stmts, &init_elts);
10624             }
10625           else
10626             {
10627               if (init_node)
10628                 ;
10629               else if (INTEGRAL_TYPE_P (TREE_TYPE (steps[0])))
10630                 {
10631                   new_name = gimple_convert (&init_stmts, stept, inits[0]);
10632                   /* Build the initial value directly as a VEC_SERIES_EXPR.  */
10633                   vec_init = gimple_build (&init_stmts, VEC_SERIES_EXPR,
10634                                            step_vectype, new_name, steps[0]);
10635                   if (!useless_type_conversion_p (vectype, step_vectype))
10636                     vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10637                                              vectype, vec_init);
10638                 }
10639               else
10640                 {
10641                   /* Build:
10642                        [base, base, base, ...]
10643                        + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10644                   gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (steps[0])));
10645                   gcc_assert (flag_associative_math);
10646                   tree index = build_index_vector (step_vectype, 0, 1);
10647                   new_name = gimple_convert (&init_stmts, TREE_TYPE (steps[0]),
10648                                              inits[0]);
10649                   tree base_vec = gimple_build_vector_from_val (&init_stmts,
10650                                                                 step_vectype,
10651                                                                 new_name);
10652                   tree step_vec = gimple_build_vector_from_val (&init_stmts,
10653                                                                 step_vectype,
10654                                                                 steps[0]);
10655                   vec_init = gimple_build (&init_stmts, FLOAT_EXPR,
10656                                            step_vectype, index);
10657                   vec_init = gimple_build (&init_stmts, MULT_EXPR,
10658                                            step_vectype, vec_init, step_vec);
10659                   vec_init = gimple_build (&init_stmts, PLUS_EXPR,
10660                                            step_vectype, vec_init, base_vec);
10661                   if (!useless_type_conversion_p (vectype, step_vectype))
10662                     vec_init = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10663                                              vectype, vec_init);
10664                 }
10665               /* iv_loop is nested in the loop to be vectorized. Generate:
10666                  vec_step = [S, S, S, S]  */
10667               t = unshare_expr (steps[0]);
10668               gcc_assert (CONSTANT_CLASS_P (t)
10669                           || TREE_CODE (t) == SSA_NAME);
10670               vec_step = gimple_build_vector_from_val (&init_stmts,
10671                                                        step_vectype, t);
10672             }
10673           vec_steps.safe_push (vec_step);
10674           if (peel_mul)
10675             {
10676               if (!step_mul)
10677                 step_mul = peel_mul;
10678               else
10679                 step_mul = gimple_build (&init_stmts,
10680                                          MINUS_EXPR, step_vectype,
10681                                          step_mul, peel_mul);
10682             }
10683
10684           /* Create the induction-phi that defines the induction-operand.  */
10685           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10686                                             "vec_iv_");
10687           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10688           induc_def = PHI_RESULT (induction_phi);
10689
10690           /* Create the iv update inside the loop  */
10691           tree up = vec_step;
10692           if (lupdate_mul)
10693             {
10694               if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10695                 {
10696                   /* When we're using loop_len produced by SELEC_VL, the
10697                      non-final iterations are not always processing VF
10698                      elements.  So vectorize induction variable instead of
10699
10700                        _21 = vect_vec_iv_.6_22 + { VF, ... };
10701
10702                      We should generate:
10703
10704                        _35 = .SELECT_VL (ivtmp_33, VF);
10705                        vect_cst__22 = [vec_duplicate_expr] _35;
10706                        _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
10707                   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10708                   tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1,
10709                                                 vectype, 0, 0);
10710                   if (SCALAR_FLOAT_TYPE_P (stept))
10711                     expr = gimple_build (&stmts, FLOAT_EXPR, stept, len);
10712                   else
10713                     expr = gimple_convert (&stmts, stept, len);
10714                   lupdate_mul = gimple_build_vector_from_val (&stmts,
10715                                                               step_vectype,
10716                                                               expr);
10717                   up = gimple_build (&stmts, MULT_EXPR,
10718                                      step_vectype, vec_step, lupdate_mul);
10719                 }
10720               else
10721                 up = gimple_build (&init_stmts,
10722                                    MULT_EXPR, step_vectype,
10723                                    vec_step, lupdate_mul);
10724             }
10725           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10726           vec_def = gimple_build (&stmts,
10727                                   PLUS_EXPR, step_vectype, vec_def, up);
10728           vec_def = gimple_convert (&stmts, vectype, vec_def);
10729           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10730           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10731                        UNKNOWN_LOCATION);
10732
10733           if (init_node)
10734             vec_init = vect_get_slp_vect_def (init_node, ivn);
10735           if (!nested_in_vect_loop
10736               && step_mul
10737               && !integer_zerop (step_mul))
10738             {
10739               gcc_assert (invariant);
10740               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10741               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10742                                  vec_step, step_mul);
10743               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10744                                       vec_def, up);
10745               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10746             }
10747
10748           /* Set the arguments of the phi node:  */
10749           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10750
10751           slp_node->push_vec_def (induction_phi);
10752         }
10753       if (!nested_in_vect_loop)
10754         {
10755           /* Fill up to the number of vectors we need for the whole group.  */
10756           if (nunits.is_constant (&const_nunits))
10757             nivs = least_common_multiple (group_size,
10758                                           const_nunits) / const_nunits;
10759           else
10760             nivs = 1;
10761           vec_steps.reserve (nivs-ivn);
10762           for (; ivn < nivs; ++ivn)
10763             {
10764               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10765               vec_steps.quick_push (vec_steps[0]);
10766             }
10767         }
10768
10769       /* Re-use IVs when we can.  We are generating further vector
10770          stmts by adding VF' * stride to the IVs generated above.  */
10771       if (ivn < nvects)
10772         {
10773           if (nunits.is_constant (&const_nunits))
10774             {
10775               unsigned vfp = (least_common_multiple (group_size, const_nunits)
10776                               / group_size);
10777               lupdate_mul
10778                 = build_vector_from_val (step_vectype,
10779                                          SCALAR_FLOAT_TYPE_P (stept)
10780                                          ? build_real_from_wide (stept,
10781                                                                  vfp, UNSIGNED)
10782                                          : build_int_cstu (stept, vfp));
10783             }
10784           else
10785             {
10786               if (SCALAR_FLOAT_TYPE_P (stept))
10787                 {
10788                   tree tem = build_int_cst (integer_type_node, nunits);
10789                   lupdate_mul = gimple_build (&init_stmts, FLOAT_EXPR,
10790                                               stept, tem);
10791                 }
10792               else
10793                 lupdate_mul = build_int_cst (stept, nunits);
10794               lupdate_mul = gimple_build_vector_from_val (&init_stmts,
10795                                                           step_vectype,
10796                                                           lupdate_mul);
10797             }
10798           for (; ivn < nvects; ++ivn)
10799             {
10800               gimple *iv
10801                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10802               tree def = gimple_get_lhs (iv);
10803               if (ivn < 2*nivs)
10804                 vec_steps[ivn - nivs]
10805                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10806                                   vec_steps[ivn - nivs], lupdate_mul);
10807               gimple_seq stmts = NULL;
10808               def = gimple_convert (&stmts, step_vectype, def);
10809               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10810                                   def, vec_steps[ivn % nivs]);
10811               def = gimple_convert (&stmts, vectype, def);
10812               if (gimple_code (iv) == GIMPLE_PHI)
10813                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10814               else
10815                 {
10816                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10817                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10818                 }
10819               slp_node->push_vec_def (def);
10820             }
10821         }
10822
10823       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10824       gcc_assert (!new_bb);
10825
10826       return true;
10827     }
10828
10829   tree init_expr = vect_phi_initial_value (phi);
10830
10831   gimple_seq stmts = NULL;
10832   if (!nested_in_vect_loop)
10833     {
10834       /* Convert the initial value to the IV update type.  */
10835       tree new_type = TREE_TYPE (step_expr);
10836       init_expr = gimple_convert (&stmts, new_type, init_expr);
10837
10838       /* If we are using the loop mask to "peel" for alignment then we need
10839          to adjust the start value here.  */
10840       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10841       if (skip_niters != NULL_TREE)
10842         {
10843           if (FLOAT_TYPE_P (vectype))
10844             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10845                                         skip_niters);
10846           else
10847             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10848           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10849                                          skip_niters, step_expr);
10850           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10851                                     init_expr, skip_step);
10852         }
10853     }
10854
10855   if (stmts)
10856     {
10857       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10858       gcc_assert (!new_bb);
10859     }
10860
10861   /* Create the vector that holds the initial_value of the induction.  */
10862   if (nested_in_vect_loop)
10863     {
10864       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10865          been created during vectorization of previous stmts.  We obtain it
10866          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10867       auto_vec<tree> vec_inits;
10868       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10869                                      init_expr, &vec_inits);
10870       vec_init = vec_inits[0];
10871       /* If the initial value is not of proper type, convert it.  */
10872       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10873         {
10874           new_stmt
10875             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10876                                                           vect_simple_var,
10877                                                           "vec_iv_"),
10878                                    VIEW_CONVERT_EXPR,
10879                                    build1 (VIEW_CONVERT_EXPR, vectype,
10880                                            vec_init));
10881           vec_init = gimple_assign_lhs (new_stmt);
10882           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10883                                                  new_stmt);
10884           gcc_assert (!new_bb);
10885         }
10886     }
10887   else
10888     {
10889       /* iv_loop is the loop to be vectorized. Create:
10890          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10891       stmts = NULL;
10892       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10893
10894       unsigned HOST_WIDE_INT const_nunits;
10895       if (nunits.is_constant (&const_nunits))
10896         {
10897           tree_vector_builder elts (step_vectype, const_nunits, 1);
10898           elts.quick_push (new_name);
10899           for (i = 1; i < const_nunits; i++)
10900             {
10901               /* Create: new_name_i = new_name + step_expr  */
10902               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10903                                        new_name, step_expr);
10904               elts.quick_push (new_name);
10905             }
10906           /* Create a vector from [new_name_0, new_name_1, ...,
10907              new_name_nunits-1]  */
10908           vec_init = gimple_build_vector (&stmts, &elts);
10909         }
10910       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10911         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10912         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10913                                  new_name, step_expr);
10914       else
10915         {
10916           /* Build:
10917                 [base, base, base, ...]
10918                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10919           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10920           gcc_assert (flag_associative_math);
10921           tree index = build_index_vector (step_vectype, 0, 1);
10922           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10923                                                         new_name);
10924           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10925                                                         step_expr);
10926           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10927           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10928                                    vec_init, step_vec);
10929           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10930                                    vec_init, base_vec);
10931         }
10932       vec_init = gimple_convert (&stmts, vectype, vec_init);
10933
10934       if (stmts)
10935         {
10936           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10937           gcc_assert (!new_bb);
10938         }
10939     }
10940
10941
10942   /* Create the vector that holds the step of the induction.  */
10943   gimple_stmt_iterator *step_iv_si = NULL;
10944   if (nested_in_vect_loop)
10945     /* iv_loop is nested in the loop to be vectorized. Generate:
10946        vec_step = [S, S, S, S]  */
10947     new_name = step_expr;
10948   else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10949     {
10950       /* When we're using loop_len produced by SELEC_VL, the non-final
10951          iterations are not always processing VF elements.  So vectorize
10952          induction variable instead of
10953
10954            _21 = vect_vec_iv_.6_22 + { VF, ... };
10955
10956          We should generate:
10957
10958            _35 = .SELECT_VL (ivtmp_33, VF);
10959            vect_cst__22 = [vec_duplicate_expr] _35;
10960            _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
10961       gcc_assert (!slp_node);
10962       gimple_seq seq = NULL;
10963       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10964       tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10965       expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10966                                                  unshare_expr (len)),
10967                                    &seq, true, NULL_TREE);
10968       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10969                                step_expr);
10970       gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10971       step_iv_si = &si;
10972     }
10973   else
10974     {
10975       /* iv_loop is the loop to be vectorized. Generate:
10976           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10977       gimple_seq seq = NULL;
10978       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10979         {
10980           expr = build_int_cst (integer_type_node, vf);
10981           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10982         }
10983       else
10984         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10985       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10986                                expr, step_expr);
10987       if (seq)
10988         {
10989           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10990           gcc_assert (!new_bb);
10991         }
10992     }
10993
10994   t = unshare_expr (new_name);
10995   gcc_assert (CONSTANT_CLASS_P (new_name)
10996               || TREE_CODE (new_name) == SSA_NAME);
10997   new_vec = build_vector_from_val (step_vectype, t);
10998   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10999                                new_vec, step_vectype, step_iv_si);
11000
11001
11002   /* Create the following def-use cycle:
11003      loop prolog:
11004          vec_init = ...
11005          vec_step = ...
11006      loop:
11007          vec_iv = PHI <vec_init, vec_loop>
11008          ...
11009          STMT
11010          ...
11011          vec_loop = vec_iv + vec_step;  */
11012
11013   /* Create the induction-phi that defines the induction-operand.  */
11014   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
11015   induction_phi = create_phi_node (vec_dest, iv_loop->header);
11016   induc_def = PHI_RESULT (induction_phi);
11017
11018   /* Create the iv update inside the loop  */
11019   stmts = NULL;
11020   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
11021   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
11022   vec_def = gimple_convert (&stmts, vectype, vec_def);
11023   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11024   new_stmt = SSA_NAME_DEF_STMT (vec_def);
11025
11026   /* Set the arguments of the phi node:  */
11027   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
11028   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
11029                UNKNOWN_LOCATION);
11030
11031   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
11032   *vec_stmt = induction_phi;
11033
11034   /* In case that vectorization factor (VF) is bigger than the number
11035      of elements that we can fit in a vectype (nunits), we have to generate
11036      more than one vector stmt - i.e - we need to "unroll" the
11037      vector stmt by a factor VF/nunits.  For more details see documentation
11038      in vectorizable_operation.  */
11039
11040   if (ncopies > 1)
11041     {
11042       gimple_seq seq = NULL;
11043       /* FORNOW. This restriction should be relaxed.  */
11044       gcc_assert (!nested_in_vect_loop);
11045       /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1.  */
11046       gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
11047
11048       /* Create the vector that holds the step of the induction.  */
11049       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
11050         {
11051           expr = build_int_cst (integer_type_node, nunits);
11052           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
11053         }
11054       else
11055         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
11056       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
11057                                expr, step_expr);
11058       if (seq)
11059         {
11060           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
11061           gcc_assert (!new_bb);
11062         }
11063
11064       t = unshare_expr (new_name);
11065       gcc_assert (CONSTANT_CLASS_P (new_name)
11066                   || TREE_CODE (new_name) == SSA_NAME);
11067       new_vec = build_vector_from_val (step_vectype, t);
11068       vec_step = vect_init_vector (loop_vinfo, stmt_info,
11069                                    new_vec, step_vectype, NULL);
11070
11071       vec_def = induc_def;
11072       for (i = 1; i < ncopies + 1; i++)
11073         {
11074           /* vec_i = vec_prev + vec_step  */
11075           gimple_seq stmts = NULL;
11076           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
11077           vec_def = gimple_build (&stmts,
11078                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
11079           vec_def = gimple_convert (&stmts, vectype, vec_def);
11080
11081           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11082           if (i < ncopies)
11083             {
11084               new_stmt = SSA_NAME_DEF_STMT (vec_def);
11085               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
11086             }
11087           else
11088             {
11089               /* vec_1 = vec_iv + (VF/n * S)
11090                  vec_2 = vec_1 + (VF/n * S)
11091                  ...
11092                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
11093
11094                  vec_n is used as vec_loop to save the large step register and
11095                  related operations.  */
11096               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
11097                            UNKNOWN_LOCATION);
11098             }
11099         }
11100     }
11101
11102   if (dump_enabled_p ())
11103     dump_printf_loc (MSG_NOTE, vect_location,
11104                      "transform induction: created def-use cycle: %G%G",
11105                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
11106
11107   return true;
11108 }
11109
11110 /* Function vectorizable_live_operation_1.
11111
11112    helper function for vectorizable_live_operation.  */
11113
11114 static tree
11115 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
11116                                stmt_vec_info stmt_info, basic_block exit_bb,
11117                                tree vectype, int ncopies, slp_tree slp_node,
11118                                tree bitsize, tree bitstart, tree vec_lhs,
11119                                tree lhs_type, gimple_stmt_iterator *exit_gsi)
11120 {
11121   gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
11122
11123   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
11124   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
11125   for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
11126     SET_PHI_ARG_DEF (phi, i, vec_lhs);
11127
11128   gimple_seq stmts = NULL;
11129   tree new_tree;
11130
11131   /* If bitstart is 0 then we can use a BIT_FIELD_REF  */
11132   if (integer_zerop (bitstart))
11133     {
11134       tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
11135                                       vec_lhs_phi, bitsize, bitstart);
11136
11137       /* Convert the extracted vector element to the scalar type.  */
11138       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
11139     }
11140   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
11141     {
11142       /* Emit:
11143
11144          SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
11145
11146          where VEC_LHS is the vectorized live-out result and MASK is
11147          the loop mask for the final iteration.  */
11148       gcc_assert (ncopies == 1
11149                   && (!slp_node || SLP_TREE_LANES (slp_node) == 1));
11150       gimple_seq tem = NULL;
11151       gimple_stmt_iterator gsi = gsi_last (tem);
11152       tree len = vect_get_loop_len (loop_vinfo, &gsi,
11153                                     &LOOP_VINFO_LENS (loop_vinfo),
11154                                     1, vectype, 0, 0);
11155
11156       /* BIAS - 1.  */
11157       signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
11158       tree bias_minus_one
11159         = int_const_binop (MINUS_EXPR,
11160                            build_int_cst (TREE_TYPE (len), biasval),
11161                            build_one_cst (TREE_TYPE (len)));
11162
11163       /* LAST_INDEX = LEN + (BIAS - 1).  */
11164       tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
11165                                      len, bias_minus_one);
11166
11167       /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
11168       tree scalar_res
11169         = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
11170                         vec_lhs_phi, last_index);
11171
11172       /* Convert the extracted vector element to the scalar type.  */
11173       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
11174     }
11175   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
11176     {
11177       /* Emit:
11178
11179          SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
11180
11181          where VEC_LHS is the vectorized live-out result and MASK is
11182          the loop mask for the final iteration.  */
11183       gcc_assert (!slp_node || SLP_TREE_LANES (slp_node) == 1);
11184       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
11185       gimple_seq tem = NULL;
11186       gimple_stmt_iterator gsi = gsi_last (tem);
11187       tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
11188                                       &LOOP_VINFO_MASKS (loop_vinfo),
11189                                       1, vectype, 0);
11190       tree scalar_res;
11191       gimple_seq_add_seq (&stmts, tem);
11192
11193       scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
11194                                  mask, vec_lhs_phi);
11195
11196       /* Convert the extracted vector element to the scalar type.  */
11197       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
11198     }
11199   else
11200     {
11201       tree bftype = TREE_TYPE (vectype);
11202       if (VECTOR_BOOLEAN_TYPE_P (vectype))
11203         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11204       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
11205       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11206                                        &stmts, true, NULL_TREE);
11207     }
11208
11209   *exit_gsi = gsi_after_labels (exit_bb);
11210   if (stmts)
11211     gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
11212
11213   return new_tree;
11214 }
11215
11216 /* Function vectorizable_live_operation.
11217
11218    STMT_INFO computes a value that is used outside the loop.  Check if
11219    it can be supported.  */
11220
11221 bool
11222 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
11223                              slp_tree slp_node, slp_instance slp_node_instance,
11224                              int slp_index, bool vec_stmt_p,
11225                              stmt_vector_for_cost *cost_vec)
11226 {
11227   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
11228   imm_use_iterator imm_iter;
11229   tree lhs, lhs_type, bitsize;
11230   tree vectype = (slp_node
11231                   ? SLP_TREE_VECTYPE (slp_node)
11232                   : STMT_VINFO_VECTYPE (stmt_info));
11233   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
11234   int ncopies;
11235   gimple *use_stmt;
11236   use_operand_p use_p;
11237   auto_vec<tree> vec_oprnds;
11238   int vec_entry = 0;
11239   poly_uint64 vec_index = 0;
11240
11241   gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
11242               || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
11243
11244   /* If a stmt of a reduction is live, vectorize it via
11245      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
11246      validity so just trigger the transform here.  */
11247   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
11248     {
11249       if (!vec_stmt_p)
11250         return true;
11251       /* For SLP reductions we vectorize the epilogue for all involved stmts
11252          together.  */
11253       if (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) && slp_index != 0)
11254         return true;
11255       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
11256       gcc_assert (reduc_info->is_reduc_info);
11257       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
11258           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
11259         return true;
11260
11261       if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
11262           || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
11263         vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
11264                                           slp_node_instance,
11265                                           LOOP_VINFO_IV_EXIT (loop_vinfo));
11266
11267       /* If early break we only have to materialize the reduction on the merge
11268          block, but we have to find an alternate exit first.  */
11269       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11270         {
11271           slp_tree phis_node = slp_node ? slp_node_instance->reduc_phis : NULL;
11272           for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11273             if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
11274               {
11275                 vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
11276                                                   phis_node, slp_node_instance,
11277                                                   exit);
11278                 break;
11279               }
11280           if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
11281             vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
11282                                               phis_node, slp_node_instance,
11283                                               LOOP_VINFO_IV_EXIT (loop_vinfo));
11284         }
11285
11286       return true;
11287     }
11288
11289   /* If STMT is not relevant and it is a simple assignment and its inputs are
11290      invariant then it can remain in place, unvectorized.  The original last
11291      scalar value that it computes will be used.  */
11292   if (!STMT_VINFO_RELEVANT_P (stmt_info))
11293     {
11294       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
11295       if (dump_enabled_p ())
11296         dump_printf_loc (MSG_NOTE, vect_location,
11297                          "statement is simple and uses invariant.  Leaving in "
11298                          "place.\n");
11299       return true;
11300     }
11301
11302   if (slp_node)
11303     ncopies = 1;
11304   else
11305     ncopies = vect_get_num_copies (loop_vinfo, vectype);
11306
11307   if (slp_node)
11308     {
11309       gcc_assert (slp_index >= 0);
11310
11311       /* Get the last occurrence of the scalar index from the concatenation of
11312          all the slp vectors. Calculate which slp vector it is and the index
11313          within.  */
11314       int num_scalar = SLP_TREE_LANES (slp_node);
11315       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
11316       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
11317
11318       /* Calculate which vector contains the result, and which lane of
11319          that vector we need.  */
11320       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
11321         {
11322           if (dump_enabled_p ())
11323             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11324                              "Cannot determine which vector holds the"
11325                              " final result.\n");
11326           return false;
11327         }
11328     }
11329
11330   if (!vec_stmt_p)
11331     {
11332       /* No transformation required.  */
11333       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
11334         {
11335           if (slp_node && SLP_TREE_LANES (slp_node) != 1)
11336             {
11337               if (dump_enabled_p ())
11338                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11339                                  "can't operate on partial vectors "
11340                                  "because an SLP statement is live after "
11341                                  "the loop.\n");
11342               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
11343             }
11344           else if (ncopies > 1
11345                    || (slp_node && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) > 1))
11346             {
11347               if (dump_enabled_p ())
11348                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11349                                  "can't operate on partial vectors "
11350                                  "because ncopies is greater than 1.\n");
11351               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
11352             }
11353           else
11354             {
11355               gcc_assert (ncopies == 1
11356                           && (!slp_node || SLP_TREE_LANES (slp_node) == 1));
11357               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
11358                                                   OPTIMIZE_FOR_SPEED))
11359                 vect_record_loop_mask (loop_vinfo,
11360                                        &LOOP_VINFO_MASKS (loop_vinfo),
11361                                        1, vectype, NULL);
11362               else if (can_vec_extract_var_idx_p (
11363                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
11364                 vect_record_loop_len (loop_vinfo,
11365                                       &LOOP_VINFO_LENS (loop_vinfo),
11366                                       1, vectype, 1);
11367               else
11368                 {
11369                   if (dump_enabled_p ())
11370                     dump_printf_loc (
11371                       MSG_MISSED_OPTIMIZATION, vect_location,
11372                       "can't operate on partial vectors "
11373                       "because the target doesn't support extract "
11374                       "last reduction.\n");
11375                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
11376                 }
11377             }
11378         }
11379       /* ???  Enable for loop costing as well.  */
11380       if (!loop_vinfo)
11381         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
11382                           0, vect_epilogue);
11383       return true;
11384     }
11385
11386   /* Use the lhs of the original scalar statement.  */
11387   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
11388   if (dump_enabled_p ())
11389     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
11390                      "stmt %G", stmt);
11391
11392   lhs = gimple_get_lhs (stmt);
11393   lhs_type = TREE_TYPE (lhs);
11394
11395   bitsize = vector_element_bits_tree (vectype);
11396
11397   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
11398   tree vec_lhs, vec_lhs0, bitstart;
11399   gimple *vec_stmt, *vec_stmt0;
11400   if (slp_node)
11401     {
11402       gcc_assert (!loop_vinfo
11403                   || ((!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
11404                        && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
11405                       || SLP_TREE_LANES (slp_node) == 1));
11406
11407       /* Get the correct slp vectorized stmt.  */
11408       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
11409       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
11410
11411       /* In case we need to early break vectorize also get the first stmt.  */
11412       vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
11413       vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
11414
11415       /* Get entry to use.  */
11416       bitstart = bitsize_int (vec_index);
11417       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
11418     }
11419   else
11420     {
11421       /* For multiple copies, get the last copy.  */
11422       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
11423       vec_lhs = gimple_get_lhs (vec_stmt);
11424
11425       /* In case we need to early break vectorize also get the first stmt.  */
11426       vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
11427       vec_lhs0 = gimple_get_lhs (vec_stmt0);
11428
11429       /* Get the last lane in the vector.  */
11430       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
11431     }
11432
11433   if (loop_vinfo)
11434     {
11435       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
11436          requirement, insert one phi node for it.  It looks like:
11437            loop;
11438          BB:
11439            # lhs' = PHI <lhs>
11440          ==>
11441            loop;
11442          BB:
11443            # vec_lhs' = PHI <vec_lhs>
11444            new_tree = lane_extract <vec_lhs', ...>;
11445            lhs' = new_tree;  */
11446
11447       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11448       /* Check if we have a loop where the chosen exit is not the main exit,
11449          in these cases for an early break we restart the iteration the vector code
11450          did.  For the live values we want the value at the start of the iteration
11451          rather than at the end.  */
11452       edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11453       bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
11454       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11455         if (!is_gimple_debug (use_stmt)
11456             && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
11457           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11458             {
11459               edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
11460                                            phi_arg_index_from_use (use_p));
11461               gcc_assert (loop_exit_edge_p (loop, e));
11462               bool main_exit_edge = e == main_e;
11463               tree tmp_vec_lhs = vec_lhs;
11464               tree tmp_bitstart = bitstart;
11465
11466               /* For early exit where the exit is not in the BB that leads
11467                  to the latch then we're restarting the iteration in the
11468                  scalar loop.  So get the first live value.  */
11469               if ((all_exits_as_early_p || !main_exit_edge)
11470                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
11471                 {
11472                   tmp_vec_lhs = vec_lhs0;
11473                   tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
11474                 }
11475
11476               gimple_stmt_iterator exit_gsi;
11477               tree new_tree
11478                 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
11479                                                  e->dest, vectype, ncopies,
11480                                                  slp_node, bitsize,
11481                                                  tmp_bitstart, tmp_vec_lhs,
11482                                                  lhs_type, &exit_gsi);
11483
11484               auto gsi = gsi_for_stmt (use_stmt);
11485               tree lhs_phi = gimple_phi_result (use_stmt);
11486               remove_phi_node (&gsi, false);
11487               gimple *copy = gimple_build_assign (lhs_phi, new_tree);
11488               gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
11489               break;
11490             }
11491
11492       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
11493       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11494         gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
11495     }
11496   else
11497     {
11498       /* For basic-block vectorization simply insert the lane-extraction.  */
11499       tree bftype = TREE_TYPE (vectype);
11500       if (VECTOR_BOOLEAN_TYPE_P (vectype))
11501         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
11502       tree new_tree = build3 (BIT_FIELD_REF, bftype,
11503                               vec_lhs, bitsize, bitstart);
11504       gimple_seq stmts = NULL;
11505       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
11506                                        &stmts, true, NULL_TREE);
11507       if (TREE_CODE (new_tree) == SSA_NAME
11508           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11509         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11510       if (is_a <gphi *> (vec_stmt))
11511         {
11512           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11513           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11514         }
11515       else
11516         {
11517           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11518           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11519         }
11520
11521       /* Replace use of lhs with newly computed result.  If the use stmt is a
11522          single arg PHI, just replace all uses of PHI result.  It's necessary
11523          because lcssa PHI defining lhs may be before newly inserted stmt.  */
11524       use_operand_p use_p;
11525       stmt_vec_info use_stmt_info;
11526       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11527         if (!is_gimple_debug (use_stmt)
11528             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11529                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11530           {
11531             /* ???  This can happen when the live lane ends up being
11532                rooted in a vector construction code-generated by an
11533                external SLP node (and code-generation for that already
11534                happened).  See gcc.dg/vect/bb-slp-47.c.
11535                Doing this is what would happen if that vector CTOR
11536                were not code-generated yet so it is not too bad.
11537                ???  In fact we'd likely want to avoid this situation
11538                in the first place.  */
11539             if (TREE_CODE (new_tree) == SSA_NAME
11540                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11541                 && gimple_code (use_stmt) != GIMPLE_PHI
11542                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11543                                                 use_stmt))
11544               {
11545                 if (dump_enabled_p ())
11546                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11547                                    "Using original scalar computation for "
11548                                    "live lane because use preceeds vector "
11549                                    "def\n");
11550                 continue;
11551               }
11552             /* ???  It can also happen that we end up pulling a def into
11553                a loop where replacing out-of-loop uses would require
11554                a new LC SSA PHI node.  Retain the original scalar in
11555                those cases as well.  PR98064.  */
11556             if (TREE_CODE (new_tree) == SSA_NAME
11557                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11558                 && (gimple_bb (use_stmt)->loop_father
11559                     != gimple_bb (vec_stmt)->loop_father)
11560                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11561                                         gimple_bb (use_stmt)->loop_father))
11562               {
11563                 if (dump_enabled_p ())
11564                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11565                                    "Using original scalar computation for "
11566                                    "live lane because there is an out-of-loop "
11567                                    "definition for it\n");
11568                 continue;
11569               }
11570             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11571               SET_USE (use_p, new_tree);
11572             update_stmt (use_stmt);
11573           }
11574     }
11575
11576   return true;
11577 }
11578
11579 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
11580
11581 static void
11582 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11583 {
11584   ssa_op_iter op_iter;
11585   imm_use_iterator imm_iter;
11586   def_operand_p def_p;
11587   gimple *ustmt;
11588
11589   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11590     {
11591       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11592         {
11593           basic_block bb;
11594
11595           if (!is_gimple_debug (ustmt))
11596             continue;
11597
11598           bb = gimple_bb (ustmt);
11599
11600           if (!flow_bb_inside_loop_p (loop, bb))
11601             {
11602               if (gimple_debug_bind_p (ustmt))
11603                 {
11604                   if (dump_enabled_p ())
11605                     dump_printf_loc (MSG_NOTE, vect_location,
11606                                      "killing debug use\n");
11607
11608                   gimple_debug_bind_reset_value (ustmt);
11609                   update_stmt (ustmt);
11610                 }
11611               else
11612                 gcc_unreachable ();
11613             }
11614         }
11615     }
11616 }
11617
11618 /* Given loop represented by LOOP_VINFO, return true if computation of
11619    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11620    otherwise.  */
11621
11622 static bool
11623 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11624 {
11625   /* Constant case.  */
11626   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11627     {
11628       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11629       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11630
11631       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11632       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11633       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11634         return true;
11635     }
11636
11637   widest_int max;
11638   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11639   /* Check the upper bound of loop niters.  */
11640   if (get_max_loop_iterations (loop, &max))
11641     {
11642       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11643       signop sgn = TYPE_SIGN (type);
11644       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11645       if (max < type_max)
11646         return true;
11647     }
11648   return false;
11649 }
11650
11651 /* Return a mask type with half the number of elements as OLD_TYPE,
11652    given that it should have mode NEW_MODE.  */
11653
11654 tree
11655 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11656 {
11657   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11658   return build_truth_vector_type_for_mode (nunits, new_mode);
11659 }
11660
11661 /* Return a mask type with twice as many elements as OLD_TYPE,
11662    given that it should have mode NEW_MODE.  */
11663
11664 tree
11665 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11666 {
11667   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11668   return build_truth_vector_type_for_mode (nunits, new_mode);
11669 }
11670
11671 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11672    contain a sequence of NVECTORS masks that each control a vector of type
11673    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
11674    these vector masks with the vector version of SCALAR_MASK.  */
11675
11676 void
11677 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11678                        unsigned int nvectors, tree vectype, tree scalar_mask)
11679 {
11680   gcc_assert (nvectors != 0);
11681
11682   if (scalar_mask)
11683     {
11684       scalar_cond_masked_key cond (scalar_mask, nvectors);
11685       loop_vinfo->scalar_cond_masked_set.add (cond);
11686     }
11687
11688   masks->mask_set.add (std::make_pair (vectype, nvectors));
11689 }
11690
11691 /* Given a complete set of masks MASKS, extract mask number INDEX
11692    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11693    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
11694
11695    See the comment above vec_loop_masks for more details about the mask
11696    arrangement.  */
11697
11698 tree
11699 vect_get_loop_mask (loop_vec_info loop_vinfo,
11700                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11701                     unsigned int nvectors, tree vectype, unsigned int index)
11702 {
11703   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11704       == vect_partial_vectors_while_ult)
11705     {
11706       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11707       tree mask_type = rgm->type;
11708
11709       /* Populate the rgroup's mask array, if this is the first time we've
11710          used it.  */
11711       if (rgm->controls.is_empty ())
11712         {
11713           rgm->controls.safe_grow_cleared (nvectors, true);
11714           for (unsigned int i = 0; i < nvectors; ++i)
11715             {
11716               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11717               /* Provide a dummy definition until the real one is available.  */
11718               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11719               rgm->controls[i] = mask;
11720             }
11721         }
11722
11723       tree mask = rgm->controls[index];
11724       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11725                     TYPE_VECTOR_SUBPARTS (vectype)))
11726         {
11727           /* A loop mask for data type X can be reused for data type Y
11728              if X has N times more elements than Y and if Y's elements
11729              are N times bigger than X's.  In this case each sequence
11730              of N elements in the loop mask will be all-zero or all-one.
11731              We can then view-convert the mask so that each sequence of
11732              N elements is replaced by a single element.  */
11733           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11734                                   TYPE_VECTOR_SUBPARTS (vectype)));
11735           gimple_seq seq = NULL;
11736           mask_type = truth_type_for (vectype);
11737           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11738           if (seq)
11739             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11740         }
11741       return mask;
11742     }
11743   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11744            == vect_partial_vectors_avx512)
11745     {
11746       /* The number of scalars per iteration and the number of vectors are
11747          both compile-time constants.  */
11748       unsigned int nscalars_per_iter
11749         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11750                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11751
11752       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11753
11754       /* The stored nV is dependent on the mask type produced.  */
11755       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11756                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11757                   == rgm->factor);
11758       nvectors = rgm->factor;
11759
11760       /* Populate the rgroup's mask array, if this is the first time we've
11761          used it.  */
11762       if (rgm->controls.is_empty ())
11763         {
11764           rgm->controls.safe_grow_cleared (nvectors, true);
11765           for (unsigned int i = 0; i < nvectors; ++i)
11766             {
11767               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11768               /* Provide a dummy definition until the real one is available.  */
11769               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11770               rgm->controls[i] = mask;
11771             }
11772         }
11773       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11774                     TYPE_VECTOR_SUBPARTS (vectype)))
11775         return rgm->controls[index];
11776
11777       /* Split the vector if needed.  Since we are dealing with integer mode
11778          masks with AVX512 we can operate on the integer representation
11779          performing the whole vector shifting.  */
11780       unsigned HOST_WIDE_INT factor;
11781       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11782                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11783       gcc_assert (ok);
11784       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11785       tree mask_type = truth_type_for (vectype);
11786       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11787       unsigned vi = index / factor;
11788       unsigned vpart = index % factor;
11789       tree vec = rgm->controls[vi];
11790       gimple_seq seq = NULL;
11791       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11792                           lang_hooks.types.type_for_mode
11793                                 (TYPE_MODE (rgm->type), 1), vec);
11794       /* For integer mode masks simply shift the right bits into position.  */
11795       if (vpart != 0)
11796         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11797                             build_int_cst (integer_type_node,
11798                                            (TYPE_VECTOR_SUBPARTS (vectype)
11799                                             * vpart)));
11800       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11801                                     (TYPE_MODE (mask_type), 1), vec);
11802       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11803       if (seq)
11804         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11805       return vec;
11806     }
11807   else
11808     gcc_unreachable ();
11809 }
11810
11811 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11812    lengths for controlling an operation on VECTYPE.  The operation splits
11813    each element of VECTYPE into FACTOR separate subelements, measuring the
11814    length as a number of these subelements.  */
11815
11816 void
11817 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11818                       unsigned int nvectors, tree vectype, unsigned int factor)
11819 {
11820   gcc_assert (nvectors != 0);
11821   if (lens->length () < nvectors)
11822     lens->safe_grow_cleared (nvectors, true);
11823   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11824
11825   /* The number of scalars per iteration, scalar occupied bytes and
11826      the number of vectors are both compile-time constants.  */
11827   unsigned int nscalars_per_iter
11828     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11829                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11830
11831   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11832     {
11833       /* For now, we only support cases in which all loads and stores fall back
11834          to VnQI or none do.  */
11835       gcc_assert (!rgl->max_nscalars_per_iter
11836                   || (rgl->factor == 1 && factor == 1)
11837                   || (rgl->max_nscalars_per_iter * rgl->factor
11838                       == nscalars_per_iter * factor));
11839       rgl->max_nscalars_per_iter = nscalars_per_iter;
11840       rgl->type = vectype;
11841       rgl->factor = factor;
11842     }
11843 }
11844
11845 /* Given a complete set of lengths LENS, extract length number INDEX
11846    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11847    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11848    multipled by the number of elements that should be processed.
11849    Insert any set-up statements before GSI.  */
11850
11851 tree
11852 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11853                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11854                    unsigned int index, unsigned int factor)
11855 {
11856   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11857   bool use_bias_adjusted_len =
11858     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11859
11860   /* Populate the rgroup's len array, if this is the first time we've
11861      used it.  */
11862   if (rgl->controls.is_empty ())
11863     {
11864       rgl->controls.safe_grow_cleared (nvectors, true);
11865       for (unsigned int i = 0; i < nvectors; ++i)
11866         {
11867           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11868           gcc_assert (len_type != NULL_TREE);
11869
11870           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11871
11872           /* Provide a dummy definition until the real one is available.  */
11873           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11874           rgl->controls[i] = len;
11875
11876           if (use_bias_adjusted_len)
11877             {
11878               gcc_assert (i == 0);
11879               tree adjusted_len =
11880                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11881               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11882               rgl->bias_adjusted_ctrl = adjusted_len;
11883             }
11884         }
11885     }
11886
11887   if (use_bias_adjusted_len)
11888     return rgl->bias_adjusted_ctrl;
11889
11890   tree loop_len = rgl->controls[index];
11891   if (rgl->factor == 1 && factor == 1)
11892     {
11893       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11894       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11895       if (maybe_ne (nunits1, nunits2))
11896         {
11897           /* A loop len for data type X can be reused for data type Y
11898              if X has N times more elements than Y and if Y's elements
11899              are N times bigger than X's.  */
11900           gcc_assert (multiple_p (nunits1, nunits2));
11901           factor = exact_div (nunits1, nunits2).to_constant ();
11902           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11903           gimple_seq seq = NULL;
11904           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11905                                    build_int_cst (iv_type, factor));
11906           if (seq)
11907             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11908         }
11909     }
11910   return loop_len;
11911 }
11912
11913 /* Generate the tree for the loop len mask and return it.  Given the lens,
11914    nvectors, vectype, index and factor to gen the len mask as below.
11915
11916    tree len_mask = VCOND_MASK_LEN (compare_mask, ones, zero, len, bias)
11917 */
11918 tree
11919 vect_gen_loop_len_mask (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11920                         gimple_stmt_iterator *cond_gsi, vec_loop_lens *lens,
11921                         unsigned int nvectors, tree vectype, tree stmt,
11922                         unsigned int index, unsigned int factor)
11923 {
11924   tree all_one_mask = build_all_ones_cst (vectype);
11925   tree all_zero_mask = build_zero_cst (vectype);
11926   tree len = vect_get_loop_len (loop_vinfo, gsi, lens, nvectors, vectype, index,
11927                                 factor);
11928   tree bias = build_int_cst (intQI_type_node,
11929                              LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo));
11930   tree len_mask = make_temp_ssa_name (TREE_TYPE (stmt), NULL, "vec_len_mask");
11931   gcall *call = gimple_build_call_internal (IFN_VCOND_MASK_LEN, 5, stmt,
11932                                             all_one_mask, all_zero_mask, len,
11933                                             bias);
11934   gimple_call_set_lhs (call, len_mask);
11935   gsi_insert_before (cond_gsi, call, GSI_SAME_STMT);
11936
11937   return len_mask;
11938 }
11939
11940 /* Scale profiling counters by estimation for LOOP which is vectorized
11941    by factor VF.
11942    If FLAT is true, the loop we started with had unrealistically flat
11943    profile.  */
11944
11945 static void
11946 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11947 {
11948   /* For flat profiles do not scale down proportionally by VF and only
11949      cap by known iteration count bounds.  */
11950   if (flat)
11951     {
11952       if (dump_file && (dump_flags & TDF_DETAILS))
11953         fprintf (dump_file,
11954                  "Vectorized loop profile seems flat; not scaling iteration "
11955                  "count down by the vectorization factor %i\n", vf);
11956       scale_loop_profile (loop, profile_probability::always (),
11957                           get_likely_max_loop_iterations_int (loop));
11958       return;
11959     }
11960   /* Loop body executes VF fewer times and exit increases VF times.  */
11961   profile_count entry_count = loop_preheader_edge (loop)->count ();
11962
11963   /* If we have unreliable loop profile avoid dropping entry
11964      count bellow header count.  This can happen since loops
11965      has unrealistically low trip counts.  */
11966   while (vf > 1
11967          && loop->header->count > entry_count
11968          && loop->header->count < entry_count * vf)
11969     {
11970       if (dump_file && (dump_flags & TDF_DETAILS))
11971         fprintf (dump_file,
11972                  "Vectorization factor %i seems too large for profile "
11973                  "prevoiusly believed to be consistent; reducing.\n", vf);
11974       vf /= 2;
11975     }
11976
11977   if (entry_count.nonzero_p ())
11978     set_edge_probability_and_rescale_others
11979             (exit_e,
11980              entry_count.probability_in (loop->header->count / vf));
11981   /* Avoid producing very large exit probability when we do not have
11982      sensible profile.  */
11983   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11984     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11985   loop->latch->count = single_pred_edge (loop->latch)->count ();
11986
11987   scale_loop_profile (loop, profile_probability::always () / vf,
11988                       get_likely_max_loop_iterations_int (loop));
11989 }
11990
11991 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11992    latch edge values originally defined by it.  */
11993
11994 static void
11995 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11996                                      stmt_vec_info def_stmt_info)
11997 {
11998   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11999   if (!def || TREE_CODE (def) != SSA_NAME)
12000     return;
12001   stmt_vec_info phi_info;
12002   imm_use_iterator iter;
12003   use_operand_p use_p;
12004   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
12005     {
12006       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
12007       if (!phi)
12008         continue;
12009       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
12010             && (phi_info = loop_vinfo->lookup_stmt (phi))
12011             && STMT_VINFO_RELEVANT_P (phi_info)))
12012         continue;
12013       loop_p loop = gimple_bb (phi)->loop_father;
12014       edge e = loop_latch_edge (loop);
12015       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
12016         continue;
12017
12018       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
12019           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
12020           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
12021         {
12022           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
12023           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
12024           gcc_assert (phi_defs.length () == latch_defs.length ());
12025           for (unsigned i = 0; i < phi_defs.length (); ++i)
12026             add_phi_arg (as_a <gphi *> (phi_defs[i]),
12027                          gimple_get_lhs (latch_defs[i]), e,
12028                          gimple_phi_arg_location (phi, e->dest_idx));
12029         }
12030       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
12031         {
12032           /* For first order recurrences we have to update both uses of
12033              the latch definition, the one in the PHI node and the one
12034              in the generated VEC_PERM_EXPR.  */
12035           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
12036           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
12037           gcc_assert (phi_defs.length () == latch_defs.length ());
12038           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
12039           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
12040           for (unsigned i = 0; i < phi_defs.length (); ++i)
12041             {
12042               gassign *perm = as_a <gassign *> (phi_defs[i]);
12043               if (i > 0)
12044                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
12045               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
12046               update_stmt (perm);
12047             }
12048           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
12049                        gimple_phi_arg_location (phi, e->dest_idx));
12050         }
12051     }
12052 }
12053
12054 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
12055    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
12056    stmt_vec_info.  */
12057
12058 static bool
12059 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
12060                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
12061 {
12062   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12063   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12064
12065   if (dump_enabled_p ())
12066     dump_printf_loc (MSG_NOTE, vect_location,
12067                      "------>vectorizing statement: %G", stmt_info->stmt);
12068
12069   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
12070     vect_loop_kill_debug_uses (loop, stmt_info);
12071
12072   if (!STMT_VINFO_RELEVANT_P (stmt_info)
12073       && !STMT_VINFO_LIVE_P (stmt_info))
12074     {
12075       if (is_gimple_call (stmt_info->stmt)
12076           && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
12077         {
12078           gcc_assert (!gimple_call_lhs (stmt_info->stmt));
12079           *seen_store = stmt_info;
12080           return false;
12081         }
12082       return false;
12083     }
12084
12085   if (STMT_VINFO_VECTYPE (stmt_info))
12086     {
12087       poly_uint64 nunits
12088         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
12089       if (!STMT_SLP_TYPE (stmt_info)
12090           && maybe_ne (nunits, vf)
12091           && dump_enabled_p ())
12092         /* For SLP VF is set according to unrolling factor, and not
12093            to vector size, hence for SLP this print is not valid.  */
12094         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12095     }
12096
12097   /* Pure SLP statements have already been vectorized.  We still need
12098      to apply loop vectorization to hybrid SLP statements.  */
12099   if (PURE_SLP_STMT (stmt_info))
12100     return false;
12101
12102   if (dump_enabled_p ())
12103     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
12104
12105   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
12106     *seen_store = stmt_info;
12107
12108   return true;
12109 }
12110
12111 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
12112    in the hash_map with its corresponding values.  */
12113
12114 static tree
12115 find_in_mapping (tree t, void *context)
12116 {
12117   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
12118
12119   tree *value = mapping->get (t);
12120   return value ? *value : t;
12121 }
12122
12123 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
12124    original loop that has now been vectorized.
12125
12126    The inits of the data_references need to be advanced with the number of
12127    iterations of the main loop.  This has been computed in vect_do_peeling and
12128    is stored in parameter ADVANCE.  We first restore the data_references
12129    initial offset with the values recored in ORIG_DRS_INIT.
12130
12131    Since the loop_vec_info of this EPILOGUE was constructed for the original
12132    loop, its stmt_vec_infos all point to the original statements.  These need
12133    to be updated to point to their corresponding copies as well as the SSA_NAMES
12134    in their PATTERN_DEF_SEQs and RELATED_STMTs.
12135
12136    The data_reference's connections also need to be updated.  Their
12137    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
12138    stmt_vec_infos, their statements need to point to their corresponding copy,
12139    if they are gather loads or scatter stores then their reference needs to be
12140    updated to point to its corresponding copy.  */
12141
12142 static void
12143 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
12144 {
12145   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
12146   auto_vec<gimple *> stmt_worklist;
12147   hash_map<tree,tree> mapping;
12148   gimple *orig_stmt, *new_stmt;
12149   gimple_stmt_iterator epilogue_gsi;
12150   gphi_iterator epilogue_phi_gsi;
12151   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
12152   basic_block *epilogue_bbs = get_loop_body (epilogue);
12153   unsigned i;
12154
12155   free (LOOP_VINFO_BBS (epilogue_vinfo));
12156   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
12157   LOOP_VINFO_NBBS (epilogue_vinfo) = epilogue->num_nodes;
12158
12159   /* Advance data_reference's with the number of iterations of the previous
12160      loop and its prologue.  */
12161   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
12162
12163
12164   /* The EPILOGUE loop is a copy of the original loop so they share the same
12165      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
12166      point to the copied statements.  We also create a mapping of all LHS' in
12167      the original loop and all the LHS' in the EPILOGUE and create worklists to
12168      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
12169   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
12170     {
12171       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
12172            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
12173         {
12174           new_stmt = epilogue_phi_gsi.phi ();
12175
12176           gcc_assert (gimple_uid (new_stmt) > 0);
12177           stmt_vinfo
12178             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
12179
12180           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
12181           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
12182
12183           mapping.put (gimple_phi_result (orig_stmt),
12184                        gimple_phi_result (new_stmt));
12185           /* PHI nodes can not have patterns or related statements.  */
12186           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
12187                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
12188         }
12189
12190       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
12191            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
12192         {
12193           new_stmt = gsi_stmt (epilogue_gsi);
12194           if (is_gimple_debug (new_stmt))
12195             continue;
12196
12197           gcc_assert (gimple_uid (new_stmt) > 0);
12198           stmt_vinfo
12199             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
12200
12201           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
12202           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
12203
12204           if (tree old_lhs = gimple_get_lhs (orig_stmt))
12205             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
12206
12207           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
12208             {
12209               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
12210               for (gimple_stmt_iterator gsi = gsi_start (seq);
12211                    !gsi_end_p (gsi); gsi_next (&gsi))
12212                 stmt_worklist.safe_push (gsi_stmt (gsi));
12213             }
12214
12215           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
12216           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
12217             {
12218               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
12219               stmt_worklist.safe_push (stmt);
12220               /* Set BB such that the assert in
12221                 'get_initial_def_for_reduction' is able to determine that
12222                 the BB of the related stmt is inside this loop.  */
12223               gimple_set_bb (stmt,
12224                              gimple_bb (new_stmt));
12225               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
12226               gcc_assert (related_vinfo == NULL
12227                           || related_vinfo == stmt_vinfo);
12228             }
12229         }
12230     }
12231
12232   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
12233      using the original main loop and thus need to be updated to refer to the
12234      cloned variables used in the epilogue.  */
12235   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
12236     {
12237       gimple *stmt = stmt_worklist[i];
12238       tree *new_op;
12239
12240       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
12241         {
12242           tree op = gimple_op (stmt, j);
12243           if ((new_op = mapping.get(op)))
12244             gimple_set_op (stmt, j, *new_op);
12245           else
12246             {
12247               /* PR92429: The last argument of simplify_replace_tree disables
12248                  folding when replacing arguments.  This is required as
12249                  otherwise you might end up with different statements than the
12250                  ones analyzed in vect_loop_analyze, leading to different
12251                  vectorization.  */
12252               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
12253                                           &find_in_mapping, &mapping, false);
12254               gimple_set_op (stmt, j, op);
12255             }
12256         }
12257     }
12258
12259   struct data_reference *dr;
12260   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
12261   FOR_EACH_VEC_ELT (datarefs, i, dr)
12262     {
12263       orig_stmt = DR_STMT (dr);
12264       gcc_assert (gimple_uid (orig_stmt) > 0);
12265       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
12266       /* Data references for gather loads and scatter stores do not use the
12267          updated offset we set using ADVANCE.  Instead we have to make sure the
12268          reference in the data references point to the corresponding copy of
12269          the original in the epilogue.  Make sure to update both
12270          gather/scatters recognized by dataref analysis and also other
12271          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
12272       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
12273       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
12274           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
12275         {
12276           DR_REF (dr)
12277             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
12278                                      &find_in_mapping, &mapping);
12279           DR_BASE_ADDRESS (dr)
12280             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
12281                                      &find_in_mapping, &mapping);
12282         }
12283       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
12284       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
12285     }
12286
12287   epilogue_vinfo->shared->datarefs_copy.release ();
12288   epilogue_vinfo->shared->save_datarefs ();
12289 }
12290
12291 /*  When vectorizing early break statements instructions that happen before
12292     the early break in the current BB need to be moved to after the early
12293     break.  This function deals with that and assumes that any validity
12294     checks has already been performed.
12295
12296     While moving the instructions if it encounters a VUSE or VDEF it then
12297     corrects the VUSES as it moves the statements along.  GDEST is the location
12298     in which to insert the new statements.  */
12299
12300 static void
12301 move_early_exit_stmts (loop_vec_info loop_vinfo)
12302 {
12303   DUMP_VECT_SCOPE ("move_early_exit_stmts");
12304
12305   if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
12306     return;
12307
12308   /* Move all stmts that need moving.  */
12309   basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
12310   gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
12311
12312   tree last_seen_vuse = NULL_TREE;
12313   for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
12314     {
12315       /* We have to update crossed degenerate virtual PHIs.  Simply
12316          elide them.  */
12317       if (gphi *vphi = dyn_cast <gphi *> (stmt))
12318         {
12319           tree vdef = gimple_phi_result (vphi);
12320           tree vuse = gimple_phi_arg_def (vphi, 0);
12321           imm_use_iterator iter;
12322           use_operand_p use_p;
12323           gimple *use_stmt;
12324           FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
12325             {
12326               FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
12327                 SET_USE (use_p, vuse);
12328             }
12329           auto gsi = gsi_for_stmt (stmt);
12330           remove_phi_node (&gsi, true);
12331           last_seen_vuse = vuse;
12332           continue;
12333         }
12334
12335       /* Check to see if statement is still required for vect or has been
12336          elided.  */
12337       auto stmt_info = loop_vinfo->lookup_stmt (stmt);
12338       if (!stmt_info)
12339         continue;
12340
12341       if (dump_enabled_p ())
12342         dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
12343
12344       gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
12345       gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
12346       last_seen_vuse = gimple_vuse (stmt);
12347     }
12348
12349   /* Update all the stmts with their new reaching VUSES.  */
12350   for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
12351     {
12352       if (dump_enabled_p ())
12353           dump_printf_loc (MSG_NOTE, vect_location,
12354                            "updating vuse to %T for load %G",
12355                            last_seen_vuse, p);
12356       gimple_set_vuse (p, last_seen_vuse);
12357       update_stmt (p);
12358     }
12359
12360   /* And update the LC PHIs on exits.  */
12361   for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP  (loop_vinfo)))
12362     if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
12363       if (gphi *phi = get_virtual_phi (e->dest))
12364         SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
12365 }
12366
12367 /* Function vect_transform_loop.
12368
12369    The analysis phase has determined that the loop is vectorizable.
12370    Vectorize the loop - created vectorized stmts to replace the scalar
12371    stmts in the loop, and update the loop exit condition.
12372    Returns scalar epilogue loop if any.  */
12373
12374 class loop *
12375 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
12376 {
12377   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12378   class loop *epilogue = NULL;
12379   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
12380   int nbbs = loop->num_nodes;
12381   int i;
12382   tree niters_vector = NULL_TREE;
12383   tree step_vector = NULL_TREE;
12384   tree niters_vector_mult_vf = NULL_TREE;
12385   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12386   unsigned int lowest_vf = constant_lower_bound (vf);
12387   gimple *stmt;
12388   bool check_profitability = false;
12389   unsigned int th;
12390   bool flat = maybe_flat_loop_profile (loop);
12391
12392   DUMP_VECT_SCOPE ("vec_transform_loop");
12393
12394   loop_vinfo->shared->check_datarefs ();
12395
12396   /* Use the more conservative vectorization threshold.  If the number
12397      of iterations is constant assume the cost check has been performed
12398      by our caller.  If the threshold makes all loops profitable that
12399      run at least the (estimated) vectorization factor number of times
12400      checking is pointless, too.  */
12401   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
12402   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
12403     {
12404       if (dump_enabled_p ())
12405         dump_printf_loc (MSG_NOTE, vect_location,
12406                          "Profitability threshold is %d loop iterations.\n",
12407                          th);
12408       check_profitability = true;
12409     }
12410
12411   /* Make sure there exists a single-predecessor exit bb.  Do this before
12412      versioning.   */
12413   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
12414   if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
12415     {
12416       split_loop_exit_edge (e, true);
12417       if (dump_enabled_p ())
12418         dump_printf (MSG_NOTE, "split exit edge\n");
12419     }
12420
12421   /* Version the loop first, if required, so the profitability check
12422      comes first.  */
12423
12424   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
12425     {
12426       class loop *sloop
12427         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
12428       sloop->force_vectorize = false;
12429       check_profitability = false;
12430     }
12431
12432   /* Make sure there exists a single-predecessor exit bb also on the
12433      scalar loop copy.  Do this after versioning but before peeling
12434      so CFG structure is fine for both scalar and if-converted loop
12435      to make slpeel_duplicate_current_defs_from_edges face matched
12436      loop closed PHI nodes on the exit.  */
12437   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
12438     {
12439       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
12440       if (! single_pred_p (e->dest))
12441         {
12442           split_loop_exit_edge (e, true);
12443           if (dump_enabled_p ())
12444             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
12445         }
12446     }
12447
12448   tree niters = vect_build_loop_niters (loop_vinfo);
12449   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
12450   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
12451   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
12452   tree advance;
12453   drs_init_vec orig_drs_init;
12454
12455   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
12456                               &step_vector, &niters_vector_mult_vf, th,
12457                               check_profitability, niters_no_overflow,
12458                               &advance);
12459   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
12460       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
12461     {
12462       /* Ifcvt duplicates loop preheader, loop body and produces an basic
12463          block after loop exit.  We need to scale all that.  */
12464       basic_block preheader
12465         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
12466       preheader->count
12467         = preheader->count.apply_probability
12468               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
12469       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
12470                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
12471       LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
12472     }
12473
12474   if (niters_vector == NULL_TREE)
12475     {
12476       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
12477           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
12478           && known_eq (lowest_vf, vf))
12479         {
12480           niters_vector
12481             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
12482                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
12483           step_vector = build_one_cst (TREE_TYPE (niters));
12484         }
12485       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
12486         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
12487                                      &step_vector, niters_no_overflow);
12488       else
12489         /* vect_do_peeling subtracted the number of peeled prologue
12490            iterations from LOOP_VINFO_NITERS.  */
12491         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
12492                                      &niters_vector, &step_vector,
12493                                      niters_no_overflow);
12494     }
12495
12496   /* 1) Make sure the loop header has exactly two entries
12497      2) Make sure we have a preheader basic block.  */
12498
12499   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
12500
12501   split_edge (loop_preheader_edge (loop));
12502
12503   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
12504     /* This will deal with any possible peeling.  */
12505     vect_prepare_for_masked_peels (loop_vinfo);
12506
12507   /* Handle any code motion that we need to for early-break vectorization after
12508      we've done peeling but just before we start vectorizing.  */
12509   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
12510     move_early_exit_stmts (loop_vinfo);
12511
12512   /* Schedule the SLP instances first, then handle loop vectorization
12513      below.  */
12514   if (!loop_vinfo->slp_instances.is_empty ())
12515     {
12516       DUMP_VECT_SCOPE ("scheduling SLP instances");
12517       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
12518     }
12519
12520   /* Generate the loop invariant statements.  */
12521   if (!gimple_seq_empty_p (LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo)))
12522     {
12523       if (dump_enabled_p ())
12524         dump_printf_loc (MSG_NOTE, vect_location,
12525                          "------>generating loop invariant statements\n");
12526       gimple_stmt_iterator gsi;
12527       gsi = gsi_after_labels (loop_preheader_edge (loop)->src);
12528       gsi_insert_seq_before (&gsi, LOOP_VINFO_INV_PATTERN_DEF_SEQ (loop_vinfo),
12529                              GSI_CONTINUE_LINKING);
12530     }
12531
12532   /* FORNOW: the vectorizer supports only loops which body consist
12533      of one basic block (header + empty latch). When the vectorizer will
12534      support more involved loop forms, the order by which the BBs are
12535      traversed need to be reconsidered.  */
12536
12537   for (i = 0; i < nbbs; i++)
12538     {
12539       basic_block bb = bbs[i];
12540       stmt_vec_info stmt_info;
12541
12542       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12543            gsi_next (&si))
12544         {
12545           gphi *phi = si.phi ();
12546           if (dump_enabled_p ())
12547             dump_printf_loc (MSG_NOTE, vect_location,
12548                              "------>vectorizing phi: %G", (gimple *) phi);
12549           stmt_info = loop_vinfo->lookup_stmt (phi);
12550           if (!stmt_info)
12551             continue;
12552
12553           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
12554             vect_loop_kill_debug_uses (loop, stmt_info);
12555
12556           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12557               && !STMT_VINFO_LIVE_P (stmt_info))
12558             continue;
12559
12560           if (STMT_VINFO_VECTYPE (stmt_info)
12561               && (maybe_ne
12562                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12563               && dump_enabled_p ())
12564             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12565
12566           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12567                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12568                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12569                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12570                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12571                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12572               && ! PURE_SLP_STMT (stmt_info))
12573             {
12574               if (dump_enabled_p ())
12575                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12576               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12577             }
12578         }
12579
12580       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12581            gsi_next (&si))
12582         {
12583           gphi *phi = si.phi ();
12584           stmt_info = loop_vinfo->lookup_stmt (phi);
12585           if (!stmt_info)
12586             continue;
12587
12588           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12589               && !STMT_VINFO_LIVE_P (stmt_info))
12590             continue;
12591
12592           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12593                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12594                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12595                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12596                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12597                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12598               && ! PURE_SLP_STMT (stmt_info))
12599             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12600         }
12601
12602       for (gimple_stmt_iterator si = gsi_start_bb (bb);
12603            !gsi_end_p (si);)
12604         {
12605           stmt = gsi_stmt (si);
12606           /* During vectorization remove existing clobber stmts and
12607              prefetches.  */
12608           if (gimple_clobber_p (stmt)
12609               || gimple_call_builtin_p (stmt, BUILT_IN_PREFETCH))
12610             {
12611               unlink_stmt_vdef (stmt);
12612               gsi_remove (&si, true);
12613               release_defs (stmt);
12614             }
12615           else
12616             {
12617               /* Ignore vector stmts created in the outer loop.  */
12618               stmt_info = loop_vinfo->lookup_stmt (stmt);
12619
12620               /* vector stmts created in the outer-loop during vectorization of
12621                  stmts in an inner-loop may not have a stmt_info, and do not
12622                  need to be vectorized.  */
12623               stmt_vec_info seen_store = NULL;
12624               if (stmt_info)
12625                 {
12626                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12627                     {
12628                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12629                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12630                            !gsi_end_p (subsi); gsi_next (&subsi))
12631                         {
12632                           stmt_vec_info pat_stmt_info
12633                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12634                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12635                                                     &si, &seen_store);
12636                         }
12637                       stmt_vec_info pat_stmt_info
12638                         = STMT_VINFO_RELATED_STMT (stmt_info);
12639                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12640                                                     &si, &seen_store))
12641                         maybe_set_vectorized_backedge_value (loop_vinfo,
12642                                                              pat_stmt_info);
12643                     }
12644                   else
12645                     {
12646                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12647                                                     &seen_store))
12648                         maybe_set_vectorized_backedge_value (loop_vinfo,
12649                                                              stmt_info);
12650                     }
12651                 }
12652               gsi_next (&si);
12653               if (seen_store)
12654                 {
12655                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12656                     /* Interleaving.  If IS_STORE is TRUE, the
12657                        vectorization of the interleaving chain was
12658                        completed - free all the stores in the chain.  */
12659                     vect_remove_stores (loop_vinfo,
12660                                         DR_GROUP_FIRST_ELEMENT (seen_store));
12661                   else
12662                     /* Free the attached stmt_vec_info and remove the stmt.  */
12663                     loop_vinfo->remove_stmt (stmt_info);
12664                 }
12665             }
12666         }
12667
12668       /* Stub out scalar statements that must not survive vectorization.
12669          Doing this here helps with grouped statements, or statements that
12670          are involved in patterns.  */
12671       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12672            !gsi_end_p (gsi); gsi_next (&gsi))
12673         {
12674           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12675           if (!call || !gimple_call_internal_p (call))
12676             continue;
12677           internal_fn ifn = gimple_call_internal_fn (call);
12678           if (ifn == IFN_MASK_LOAD)
12679             {
12680               tree lhs = gimple_get_lhs (call);
12681               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12682                 {
12683                   tree zero = build_zero_cst (TREE_TYPE (lhs));
12684                   gimple *new_stmt = gimple_build_assign (lhs, zero);
12685                   gsi_replace (&gsi, new_stmt, true);
12686                 }
12687             }
12688           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12689             {
12690               tree lhs = gimple_get_lhs (call);
12691               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12692                 {
12693                   tree else_arg
12694                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12695                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12696                   gsi_replace (&gsi, new_stmt, true);
12697                 }
12698             }
12699         }
12700     }                           /* BBs in loop */
12701
12702   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12703      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
12704   if (integer_onep (step_vector))
12705     niters_no_overflow = true;
12706   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12707                            niters_vector, step_vector, niters_vector_mult_vf,
12708                            !niters_no_overflow);
12709
12710   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12711
12712   /* True if the final iteration might not handle a full vector's
12713      worth of scalar iterations.  */
12714   bool final_iter_may_be_partial
12715     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
12716       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
12717
12718   /* +1 to convert latch counts to loop iteration counts.  */
12719   int bias_for_lowest = 1;
12720
12721   /* When we are peeling for gaps then we take away one scalar iteration
12722      from the vector loop.  Thus we can adjust the upper bound by one
12723      scalar iteration.  But only when we know the bound applies to the
12724      IV exit test which might not be true when we have multiple exits.  */
12725   if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
12726     bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12727
12728   int bias_for_assumed = bias_for_lowest;
12729   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12730   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12731     {
12732       /* When the amount of peeling is known at compile time, the first
12733          iteration will have exactly alignment_npeels active elements.
12734          In the worst case it will have at least one.  */
12735       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12736       bias_for_lowest += lowest_vf - min_first_active;
12737       bias_for_assumed += assumed_vf - min_first_active;
12738     }
12739   /* In these calculations the "- 1" converts loop iteration counts
12740      back to latch counts.  */
12741   if (loop->any_upper_bound)
12742     {
12743       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12744       loop->nb_iterations_upper_bound
12745         = (final_iter_may_be_partial
12746            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12747                             lowest_vf) - 1
12748            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12749                              lowest_vf) - 1);
12750       if (main_vinfo
12751           /* Both peeling for alignment and peeling for gaps can end up
12752              with the scalar epilogue running for more than VF-1 iterations.  */
12753           && !main_vinfo->peeling_for_alignment
12754           && !main_vinfo->peeling_for_gaps)
12755         {
12756           unsigned int bound;
12757           poly_uint64 main_iters
12758             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12759                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12760           main_iters
12761             = upper_bound (main_iters,
12762                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12763           if (can_div_away_from_zero_p (main_iters,
12764                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12765                                         &bound))
12766             loop->nb_iterations_upper_bound
12767               = wi::umin ((bound_wide_int) (bound - 1),
12768                           loop->nb_iterations_upper_bound);
12769       }
12770   }
12771   if (loop->any_likely_upper_bound)
12772     loop->nb_iterations_likely_upper_bound
12773       = (final_iter_may_be_partial
12774          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12775                           + bias_for_lowest, lowest_vf) - 1
12776          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12777                            + bias_for_lowest, lowest_vf) - 1);
12778   if (loop->any_estimate)
12779     loop->nb_iterations_estimate
12780       = (final_iter_may_be_partial
12781          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12782                           assumed_vf) - 1
12783          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12784                            assumed_vf) - 1);
12785   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12786                                assumed_vf, flat);
12787
12788   if (dump_enabled_p ())
12789     {
12790       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12791         {
12792           dump_printf_loc (MSG_NOTE, vect_location,
12793                            "LOOP VECTORIZED\n");
12794           if (loop->inner)
12795             dump_printf_loc (MSG_NOTE, vect_location,
12796                              "OUTER LOOP VECTORIZED\n");
12797           dump_printf (MSG_NOTE, "\n");
12798         }
12799       else
12800         dump_printf_loc (MSG_NOTE, vect_location,
12801                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12802                          GET_MODE_NAME (loop_vinfo->vector_mode));
12803     }
12804
12805   /* Loops vectorized with a variable factor won't benefit from
12806      unrolling/peeling.  */
12807   if (!vf.is_constant ())
12808     {
12809       loop->unroll = 1;
12810       if (dump_enabled_p ())
12811         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12812                          " variable-length vectorization factor\n");
12813     }
12814   /* Free SLP instances here because otherwise stmt reference counting
12815      won't work.  */
12816   slp_instance instance;
12817   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12818     vect_free_slp_instance (instance);
12819   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12820   /* Clear-up safelen field since its value is invalid after vectorization
12821      since vectorized loop can have loop-carried dependencies.  */
12822   loop->safelen = 0;
12823
12824   if (epilogue)
12825     {
12826       update_epilogue_loop_vinfo (epilogue, advance);
12827
12828       epilogue->simduid = loop->simduid;
12829       epilogue->force_vectorize = loop->force_vectorize;
12830       epilogue->dont_vectorize = false;
12831     }
12832
12833   return epilogue;
12834 }
12835
12836 /* The code below is trying to perform simple optimization - revert
12837    if-conversion for masked stores, i.e. if the mask of a store is zero
12838    do not perform it and all stored value producers also if possible.
12839    For example,
12840      for (i=0; i<n; i++)
12841        if (c[i])
12842         {
12843           p1[i] += 1;
12844           p2[i] = p3[i] +2;
12845         }
12846    this transformation will produce the following semi-hammock:
12847
12848    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12849      {
12850        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12851        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12852        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12853        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12854        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12855        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12856      }
12857 */
12858
12859 void
12860 optimize_mask_stores (class loop *loop)
12861 {
12862   basic_block *bbs = get_loop_body (loop);
12863   unsigned nbbs = loop->num_nodes;
12864   unsigned i;
12865   basic_block bb;
12866   class loop *bb_loop;
12867   gimple_stmt_iterator gsi;
12868   gimple *stmt;
12869   auto_vec<gimple *> worklist;
12870   auto_purge_vect_location sentinel;
12871
12872   vect_location = find_loop_location (loop);
12873   /* Pick up all masked stores in loop if any.  */
12874   for (i = 0; i < nbbs; i++)
12875     {
12876       bb = bbs[i];
12877       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12878            gsi_next (&gsi))
12879         {
12880           stmt = gsi_stmt (gsi);
12881           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12882             worklist.safe_push (stmt);
12883         }
12884     }
12885
12886   free (bbs);
12887   if (worklist.is_empty ())
12888     return;
12889
12890   /* Loop has masked stores.  */
12891   while (!worklist.is_empty ())
12892     {
12893       gimple *last, *last_store;
12894       edge e, efalse;
12895       tree mask;
12896       basic_block store_bb, join_bb;
12897       gimple_stmt_iterator gsi_to;
12898       tree vdef, new_vdef;
12899       gphi *phi;
12900       tree vectype;
12901       tree zero;
12902
12903       last = worklist.pop ();
12904       mask = gimple_call_arg (last, 2);
12905       bb = gimple_bb (last);
12906       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12907          the same loop as if_bb.  It could be different to LOOP when two
12908          level loop-nest is vectorized and mask_store belongs to the inner
12909          one.  */
12910       e = split_block (bb, last);
12911       bb_loop = bb->loop_father;
12912       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12913       join_bb = e->dest;
12914       store_bb = create_empty_bb (bb);
12915       add_bb_to_loop (store_bb, bb_loop);
12916       e->flags = EDGE_TRUE_VALUE;
12917       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12918       /* Put STORE_BB to likely part.  */
12919       efalse->probability = profile_probability::likely ();
12920       e->probability = efalse->probability.invert ();
12921       store_bb->count = efalse->count ();
12922       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12923       if (dom_info_available_p (CDI_DOMINATORS))
12924         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12925       if (dump_enabled_p ())
12926         dump_printf_loc (MSG_NOTE, vect_location,
12927                          "Create new block %d to sink mask stores.",
12928                          store_bb->index);
12929       /* Create vector comparison with boolean result.  */
12930       vectype = TREE_TYPE (mask);
12931       zero = build_zero_cst (vectype);
12932       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12933       gsi = gsi_last_bb (bb);
12934       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12935       /* Create new PHI node for vdef of the last masked store:
12936          .MEM_2 = VDEF <.MEM_1>
12937          will be converted to
12938          .MEM.3 = VDEF <.MEM_1>
12939          and new PHI node will be created in join bb
12940          .MEM_2 = PHI <.MEM_1, .MEM_3>
12941       */
12942       vdef = gimple_vdef (last);
12943       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12944       gimple_set_vdef (last, new_vdef);
12945       phi = create_phi_node (vdef, join_bb);
12946       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12947
12948       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12949       while (true)
12950         {
12951           gimple_stmt_iterator gsi_from;
12952           gimple *stmt1 = NULL;
12953
12954           /* Move masked store to STORE_BB.  */
12955           last_store = last;
12956           gsi = gsi_for_stmt (last);
12957           gsi_from = gsi;
12958           /* Shift GSI to the previous stmt for further traversal.  */
12959           gsi_prev (&gsi);
12960           gsi_to = gsi_start_bb (store_bb);
12961           gsi_move_before (&gsi_from, &gsi_to);
12962           /* Setup GSI_TO to the non-empty block start.  */
12963           gsi_to = gsi_start_bb (store_bb);
12964           if (dump_enabled_p ())
12965             dump_printf_loc (MSG_NOTE, vect_location,
12966                              "Move stmt to created bb\n%G", last);
12967           /* Move all stored value producers if possible.  */
12968           while (!gsi_end_p (gsi))
12969             {
12970               tree lhs;
12971               imm_use_iterator imm_iter;
12972               use_operand_p use_p;
12973               bool res;
12974
12975               /* Skip debug statements.  */
12976               if (is_gimple_debug (gsi_stmt (gsi)))
12977                 {
12978                   gsi_prev (&gsi);
12979                   continue;
12980                 }
12981               stmt1 = gsi_stmt (gsi);
12982               /* Do not consider statements writing to memory or having
12983                  volatile operand.  */
12984               if (gimple_vdef (stmt1)
12985                   || gimple_has_volatile_ops (stmt1))
12986                 break;
12987               gsi_from = gsi;
12988               gsi_prev (&gsi);
12989               lhs = gimple_get_lhs (stmt1);
12990               if (!lhs)
12991                 break;
12992
12993               /* LHS of vectorized stmt must be SSA_NAME.  */
12994               if (TREE_CODE (lhs) != SSA_NAME)
12995                 break;
12996
12997               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12998                 {
12999                   /* Remove dead scalar statement.  */
13000                   if (has_zero_uses (lhs))
13001                     {
13002                       gsi_remove (&gsi_from, true);
13003                       release_defs (stmt1);
13004                       continue;
13005                     }
13006                 }
13007
13008               /* Check that LHS does not have uses outside of STORE_BB.  */
13009               res = true;
13010               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
13011                 {
13012                   gimple *use_stmt;
13013                   use_stmt = USE_STMT (use_p);
13014                   if (is_gimple_debug (use_stmt))
13015                     continue;
13016                   if (gimple_bb (use_stmt) != store_bb)
13017                     {
13018                       res = false;
13019                       break;
13020                     }
13021                 }
13022               if (!res)
13023                 break;
13024
13025               if (gimple_vuse (stmt1)
13026                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
13027                 break;
13028
13029               /* Can move STMT1 to STORE_BB.  */
13030               if (dump_enabled_p ())
13031                 dump_printf_loc (MSG_NOTE, vect_location,
13032                                  "Move stmt to created bb\n%G", stmt1);
13033               gsi_move_before (&gsi_from, &gsi_to);
13034               /* Shift GSI_TO for further insertion.  */
13035               gsi_prev (&gsi_to);
13036             }
13037           /* Put other masked stores with the same mask to STORE_BB.  */
13038           if (worklist.is_empty ()
13039               || gimple_call_arg (worklist.last (), 2) != mask
13040               || worklist.last () != stmt1)
13041             break;
13042           last = worklist.pop ();
13043         }
13044       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
13045     }
13046 }
13047
13048 /* Decide whether it is possible to use a zero-based induction variable
13049    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
13050    the value that the induction variable must be able to hold in order
13051    to ensure that the rgroups eventually have no active vector elements.
13052    Return -1 otherwise.  */
13053
13054 widest_int
13055 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
13056 {
13057   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
13058   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
13059   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
13060
13061   /* Calculate the value that the induction variable must be able
13062      to hit in order to ensure that we end the loop with an all-false mask.
13063      This involves adding the maximum number of inactive trailing scalar
13064      iterations.  */
13065   widest_int iv_limit = -1;
13066   if (max_loop_iterations (loop, &iv_limit))
13067     {
13068       if (niters_skip)
13069         {
13070           /* Add the maximum number of skipped iterations to the
13071              maximum iteration count.  */
13072           if (TREE_CODE (niters_skip) == INTEGER_CST)
13073             iv_limit += wi::to_widest (niters_skip);
13074           else
13075             iv_limit += max_vf - 1;
13076         }
13077       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
13078         /* Make a conservatively-correct assumption.  */
13079         iv_limit += max_vf - 1;
13080
13081       /* IV_LIMIT is the maximum number of latch iterations, which is also
13082          the maximum in-range IV value.  Round this value down to the previous
13083          vector alignment boundary and then add an extra full iteration.  */
13084       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
13085       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
13086     }
13087   return iv_limit;
13088 }
13089
13090 /* For the given rgroup_controls RGC, check whether an induction variable
13091    would ever hit a value that produces a set of all-false masks or zero
13092    lengths before wrapping around.  Return true if it's possible to wrap
13093    around before hitting the desirable value, otherwise return false.  */
13094
13095 bool
13096 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
13097 {
13098   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
13099
13100   if (iv_limit == -1)
13101     return true;
13102
13103   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
13104   unsigned int compare_precision = TYPE_PRECISION (compare_type);
13105   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
13106
13107   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
13108     return true;
13109
13110   return false;
13111 }