gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2025 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "insn-config.h"
  35 #include "recog.h"              /* FIXME: for insn_data */
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "gimple-iterator.h"
  39 #include "cfgloop.h"
  40 #include "tree-vectorizer.h"
  41 #include "langhooks.h"
  42 #include "gimple-walk.h"
  43 #include "dbgcnt.h"
  44 #include "tree-vector-builder.h"
  45 #include "vec-perm-indices.h"
  46 #include "gimple-fold.h"
  47 #include "internal-fn.h"
  48 #include "dump-context.h"
  49 #include "cfganal.h"
  50 #include "tree-eh.h"
  51 #include "tree-cfg.h"
  52 #include "alloc-pool.h"
  53 #include "sreal.h"
  54 #include "predict.h"
  55
  56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  57                                             load_permutation_t &,
  58                                             const vec<tree> &,
  59                                             gimple_stmt_iterator *,
  60                                             poly_uint64, bool, bool,
  61                                             unsigned *,
  62                                             unsigned * = nullptr,
  63                                             bool = false);
  64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  65                                            slp_tree, lane_permutation_t &,
  66                                            vec<slp_tree> &, bool);
  67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  68                                           slp_tree, stmt_vector_for_cost *);
  69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  70 static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &);
  71
  72 static object_allocator<_slp_tree> *slp_tree_pool;
  73 static slp_tree slp_first_node;
  74
  75 void
  76 vect_slp_init (void)
  77 {
  78   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  79 }
  80
  81 void
  82 vect_slp_fini (void)
  83 {
  84   while (slp_first_node)
  85     delete slp_first_node;
  86   delete slp_tree_pool;
  87   slp_tree_pool = NULL;
  88 }
  89
  90 void *
  91 _slp_tree::operator new (size_t n)
  92 {
  93   gcc_assert (n == sizeof (_slp_tree));
  94   return slp_tree_pool->allocate_raw ();
  95 }
  96
  97 void
  98 _slp_tree::operator delete (void *node, size_t n)
  99 {
 100   gcc_assert (n == sizeof (_slp_tree));
 101   slp_tree_pool->remove_raw (node);
 102 }
 103
 104
 105 /* Initialize a SLP node.  */
 106
 107 _slp_tree::_slp_tree ()
 108 {
 109   this->prev_node = NULL;
 110   if (slp_first_node)
 111     slp_first_node->prev_node = this;
 112   this->next_node = slp_first_node;
 113   slp_first_node = this;
 114   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 115   SLP_TREE_SCALAR_OPS (this) = vNULL;
 116   SLP_TREE_VEC_DEFS (this) = vNULL;
 117   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 118   SLP_TREE_CHILDREN (this) = vNULL;
 119   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 120   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 121   SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
 122   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 123   SLP_TREE_CODE (this) = ERROR_MARK;
 124   this->ldst_lanes = false;
 125   SLP_TREE_VECTYPE (this) = NULL_TREE;
 126   SLP_TREE_REPRESENTATIVE (this) = NULL;
 127   SLP_TREE_MEMORY_ACCESS_TYPE (this) = VMAT_INVARIANT;
 128   SLP_TREE_REF_COUNT (this) = 1;
 129   this->failed = NULL;
 130   this->max_nunits = 1;
 131   this->lanes = 0;
 132 }
 133
 134 /* Tear down a SLP node.  */
 135
 136 _slp_tree::~_slp_tree ()
 137 {
 138   if (this->prev_node)
 139     this->prev_node->next_node = this->next_node;
 140   else
 141     slp_first_node = this->next_node;
 142   if (this->next_node)
 143     this->next_node->prev_node = this->prev_node;
 144   SLP_TREE_CHILDREN (this).release ();
 145   SLP_TREE_SCALAR_STMTS (this).release ();
 146   SLP_TREE_SCALAR_OPS (this).release ();
 147   SLP_TREE_VEC_DEFS (this).release ();
 148   SLP_TREE_LOAD_PERMUTATION (this).release ();
 149   SLP_TREE_LANE_PERMUTATION (this).release ();
 150   SLP_TREE_SIMD_CLONE_INFO (this).release ();
 151   if (this->failed)
 152     free (failed);
 153 }
 154
 155 /* Push the single SSA definition in DEF to the vector of vector defs.  */
 156
 157 void
 158 _slp_tree::push_vec_def (gimple *def)
 159 {
 160   if (gphi *phi = dyn_cast <gphi *> (def))
 161     vec_defs.quick_push (gimple_phi_result (phi));
 162   else
 163     {
 164       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
 165       vec_defs.quick_push (get_def_from_ptr (defop));
 166     }
 167 }
 168
 169 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 170
 171 void
 172 vect_free_slp_tree (slp_tree node)
 173 {
 174   int i;
 175   slp_tree child;
 176
 177   if (--SLP_TREE_REF_COUNT (node) != 0)
 178     return;
 179
 180   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 181     if (child)
 182       vect_free_slp_tree (child);
 183
 184   /* If the node defines any SLP only patterns then those patterns are no
 185      longer valid and should be removed.  */
 186   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 187   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 188     {
 189       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 190       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 191       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 192     }
 193
 194   delete node;
 195 }
 196
 197 /* Return a location suitable for dumpings related to the SLP instance.  */
 198
 199 dump_user_location_t
 200 _slp_instance::location () const
 201 {
 202   if (!root_stmts.is_empty ())
 203     return root_stmts[0]->stmt;
 204   else
 205     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 206 }
 207
 208
 209 /* Free the memory allocated for the SLP instance.  */
 210
 211 void
 212 vect_free_slp_instance (slp_instance instance)
 213 {
 214   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 215   SLP_INSTANCE_LOADS (instance).release ();
 216   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 217   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
 218   instance->subgraph_entries.release ();
 219   instance->cost_vec.release ();
 220   free (instance);
 221 }
 222
 223
 224 /* Create an SLP node for SCALAR_STMTS.  */
 225
 226 slp_tree
 227 vect_create_new_slp_node (unsigned nops, tree_code code)
 228 {
 229   slp_tree node = new _slp_tree;
 230   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 231   SLP_TREE_CHILDREN (node).create (nops);
 232   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 233   SLP_TREE_CODE (node) = code;
 234   return node;
 235 }
 236 /* Create an SLP node for SCALAR_STMTS.  */
 237
 238 static slp_tree
 239 vect_create_new_slp_node (slp_tree node,
 240                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 241 {
 242   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 243   SLP_TREE_CHILDREN (node).create (nops);
 244   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 245   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 246   SLP_TREE_LANES (node) = scalar_stmts.length ();
 247   return node;
 248 }
 249
 250 /* Create an SLP node for SCALAR_STMTS.  */
 251
 252 static slp_tree
 253 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 254 {
 255   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 256 }
 257
 258 /* Create an SLP node for OPS.  */
 259
 260 static slp_tree
 261 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 262 {
 263   SLP_TREE_SCALAR_OPS (node) = ops;
 264   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 265   SLP_TREE_LANES (node) = ops.length ();
 266   return node;
 267 }
 268
 269 /* Create an SLP node for OPS.  */
 270
 271 static slp_tree
 272 vect_create_new_slp_node (vec<tree> ops)
 273 {
 274   return vect_create_new_slp_node (new _slp_tree, ops);
 275 }
 276
 277
 278 /* This structure is used in creation of an SLP tree.  Each instance
 279    corresponds to the same operand in a group of scalar stmts in an SLP
 280    node.  */
 281 typedef struct _slp_oprnd_info
 282 {
 283   /* Def-stmts for the operands.  */
 284   vec<stmt_vec_info> def_stmts;
 285   /* Operands.  */
 286   vec<tree> ops;
 287   /* Information about the first statement, its vector def-type, type, the
 288      operand itself in case it's constant, and an indication if it's a pattern
 289      stmt and gather/scatter info.  */
 290   tree first_op_type;
 291   enum vect_def_type first_dt;
 292   bool any_pattern;
 293   bool first_gs_p;
 294   gather_scatter_info first_gs_info;
 295 } *slp_oprnd_info;
 296
 297
 298 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 299    operand.  */
 300 static vec<slp_oprnd_info>
 301 vect_create_oprnd_info (int nops, int group_size)
 302 {
 303   int i;
 304   slp_oprnd_info oprnd_info;
 305   vec<slp_oprnd_info> oprnds_info;
 306
 307   oprnds_info.create (nops);
 308   for (i = 0; i < nops; i++)
 309     {
 310       oprnd_info = XNEW (struct _slp_oprnd_info);
 311       oprnd_info->def_stmts.create (group_size);
 312       oprnd_info->ops.create (group_size);
 313       oprnd_info->first_dt = vect_uninitialized_def;
 314       oprnd_info->first_op_type = NULL_TREE;
 315       oprnd_info->any_pattern = false;
 316       oprnd_info->first_gs_p = false;
 317       oprnds_info.quick_push (oprnd_info);
 318     }
 319
 320   return oprnds_info;
 321 }
 322
 323
 324 /* Free operands info.  */
 325
 326 static void
 327 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 328 {
 329   int i;
 330   slp_oprnd_info oprnd_info;
 331
 332   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 333     {
 334       oprnd_info->def_stmts.release ();
 335       oprnd_info->ops.release ();
 336       XDELETE (oprnd_info);
 337     }
 338
 339   oprnds_info.release ();
 340 }
 341
 342 /* Return the execution frequency of NODE (so that a higher value indicates
 343    a "more important" node when optimizing for speed).  */
 344
 345 static sreal
 346 vect_slp_node_weight (slp_tree node)
 347 {
 348   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 349   basic_block bb = gimple_bb (stmt_info->stmt);
 350   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 351 }
 352
 353 /* Return true if STMTS contains a pattern statement.  */
 354
 355 static bool
 356 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 357 {
 358   stmt_vec_info stmt_info;
 359   unsigned int i;
 360   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 361     if (stmt_info && is_pattern_stmt_p (stmt_info))
 362       return true;
 363   return false;
 364 }
 365
 366 /* Return true when all lanes in the external or constant NODE have
 367    the same value.  */
 368
 369 static bool
 370 vect_slp_tree_uniform_p (slp_tree node)
 371 {
 372   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 373               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 374
 375   /* Pre-exsting vectors.  */
 376   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 377     return false;
 378
 379   unsigned i;
 380   tree op, first = NULL_TREE;
 381   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 382     if (!first)
 383       first = op;
 384     else if (!operand_equal_p (first, op, 0))
 385       return false;
 386
 387   return true;
 388 }
 389
 390 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 391    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 392    of the chain.  */
 393
 394 int
 395 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 396                                       stmt_vec_info first_stmt_info)
 397 {
 398   stmt_vec_info next_stmt_info = first_stmt_info;
 399   int result = 0;
 400
 401   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 402     return -1;
 403
 404   do
 405     {
 406       if (next_stmt_info == stmt_info)
 407         return result;
 408       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 409       if (next_stmt_info)
 410         result += DR_GROUP_GAP (next_stmt_info);
 411     }
 412   while (next_stmt_info);
 413
 414   return -1;
 415 }
 416
 417 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 418    using the method implemented by duplicate_and_interleave.  Return true
 419    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 420    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 421    (if nonnull).  */
 422
 423 bool
 424 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 425                                 tree elt_type, unsigned int *nvectors_out,
 426                                 tree *vector_type_out,
 427                                 tree *permutes)
 428 {
 429   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 430   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 431     return false;
 432
 433   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 434   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 435   unsigned int nvectors = 1;
 436   for (;;)
 437     {
 438       scalar_int_mode int_mode;
 439       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 440       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 441         {
 442           /* Get the natural vector type for this SLP group size.  */
 443           tree int_type = build_nonstandard_integer_type
 444             (GET_MODE_BITSIZE (int_mode), 1);
 445           tree vector_type
 446             = get_vectype_for_scalar_type (vinfo, int_type, count);
 447           poly_int64 half_nelts;
 448           if (vector_type
 449               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 450               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 451                            GET_MODE_SIZE (base_vector_mode))
 452               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
 453                              2, &half_nelts))
 454             {
 455               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 456                  together into elements of type INT_TYPE and using the result
 457                  to build NVECTORS vectors.  */
 458               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 459               vec_perm_builder sel1 (nelts, 2, 3);
 460               vec_perm_builder sel2 (nelts, 2, 3);
 461
 462               for (unsigned int i = 0; i < 3; ++i)
 463                 {
 464                   sel1.quick_push (i);
 465                   sel1.quick_push (i + nelts);
 466                   sel2.quick_push (half_nelts + i);
 467                   sel2.quick_push (half_nelts + i + nelts);
 468                 }
 469               vec_perm_indices indices1 (sel1, 2, nelts);
 470               vec_perm_indices indices2 (sel2, 2, nelts);
 471               machine_mode vmode = TYPE_MODE (vector_type);
 472               if (can_vec_perm_const_p (vmode, vmode, indices1)
 473                   && can_vec_perm_const_p (vmode, vmode, indices2))
 474                 {
 475                   if (nvectors_out)
 476                     *nvectors_out = nvectors;
 477                   if (vector_type_out)
 478                     *vector_type_out = vector_type;
 479                   if (permutes)
 480                     {
 481                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 482                                                                 indices1);
 483                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 484                                                                 indices2);
 485                     }
 486                   return true;
 487                 }
 488             }
 489         }
 490       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 491         return false;
 492       nvectors *= 2;
 493       /* We need to be able to fuse COUNT / NVECTORS elements together.  */
 494       if (!multiple_p (count, nvectors))
 495         return false;
 496     }
 497 }
 498
 499 /* Return true if DTA and DTB match.  */
 500
 501 static bool
 502 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 503 {
 504   return (dta == dtb
 505           || ((dta == vect_external_def || dta == vect_constant_def)
 506               && (dtb == vect_external_def || dtb == vect_constant_def)));
 507 }
 508
 509 static const int cond_expr_maps[3][5] = {
 510   { 4, -1, -2, 1, 2 },
 511   { 4, -2, -1, 1, 2 },
 512   { 4, -1, -2, 2, 1 }
 513 };
 514 static const int no_arg_map[] = { 0 };
 515 static const int arg0_map[] = { 1, 0 };
 516 static const int arg1_map[] = { 1, 1 };
 517 static const int arg2_arg3_map[] = { 2, 2, 3 };
 518 static const int arg1_arg3_map[] = { 2, 1, 3 };
 519 static const int arg1_arg4_arg5_map[] = { 3, 1, 4, 5 };
 520 static const int arg1_arg3_arg4_map[] = { 3, 1, 3, 4 };
 521 static const int arg3_arg2_map[] = { 2, 3, 2 };
 522 static const int op1_op0_map[] = { 2, 1, 0 };
 523 static const int off_map[] = { 1, -3 };
 524 static const int off_op0_map[] = { 2, -3, 0 };
 525 static const int off_arg2_arg3_map[] = { 3, -3, 2, 3 };
 526 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
 527 static const int mask_call_maps[6][7] = {
 528   { 1, 1, },
 529   { 2, 1, 2, },
 530   { 3, 1, 2, 3, },
 531   { 4, 1, 2, 3, 4, },
 532   { 5, 1, 2, 3, 4, 5, },
 533   { 6, 1, 2, 3, 4, 5, 6 },
 534 };
 535
 536 /* For most SLP statements, there is a one-to-one mapping between
 537    gimple arguments and child nodes.  If that is not true for STMT,
 538    return an array that contains:
 539
 540    - the number of child nodes, followed by
 541    - for each child node, the index of the argument associated with that node.
 542      The special index -1 is the first operand of an embedded comparison and
 543      the special index -2 is the second operand of an embedded comparison.
 544      The special indes -3 is the offset of a gather as analyzed by
 545      vect_check_gather_scatter.
 546
 547    SWAP is as for vect_get_and_check_slp_defs.  */
 548
 549 static const int *
 550 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
 551                       unsigned char swap = 0)
 552 {
 553   if (auto assign = dyn_cast<const gassign *> (stmt))
 554     {
 555       if (gimple_assign_rhs_code (assign) == COND_EXPR
 556           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 557         gcc_unreachable ();
 558       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 559           && swap)
 560         return op1_op0_map;
 561       if (gather_scatter_p)
 562         return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
 563                 ? off_op0_map : off_map);
 564     }
 565   gcc_assert (!swap);
 566   if (auto call = dyn_cast<const gcall *> (stmt))
 567     {
 568       if (gimple_call_internal_p (call))
 569         switch (gimple_call_internal_fn (call))
 570           {
 571           case IFN_MASK_LOAD:
 572             return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map;
 573
 574           case IFN_GATHER_LOAD:
 575             return arg1_map;
 576
 577           case IFN_MASK_GATHER_LOAD:
 578           case IFN_MASK_LEN_GATHER_LOAD:
 579             return arg1_arg4_arg5_map;
 580
 581           case IFN_SCATTER_STORE:
 582             return arg1_arg3_map;
 583
 584           case IFN_MASK_SCATTER_STORE:
 585           case IFN_MASK_LEN_SCATTER_STORE:
 586             return arg1_arg3_arg4_map;
 587
 588           case IFN_MASK_STORE:
 589             return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
 590
 591           case IFN_MASK_CALL:
 592             {
 593               unsigned nargs = gimple_call_num_args (call);
 594               if (nargs >= 2 && nargs <= 7)
 595                 return mask_call_maps[nargs-2];
 596               else
 597                 return nullptr;
 598             }
 599
 600           case IFN_CLZ:
 601           case IFN_CTZ:
 602             return arg0_map;
 603
 604           case IFN_GOMP_SIMD_LANE:
 605             return no_arg_map;
 606
 607           default:
 608             break;
 609           }
 610     }
 611   return nullptr;
 612 }
 613
 614 /* Return the SLP node child index for operand OP of STMT.  */
 615
 616 int
 617 vect_slp_child_index_for_operand (const gimple *stmt, int op,
 618                                   bool gather_scatter_p)
 619 {
 620   const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
 621   if (!opmap)
 622     return op;
 623   for (int i = 1; i < 1 + opmap[0]; ++i)
 624     if (opmap[i] == op)
 625       return i - 1;
 626   gcc_unreachable ();
 627 }
 628
 629 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 630    they are of a valid type and that they match the defs of the first stmt of
 631    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 632    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 633    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 634    is 1 if STMT is cond and operands of comparison need to be swapped;
 635    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 636
 637    If there was a fatal error return -1; if the error could be corrected by
 638    swapping operands of father node of this one, return 1; if everything is
 639    ok return 0.  */
 640 static int
 641 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 642                              bool *skip_args,
 643                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 644                              vec<slp_oprnd_info> *oprnds_info)
 645 {
 646   stmt_vec_info stmt_info = stmts[stmt_num];
 647   tree oprnd;
 648   unsigned int i, number_of_oprnds;
 649   enum vect_def_type dt = vect_uninitialized_def;
 650   slp_oprnd_info oprnd_info;
 651   gather_scatter_info gs_info;
 652   unsigned int gs_op = -1u;
 653   unsigned int commutative_op = -1U;
 654   bool first = stmt_num == 0;
 655
 656   if (!stmt_info)
 657     {
 658       for (auto oi : *oprnds_info)
 659         {
 660           oi->def_stmts.quick_push (NULL);
 661           oi->ops.quick_push (NULL_TREE);
 662         }
 663       return 0;
 664     }
 665
 666   if (!is_a<gcall *> (stmt_info->stmt)
 667       && !is_a<gassign *> (stmt_info->stmt)
 668       && !is_a<gphi *> (stmt_info->stmt))
 669     return -1;
 670
 671   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 672   const int *map
 673     = vect_get_operand_map (stmt_info->stmt,
 674                             STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
 675   if (map)
 676     number_of_oprnds = *map++;
 677   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 678     {
 679       if (gimple_call_internal_p (stmt))
 680         {
 681           internal_fn ifn = gimple_call_internal_fn (stmt);
 682           commutative_op = first_commutative_argument (ifn);
 683         }
 684     }
 685   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 686     {
 687       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 688         commutative_op = 0;
 689     }
 690
 691   bool swapped = (swap != 0);
 692   bool backedge = false;
 693   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 694   for (i = 0; i < number_of_oprnds; i++)
 695     {
 696       oprnd_info = (*oprnds_info)[i];
 697       int opno = map ? map[i] : int (i);
 698       if (opno == -3)
 699         {
 700           gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
 701           if (!is_a <loop_vec_info> (vinfo)
 702               || !vect_check_gather_scatter (stmt_info,
 703                                              as_a <loop_vec_info> (vinfo),
 704                                              first ? &oprnd_info->first_gs_info
 705                                              : &gs_info))
 706             return -1;
 707
 708           if (first)
 709             {
 710               oprnd_info->first_gs_p = true;
 711               oprnd = oprnd_info->first_gs_info.offset;
 712             }
 713           else
 714             {
 715               gs_op = i;
 716               oprnd = gs_info.offset;
 717             }
 718         }
 719       else if (opno < 0)
 720         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 721       else
 722         {
 723           oprnd = gimple_arg (stmt_info->stmt, opno);
 724           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 725             {
 726               edge e = gimple_phi_arg_edge (stmt, opno);
 727               backedge = (is_a <bb_vec_info> (vinfo)
 728                           ? e->flags & EDGE_DFS_BACK
 729                           : dominated_by_p (CDI_DOMINATORS, e->src,
 730                                             gimple_bb (stmt_info->stmt)));
 731             }
 732         }
 733       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 734         oprnd = TREE_OPERAND (oprnd, 0);
 735
 736       stmt_vec_info def_stmt_info;
 737       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 738         {
 739           if (dump_enabled_p ())
 740             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 741                              "Build SLP failed: can't analyze def for %T\n",
 742                              oprnd);
 743
 744           return -1;
 745         }
 746
 747       if (skip_args[i])
 748         {
 749           oprnd_info->def_stmts.quick_push (NULL);
 750           oprnd_info->ops.quick_push (NULL_TREE);
 751           oprnd_info->first_dt = vect_uninitialized_def;
 752           continue;
 753         }
 754
 755       oprnd_info->def_stmts.quick_push (def_stmt_info);
 756       oprnd_info->ops.quick_push (oprnd);
 757
 758       if (def_stmt_info
 759           && is_pattern_stmt_p (def_stmt_info))
 760         {
 761           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 762               != def_stmt_info)
 763             oprnd_info->any_pattern = true;
 764           else
 765             /* If we promote this to external use the original stmt def.  */
 766             oprnd_info->ops.last ()
 767               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 768         }
 769
 770       /* If there's a extern def on a backedge make sure we can
 771          code-generate at the region start.
 772          ???  This is another case that could be fixed by adjusting
 773          how we split the function but at the moment we'd have conflicting
 774          goals there.  */
 775       if (backedge
 776           && dts[i] == vect_external_def
 777           && is_a <bb_vec_info> (vinfo)
 778           && TREE_CODE (oprnd) == SSA_NAME
 779           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 780           && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
 781                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 782         {
 783           if (dump_enabled_p ())
 784             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 785                              "Build SLP failed: extern def %T only defined "
 786                              "on backedge\n", oprnd);
 787           return -1;
 788         }
 789
 790       if (first)
 791         {
 792           tree type = TREE_TYPE (oprnd);
 793           dt = dts[i];
 794
 795           /* For the swapping logic below force vect_reduction_def
 796              for the reduction op in a SLP reduction group.  */
 797           if (!STMT_VINFO_DATA_REF (stmt_info)
 798               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 799               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 800               && def_stmt_info)
 801             dts[i] = dt = vect_reduction_def;
 802
 803           /* Check the types of the definition.  */
 804           switch (dt)
 805             {
 806             case vect_external_def:
 807             case vect_constant_def:
 808             case vect_internal_def:
 809             case vect_reduction_def:
 810             case vect_double_reduction_def:
 811             case vect_induction_def:
 812             case vect_nested_cycle:
 813             case vect_first_order_recurrence:
 814               break;
 815
 816             default:
 817               /* FORNOW: Not supported.  */
 818               if (dump_enabled_p ())
 819                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 820                                  "Build SLP failed: illegal type of def %T\n",
 821                                  oprnd);
 822               return -1;
 823             }
 824
 825           oprnd_info->first_dt = dt;
 826           oprnd_info->first_op_type = type;
 827         }
 828     }
 829   if (first)
 830     return 0;
 831
 832   /* Now match the operand definition types to that of the first stmt.  */
 833   for (i = 0; i < number_of_oprnds;)
 834     {
 835       if (skip_args[i])
 836         {
 837           ++i;
 838           continue;
 839         }
 840
 841       oprnd_info = (*oprnds_info)[i];
 842       dt = dts[i];
 843       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 844       oprnd = oprnd_info->ops[stmt_num];
 845       tree type = TREE_TYPE (oprnd);
 846
 847       if (!types_compatible_p (oprnd_info->first_op_type, type))
 848         {
 849           if (dump_enabled_p ())
 850             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 851                              "Build SLP failed: different operand types\n");
 852           return 1;
 853         }
 854
 855       if ((gs_op == i) != oprnd_info->first_gs_p)
 856         {
 857           if (dump_enabled_p ())
 858             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 859                              "Build SLP failed: mixed gather and non-gather\n");
 860           return 1;
 861         }
 862       else if (gs_op == i)
 863         {
 864           if (!operand_equal_p (oprnd_info->first_gs_info.base,
 865                                 gs_info.base))
 866             {
 867               if (dump_enabled_p ())
 868                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 869                                  "Build SLP failed: different gather base\n");
 870               return 1;
 871             }
 872           if (oprnd_info->first_gs_info.scale != gs_info.scale)
 873             {
 874               if (dump_enabled_p ())
 875                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 876                                  "Build SLP failed: different gather scale\n");
 877               return 1;
 878             }
 879         }
 880
 881       /* Not first stmt of the group, check that the def-stmt/s match
 882          the def-stmt/s of the first stmt.  Allow different definition
 883          types for reduction chains: the first stmt must be a
 884          vect_reduction_def (a phi node), and the rest
 885          end in the reduction chain.  */
 886       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 887            && !(oprnd_info->first_dt == vect_reduction_def
 888                 && !STMT_VINFO_DATA_REF (stmt_info)
 889                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 890                 && def_stmt_info
 891                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 892                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 893                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 894           || (!STMT_VINFO_DATA_REF (stmt_info)
 895               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 896               && ((!def_stmt_info
 897                    || STMT_VINFO_DATA_REF (def_stmt_info)
 898                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 899                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 900                   != (oprnd_info->first_dt != vect_reduction_def))))
 901         {
 902           /* Try swapping operands if we got a mismatch.  For BB
 903              vectorization only in case it will clearly improve things.  */
 904           if (i == commutative_op && !swapped
 905               && (!is_a <bb_vec_info> (vinfo)
 906                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 907                                              dts[i+1])
 908                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 909                           || vect_def_types_match
 910                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 911             {
 912               if (dump_enabled_p ())
 913                 dump_printf_loc (MSG_NOTE, vect_location,
 914                                  "trying swapped operands\n");
 915               std::swap (dts[i], dts[i+1]);
 916               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 917                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 918               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 919                          (*oprnds_info)[i+1]->ops[stmt_num]);
 920               /* After swapping some operands we lost track whether an
 921                  operand has any pattern defs so be conservative here.  */
 922               if ((*oprnds_info)[i]->any_pattern
 923                   || (*oprnds_info)[i+1]->any_pattern)
 924                 (*oprnds_info)[i]->any_pattern
 925                   = (*oprnds_info)[i+1]->any_pattern = true;
 926               swapped = true;
 927               continue;
 928             }
 929
 930           if (is_a <bb_vec_info> (vinfo)
 931               && !oprnd_info->any_pattern
 932               && number_of_oprnds > 1)
 933             {
 934               /* Now for commutative ops we should see whether we can
 935                  make the other operand matching.  */
 936               if (dump_enabled_p ())
 937                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 938                                  "treating operand as external\n");
 939               oprnd_info->first_dt = dt = vect_external_def;
 940             }
 941           else
 942             {
 943               if (dump_enabled_p ())
 944                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 945                                  "Build SLP failed: different types\n");
 946               return 1;
 947             }
 948         }
 949
 950       /* Make sure to demote the overall operand to external.  */
 951       if (dt == vect_external_def)
 952         oprnd_info->first_dt = vect_external_def;
 953       /* For a SLP reduction chain we want to duplicate the reduction to
 954          each of the chain members.  That gets us a sane SLP graph (still
 955          the stmts are not 100% correct wrt the initial values).  */
 956       else if ((dt == vect_internal_def
 957                 || dt == vect_reduction_def)
 958                && oprnd_info->first_dt == vect_reduction_def
 959                && !STMT_VINFO_DATA_REF (stmt_info)
 960                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 961                && !STMT_VINFO_DATA_REF (def_stmt_info)
 962                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 963                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 964         {
 965           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 966           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 967         }
 968
 969       ++i;
 970     }
 971
 972   /* Swap operands.  */
 973   if (swapped)
 974     {
 975       if (dump_enabled_p ())
 976         dump_printf_loc (MSG_NOTE, vect_location,
 977                          "swapped operands to match def types in %G",
 978                          stmt_info->stmt);
 979     }
 980
 981   return 0;
 982 }
 983
 984 /* Return true if call statements CALL1 and CALL2 are similar enough
 985    to be combined into the same SLP group.  */
 986
 987 bool
 988 compatible_calls_p (gcall *call1, gcall *call2)
 989 {
 990   unsigned int nargs = gimple_call_num_args (call1);
 991   if (nargs != gimple_call_num_args (call2))
 992     return false;
 993
 994   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 995     return false;
 996
 997   if (gimple_call_internal_p (call1))
 998     {
 999       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
1000                                TREE_TYPE (gimple_call_lhs (call2))))
1001         return false;
1002       for (unsigned int i = 0; i < nargs; ++i)
1003         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
1004                                  TREE_TYPE (gimple_call_arg (call2, i))))
1005           return false;
1006     }
1007   else
1008     {
1009       if (!operand_equal_p (gimple_call_fn (call1),
1010                             gimple_call_fn (call2), 0))
1011         return false;
1012
1013       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
1014         return false;
1015     }
1016
1017   /* Check that any unvectorized arguments are equal.  */
1018   if (const int *map = vect_get_operand_map (call1))
1019     {
1020       unsigned int nkept = *map++;
1021       unsigned int mapi = 0;
1022       for (unsigned int i = 0; i < nargs; ++i)
1023         if (mapi < nkept && map[mapi] == int (i))
1024           mapi += 1;
1025         else if (!operand_equal_p (gimple_call_arg (call1, i),
1026                                    gimple_call_arg (call2, i)))
1027           return false;
1028     }
1029
1030   return true;
1031 }
1032
1033 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1034    caller's attempt to find the vector type in STMT_INFO with the narrowest
1035    element type.  Return true if VECTYPE is nonnull and if it is valid
1036    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
1037    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
1038    vect_build_slp_tree.  */
1039
1040 static bool
1041 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1042                         unsigned int group_size,
1043                         tree vectype, poly_uint64 *max_nunits)
1044 {
1045   if (!vectype)
1046     {
1047       if (dump_enabled_p ())
1048         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1049                          "Build SLP failed: unsupported data-type in %G\n",
1050                          stmt_info->stmt);
1051       /* Fatal mismatch.  */
1052       return false;
1053     }
1054
1055   /* If populating the vector type requires unrolling then fail
1056      before adjusting *max_nunits for basic-block vectorization.  */
1057   if (is_a <bb_vec_info> (vinfo)
1058       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1059     {
1060       if (dump_enabled_p ())
1061         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1062                          "Build SLP failed: unrolling required "
1063                          "in basic block SLP\n");
1064       /* Fatal mismatch.  */
1065       return false;
1066     }
1067
1068   /* In case of multiple types we need to detect the smallest type.  */
1069   vect_update_max_nunits (max_nunits, vectype);
1070   return true;
1071 }
1072
1073 /* Verify if the scalar stmts STMTS are isomorphic, require data
1074    permutation or are of unsupported types of operation.  Return
1075    true if they are, otherwise return false and indicate in *MATCHES
1076    which stmts are not isomorphic to the first one.  If MATCHES[0]
1077    is false then this indicates the comparison could not be
1078    carried out or the stmts will never be vectorized by SLP.
1079
1080    Note COND_EXPR is possibly isomorphic to another one after swapping its
1081    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1082    the first stmt by swapping the two operands of comparison; set SWAP[i]
1083    to 2 if stmt I is isormorphic to the first stmt by inverting the code
1084    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1085    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
1086
1087 static bool
1088 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1089                        vec<stmt_vec_info> stmts, unsigned int group_size,
1090                        poly_uint64 *max_nunits, bool *matches,
1091                        bool *two_operators, tree *node_vectype)
1092 {
1093   unsigned int i;
1094   stmt_vec_info first_stmt_info = stmts[0];
1095   code_helper first_stmt_code = ERROR_MARK;
1096   code_helper alt_stmt_code = ERROR_MARK;
1097   code_helper first_cond_code = ERROR_MARK;
1098   bool need_same_oprnds = false;
1099   tree first_lhs = NULL_TREE;
1100   tree first_op1 = NULL_TREE;
1101   stmt_vec_info first_load = NULL, prev_first_load = NULL;
1102   bool first_stmt_ldst_p = false;
1103   bool first_stmt_phi_p = false;
1104   int first_reduc_idx = -1;
1105   bool maybe_soft_fail = false;
1106   tree soft_fail_nunits_vectype = NULL_TREE;
1107
1108   tree vectype, nunits_vectype;
1109   if (!vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
1110                                        &nunits_vectype, group_size))
1111     {
1112       /* Fatal mismatch.  */
1113       matches[0] = false;
1114       return false;
1115     }
1116   /* Record nunits required but continue analysis, producing matches[]
1117      as if nunits was not an issue.  This allows splitting of groups
1118      to happen.  */
1119   if (nunits_vectype
1120       && !vect_record_max_nunits (vinfo, first_stmt_info, group_size,
1121                                   nunits_vectype, max_nunits))
1122     {
1123       gcc_assert (is_a <bb_vec_info> (vinfo));
1124       maybe_soft_fail = true;
1125       soft_fail_nunits_vectype = nunits_vectype;
1126     }
1127
1128   gcc_assert (vectype);
1129   *node_vectype = vectype;
1130
1131   /* For every stmt in NODE find its def stmt/s.  */
1132   stmt_vec_info stmt_info;
1133   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1134     {
1135       bool ldst_p = false;
1136       bool phi_p = false;
1137       code_helper rhs_code = ERROR_MARK;
1138
1139       swap[i] = 0;
1140       matches[i] = false;
1141       if (!stmt_info)
1142         {
1143           matches[i] = true;
1144           continue;
1145         }
1146
1147       gimple *stmt = stmt_info->stmt;
1148       if (dump_enabled_p ())
1149         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1150
1151       /* Fail to vectorize statements marked as unvectorizable, throw
1152          or are volatile.  */
1153       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1154           || stmt_can_throw_internal (cfun, stmt)
1155           || gimple_has_volatile_ops (stmt))
1156         {
1157           if (dump_enabled_p ())
1158             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159                              "Build SLP failed: unvectorizable statement %G",
1160                              stmt);
1161           /* ???  For BB vectorization we want to commutate operands in a way
1162              to shuffle all unvectorizable defs into one operand and have
1163              the other still vectorized.  The following doesn't reliably
1164              work for this though but it's the easiest we can do here.  */
1165           if (is_a <bb_vec_info> (vinfo) && i != 0)
1166             continue;
1167           /* Fatal mismatch.  */
1168           matches[0] = false;
1169           return false;
1170         }
1171
1172       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1173       tree lhs = gimple_get_lhs (stmt);
1174       if (lhs == NULL_TREE
1175           && (!call_stmt
1176               || !gimple_call_internal_p (stmt)
1177               || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1178         {
1179           if (dump_enabled_p ())
1180             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1181                              "Build SLP failed: not GIMPLE_ASSIGN nor "
1182                              "GIMPLE_CALL %G", stmt);
1183           if (is_a <bb_vec_info> (vinfo) && i != 0)
1184             continue;
1185           /* Fatal mismatch.  */
1186           matches[0] = false;
1187           return false;
1188         }
1189
1190       if (call_stmt)
1191         {
1192           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1193           if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1194             rhs_code = cfn;
1195           else
1196             rhs_code = CALL_EXPR;
1197
1198           if (cfn == CFN_MASK_LOAD
1199               || cfn == CFN_GATHER_LOAD
1200               || cfn == CFN_MASK_GATHER_LOAD
1201               || cfn == CFN_MASK_LEN_GATHER_LOAD
1202               || cfn == CFN_SCATTER_STORE
1203               || cfn == CFN_MASK_SCATTER_STORE
1204               || cfn == CFN_MASK_LEN_SCATTER_STORE)
1205             ldst_p = true;
1206           else if (cfn == CFN_MASK_STORE)
1207             {
1208               ldst_p = true;
1209               rhs_code = CFN_MASK_STORE;
1210             }
1211           else if (cfn == CFN_GOMP_SIMD_LANE)
1212             ;
1213           else if ((cfn != CFN_LAST
1214                     && cfn != CFN_MASK_CALL
1215                     && internal_fn_p (cfn)
1216                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1217                    || gimple_call_tail_p (call_stmt)
1218                    || gimple_call_noreturn_p (call_stmt)
1219                    || gimple_call_chain (call_stmt))
1220             {
1221               if (dump_enabled_p ())
1222                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223                                  "Build SLP failed: unsupported call type %G",
1224                                  (gimple *) call_stmt);
1225               if (is_a <bb_vec_info> (vinfo) && i != 0)
1226                 continue;
1227               /* Fatal mismatch.  */
1228               matches[0] = false;
1229               return false;
1230             }
1231         }
1232       else if (gimple_code (stmt) == GIMPLE_PHI)
1233         {
1234           rhs_code = ERROR_MARK;
1235           phi_p = true;
1236         }
1237       else
1238         {
1239           rhs_code = gimple_assign_rhs_code (stmt);
1240           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1241         }
1242
1243       /* Check the operation.  */
1244       if (i == 0)
1245         {
1246           first_lhs = lhs;
1247           first_stmt_code = rhs_code;
1248           first_stmt_ldst_p = ldst_p;
1249           first_stmt_phi_p = phi_p;
1250           first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
1251
1252           /* Shift arguments should be equal in all the packed stmts for a
1253              vector shift with scalar shift operand.  */
1254           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1255               || rhs_code == LROTATE_EXPR
1256               || rhs_code == RROTATE_EXPR)
1257             {
1258               /* First see if we have a vector/vector shift.  */
1259               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1260                 {
1261                   /* No vector/vector shift, try for a vector/scalar shift.  */
1262                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1263                     {
1264                       if (dump_enabled_p ())
1265                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1266                                          "Build SLP failed: "
1267                                          "op not supported by target.\n");
1268                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1269                         continue;
1270                       /* Fatal mismatch.  */
1271                       matches[0] = false;
1272                       return false;
1273                     }
1274                   need_same_oprnds = true;
1275                   first_op1 = gimple_assign_rhs2 (stmt);
1276                 }
1277             }
1278           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1279             {
1280               need_same_oprnds = true;
1281               first_op1 = gimple_assign_rhs2 (stmt);
1282             }
1283           else if (!ldst_p
1284                    && rhs_code == BIT_FIELD_REF)
1285             {
1286               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1287               if (!is_a <bb_vec_info> (vinfo)
1288                   || TREE_CODE (vec) != SSA_NAME
1289                   /* When the element types are not compatible we pun the
1290                      source to the target vectype which requires equal size.  */
1291                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1292                        || !types_compatible_p (TREE_TYPE (vectype),
1293                                                TREE_TYPE (TREE_TYPE (vec))))
1294                       && !operand_equal_p (TYPE_SIZE (vectype),
1295                                            TYPE_SIZE (TREE_TYPE (vec)))))
1296                 {
1297                   if (dump_enabled_p ())
1298                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1299                                      "Build SLP failed: "
1300                                      "BIT_FIELD_REF not supported\n");
1301                   /* Fatal mismatch.  */
1302                   matches[0] = false;
1303                   return false;
1304                 }
1305             }
1306           else if (rhs_code == CFN_DIV_POW2)
1307             {
1308               need_same_oprnds = true;
1309               first_op1 = gimple_call_arg (call_stmt, 1);
1310             }
1311           else if (rhs_code == CFN_GOMP_SIMD_LANE)
1312             {
1313               need_same_oprnds = true;
1314               first_op1 = gimple_call_arg (call_stmt, 1);
1315             }
1316         }
1317       else
1318         {
1319           if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1320               /* For SLP reduction groups the index isn't necessarily
1321                  uniform but only that of the first stmt matters.  */
1322               && !(first_reduc_idx != -1
1323                    && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1324                    && REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
1325             {
1326               if (dump_enabled_p ())
1327                 {
1328                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1329                                    "Build SLP failed: different reduc_idx "
1330                                    "%d instead of %d in %G",
1331                                    STMT_VINFO_REDUC_IDX (stmt_info),
1332                                    first_reduc_idx, stmt);
1333                 }
1334               /* Mismatch.  */
1335               continue;
1336             }
1337           if (!ldst_p
1338               && first_stmt_code != rhs_code
1339               && alt_stmt_code == ERROR_MARK)
1340             alt_stmt_code = rhs_code;
1341           if ((!ldst_p
1342                && first_stmt_code != rhs_code
1343                && (first_stmt_code != IMAGPART_EXPR
1344                    || rhs_code != REALPART_EXPR)
1345                && (first_stmt_code != REALPART_EXPR
1346                    || rhs_code != IMAGPART_EXPR)
1347                /* Handle mismatches in plus/minus by computing both
1348                   and merging the results.  */
1349                && !((first_stmt_code == PLUS_EXPR
1350                      || first_stmt_code == MINUS_EXPR)
1351                     && (alt_stmt_code == PLUS_EXPR
1352                         || alt_stmt_code == MINUS_EXPR)
1353                     && rhs_code == alt_stmt_code)
1354                && !(first_stmt_code.is_tree_code ()
1355                     && rhs_code.is_tree_code ()
1356                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1357                         == tcc_comparison)
1358                     && (swap_tree_comparison (tree_code (first_stmt_code))
1359                         == tree_code (rhs_code))))
1360               || (ldst_p
1361                   && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1362                       != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1363               || (ldst_p
1364                   && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1365                       != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1366               || first_stmt_ldst_p != ldst_p
1367               || first_stmt_phi_p != phi_p)
1368             {
1369               if (dump_enabled_p ())
1370                 {
1371                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1372                                    "Build SLP failed: different operation "
1373                                    "in stmt %G", stmt);
1374                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1375                                    "original stmt %G", first_stmt_info->stmt);
1376                 }
1377               /* Mismatch.  */
1378               continue;
1379             }
1380
1381           if (!ldst_p
1382               && first_stmt_code == BIT_FIELD_REF
1383               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1384                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1385             {
1386               if (dump_enabled_p ())
1387                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1388                                  "Build SLP failed: different BIT_FIELD_REF "
1389                                  "arguments in %G", stmt);
1390               /* Mismatch.  */
1391               continue;
1392             }
1393
1394           if (call_stmt
1395               && first_stmt_code != CFN_MASK_LOAD
1396               && first_stmt_code != CFN_MASK_STORE)
1397             {
1398               if (!is_a <gcall *> (stmts[0]->stmt)
1399                   || !compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1400                                           call_stmt))
1401                 {
1402                   if (dump_enabled_p ())
1403                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1404                                      "Build SLP failed: different calls in %G",
1405                                      stmt);
1406                   /* Mismatch.  */
1407                   continue;
1408                 }
1409             }
1410
1411           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1412               && (gimple_bb (first_stmt_info->stmt)
1413                   != gimple_bb (stmt_info->stmt)))
1414             {
1415               if (dump_enabled_p ())
1416                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417                                  "Build SLP failed: different BB for PHI "
1418                                  "or possibly trapping operation in %G", stmt);
1419               /* Mismatch.  */
1420               continue;
1421             }
1422
1423           if (need_same_oprnds)
1424             {
1425               tree other_op1 = gimple_arg (stmt, 1);
1426               if (!operand_equal_p (first_op1, other_op1, 0))
1427                 {
1428                   if (dump_enabled_p ())
1429                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1430                                      "Build SLP failed: different shift "
1431                                      "arguments in %G", stmt);
1432                   /* Mismatch.  */
1433                   continue;
1434                 }
1435             }
1436
1437           if (first_lhs
1438               && lhs
1439               && !types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (first_lhs)))
1440             {
1441               if (dump_enabled_p ())
1442                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1443                                  "Build SLP failed: different vector type "
1444                                  "in %G", stmt);
1445               /* Mismatch.  */
1446               continue;
1447             }
1448         }
1449
1450       /* Grouped store or load.  */
1451       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1452         {
1453           gcc_assert (ldst_p);
1454           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1455             {
1456               /* Store.  */
1457               gcc_assert (rhs_code == CFN_MASK_STORE
1458                           || REFERENCE_CLASS_P (lhs)
1459                           || DECL_P (lhs));
1460             }
1461           else
1462             {
1463               /* Load.  */
1464               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1465               if (prev_first_load)
1466                 {
1467                   /* Check that there are no loads from different interleaving
1468                      chains in the same node.  */
1469                   if (prev_first_load != first_load)
1470                     {
1471                       if (dump_enabled_p ())
1472                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1473                                          vect_location,
1474                                          "Build SLP failed: different "
1475                                          "interleaving chains in one node %G",
1476                                          stmt);
1477                       /* Mismatch.  */
1478                       continue;
1479                     }
1480                 }
1481               else
1482                 prev_first_load = first_load;
1483            }
1484         }
1485       /* Non-grouped store or load.  */
1486       else if (ldst_p)
1487         {
1488           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1489               && rhs_code != CFN_GATHER_LOAD
1490               && rhs_code != CFN_MASK_GATHER_LOAD
1491               && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1492               && rhs_code != CFN_SCATTER_STORE
1493               && rhs_code != CFN_MASK_SCATTER_STORE
1494               && rhs_code != CFN_MASK_LEN_SCATTER_STORE
1495               && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1496               /* Not grouped loads are handled as externals for BB
1497                  vectorization.  For loop vectorization we can handle
1498                  splats the same we handle single element interleaving.  */
1499               && (is_a <bb_vec_info> (vinfo)
1500                   || stmt_info != first_stmt_info))
1501             {
1502               /* Not grouped load.  */
1503               if (dump_enabled_p ())
1504                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1505                                  "Build SLP failed: not grouped load %G", stmt);
1506
1507               if (i != 0)
1508                 continue;
1509               /* Fatal mismatch.  */
1510               matches[0] = false;
1511               return false;
1512             }
1513         }
1514       /* Not memory operation.  */
1515       else
1516         {
1517           if (!phi_p
1518               && rhs_code.is_tree_code ()
1519               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1520               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1521               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1522               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1523               && rhs_code != VIEW_CONVERT_EXPR
1524               && rhs_code != CALL_EXPR
1525               && rhs_code != BIT_FIELD_REF
1526               && rhs_code != SSA_NAME)
1527             {
1528               if (dump_enabled_p ())
1529                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530                                  "Build SLP failed: operation unsupported %G",
1531                                  stmt);
1532               if (is_a <bb_vec_info> (vinfo) && i != 0)
1533                 continue;
1534               /* Fatal mismatch.  */
1535               matches[0] = false;
1536               return false;
1537             }
1538
1539           if (rhs_code == COND_EXPR)
1540             {
1541               tree cond_expr = gimple_assign_rhs1 (stmt);
1542               enum tree_code cond_code = TREE_CODE (cond_expr);
1543               enum tree_code swap_code = ERROR_MARK;
1544               enum tree_code invert_code = ERROR_MARK;
1545
1546               if (i == 0)
1547                 first_cond_code = TREE_CODE (cond_expr);
1548               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1549                 {
1550                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1551                   swap_code = swap_tree_comparison (cond_code);
1552                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1553                 }
1554
1555               if (first_cond_code == cond_code)
1556                 ;
1557               /* Isomorphic can be achieved by swapping.  */
1558               else if (first_cond_code == swap_code)
1559                 swap[i] = 1;
1560               /* Isomorphic can be achieved by inverting.  */
1561               else if (first_cond_code == invert_code)
1562                 swap[i] = 2;
1563               else
1564                 {
1565                   if (dump_enabled_p ())
1566                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1567                                      "Build SLP failed: different"
1568                                      " operation %G", stmt);
1569                   /* Mismatch.  */
1570                   continue;
1571                 }
1572             }
1573
1574           if (rhs_code.is_tree_code ()
1575               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1576               && (swap_tree_comparison ((tree_code)first_stmt_code)
1577                   == (tree_code)rhs_code))
1578             swap[i] = 1;
1579         }
1580
1581       matches[i] = true;
1582     }
1583
1584   for (i = 0; i < group_size; ++i)
1585     if (!matches[i])
1586       return false;
1587
1588   /* If we allowed a two-operation SLP node verify the target can cope
1589      with the permute we are going to use.  */
1590   if (alt_stmt_code != ERROR_MARK
1591       && (!alt_stmt_code.is_tree_code ()
1592           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1593               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1594     {
1595       *two_operators = true;
1596     }
1597
1598   if (maybe_soft_fail)
1599     {
1600       unsigned HOST_WIDE_INT const_nunits;
1601       if (!TYPE_VECTOR_SUBPARTS
1602             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1603           || const_nunits > group_size)
1604         matches[0] = false;
1605       else
1606         {
1607           /* With constant vector elements simulate a mismatch at the
1608              point we need to split.  */
1609           unsigned tail = group_size & (const_nunits - 1);
1610           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1611         }
1612       return false;
1613     }
1614
1615   return true;
1616 }
1617
1618 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1619    Note we never remove apart from at destruction time so we do not
1620    need a special value for deleted that differs from empty.  */
1621 struct bst_traits
1622 {
1623   typedef vec <stmt_vec_info> value_type;
1624   typedef vec <stmt_vec_info> compare_type;
1625   static inline hashval_t hash (value_type);
1626   static inline bool equal (value_type existing, value_type candidate);
1627   static inline bool is_empty (value_type x) { return !x.exists (); }
1628   static inline bool is_deleted (value_type x) { return !x.exists (); }
1629   static const bool empty_zero_p = true;
1630   static inline void mark_empty (value_type &x) { x.release (); }
1631   static inline void mark_deleted (value_type &x) { x.release (); }
1632   static inline void remove (value_type &x) { x.release (); }
1633 };
1634 inline hashval_t
1635 bst_traits::hash (value_type x)
1636 {
1637   inchash::hash h;
1638   for (unsigned i = 0; i < x.length (); ++i)
1639     h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
1640   return h.end ();
1641 }
1642 inline bool
1643 bst_traits::equal (value_type existing, value_type candidate)
1644 {
1645   if (existing.length () != candidate.length ())
1646     return false;
1647   for (unsigned i = 0; i < existing.length (); ++i)
1648     if (existing[i] != candidate[i])
1649       return false;
1650   return true;
1651 }
1652
1653 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1654                   simple_hashmap_traits <bst_traits, slp_tree> >
1655   scalar_stmts_to_slp_tree_map_t;
1656
1657 /* Release BST_MAP.  */
1658
1659 static void
1660 release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
1661 {
1662   /* The map keeps a reference on SLP nodes built, release that.  */
1663   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1664        it != bst_map->end (); ++it)
1665     if ((*it).second)
1666       vect_free_slp_tree ((*it).second);
1667   delete bst_map;
1668 }
1669
1670 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1671    but then vec::insert does memmove and that's not compatible with
1672    std::pair.  */
1673 struct chain_op_t
1674 {
1675   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1676       : code (code_), dt (dt_), op (op_) {}
1677   tree_code code;
1678   vect_def_type dt;
1679   tree op;
1680 };
1681
1682 /* Comparator for sorting associatable chains.  */
1683
1684 static int
1685 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1686 {
1687   auto *op1 = (const chain_op_t *) op1_;
1688   auto *op2 = (const chain_op_t *) op2_;
1689   if (op1->dt != op2->dt)
1690     return (int)op1->dt - (int)op2->dt;
1691   return (int)op1->code - (int)op2->code;
1692 }
1693
1694 /* Linearize the associatable expression chain at START with the
1695    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1696    filling CHAIN with the result and using WORKLIST as intermediate storage.
1697    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1698    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1699    stmts, starting with START.  */
1700
1701 static void
1702 vect_slp_linearize_chain (vec_info *vinfo,
1703                           vec<std::pair<tree_code, gimple *> > &worklist,
1704                           vec<chain_op_t> &chain,
1705                           enum tree_code code, gimple *start,
1706                           gimple *&code_stmt, gimple *&alt_code_stmt,
1707                           vec<gimple *> *chain_stmts)
1708 {
1709   /* For each lane linearize the addition/subtraction (or other
1710      uniform associatable operation) expression tree.  */
1711   worklist.safe_push (std::make_pair (code, start));
1712   while (!worklist.is_empty ())
1713     {
1714       auto entry = worklist.pop ();
1715       gassign *stmt = as_a <gassign *> (entry.second);
1716       enum tree_code in_code = entry.first;
1717       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1718       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1719       if (!code_stmt
1720           && gimple_assign_rhs_code (stmt) == code)
1721         code_stmt = stmt;
1722       else if (!alt_code_stmt
1723                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1724         alt_code_stmt = stmt;
1725       if (chain_stmts)
1726         chain_stmts->safe_push (stmt);
1727       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1728         {
1729           tree op = gimple_op (stmt, opnum);
1730           vect_def_type dt;
1731           stmt_vec_info def_stmt_info;
1732           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1733           gcc_assert (res);
1734           if (dt == vect_internal_def
1735               && is_pattern_stmt_p (def_stmt_info))
1736             op = gimple_get_lhs (def_stmt_info->stmt);
1737           gimple *use_stmt;
1738           use_operand_p use_p;
1739           if (dt == vect_internal_def
1740               && single_imm_use (op, &use_p, &use_stmt)
1741               && is_gimple_assign (def_stmt_info->stmt)
1742               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1743                   || (code == PLUS_EXPR
1744                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1745                           == MINUS_EXPR))))
1746             {
1747               tree_code op_def_code = this_code;
1748               if (op_def_code == MINUS_EXPR && opnum == 1)
1749                 op_def_code = PLUS_EXPR;
1750               if (in_code == MINUS_EXPR)
1751                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1752               worklist.safe_push (std::make_pair (op_def_code,
1753                                                   def_stmt_info->stmt));
1754             }
1755           else
1756             {
1757               tree_code op_def_code = this_code;
1758               if (op_def_code == MINUS_EXPR && opnum == 1)
1759                 op_def_code = PLUS_EXPR;
1760               if (in_code == MINUS_EXPR)
1761                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1762               chain.safe_push (chain_op_t (op_def_code, dt, op));
1763             }
1764         }
1765     }
1766 }
1767
1768 static slp_tree
1769 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1770                        vec<stmt_vec_info> stmts, unsigned int group_size,
1771                        poly_uint64 *max_nunits,
1772                        bool *matches, unsigned *limit, unsigned *tree_size,
1773                        scalar_stmts_to_slp_tree_map_t *bst_map);
1774
1775 static slp_tree
1776 vect_build_slp_tree (vec_info *vinfo,
1777                      vec<stmt_vec_info> stmts, unsigned int group_size,
1778                      poly_uint64 *max_nunits,
1779                      bool *matches, unsigned *limit, unsigned *tree_size,
1780                      scalar_stmts_to_slp_tree_map_t *bst_map)
1781 {
1782   if (slp_tree *leader = bst_map->get (stmts))
1783     {
1784       if (dump_enabled_p ())
1785         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1786                          !(*leader)->failed ? "" : "failed ",
1787                          (void *) *leader);
1788       if (!(*leader)->failed)
1789         {
1790           SLP_TREE_REF_COUNT (*leader)++;
1791           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1792           stmts.release ();
1793           return *leader;
1794         }
1795       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1796       return NULL;
1797     }
1798
1799   /* Single-lane SLP doesn't have the chance of run-away, do not account
1800      it to the limit.  */
1801   if (stmts.length () > 1)
1802     {
1803       if (*limit == 0)
1804         {
1805           if (dump_enabled_p ())
1806             dump_printf_loc (MSG_NOTE, vect_location,
1807                              "SLP discovery limit exceeded\n");
1808           memset (matches, 0, sizeof (bool) * group_size);
1809           return NULL;
1810         }
1811       --*limit;
1812     }
1813
1814   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1815      so we can pick up backedge destinations during discovery.  */
1816   slp_tree res = new _slp_tree;
1817   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1818   SLP_TREE_SCALAR_STMTS (res) = stmts;
1819   bst_map->put (stmts.copy (), res);
1820
1821   if (dump_enabled_p ())
1822     dump_printf_loc (MSG_NOTE, vect_location,
1823                      "starting SLP discovery for node %p\n", (void *) res);
1824
1825   poly_uint64 this_max_nunits = 1;
1826   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1827                                         &this_max_nunits,
1828                                         matches, limit, tree_size, bst_map);
1829   if (!res_)
1830     {
1831       if (dump_enabled_p ())
1832         dump_printf_loc (MSG_NOTE, vect_location,
1833                          "SLP discovery for node %p failed\n", (void *) res);
1834       /* Mark the node invalid so we can detect those when still in use
1835          as backedge destinations.  */
1836       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1837       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1838       res->failed = XNEWVEC (bool, group_size);
1839       if (flag_checking)
1840         {
1841           unsigned i;
1842           for (i = 0; i < group_size; ++i)
1843             if (!matches[i])
1844               break;
1845           gcc_assert (i < group_size);
1846         }
1847       memcpy (res->failed, matches, sizeof (bool) * group_size);
1848     }
1849   else
1850     {
1851       if (dump_enabled_p ())
1852         dump_printf_loc (MSG_NOTE, vect_location,
1853                          "SLP discovery for node %p succeeded\n",
1854                          (void *) res);
1855       gcc_assert (res_ == res);
1856       res->max_nunits = this_max_nunits;
1857       vect_update_max_nunits (max_nunits, this_max_nunits);
1858       /* Keep a reference for the bst_map use.  */
1859       SLP_TREE_REF_COUNT (res)++;
1860     }
1861   return res_;
1862 }
1863
1864 /* Helper for building an associated SLP node chain.  */
1865
1866 static void
1867 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1868                                    slp_tree op0, slp_tree op1,
1869                                    stmt_vec_info oper1, stmt_vec_info oper2,
1870                                    vec<std::pair<unsigned, unsigned> > lperm)
1871 {
1872   unsigned group_size = SLP_TREE_LANES (op1);
1873
1874   slp_tree child1 = new _slp_tree;
1875   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1876   SLP_TREE_VECTYPE (child1) = vectype;
1877   SLP_TREE_LANES (child1) = group_size;
1878   SLP_TREE_CHILDREN (child1).create (2);
1879   SLP_TREE_CHILDREN (child1).quick_push (op0);
1880   SLP_TREE_CHILDREN (child1).quick_push (op1);
1881   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1882
1883   slp_tree child2 = new _slp_tree;
1884   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1885   SLP_TREE_VECTYPE (child2) = vectype;
1886   SLP_TREE_LANES (child2) = group_size;
1887   SLP_TREE_CHILDREN (child2).create (2);
1888   SLP_TREE_CHILDREN (child2).quick_push (op0);
1889   SLP_TREE_REF_COUNT (op0)++;
1890   SLP_TREE_CHILDREN (child2).quick_push (op1);
1891   SLP_TREE_REF_COUNT (op1)++;
1892   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1893
1894   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1895   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1896   SLP_TREE_VECTYPE (perm) = vectype;
1897   SLP_TREE_LANES (perm) = group_size;
1898   /* ???  We should set this NULL but that's not expected.  */
1899   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1900   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1901   SLP_TREE_CHILDREN (perm).quick_push (child1);
1902   SLP_TREE_CHILDREN (perm).quick_push (child2);
1903 }
1904
1905 /* Recursively build an SLP tree starting from NODE.
1906    Fail (and return a value not equal to zero) if def-stmts are not
1907    isomorphic, require data permutation or are of unsupported types of
1908    operation.  Otherwise, return 0.
1909    The value returned is the depth in the SLP tree where a mismatch
1910    was found.  */
1911
1912 static slp_tree
1913 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1914                        vec<stmt_vec_info> stmts, unsigned int group_size,
1915                        poly_uint64 *max_nunits,
1916                        bool *matches, unsigned *limit, unsigned *tree_size,
1917                        scalar_stmts_to_slp_tree_map_t *bst_map)
1918 {
1919   unsigned nops, i, this_tree_size = 0;
1920   poly_uint64 this_max_nunits = *max_nunits;
1921
1922   matches[0] = false;
1923
1924   stmt_vec_info stmt_info = stmts[0];
1925   if (!is_a<gcall *> (stmt_info->stmt)
1926       && !is_a<gassign *> (stmt_info->stmt)
1927       && !is_a<gphi *> (stmt_info->stmt))
1928     return NULL;
1929
1930   nops = gimple_num_args (stmt_info->stmt);
1931   if (const int *map = vect_get_operand_map (stmt_info->stmt,
1932                                              STMT_VINFO_GATHER_SCATTER_P
1933                                                (stmt_info)))
1934     nops = map[0];
1935
1936   /* If the SLP node is a PHI (induction or reduction), terminate
1937      the recursion.  */
1938   bool *skip_args = XALLOCAVEC (bool, nops);
1939   memset (skip_args, 0, sizeof (bool) * nops);
1940   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1941     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1942       {
1943         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1944         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1945                                                     group_size);
1946         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1947                                      max_nunits))
1948           return NULL;
1949
1950         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1951         if (def_type == vect_induction_def)
1952           {
1953             /* Induction PHIs are not cycles but walk the initial
1954                value.  Only for inner loops through, for outer loops
1955                we need to pick up the value from the actual PHIs
1956                to more easily support peeling and epilogue vectorization.  */
1957             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1958             if (!nested_in_vect_loop_p (loop, stmt_info))
1959               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1960             else
1961               loop = loop->inner;
1962             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1963           }
1964         else if (def_type == vect_reduction_def
1965                  || def_type == vect_double_reduction_def
1966                  || def_type == vect_nested_cycle
1967                  || def_type == vect_first_order_recurrence)
1968           {
1969             /* Else def types have to match.  */
1970             stmt_vec_info other_info;
1971             bool all_same = true;
1972             FOR_EACH_VEC_ELT (stmts, i, other_info)
1973               {
1974                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1975                   return NULL;
1976                 if (other_info != stmt_info)
1977                   all_same = false;
1978               }
1979             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1980             /* Reduction initial values are not explicitely represented.  */
1981             if (def_type != vect_first_order_recurrence
1982                 && gimple_bb (stmt_info->stmt) == loop->header)
1983               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1984             /* Reduction chain backedge defs are filled manually.
1985                ???  Need a better way to identify a SLP reduction chain PHI.
1986                Or a better overall way to SLP match those.  */
1987             if (stmts.length () > 1
1988                 && all_same && def_type == vect_reduction_def)
1989               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1990           }
1991         else if (def_type != vect_internal_def)
1992           return NULL;
1993       }
1994
1995
1996   bool two_operators = false;
1997   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1998   tree vectype = NULL_TREE;
1999   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
2000                               &this_max_nunits, matches, &two_operators,
2001                               &vectype))
2002     return NULL;
2003
2004   /* If the SLP node is a load, terminate the recursion unless masked.  */
2005   if (STMT_VINFO_DATA_REF (stmt_info)
2006       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2007     {
2008       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2009         gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
2010       else
2011         {
2012           *max_nunits = this_max_nunits;
2013           (*tree_size)++;
2014           node = vect_create_new_slp_node (node, stmts, 0);
2015           SLP_TREE_VECTYPE (node) = vectype;
2016           /* And compute the load permutation.  Whether it is actually
2017              a permutation depends on the unrolling factor which is
2018              decided later.  */
2019           vec<unsigned> load_permutation;
2020           int j;
2021           stmt_vec_info load_info;
2022           load_permutation.create (group_size);
2023           stmt_vec_info first_stmt_info
2024             = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2025               ? DR_GROUP_FIRST_ELEMENT (stmt_info) : stmt_info;
2026           bool any_permute = false;
2027           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2028             {
2029               int load_place;
2030               if (! load_info)
2031                 {
2032                   if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2033                     load_place = j;
2034                   else
2035                     load_place = 0;
2036                 }
2037               else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2038                 load_place = vect_get_place_in_interleaving_chain
2039                     (load_info, first_stmt_info);
2040               else
2041                 load_place = 0;
2042               gcc_assert (load_place != -1);
2043               any_permute |= load_place != j;
2044               load_permutation.quick_push (load_place);
2045             }
2046
2047           if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
2048             {
2049               gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
2050               bool has_gaps = false;
2051               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052                 for (stmt_vec_info si = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
2053                      si; si = DR_GROUP_NEXT_ELEMENT (si))
2054                   if (DR_GROUP_GAP (si) != 1)
2055                     has_gaps = true;
2056               /* We cannot handle permuted masked loads directly, see
2057                  PR114375.  We cannot handle strided masked loads or masked
2058                  loads with gaps unless the mask is uniform.  */
2059               if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
2060                    && (DR_GROUP_GAP (first_stmt_info) != 0
2061                        || (has_gaps
2062                            && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))))
2063                   || STMT_VINFO_STRIDED_P (stmt_info))
2064                 {
2065                   load_permutation.release ();
2066                   matches[0] = false;
2067                   return NULL;
2068                 }
2069
2070               /* For permuted masked loads do an unpermuted masked load of
2071                  the whole group followed by a SLP permute node.  */
2072               if (any_permute
2073                   || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2074                       && DR_GROUP_SIZE (first_stmt_info) != group_size))
2075                 {
2076                   /* Discover the whole unpermuted load.  */
2077                   vec<stmt_vec_info> stmts2;
2078                   unsigned dr_group_size = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2079                       ? DR_GROUP_SIZE (first_stmt_info) : 1;
2080                   stmts2.create (dr_group_size);
2081                   stmts2.quick_grow_cleared (dr_group_size);
2082                   unsigned i = 0;
2083                   for (stmt_vec_info si = first_stmt_info;
2084                        si; si = DR_GROUP_NEXT_ELEMENT (si))
2085                     {
2086                       if (si != first_stmt_info)
2087                         for (unsigned k = 1; k < DR_GROUP_GAP (si); ++k)
2088                           stmts2[i++] = NULL;
2089                       stmts2[i++] = si;
2090                     }
2091                   bool *matches2 = XALLOCAVEC (bool, dr_group_size);
2092                   slp_tree unperm_load
2093                     = vect_build_slp_tree (vinfo, stmts2, dr_group_size,
2094                                            &this_max_nunits, matches2, limit,
2095                                            &this_tree_size, bst_map);
2096                   /* When we are able to do the full masked load emit that
2097                      followed by 'node' being the desired final permutation.  */
2098                   if (unperm_load)
2099                     {
2100                       gcc_assert
2101                         (!SLP_TREE_LOAD_PERMUTATION (unperm_load).exists ());
2102                       lane_permutation_t lperm;
2103                       lperm.create (group_size);
2104                       for (unsigned j = 0; j < load_permutation.length (); ++j)
2105                         lperm.quick_push
2106                           (std::make_pair (0, load_permutation[j]));
2107                       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2108                       SLP_TREE_CHILDREN (node).safe_push (unperm_load);
2109                       SLP_TREE_LANE_PERMUTATION (node) = lperm;
2110                       load_permutation.release ();
2111                       return node;
2112                     }
2113                   stmts2.release ();
2114                   load_permutation.release ();
2115                   matches[0] = false;
2116                   return NULL;
2117                 }
2118               load_permutation.release ();
2119             }
2120           else
2121             {
2122               if (!any_permute
2123                   && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2124                   && group_size == DR_GROUP_SIZE (first_stmt_info))
2125                 load_permutation.release ();
2126               SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
2127               return node;
2128             }
2129         }
2130     }
2131   else if (gimple_assign_single_p (stmt_info->stmt)
2132            && !gimple_vuse (stmt_info->stmt)
2133            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
2134     {
2135       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
2136          the same SSA name vector of a compatible type to vectype.  */
2137       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
2138       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
2139       stmt_vec_info estmt_info;
2140       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
2141         {
2142           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
2143           tree bfref = gimple_assign_rhs1 (estmt);
2144           HOST_WIDE_INT lane;
2145           if (!known_eq (bit_field_size (bfref),
2146                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2147               || !constant_multiple_p (bit_field_offset (bfref),
2148                                        bit_field_size (bfref), &lane))
2149             {
2150               lperm.release ();
2151               matches[0] = false;
2152               return NULL;
2153             }
2154           lperm.safe_push (std::make_pair (0, (unsigned)lane));
2155         }
2156       slp_tree vnode = vect_create_new_slp_node (vNULL);
2157       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2158         /* ???  We record vectype here but we hide eventually necessary
2159            punning and instead rely on code generation to materialize
2160            VIEW_CONVERT_EXPRs as necessary.  We instead should make
2161            this explicit somehow.  */
2162         SLP_TREE_VECTYPE (vnode) = vectype;
2163       else
2164         {
2165           /* For different size but compatible elements we can still
2166              use VEC_PERM_EXPR without punning.  */
2167           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2168                       && types_compatible_p (TREE_TYPE (vectype),
2169                                              TREE_TYPE (TREE_TYPE (vec))));
2170           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2171         }
2172       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2173       unsigned HOST_WIDE_INT const_nunits;
2174       if (nunits.is_constant (&const_nunits))
2175         SLP_TREE_LANES (vnode) = const_nunits;
2176       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2177       /* We are always building a permutation node even if it is an identity
2178          permute to shield the rest of the vectorizer from the odd node
2179          representing an actual vector without any scalar ops.
2180          ???  We could hide it completely with making the permute node
2181          external?  */
2182       node = vect_create_new_slp_node (node, stmts, 1);
2183       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2184       SLP_TREE_LANE_PERMUTATION (node) = lperm;
2185       SLP_TREE_VECTYPE (node) = vectype;
2186       SLP_TREE_CHILDREN (node).quick_push (vnode);
2187       return node;
2188     }
2189   /* When discovery reaches an associatable operation see whether we can
2190      improve that to match up lanes in a way superior to the operand
2191      swapping code which at most looks at two defs.
2192      ???  For BB vectorization we cannot do the brute-force search
2193      for matching as we can succeed by means of builds from scalars
2194      and have no good way to "cost" one build against another.  */
2195   else if (is_a <loop_vec_info> (vinfo)
2196            /* Do not bother for single-lane SLP.  */
2197            && group_size > 1
2198            /* ???  We don't handle !vect_internal_def defs below.  */
2199            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2200            /* ???  Do not associate a reduction, this will wreck REDUC_IDX
2201               mapping as long as that exists on the stmt_info level.  */
2202            && STMT_VINFO_REDUC_IDX (stmt_info) == -1
2203            && is_gimple_assign (stmt_info->stmt)
2204            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2205                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2206            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2207                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2208                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2209     {
2210       /* See if we have a chain of (mixed) adds or subtracts or other
2211          associatable ops.  */
2212       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2213       if (code == MINUS_EXPR)
2214         code = PLUS_EXPR;
2215       stmt_vec_info other_op_stmt_info = NULL;
2216       stmt_vec_info op_stmt_info = NULL;
2217       unsigned chain_len = 0;
2218       auto_vec<chain_op_t> chain;
2219       auto_vec<std::pair<tree_code, gimple *> > worklist;
2220       auto_vec<vec<chain_op_t> > chains (group_size);
2221       auto_vec<slp_tree, 4> children;
2222       bool hard_fail = true;
2223       for (unsigned lane = 0; lane < group_size; ++lane)
2224         {
2225           if (!stmts[lane])
2226             {
2227               /* ???  Below we require lane zero is present.  */
2228               if (lane == 0)
2229                 {
2230                   hard_fail = false;
2231                   break;
2232                 }
2233               chains.quick_push (vNULL);
2234               continue;
2235             }
2236           /* For each lane linearize the addition/subtraction (or other
2237              uniform associatable operation) expression tree.  */
2238           gimple *op_stmt = NULL, *other_op_stmt = NULL;
2239           vect_slp_linearize_chain (vinfo, worklist, chain, code,
2240                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
2241                                     NULL);
2242           if (!op_stmt_info && op_stmt)
2243             op_stmt_info = vinfo->lookup_stmt (op_stmt);
2244           if (!other_op_stmt_info && other_op_stmt)
2245             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2246           if (chain.length () == 2)
2247             {
2248               /* In a chain of just two elements resort to the regular
2249                  operand swapping scheme.  Likewise if we run into a
2250                  length mismatch process regularly as well as we did not
2251                  process the other lanes we cannot report a good hint what
2252                  lanes to try swapping in the parent.  */
2253               hard_fail = false;
2254               break;
2255             }
2256           else if (chain_len == 0)
2257             chain_len = chain.length ();
2258           else if (chain.length () != chain_len)
2259             {
2260               /* ???  Here we could slip in magic to compensate with
2261                  neutral operands.  */
2262               matches[lane] = false;
2263               if (lane != group_size - 1)
2264                 matches[0] = false;
2265               break;
2266             }
2267           chains.quick_push (chain.copy ());
2268           chain.truncate (0);
2269         }
2270       if (chains.length () == group_size)
2271         {
2272           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
2273           if (!op_stmt_info)
2274             {
2275               hard_fail = false;
2276               goto out;
2277             }
2278           /* Now we have a set of chains with the same length.  */
2279           /* 1. pre-sort according to def_type and operation.  */
2280           for (unsigned lane = 0; lane < group_size; ++lane)
2281             chains[lane].stablesort (dt_sort_cmp, vinfo);
2282           if (dump_enabled_p ())
2283             {
2284               dump_printf_loc (MSG_NOTE, vect_location,
2285                                "pre-sorted chains of %s\n",
2286                                get_tree_code_name (code));
2287               for (unsigned lane = 0; lane < group_size; ++lane)
2288                 {
2289                   if (!stmts[lane])
2290                     dump_printf (MSG_NOTE, "--");
2291                   else
2292                     for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2293                       dump_printf (MSG_NOTE, "%s %T ",
2294                                    get_tree_code_name (chains[lane][opnum].code),
2295                                    chains[lane][opnum].op);
2296                   dump_printf (MSG_NOTE, "\n");
2297                 }
2298             }
2299           /* 2. try to build children nodes, associating as necessary.  */
2300           /* 2a. prepare and perform early checks to avoid eating into
2301              discovery limit unnecessarily.  */
2302           vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
2303           for (unsigned n = 0; n < chain_len; ++n)
2304             {
2305               vect_def_type dt = chains[0][n].dt;
2306               unsigned lane;
2307               for (lane = 0; lane < group_size; ++lane)
2308                 if (stmts[lane] && chains[lane][n].dt != dt)
2309                   {
2310                     if (dt == vect_constant_def
2311                         && chains[lane][n].dt == vect_external_def)
2312                       dt = vect_external_def;
2313                     else if (dt == vect_external_def
2314                              && chains[lane][n].dt == vect_constant_def)
2315                       ;
2316                     else
2317                       break;
2318                   }
2319               if (lane != group_size)
2320                 {
2321                   if (dump_enabled_p ())
2322                     dump_printf_loc (MSG_NOTE, vect_location,
2323                                      "giving up on chain due to mismatched "
2324                                      "def types\n");
2325                   matches[lane] = false;
2326                   if (lane != group_size - 1)
2327                     matches[0] = false;
2328                   goto out;
2329                 }
2330               dts[n] = dt;
2331               if (dt == vect_constant_def
2332                   || dt == vect_external_def)
2333                 {
2334                   /* Check whether we can build the invariant.  If we can't
2335                      we never will be able to.  */
2336                   tree type = TREE_TYPE (chains[0][n].op);
2337                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2338                       && (TREE_CODE (type) == BOOLEAN_TYPE
2339                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2340                                                               type)))
2341                     {
2342                       matches[0] = false;
2343                       goto out;
2344                     }
2345                 }
2346               else if (dt != vect_internal_def)
2347                 {
2348                   /* Not sure, we might need sth special.
2349                      gcc.dg/vect/pr96854.c,
2350                      gfortran.dg/vect/fast-math-pr37021.f90
2351                      and gfortran.dg/vect/pr61171.f trigger.  */
2352                   /* Soft-fail for now.  */
2353                   hard_fail = false;
2354                   goto out;
2355                 }
2356             }
2357           /* 2b. do the actual build.  */
2358           for (unsigned n = 0; n < chain_len; ++n)
2359             {
2360               vect_def_type dt = dts[n];
2361               unsigned lane;
2362               if (dt == vect_constant_def
2363                   || dt == vect_external_def)
2364                 {
2365                   vec<tree> ops;
2366                   ops.create (group_size);
2367                   for (lane = 0; lane < group_size; ++lane)
2368                     if (stmts[lane])
2369                       ops.quick_push (chains[lane][n].op);
2370                     else
2371                       ops.quick_push (NULL_TREE);
2372                   slp_tree child = vect_create_new_slp_node (ops);
2373                   SLP_TREE_DEF_TYPE (child) = dt;
2374                   children.safe_push (child);
2375                 }
2376               else
2377                 {
2378                   vec<stmt_vec_info> op_stmts;
2379                   op_stmts.create (group_size);
2380                   slp_tree child = NULL;
2381                   /* Brute-force our way.  We have to consider a lane
2382                      failing after fixing an earlier fail up in the
2383                      SLP discovery recursion.  So track the current
2384                      permute per lane.  */
2385                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2386                   memset (perms, 0, sizeof (unsigned) * group_size);
2387                   do
2388                     {
2389                       op_stmts.truncate (0);
2390                       for (lane = 0; lane < group_size; ++lane)
2391                         if (stmts[lane])
2392                           op_stmts.quick_push
2393                             (vinfo->lookup_def (chains[lane][n].op));
2394                         else
2395                           op_stmts.quick_push (NULL);
2396                       child = vect_build_slp_tree (vinfo, op_stmts,
2397                                                    group_size, &this_max_nunits,
2398                                                    matches, limit,
2399                                                    &this_tree_size, bst_map);
2400                       /* ???  We're likely getting too many fatal mismatches
2401                          here so maybe we want to ignore them (but then we
2402                          have no idea which lanes fatally mismatched).  */
2403                       if (child || !matches[0])
2404                         break;
2405                       /* Swap another lane we have not yet matched up into
2406                          lanes that did not match.  If we run out of
2407                          permute possibilities for a lane terminate the
2408                          search.  */
2409                       bool term = false;
2410                       for (lane = 1; lane < group_size; ++lane)
2411                         if (!matches[lane])
2412                           {
2413                             if (n + perms[lane] + 1 == chain_len)
2414                               {
2415                                 term = true;
2416                                 break;
2417                               }
2418                             if (dump_enabled_p ())
2419                               dump_printf_loc (MSG_NOTE, vect_location,
2420                                                "swapping operand %d and %d "
2421                                                "of lane %d\n",
2422                                                n, n + perms[lane] + 1, lane);
2423                             std::swap (chains[lane][n],
2424                                        chains[lane][n + perms[lane] + 1]);
2425                             perms[lane]++;
2426                           }
2427                       if (term)
2428                         break;
2429                     }
2430                   while (1);
2431                   if (!child)
2432                     {
2433                       if (dump_enabled_p ())
2434                         dump_printf_loc (MSG_NOTE, vect_location,
2435                                          "failed to match up op %d\n", n);
2436                       op_stmts.release ();
2437                       if (lane != group_size - 1)
2438                         matches[0] = false;
2439                       else
2440                         matches[lane] = false;
2441                       goto out;
2442                     }
2443                   if (dump_enabled_p ())
2444                     {
2445                       dump_printf_loc (MSG_NOTE, vect_location,
2446                                        "matched up op %d to\n", n);
2447                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2448                     }
2449                   children.safe_push (child);
2450                 }
2451             }
2452           /* 3. build SLP nodes to combine the chain.  */
2453           for (unsigned lane = 0; lane < group_size; ++lane)
2454             if (stmts[lane] && chains[lane][0].code != code)
2455               {
2456                 /* See if there's any alternate all-PLUS entry.  */
2457                 unsigned n;
2458                 for (n = 1; n < chain_len; ++n)
2459                   {
2460                     for (lane = 0; lane < group_size; ++lane)
2461                       if (stmts[lane] && chains[lane][n].code != code)
2462                         break;
2463                     if (lane == group_size)
2464                       break;
2465                   }
2466                 if (n != chain_len)
2467                   {
2468                     /* Swap that in at first position.  */
2469                     std::swap (children[0], children[n]);
2470                     for (lane = 0; lane < group_size; ++lane)
2471                       if (stmts[lane])
2472                         std::swap (chains[lane][0], chains[lane][n]);
2473                   }
2474                 else
2475                   {
2476                     /* ???  When this triggers and we end up with two
2477                        vect_constant/external_def up-front things break (ICE)
2478                        spectacularly finding an insertion place for the
2479                        all-constant op.  We should have a fully
2480                        vect_internal_def operand though(?) so we can swap
2481                        that into first place and then prepend the all-zero
2482                        constant.  */
2483                     if (dump_enabled_p ())
2484                       dump_printf_loc (MSG_NOTE, vect_location,
2485                                        "inserting constant zero to compensate "
2486                                        "for (partially) negated first "
2487                                        "operand\n");
2488                     chain_len++;
2489                     for (lane = 0; lane < group_size; ++lane)
2490                       if (stmts[lane])
2491                         chains[lane].safe_insert
2492                           (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2493                     vec<tree> zero_ops;
2494                     zero_ops.create (group_size);
2495                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2496                     for (lane = 1; lane < group_size; ++lane)
2497                       if (stmts[lane])
2498                         zero_ops.quick_push (zero_ops[0]);
2499                       else
2500                         zero_ops.quick_push (NULL_TREE);
2501                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2502                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2503                     children.safe_insert (0, zero);
2504                   }
2505                 break;
2506               }
2507           for (unsigned i = 1; i < children.length (); ++i)
2508             {
2509               slp_tree op0 = children[i - 1];
2510               slp_tree op1 = children[i];
2511               bool this_two_op = false;
2512               for (unsigned lane = 0; lane < group_size; ++lane)
2513                 if (stmts[lane] && chains[lane][i].code != chains[0][i].code)
2514                   {
2515                     this_two_op = true;
2516                     break;
2517                   }
2518               slp_tree child;
2519               if (i == children.length () - 1)
2520                 child = vect_create_new_slp_node (node, stmts, 2);
2521               else
2522                 child = vect_create_new_slp_node (2, ERROR_MARK);
2523               if (this_two_op)
2524                 {
2525                   vec<std::pair<unsigned, unsigned> > lperm;
2526                   lperm.create (group_size);
2527                   for (unsigned lane = 0; lane < group_size; ++lane)
2528                     lperm.quick_push (std::make_pair
2529                       (chains[lane][i].code != chains[0][i].code, lane));
2530                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2531                                                      (chains[0][i].code == code
2532                                                       ? op_stmt_info
2533                                                       : other_op_stmt_info),
2534                                                      (chains[0][i].code == code
2535                                                       ? other_op_stmt_info
2536                                                       : op_stmt_info),
2537                                                      lperm);
2538                 }
2539               else
2540                 {
2541                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2542                   SLP_TREE_VECTYPE (child) = vectype;
2543                   SLP_TREE_LANES (child) = group_size;
2544                   SLP_TREE_CHILDREN (child).quick_push (op0);
2545                   SLP_TREE_CHILDREN (child).quick_push (op1);
2546                   SLP_TREE_REPRESENTATIVE (child)
2547                     = (chains[0][i].code == code
2548                        ? op_stmt_info : other_op_stmt_info);
2549                 }
2550               children[i] = child;
2551             }
2552           *tree_size += this_tree_size + 1;
2553           *max_nunits = this_max_nunits;
2554           while (!chains.is_empty ())
2555             chains.pop ().release ();
2556           return node;
2557         }
2558 out:
2559       if (dump_enabled_p ())
2560         dump_printf_loc (MSG_NOTE, vect_location,
2561                          "failed to line up SLP graph by re-associating "
2562                          "operations in lanes%s\n",
2563                          !hard_fail ? " trying regular discovery" : "");
2564       while (!children.is_empty ())
2565         vect_free_slp_tree (children.pop ());
2566       while (!chains.is_empty ())
2567         chains.pop ().release ();
2568       /* Hard-fail, otherwise we might run into quadratic processing of the
2569          chains starting one stmt into the chain again.  */
2570       if (hard_fail)
2571         return NULL;
2572       /* Fall thru to normal processing.  */
2573     }
2574
2575   /* Get at the operands, verifying they are compatible.  */
2576   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2577   slp_oprnd_info oprnd_info;
2578   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2579     {
2580       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2581                                              stmts, i, &oprnds_info);
2582       if (res != 0)
2583         matches[(res == -1) ? 0 : i] = false;
2584       if (!matches[0])
2585         break;
2586     }
2587   for (i = 0; i < group_size; ++i)
2588     if (!matches[i])
2589       {
2590         vect_free_oprnd_info (oprnds_info);
2591         return NULL;
2592       }
2593   swap = NULL;
2594
2595   bool has_two_operators_perm = false;
2596   auto_vec<unsigned> two_op_perm_indices[2];
2597   vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
2598
2599   if (two_operators && oprnds_info.length () == 2 && group_size > 2)
2600     {
2601       unsigned idx = 0;
2602       hash_map<gimple *, unsigned> seen;
2603       vec<slp_oprnd_info> new_oprnds_info
2604         = vect_create_oprnd_info (1, group_size);
2605       bool success = true;
2606
2607       enum tree_code code = ERROR_MARK;
2608       if (oprnds_info[0]->def_stmts[0]
2609           && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
2610         code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
2611
2612       for (unsigned j = 0; j < group_size; ++j)
2613         {
2614           FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2615             {
2616               stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
2617               if (!stmt_info || !stmt_info->stmt
2618                   || !is_a<gassign *> (stmt_info->stmt)
2619                   || gimple_assign_rhs_code (stmt_info->stmt) != code
2620                   || skip_args[i])
2621                 {
2622                   success = false;
2623                   break;
2624                 }
2625
2626               bool exists;
2627               unsigned &stmt_idx
2628                 = seen.get_or_insert (stmt_info->stmt, &exists);
2629
2630               if (!exists)
2631                 {
2632                   new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
2633                   new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
2634                   stmt_idx = idx;
2635                   idx++;
2636                 }
2637
2638               two_op_perm_indices[i].safe_push (stmt_idx);
2639             }
2640
2641           if (!success)
2642             break;
2643         }
2644
2645       if (success && idx == group_size)
2646         {
2647           if (dump_enabled_p ())
2648             {
2649               dump_printf_loc (MSG_NOTE, vect_location,
2650                                "Replace two_operators operands:\n");
2651
2652               FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2653                 {
2654                   dump_printf_loc (MSG_NOTE, vect_location,
2655                                    "Operand %u:\n", i);
2656                   for (unsigned j = 0; j < group_size; j++)
2657                     dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2658                                      j, oprnd_info->def_stmts[j]->stmt);
2659                 }
2660
2661               dump_printf_loc (MSG_NOTE, vect_location,
2662                                "With a single operand:\n");
2663               for (unsigned j = 0; j < group_size; j++)
2664                 dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2665                                  j, new_oprnds_info[0]->def_stmts[j]->stmt);
2666             }
2667
2668           two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
2669           two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
2670
2671           new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
2672           new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
2673           new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
2674           new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
2675           new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
2676
2677           vect_free_oprnd_info (oprnds_info);
2678           oprnds_info = new_oprnds_info;
2679           nops = 1;
2680           has_two_operators_perm = true;
2681         }
2682       else
2683         vect_free_oprnd_info (new_oprnds_info);
2684     }
2685
2686   auto_vec<slp_tree, 4> children;
2687
2688   stmt_info = stmts[0];
2689
2690   /* Create SLP_TREE nodes for the definition node/s.  */
2691   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2692     {
2693       slp_tree child = nullptr;
2694       unsigned int j;
2695
2696       /* We're skipping certain operands from processing, for example
2697          outer loop reduction initial defs.  */
2698       if (skip_args[i])
2699         {
2700           children.safe_push (NULL);
2701           continue;
2702         }
2703
2704       if (oprnd_info->first_dt == vect_uninitialized_def)
2705         {
2706           /* COND_EXPR have one too many eventually if the condition
2707              is a SSA name.  */
2708           gcc_assert (i == 3 && nops == 4);
2709           continue;
2710         }
2711
2712       if (is_a <bb_vec_info> (vinfo)
2713           && oprnd_info->first_dt == vect_internal_def
2714           && !oprnd_info->any_pattern)
2715         {
2716           /* For BB vectorization, if all defs are the same do not
2717              bother to continue the build along the single-lane
2718              graph but use a splat of the scalar value.  */
2719           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2720           for (j = 1; j < group_size; ++j)
2721             if (oprnd_info->def_stmts[j] != first_def)
2722               break;
2723           if (j == group_size
2724               /* But avoid doing this for loads where we may be
2725                  able to CSE things, unless the stmt is not
2726                  vectorizable.  */
2727               && (!STMT_VINFO_VECTORIZABLE (first_def)
2728                   || !gimple_vuse (first_def->stmt)))
2729             {
2730               if (dump_enabled_p ())
2731                 dump_printf_loc (MSG_NOTE, vect_location,
2732                                  "Using a splat of the uniform operand %G",
2733                                  first_def->stmt);
2734               oprnd_info->first_dt = vect_external_def;
2735             }
2736         }
2737
2738       if (oprnd_info->first_dt == vect_external_def
2739           || oprnd_info->first_dt == vect_constant_def)
2740         {
2741           if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2742             {
2743               tree op0;
2744               tree uniform_val = op0 = oprnd_info->ops[0];
2745               for (j = 1; j < oprnd_info->ops.length (); ++j)
2746                 if (oprnd_info->ops[j]
2747                     && !operand_equal_p (uniform_val, oprnd_info->ops[j]))
2748                   {
2749                     uniform_val = NULL_TREE;
2750                     break;
2751                   }
2752               if (!uniform_val
2753                   && !can_duplicate_and_interleave_p (vinfo,
2754                                                       oprnd_info->ops.length (),
2755                                                       TREE_TYPE (op0)))
2756                 {
2757                   matches[j] = false;
2758                   if (dump_enabled_p ())
2759                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760                                      "Build SLP failed: invalid type of def "
2761                                      "for variable-length SLP %T\n", op0);
2762                   goto fail;
2763                 }
2764             }
2765           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2766           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2767           oprnd_info->ops = vNULL;
2768           children.safe_push (invnode);
2769           continue;
2770         }
2771
2772       /* When we have a masked load with uniform mask discover this
2773          as a single-lane mask with a splat permute.  This way we can
2774          recognize this as a masked load-lane by stripping the splat.  */
2775       if (is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
2776           && gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
2777                                      IFN_MASK_LOAD)
2778           && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2779           && ! STMT_VINFO_SLP_VECT_ONLY (DR_GROUP_FIRST_ELEMENT (stmt_info)))
2780         {
2781           vec<stmt_vec_info> def_stmts2;
2782           def_stmts2.create (1);
2783           def_stmts2.quick_push (oprnd_info->def_stmts[0]);
2784           child = vect_build_slp_tree (vinfo, def_stmts2, 1,
2785                                        &this_max_nunits,
2786                                        matches, limit,
2787                                        &this_tree_size, bst_map);
2788           if (child)
2789             {
2790               slp_tree pnode = vect_create_new_slp_node (1, VEC_PERM_EXPR);
2791               SLP_TREE_VECTYPE (pnode) = SLP_TREE_VECTYPE (child);
2792               SLP_TREE_LANES (pnode) = group_size;
2793               SLP_TREE_SCALAR_STMTS (pnode).create (group_size);
2794               SLP_TREE_LANE_PERMUTATION (pnode).create (group_size);
2795               for (unsigned k = 0; k < group_size; ++k)
2796                 {
2797                   SLP_TREE_SCALAR_STMTS (pnode)
2798                     .quick_push (oprnd_info->def_stmts[0]);
2799                   SLP_TREE_LANE_PERMUTATION (pnode)
2800                     .quick_push (std::make_pair (0u, 0u));
2801                 }
2802               SLP_TREE_CHILDREN (pnode).quick_push (child);
2803               pnode->max_nunits = child->max_nunits;
2804               children.safe_push (pnode);
2805               oprnd_info->def_stmts = vNULL;
2806               continue;
2807             }
2808           else
2809             def_stmts2.release ();
2810         }
2811
2812       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2813                                         group_size, &this_max_nunits,
2814                                         matches, limit,
2815                                         &this_tree_size, bst_map)) != NULL)
2816         {
2817           oprnd_info->def_stmts = vNULL;
2818           children.safe_push (child);
2819           continue;
2820         }
2821
2822       /* If the SLP build for operand zero failed and operand zero
2823          and one can be commutated try that for the scalar stmts
2824          that failed the match.  */
2825       if (i == 0
2826           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2827           && matches[0]
2828           /* ???  For COND_EXPRs we can swap the comparison operands
2829              as well as the arms under some constraints.  */
2830           && nops == 2
2831           && oprnds_info[1]->first_dt == vect_internal_def
2832           && is_gimple_assign (stmt_info->stmt)
2833           /* Swapping operands for reductions breaks assumptions later on.  */
2834           && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
2835         {
2836           /* See whether we can swap the matching or the non-matching
2837              stmt operands.  */
2838           bool swap_not_matching = true;
2839           do
2840             {
2841               for (j = 0; j < group_size; ++j)
2842                 {
2843                   if (matches[j] != !swap_not_matching)
2844                     continue;
2845                   stmt_vec_info stmt_info = stmts[j];
2846                   /* Verify if we can swap operands of this stmt.  */
2847                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2848                   if (!stmt
2849                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2850                     {
2851                       if (!swap_not_matching)
2852                         goto fail;
2853                       swap_not_matching = false;
2854                       break;
2855                     }
2856                 }
2857             }
2858           while (j != group_size);
2859
2860           /* Swap mismatched definition stmts.  */
2861           if (dump_enabled_p ())
2862             dump_printf_loc (MSG_NOTE, vect_location,
2863                              "Re-trying with swapped operands of stmts ");
2864           for (j = 0; j < group_size; ++j)
2865             if (matches[j] == !swap_not_matching)
2866               {
2867                 std::swap (oprnds_info[0]->def_stmts[j],
2868                            oprnds_info[1]->def_stmts[j]);
2869                 std::swap (oprnds_info[0]->ops[j],
2870                            oprnds_info[1]->ops[j]);
2871                 if (dump_enabled_p ())
2872                   dump_printf (MSG_NOTE, "%d ", j);
2873               }
2874           if (dump_enabled_p ())
2875             dump_printf (MSG_NOTE, "\n");
2876           /* After swapping some operands we lost track whether an
2877              operand has any pattern defs so be conservative here.  */
2878           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2879             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2880           /* And try again with scratch 'matches' ... */
2881           bool *tem = XALLOCAVEC (bool, group_size);
2882           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2883                                             group_size, &this_max_nunits,
2884                                             tem, limit,
2885                                             &this_tree_size, bst_map)) != NULL)
2886             {
2887               oprnd_info->def_stmts = vNULL;
2888               children.safe_push (child);
2889               continue;
2890             }
2891         }
2892 fail:
2893
2894       /* If the SLP build failed and we analyze a basic-block
2895          simply treat nodes we fail to build as externally defined
2896          (and thus build vectors from the scalar defs).
2897          The cost model will reject outright expensive cases.
2898          ???  This doesn't treat cases where permutation ultimatively
2899          fails (or we don't try permutation below).  Ideally we'd
2900          even compute a permutation that will end up with the maximum
2901          SLP tree size...  */
2902       if (is_a <bb_vec_info> (vinfo)
2903           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2904              do extra work to cancel the pattern so the uses see the
2905              scalar version.  */
2906           && !is_pattern_stmt_p (stmt_info)
2907           && !oprnd_info->any_pattern)
2908         {
2909           /* But if there's a leading vector sized set of matching stmts
2910              fail here so we can split the group.  This matches the condition
2911              vect_analyze_slp_instance uses.  */
2912           /* ???  We might want to split here and combine the results to support
2913              multiple vector sizes better.  */
2914           for (j = 0; j < group_size; ++j)
2915             if (!matches[j])
2916               break;
2917           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))
2918               && vect_slp_can_convert_to_external (oprnd_info->def_stmts))
2919             {
2920               if (dump_enabled_p ())
2921                 dump_printf_loc (MSG_NOTE, vect_location,
2922                                  "Building vector operands from scalars\n");
2923               this_tree_size++;
2924               child = vect_create_new_slp_node (oprnd_info->ops);
2925               children.safe_push (child);
2926               oprnd_info->ops = vNULL;
2927               continue;
2928             }
2929         }
2930
2931       gcc_assert (child == NULL);
2932       FOR_EACH_VEC_ELT (children, j, child)
2933         if (child)
2934           vect_free_slp_tree (child);
2935       vect_free_oprnd_info (oprnds_info);
2936       return NULL;
2937     }
2938
2939   vect_free_oprnd_info (oprnds_info);
2940
2941   /* If we have all children of a child built up from uniform scalars
2942      or does more than one possibly expensive vector construction then
2943      just throw that away, causing it built up from scalars.
2944      The exception is the SLP node for the vector store.  */
2945   if (is_a <bb_vec_info> (vinfo)
2946       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2947       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2948          do extra work to cancel the pattern so the uses see the
2949          scalar version.  */
2950       && !is_pattern_stmt_p (stmt_info))
2951     {
2952       slp_tree child;
2953       unsigned j;
2954       bool all_uniform_p = true;
2955       unsigned n_vector_builds = 0;
2956       FOR_EACH_VEC_ELT (children, j, child)
2957         {
2958           if (!child)
2959             ;
2960           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2961             all_uniform_p = false;
2962           else if (!vect_slp_tree_uniform_p (child))
2963             {
2964               all_uniform_p = false;
2965               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2966                 n_vector_builds++;
2967             }
2968         }
2969       if (all_uniform_p
2970           || n_vector_builds > 1
2971           || (n_vector_builds == children.length ()
2972               && is_a <gphi *> (stmt_info->stmt)))
2973         {
2974           /* Roll back.  */
2975           matches[0] = false;
2976           FOR_EACH_VEC_ELT (children, j, child)
2977             if (child)
2978               vect_free_slp_tree (child);
2979
2980           if (dump_enabled_p ())
2981             dump_printf_loc (MSG_NOTE, vect_location,
2982                              "Building parent vector operands from "
2983                              "scalars instead\n");
2984           return NULL;
2985         }
2986     }
2987
2988   *tree_size += this_tree_size + 1;
2989   *max_nunits = this_max_nunits;
2990
2991   if (two_operators)
2992     {
2993       /* ???  We'd likely want to either cache in bst_map sth like
2994          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2995          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2996          explicit stmts to put in so the keying on 'stmts' doesn't
2997          work (but we have the same issue with nodes that use 'ops').  */
2998
2999       if (has_two_operators_perm)
3000         {
3001           slp_tree child = children[0];
3002           children.truncate (0);
3003           for (i = 0; i < 2; i++)
3004             {
3005               slp_tree pnode
3006                 = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
3007               SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
3008               SLP_TREE_VECTYPE (pnode) = vectype;
3009               SLP_TREE_CHILDREN (pnode).quick_push (child);
3010               SLP_TREE_CHILDREN (pnode).quick_push (child);
3011               lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
3012               children.safe_push (pnode);
3013
3014               for (unsigned j = 0; j < stmts.length (); j++)
3015                 perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
3016             }
3017
3018           SLP_TREE_REF_COUNT (child) += 4;
3019         }
3020
3021       slp_tree one = new _slp_tree;
3022       slp_tree two = new _slp_tree;
3023       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
3024       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
3025       SLP_TREE_VECTYPE (one) = vectype;
3026       SLP_TREE_VECTYPE (two) = vectype;
3027       SLP_TREE_CHILDREN (one).safe_splice (children);
3028       SLP_TREE_CHILDREN (two).safe_splice (children);
3029       slp_tree child;
3030       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
3031         SLP_TREE_REF_COUNT (child)++;
3032
3033       /* Here we record the original defs since this
3034          node represents the final lane configuration.  */
3035       node = vect_create_new_slp_node (node, stmts, 2);
3036       SLP_TREE_VECTYPE (node) = vectype;
3037       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3038       SLP_TREE_CHILDREN (node).quick_push (one);
3039       SLP_TREE_CHILDREN (node).quick_push (two);
3040       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
3041       enum tree_code code0 = gimple_assign_rhs_code (stmt);
3042       enum tree_code ocode = ERROR_MARK;
3043       stmt_vec_info ostmt_info;
3044       unsigned j = 0;
3045       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
3046         {
3047           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
3048           if (gimple_assign_rhs_code (ostmt) != code0)
3049             {
3050               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
3051               ocode = gimple_assign_rhs_code (ostmt);
3052               j = i;
3053             }
3054           else
3055             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
3056         }
3057
3058       SLP_TREE_CODE (one) = code0;
3059       SLP_TREE_CODE (two) = ocode;
3060       SLP_TREE_LANES (one) = stmts.length ();
3061       SLP_TREE_LANES (two) = stmts.length ();
3062       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
3063       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
3064
3065       return node;
3066     }
3067
3068   node = vect_create_new_slp_node (node, stmts, nops);
3069   SLP_TREE_VECTYPE (node) = vectype;
3070   SLP_TREE_CHILDREN (node).splice (children);
3071   return node;
3072 }
3073
3074 /* Dump a single SLP tree NODE.  */
3075
3076 static void
3077 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
3078                      slp_tree node)
3079 {
3080   unsigned i, j;
3081   slp_tree child;
3082   stmt_vec_info stmt_info;
3083   tree op;
3084
3085   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
3086   dump_user_location_t user_loc = loc.get_user_location ();
3087   dump_printf_loc (metadata, user_loc,
3088                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
3089                    ", refcnt=%u)",
3090                    SLP_TREE_DEF_TYPE (node) == vect_external_def
3091                    ? " (external)"
3092                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
3093                       ? " (constant)"
3094                       : ""), (void *) node,
3095                    estimated_poly_value (node->max_nunits),
3096                                          SLP_TREE_REF_COUNT (node));
3097   if (SLP_TREE_VECTYPE (node))
3098     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
3099   dump_printf (metadata, "\n");
3100   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3101     {
3102       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3103         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
3104       else
3105         dump_printf_loc (metadata, user_loc, "op template: %G",
3106                          SLP_TREE_REPRESENTATIVE (node)->stmt);
3107     }
3108   if (SLP_TREE_SCALAR_STMTS (node).exists ())
3109     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3110       if (stmt_info)
3111         dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
3112                          STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
3113                          i, stmt_info->stmt);
3114       else
3115         dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
3116   else
3117     {
3118       dump_printf_loc (metadata, user_loc, "\t{ ");
3119       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
3120         dump_printf (metadata, "%T%s ", op,
3121                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
3122       dump_printf (metadata, "}\n");
3123     }
3124   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3125     {
3126       dump_printf_loc (metadata, user_loc, "\tload permutation {");
3127       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
3128         dump_printf (dump_kind, " %u", j);
3129       dump_printf (dump_kind, " }\n");
3130     }
3131   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3132     {
3133       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
3134       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
3135         dump_printf (dump_kind, " %u[%u]",
3136                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
3137                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
3138       dump_printf (dump_kind, " }%s\n",
3139                    node->ldst_lanes ? " (load-lanes)" : "");
3140     }
3141   if (SLP_TREE_CHILDREN (node).is_empty ())
3142     return;
3143   dump_printf_loc (metadata, user_loc, "\tchildren");
3144   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3145     dump_printf (dump_kind, " %p", (void *)child);
3146   dump_printf (dump_kind, "%s\n",
3147                node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
3148                ? " (store-lanes)" : "");
3149 }
3150
3151 DEBUG_FUNCTION void
3152 debug (slp_tree node)
3153 {
3154   debug_dump_context ctx;
3155   vect_print_slp_tree (MSG_NOTE,
3156                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
3157                        node);
3158 }
3159
3160 /* Recursive helper for the dot producer below.  */
3161
3162 static void
3163 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
3164 {
3165   if (visited.add (node))
3166     return;
3167
3168   fprintf (f, "\"%p\" [label=\"", (void *)node);
3169   vect_print_slp_tree (MSG_NOTE,
3170                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
3171                        node);
3172   fprintf (f, "\"];\n");
3173
3174
3175   for (slp_tree child : SLP_TREE_CHILDREN (node))
3176     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
3177
3178   for (slp_tree child : SLP_TREE_CHILDREN (node))
3179     if (child)
3180       dot_slp_tree (f, child, visited);
3181 }
3182
3183 DEBUG_FUNCTION void
3184 dot_slp_tree (const char *fname, slp_tree node)
3185 {
3186   FILE *f = fopen (fname, "w");
3187   fprintf (f, "digraph {\n");
3188   fflush (f);
3189     {
3190       debug_dump_context ctx (f);
3191       hash_set<slp_tree> visited;
3192       dot_slp_tree (f, node, visited);
3193     }
3194   fflush (f);
3195   fprintf (f, "}\n");
3196   fclose (f);
3197 }
3198
3199 DEBUG_FUNCTION void
3200 dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
3201 {
3202   FILE *f = fopen (fname, "w");
3203   fprintf (f, "digraph {\n");
3204   fflush (f);
3205     {
3206       debug_dump_context ctx (f);
3207       hash_set<slp_tree> visited;
3208       for (auto inst : slp_instances)
3209         dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
3210     }
3211   fflush (f);
3212   fprintf (f, "}\n");
3213   fclose (f);
3214 }
3215
3216 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
3217
3218 static void
3219 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3220                       slp_tree node, hash_set<slp_tree> &visited)
3221 {
3222   unsigned i;
3223   slp_tree child;
3224
3225   if (visited.add (node))
3226     return;
3227
3228   vect_print_slp_tree (dump_kind, loc, node);
3229
3230   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3231     if (child)
3232       vect_print_slp_graph (dump_kind, loc, child, visited);
3233 }
3234
3235 static void
3236 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3237                       slp_tree entry)
3238 {
3239   hash_set<slp_tree> visited;
3240   vect_print_slp_graph (dump_kind, loc, entry, visited);
3241 }
3242
3243 DEBUG_FUNCTION void
3244 debug (slp_instance instance)
3245 {
3246   debug_dump_context ctx;
3247   vect_print_slp_graph (MSG_NOTE,
3248                         dump_location_t::from_location_t (UNKNOWN_LOCATION),
3249                         SLP_INSTANCE_TREE (instance));
3250 }
3251
3252 /* Mark the tree rooted at NODE with PURE_SLP.  */
3253
3254 static void
3255 vect_mark_slp_stmts (vec_info *vinfo, slp_tree node,
3256                      hash_set<slp_tree> &visited)
3257 {
3258   int i;
3259   stmt_vec_info stmt_info;
3260   slp_tree child;
3261
3262   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3263     return;
3264
3265   if (visited.add (node))
3266     return;
3267
3268   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3269     if (stmt_info)
3270       {
3271         STMT_SLP_TYPE (stmt_info) = pure_slp;
3272         /* ???  For .MASK_LOAD and .MASK_STORE detected as load/store-lanes
3273            when there is the mask_conversion pattern applied we have lost the
3274            alternate lanes of the uniform mask which nevertheless
3275            have separate pattern defs.  To not confuse hybrid
3276            analysis we mark those as covered as well here.  */
3277         if (node->ldst_lanes)
3278           if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
3279             if (gimple_call_internal_p (call, IFN_MASK_LOAD)
3280                 || gimple_call_internal_p (call, IFN_MASK_STORE))
3281               {
3282                 tree mask = gimple_call_arg (call,
3283                                              internal_fn_mask_index
3284                                              (gimple_call_internal_fn (call)));
3285                 if (TREE_CODE (mask) == SSA_NAME)
3286                   if (stmt_vec_info mask_info = vinfo->lookup_def (mask))
3287                     {
3288                       mask_info = vect_stmt_to_vectorize (mask_info);
3289                       STMT_SLP_TYPE (mask_info) = pure_slp;
3290                     }
3291               }
3292       }
3293
3294   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3295     if (child)
3296       vect_mark_slp_stmts (vinfo, child, visited);
3297 }
3298
3299 static void
3300 vect_mark_slp_stmts (vec_info *vinfo, slp_tree node)
3301 {
3302   hash_set<slp_tree> visited;
3303   vect_mark_slp_stmts (vinfo, node, visited);
3304 }
3305
3306 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
3307
3308 static void
3309 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
3310 {
3311   int i;
3312   stmt_vec_info stmt_info;
3313   slp_tree child;
3314
3315   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3316     return;
3317
3318   if (visited.add (node))
3319     return;
3320
3321   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3322     if (stmt_info)
3323       {
3324         gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
3325                     || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
3326         STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
3327       }
3328
3329   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3330     if (child)
3331       vect_mark_slp_stmts_relevant (child, visited);
3332 }
3333
3334 static void
3335 vect_mark_slp_stmts_relevant (slp_tree node)
3336 {
3337   hash_set<slp_tree> visited;
3338   vect_mark_slp_stmts_relevant (node, visited);
3339 }
3340
3341
3342 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
3343
3344 static void
3345 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
3346                        hash_set<slp_tree> &visited)
3347 {
3348   if (!node || visited.add (node))
3349     return;
3350
3351   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3352     return;
3353
3354   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3355     {
3356       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3357       if (STMT_VINFO_DATA_REF (stmt_info)
3358           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3359         loads.safe_push (node);
3360     }
3361
3362   unsigned i;
3363   slp_tree child;
3364   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3365     vect_gather_slp_loads (loads, child, visited);
3366 }
3367
3368
3369 /* Find the last store in SLP INSTANCE.  */
3370
3371 stmt_vec_info
3372 vect_find_last_scalar_stmt_in_slp (slp_tree node)
3373 {
3374   stmt_vec_info last = NULL;
3375   stmt_vec_info stmt_vinfo;
3376
3377   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3378     if (stmt_vinfo)
3379       {
3380         stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3381         last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
3382       }
3383
3384   return last;
3385 }
3386
3387 /* Find the first stmt in NODE.  */
3388
3389 stmt_vec_info
3390 vect_find_first_scalar_stmt_in_slp (slp_tree node)
3391 {
3392   stmt_vec_info first = NULL;
3393   stmt_vec_info stmt_vinfo;
3394
3395   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3396     if (stmt_vinfo)
3397       {
3398         stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3399         if (!first
3400             || get_later_stmt (stmt_vinfo, first) == first)
3401           first = stmt_vinfo;
3402       }
3403
3404   return first;
3405 }
3406
3407 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3408    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3409    (also containing the first GROUP1_SIZE stmts, since stores are
3410    consecutive), the second containing the remainder.
3411    Return the first stmt in the second group.  */
3412
3413 static stmt_vec_info
3414 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3415 {
3416   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3417   gcc_assert (group1_size > 0);
3418   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3419   gcc_assert (group2_size > 0);
3420   DR_GROUP_SIZE (first_vinfo) = group1_size;
3421
3422   stmt_vec_info stmt_info = first_vinfo;
3423   for (unsigned i = group1_size; i > 1; i--)
3424     {
3425       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3426       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3427     }
3428   /* STMT is now the last element of the first group.  */
3429   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3430   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3431
3432   DR_GROUP_SIZE (group2) = group2_size;
3433   for (stmt_info = group2; stmt_info;
3434        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3435     {
3436       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3437       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3438     }
3439
3440   /* For the second group, the DR_GROUP_GAP is that before the original group,
3441      plus skipping over the first vector.  */
3442   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3443
3444   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
3445   DR_GROUP_GAP (first_vinfo) += group2_size;
3446
3447   if (dump_enabled_p ())
3448     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3449                      group1_size, group2_size);
3450
3451   return group2;
3452 }
3453
3454 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3455    statements and a vector of NUNITS elements.  */
3456
3457 static poly_uint64
3458 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3459 {
3460   return exact_div (common_multiple (nunits, group_size), group_size);
3461 }
3462
3463 /* Helper that checks to see if a node is a load node.  */
3464
3465 static inline bool
3466 vect_is_slp_load_node  (slp_tree root)
3467 {
3468   return (SLP_TREE_CODE (root) != VEC_PERM_EXPR
3469           && SLP_TREE_DEF_TYPE (root) == vect_internal_def
3470           && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3471           && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
3472 }
3473
3474
3475 /* Helper function of optimize_load_redistribution that performs the operation
3476    recursively.  */
3477
3478 static slp_tree
3479 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3480                                 vec_info *vinfo, unsigned int group_size,
3481                                 hash_map<slp_tree, slp_tree> *load_map,
3482                                 slp_tree root)
3483 {
3484   if (slp_tree *leader = load_map->get (root))
3485     return *leader;
3486
3487   slp_tree node;
3488   unsigned i;
3489
3490   /* For now, we don't know anything about externals so do not do anything.  */
3491   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3492     return NULL;
3493   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3494     {
3495       /* First convert this node into a load node and add it to the leaves
3496          list and flatten the permute from a lane to a load one.  If it's
3497          unneeded it will be elided later.  */
3498       vec<stmt_vec_info> stmts;
3499       stmts.create (SLP_TREE_LANES (root));
3500       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3501       for (unsigned j = 0; j < lane_perm.length (); j++)
3502         {
3503           std::pair<unsigned, unsigned> perm = lane_perm[j];
3504           node = SLP_TREE_CHILDREN (root)[perm.first];
3505
3506           if (!vect_is_slp_load_node (node)
3507               || SLP_TREE_CHILDREN (node).exists ())
3508             {
3509               stmts.release ();
3510               goto next;
3511             }
3512
3513           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3514         }
3515
3516       if (dump_enabled_p ())
3517         dump_printf_loc (MSG_NOTE, vect_location,
3518                          "converting stmts on permute node %p\n",
3519                          (void *) root);
3520
3521       bool *matches = XALLOCAVEC (bool, group_size);
3522       poly_uint64 max_nunits = 1;
3523       unsigned tree_size = 0, limit = 1;
3524       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3525                                   matches, &limit, &tree_size, bst_map);
3526       if (!node)
3527         stmts.release ();
3528
3529       load_map->put (root, node);
3530       return node;
3531     }
3532
3533 next:
3534   load_map->put (root, NULL);
3535
3536   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3537     {
3538       slp_tree value
3539         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3540                                           node);
3541       if (value)
3542         {
3543           SLP_TREE_REF_COUNT (value)++;
3544           SLP_TREE_CHILDREN (root)[i] = value;
3545           /* ???  We know the original leafs of the replaced nodes will
3546              be referenced by bst_map, only the permutes created by
3547              pattern matching are not.  */
3548           if (SLP_TREE_REF_COUNT (node) == 1)
3549             load_map->remove (node);
3550           vect_free_slp_tree (node);
3551         }
3552     }
3553
3554   return NULL;
3555 }
3556
3557 /* Temporary workaround for loads not being CSEd during SLP build.  This
3558    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3559    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3560    same DR such that the final operation is equal to a permuted load.  Such
3561    NODES are then directly converted into LOADS themselves.  The nodes are
3562    CSEd using BST_MAP.  */
3563
3564 static void
3565 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3566                               vec_info *vinfo, unsigned int group_size,
3567                               hash_map<slp_tree, slp_tree> *load_map,
3568                               slp_tree root)
3569 {
3570   slp_tree node;
3571   unsigned i;
3572
3573   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3574     {
3575       slp_tree value
3576         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3577                                           node);
3578       if (value)
3579         {
3580           SLP_TREE_REF_COUNT (value)++;
3581           SLP_TREE_CHILDREN (root)[i] = value;
3582           /* ???  We know the original leafs of the replaced nodes will
3583              be referenced by bst_map, only the permutes created by
3584              pattern matching are not.  */
3585           if (SLP_TREE_REF_COUNT (node) == 1)
3586             load_map->remove (node);
3587           vect_free_slp_tree (node);
3588         }
3589     }
3590 }
3591
3592 /* Helper function of vect_match_slp_patterns.
3593
3594    Attempts to match patterns against the slp tree rooted in REF_NODE using
3595    VINFO.  Patterns are matched in post-order traversal.
3596
3597    If matching is successful the value in REF_NODE is updated and returned, if
3598    not then it is returned unchanged.  */
3599
3600 static bool
3601 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3602                            slp_tree_to_load_perm_map_t *perm_cache,
3603                            slp_compat_nodes_map_t *compat_cache,
3604                            hash_set<slp_tree> *visited)
3605 {
3606   unsigned i;
3607   slp_tree node = *ref_node;
3608   bool found_p = false;
3609   if (!node || visited->add (node))
3610     return false;
3611
3612   slp_tree child;
3613   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3614     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3615                                           vinfo, perm_cache, compat_cache,
3616                                           visited);
3617
3618   for (unsigned x = 0; x < num__slp_patterns; x++)
3619     {
3620       vect_pattern *pattern
3621         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3622       if (pattern)
3623         {
3624           pattern->build (vinfo);
3625           delete pattern;
3626           found_p = true;
3627         }
3628     }
3629
3630   return found_p;
3631 }
3632
3633 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3634    vec_info VINFO.
3635
3636    The modified tree is returned.  Patterns are tried in order and multiple
3637    patterns may match.  */
3638
3639 static bool
3640 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3641                          hash_set<slp_tree> *visited,
3642                          slp_tree_to_load_perm_map_t *perm_cache,
3643                          slp_compat_nodes_map_t *compat_cache)
3644 {
3645   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3646   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3647
3648   if (dump_enabled_p ())
3649     dump_printf_loc (MSG_NOTE, vect_location,
3650                      "Analyzing SLP tree %p for patterns\n",
3651                      (void *) SLP_INSTANCE_TREE (instance));
3652
3653   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3654                                     visited);
3655 }
3656
3657 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3658    vectorizing with VECTYPE that might be NULL.  MASKED_P indicates whether
3659    the stores are masked.
3660    Return true if we could use IFN_STORE_LANES instead and if that appears
3661    to be the better approach.  */
3662
3663 static bool
3664 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3665                                tree vectype, bool masked_p,
3666                                unsigned int group_size,
3667                                unsigned int new_group_size)
3668 {
3669   if (!vectype)
3670     {
3671       tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3672       vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3673     }
3674   if (!vectype)
3675     return false;
3676   /* Allow the split if one of the two new groups would operate on full
3677      vectors *within* rather than across one scalar loop iteration.
3678      This is purely a heuristic, but it should work well for group
3679      sizes of 3 and 4, where the possible splits are:
3680
3681        3->2+1:  OK if the vector has exactly two elements
3682        4->2+2:  Likewise
3683        4->3+1:  Less clear-cut.  */
3684   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3685       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3686     return false;
3687   return vect_store_lanes_supported (vectype, group_size, masked_p) != IFN_LAST;
3688 }
3689
3690 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3691    vect_build_slp_tree to build a tree of packed stmts if possible.
3692    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3693
3694 static bool
3695 vect_analyze_slp_instance (vec_info *vinfo,
3696                            scalar_stmts_to_slp_tree_map_t *bst_map,
3697                            stmt_vec_info stmt_info, slp_instance_kind kind,
3698                            unsigned max_tree_size, unsigned *limit,
3699                            bool force_single_lane);
3700
3701 /* Build an interleaving scheme for the store sources RHS_NODES from
3702    SCALAR_STMTS.  */
3703
3704 static slp_tree
3705 vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
3706                                    vec<stmt_vec_info> &scalar_stmts,
3707                                    poly_uint64 max_nunits)
3708 {
3709   unsigned int group_size = scalar_stmts.length ();
3710   slp_tree node = vect_create_new_slp_node (scalar_stmts,
3711                                             SLP_TREE_CHILDREN
3712                                               (rhs_nodes[0]).length ());
3713   SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
3714   node->max_nunits = max_nunits;
3715   for (unsigned l = 0;
3716        l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
3717     {
3718       /* And a permute merging all RHS SLP trees.  */
3719       slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
3720                                                 VEC_PERM_EXPR);
3721       SLP_TREE_CHILDREN (node).quick_push (perm);
3722       SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
3723       SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
3724       perm->max_nunits = max_nunits;
3725       SLP_TREE_LANES (perm) = group_size;
3726       /* ???  We should set this NULL but that's not expected.  */
3727       SLP_TREE_REPRESENTATIVE (perm)
3728         = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
3729       for (unsigned j = 0; j < rhs_nodes.length (); ++j)
3730         {
3731           SLP_TREE_CHILDREN (perm)
3732             .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
3733           SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
3734           for (unsigned k = 0;
3735                k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
3736             {
3737               /* ???  We should populate SLP_TREE_SCALAR_STMTS
3738                  or SLP_TREE_SCALAR_OPS but then we might have
3739                  a mix of both in our children.  */
3740               SLP_TREE_LANE_PERMUTATION (perm)
3741                 .quick_push (std::make_pair (j, k));
3742             }
3743         }
3744
3745       /* Now we have a single permute node but we cannot code-generate
3746          the case with more than two inputs.
3747          Perform pairwise reduction, reducing the two inputs
3748          with the least number of lanes to one and then repeat until
3749          we end up with two inputs.  That scheme makes sure we end
3750          up with permutes satisfying the restriction of requiring at
3751          most two vector inputs to produce a single vector output
3752          when the number of lanes is even.  */
3753       while (SLP_TREE_CHILDREN (perm).length () > 2)
3754         {
3755           /* When we have three equal sized groups left the pairwise
3756              reduction does not result in a scheme that avoids using
3757              three vectors.  Instead merge the first two groups
3758              to the final size with do-not-care elements (chosen
3759              from the first group) and then merge with the third.
3760                   { A0, B0,  x, A1, B1,  x, ... }
3761                -> { A0, B0, C0, A1, B1, C1, ... }
3762              This handles group size of three (and at least
3763              power-of-two multiples of that).  */
3764           if (SLP_TREE_CHILDREN (perm).length () == 3
3765               && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3766                   == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
3767               && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3768                   == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
3769             {
3770               int ai = 0;
3771               int bi = 1;
3772               slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3773               slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3774               unsigned n = SLP_TREE_LANES (perm);
3775
3776               slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3777               SLP_TREE_LANES (permab) = n;
3778               SLP_TREE_LANE_PERMUTATION (permab).create (n);
3779               SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3780               permab->max_nunits = max_nunits;
3781               /* ???  Should be NULL but that's not expected.  */
3782               SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3783               SLP_TREE_CHILDREN (permab).quick_push (a);
3784               for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3785                 SLP_TREE_LANE_PERMUTATION (permab)
3786                   .quick_push (std::make_pair (0, k));
3787               SLP_TREE_CHILDREN (permab).quick_push (b);
3788               for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3789                 SLP_TREE_LANE_PERMUTATION (permab)
3790                   .quick_push (std::make_pair (1, k));
3791               /* Push the do-not-care lanes.  */
3792               for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3793                 SLP_TREE_LANE_PERMUTATION (permab)
3794                   .quick_push (std::make_pair (0, k));
3795
3796               /* Put the merged node into 'perm', in place of a.  */
3797               SLP_TREE_CHILDREN (perm)[ai] = permab;
3798               /* Adjust the references to b in the permutation
3799                  of perm and to the later children which we'll
3800                  remove.  */
3801               for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3802                 {
3803                   std::pair<unsigned, unsigned> &p
3804                     = SLP_TREE_LANE_PERMUTATION (perm)[k];
3805                   if (p.first == (unsigned) bi)
3806                     {
3807                       p.first = ai;
3808                       p.second += SLP_TREE_LANES (a);
3809                     }
3810                   else if (p.first > (unsigned) bi)
3811                     p.first--;
3812                 }
3813               SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3814               break;
3815             }
3816
3817           /* Pick the two nodes with the least number of lanes,
3818              prefer the earliest candidate and maintain ai < bi.  */
3819           int ai = -1;
3820           int bi = -1;
3821           for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
3822             {
3823               if (ai == -1)
3824                 ai = ci;
3825               else if (bi == -1)
3826                 bi = ci;
3827               else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
3828                         < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
3829                        || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
3830                            < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
3831                 {
3832                   if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
3833                       <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
3834                     bi = ci;
3835                   else
3836                     {
3837                       ai = bi;
3838                       bi = ci;
3839                     }
3840                 }
3841             }
3842
3843           /* Produce a merge of nodes ai and bi.  */
3844           slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3845           slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3846           unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
3847           slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3848           SLP_TREE_LANES (permab) = n;
3849           SLP_TREE_LANE_PERMUTATION (permab).create (n);
3850           SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3851           permab->max_nunits = max_nunits;
3852           /* ???  Should be NULL but that's not expected.  */
3853           SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3854           SLP_TREE_CHILDREN (permab).quick_push (a);
3855           for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3856             SLP_TREE_LANE_PERMUTATION (permab)
3857               .quick_push (std::make_pair (0, k));
3858           SLP_TREE_CHILDREN (permab).quick_push (b);
3859           for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3860             SLP_TREE_LANE_PERMUTATION (permab)
3861               .quick_push (std::make_pair (1, k));
3862
3863           /* Put the merged node into 'perm', in place of a.  */
3864           SLP_TREE_CHILDREN (perm)[ai] = permab;
3865           /* Adjust the references to b in the permutation
3866              of perm and to the later children which we'll
3867              remove.  */
3868           for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3869             {
3870               std::pair<unsigned, unsigned> &p
3871                 = SLP_TREE_LANE_PERMUTATION (perm)[k];
3872               if (p.first == (unsigned) bi)
3873                 {
3874                   p.first = ai;
3875                   p.second += SLP_TREE_LANES (a);
3876                 }
3877               else if (p.first > (unsigned) bi)
3878                 p.first--;
3879             }
3880           SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3881         }
3882     }
3883
3884   return node;
3885 }
3886
3887 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3888    of KIND.  Return true if successful.  */
3889
3890 static bool
3891 vect_build_slp_instance (vec_info *vinfo,
3892                          slp_instance_kind kind,
3893                          vec<stmt_vec_info> &scalar_stmts,
3894                          vec<stmt_vec_info> &root_stmt_infos,
3895                          vec<tree> &remain,
3896                          unsigned max_tree_size, unsigned *limit,
3897                          scalar_stmts_to_slp_tree_map_t *bst_map,
3898                          /* ???  We need stmt_info for group splitting.  */
3899                          stmt_vec_info stmt_info_,
3900                          bool force_single_lane)
3901 {
3902   /* If there's no budget left bail out early.  */
3903   if (*limit == 0)
3904     return false;
3905
3906   if (kind == slp_inst_kind_ctor)
3907     {
3908       if (dump_enabled_p ())
3909         dump_printf_loc (MSG_NOTE, vect_location,
3910                          "Analyzing vectorizable constructor: %G\n",
3911                          root_stmt_infos[0]->stmt);
3912     }
3913   else if (kind == slp_inst_kind_gcond)
3914     {
3915       if (dump_enabled_p ())
3916         dump_printf_loc (MSG_NOTE, vect_location,
3917                          "Analyzing vectorizable control flow: %G",
3918                          root_stmt_infos[0]->stmt);
3919     }
3920
3921   if (dump_enabled_p ())
3922     {
3923       dump_printf_loc (MSG_NOTE, vect_location,
3924                        "Starting SLP discovery for\n");
3925       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3926         dump_printf_loc (MSG_NOTE, vect_location,
3927                          "  %G", scalar_stmts[i]->stmt);
3928     }
3929
3930   /* Build the tree for the SLP instance.  */
3931   unsigned int group_size = scalar_stmts.length ();
3932   bool *matches = XALLOCAVEC (bool, group_size);
3933   poly_uint64 max_nunits = 1;
3934   unsigned tree_size = 0;
3935   unsigned i;
3936
3937   slp_tree node = NULL;
3938   if (group_size > 1 && force_single_lane)
3939     {
3940       matches[0] = true;
3941       matches[1] = false;
3942     }
3943   else
3944     node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3945                                 &max_nunits, matches, limit,
3946                                 &tree_size, bst_map);
3947   if (node != NULL)
3948     {
3949       /* Calculate the unrolling factor based on the smallest type.  */
3950       poly_uint64 unrolling_factor
3951         = calculate_unrolling_factor (max_nunits, group_size);
3952
3953       if (maybe_ne (unrolling_factor, 1U)
3954           && is_a <bb_vec_info> (vinfo))
3955         {
3956           unsigned HOST_WIDE_INT const_max_nunits;
3957           if (!max_nunits.is_constant (&const_max_nunits)
3958               || const_max_nunits > group_size)
3959             {
3960               if (dump_enabled_p ())
3961                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3962                                  "Build SLP failed: store group "
3963                                  "size not a multiple of the vector size "
3964                                  "in basic block SLP\n");
3965               vect_free_slp_tree (node);
3966               return false;
3967             }
3968           /* Fatal mismatch.  */
3969           if (dump_enabled_p ())
3970             dump_printf_loc (MSG_NOTE, vect_location,
3971                              "SLP discovery succeeded but node needs "
3972                              "splitting\n");
3973           memset (matches, true, group_size);
3974           matches[group_size / const_max_nunits * const_max_nunits] = false;
3975           vect_free_slp_tree (node);
3976         }
3977       else
3978         {
3979           /* Create a new SLP instance.  */
3980           slp_instance new_instance = XNEW (class _slp_instance);
3981           SLP_INSTANCE_TREE (new_instance) = node;
3982           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3983           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3984           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3985           SLP_INSTANCE_KIND (new_instance) = kind;
3986           new_instance->reduc_phis = NULL;
3987           new_instance->cost_vec = vNULL;
3988           new_instance->subgraph_entries = vNULL;
3989
3990           if (dump_enabled_p ())
3991             dump_printf_loc (MSG_NOTE, vect_location,
3992                              "SLP size %u vs. limit %u.\n",
3993                              tree_size, max_tree_size);
3994
3995           /* Fixup SLP reduction chains.  */
3996           if (kind == slp_inst_kind_reduc_chain)
3997             {
3998               /* If this is a reduction chain with a conversion in front
3999                  amend the SLP tree with a node for that.  */
4000               gimple *scalar_def
4001                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
4002               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
4003                 {
4004                   /* Get at the conversion stmt - we know it's the single use
4005                      of the last stmt of the reduction chain.  */
4006                   use_operand_p use_p;
4007                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
4008                                            &use_p, &scalar_def);
4009                   gcc_assert (r);
4010                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
4011                   next_info = vect_stmt_to_vectorize (next_info);
4012                   scalar_stmts = vNULL;
4013                   scalar_stmts.create (group_size);
4014                   for (unsigned i = 0; i < group_size; ++i)
4015                     scalar_stmts.quick_push (next_info);
4016                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
4017                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
4018                   SLP_TREE_CHILDREN (conv).quick_push (node);
4019                   SLP_INSTANCE_TREE (new_instance) = conv;
4020                   /* We also have to fake this conversion stmt as SLP reduction
4021                      group so we don't have to mess with too much code
4022                      elsewhere.  */
4023                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
4024                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
4025                 }
4026               /* Fill the backedge child of the PHI SLP node.  The
4027                  general matching code cannot find it because the
4028                  scalar code does not reflect how we vectorize the
4029                  reduction.  */
4030               use_operand_p use_p;
4031               imm_use_iterator imm_iter;
4032               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
4033               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
4034                                      gimple_get_lhs (scalar_def))
4035                 /* There are exactly two non-debug uses, the reduction
4036                    PHI and the loop-closed PHI node.  */
4037                 if (!is_gimple_debug (USE_STMT (use_p))
4038                     && gimple_bb (USE_STMT (use_p)) == loop->header)
4039                   {
4040                     auto_vec<stmt_vec_info, 64> phis (group_size);
4041                     stmt_vec_info phi_info
4042                       = vinfo->lookup_stmt (USE_STMT (use_p));
4043                     for (unsigned i = 0; i < group_size; ++i)
4044                       phis.quick_push (phi_info);
4045                     slp_tree *phi_node = bst_map->get (phis);
4046                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
4047                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
4048                       = SLP_INSTANCE_TREE (new_instance);
4049                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
4050                   }
4051             }
4052
4053           vinfo->slp_instances.safe_push (new_instance);
4054
4055           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4056              the number of scalar stmts in the root in a few places.
4057              Verify that assumption holds.  */
4058           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4059                         .length () == group_size);
4060
4061           if (dump_enabled_p ())
4062             {
4063               dump_printf_loc (MSG_NOTE, vect_location,
4064                                "Final SLP tree for instance %p:\n",
4065                                (void *) new_instance);
4066               vect_print_slp_graph (MSG_NOTE, vect_location,
4067                                     SLP_INSTANCE_TREE (new_instance));
4068             }
4069
4070           return true;
4071         }
4072     }
4073   /* Failed to SLP.  */
4074
4075   stmt_vec_info stmt_info = stmt_info_;
4076   /* Try to break the group up into pieces.  */
4077   if (*limit > 0 && kind == slp_inst_kind_store)
4078     {
4079       /* ???  We could delay all the actual splitting of store-groups
4080          until after SLP discovery of the original group completed.
4081          Then we can recurse to vect_build_slp_instance directly.  */
4082       for (i = 0; i < group_size; i++)
4083         if (!matches[i])
4084           break;
4085
4086       /* For basic block SLP, try to break the group up into multiples of
4087          a vector size.  */
4088       if (is_a <bb_vec_info> (vinfo)
4089           && (i > 1 && i < group_size))
4090         {
4091           /* Free the allocated memory.  */
4092           scalar_stmts.release ();
4093
4094           tree scalar_type
4095             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
4096           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
4097                                                       1 << floor_log2 (i));
4098           unsigned HOST_WIDE_INT const_nunits;
4099           if (vectype
4100               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
4101             {
4102               /* Split into two groups at the first vector boundary.  */
4103               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
4104               unsigned group1_size = i & ~(const_nunits - 1);
4105
4106               if (dump_enabled_p ())
4107                 dump_printf_loc (MSG_NOTE, vect_location,
4108                                  "Splitting SLP group at stmt %u\n", i);
4109               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
4110                                                                group1_size);
4111               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
4112                                                     kind, max_tree_size,
4113                                                     limit, false);
4114               /* Split the rest at the failure point and possibly
4115                  re-analyze the remaining matching part if it has
4116                  at least two lanes.  */
4117               if (group1_size < i
4118                   && (i + 1 < group_size
4119                       || i - group1_size > 1))
4120                 {
4121                   stmt_vec_info rest2 = rest;
4122                   rest = vect_split_slp_store_group (rest, i - group1_size);
4123                   if (i - group1_size > 1)
4124                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
4125                                                       kind, max_tree_size,
4126                                                       limit, false);
4127                 }
4128               /* Re-analyze the non-matching tail if it has at least
4129                  two lanes.  */
4130               if (i + 1 < group_size)
4131                 res |= vect_analyze_slp_instance (vinfo, bst_map,
4132                                                   rest, kind, max_tree_size,
4133                                                   limit, false);
4134               return res;
4135             }
4136         }
4137
4138       /* For loop vectorization split the RHS into arbitrary pieces of
4139          size >= 1.  */
4140       else if (is_a <loop_vec_info> (vinfo)
4141                && (group_size != 1 && i < group_size))
4142         {
4143           gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4144           bool masked_p = call
4145               && gimple_call_internal_p (call)
4146               && internal_fn_mask_index (gimple_call_internal_fn (call)) != -1;
4147           /* There are targets that cannot do even/odd interleaving schemes
4148              so they absolutely need to use load/store-lanes.  For now
4149              force single-lane SLP for them - they would be happy with
4150              uniform power-of-two lanes (but depending on element size),
4151              but even if we can use 'i' as indicator we would need to
4152              backtrack when later lanes fail to discover with the same
4153              granularity.  We cannot turn any of strided or scatter store
4154              into store-lanes.  */
4155           /* ???  If this is not in sync with what get_load_store_type
4156              later decides the SLP representation is not good for other
4157              store vectorization methods.  */
4158           bool want_store_lanes
4159             = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
4160                && ! STMT_VINFO_STRIDED_P (stmt_info)
4161                && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
4162                && compare_step_with_zero (vinfo, stmt_info) > 0
4163                && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, NULL_TREE,
4164                                                  masked_p, group_size, 1));
4165           if (want_store_lanes || force_single_lane)
4166             i = 1;
4167
4168           /* A fatal discovery fail doesn't always mean single-lane SLP
4169              isn't a possibility, so try.  */
4170           if (i == 0)
4171             i = 1;
4172
4173           if (dump_enabled_p ())
4174             dump_printf_loc (MSG_NOTE, vect_location,
4175                              "Splitting SLP group at stmt %u\n", i);
4176
4177           /* Analyze the stored values and pinch them together with
4178              a permute node so we can preserve the whole store group.  */
4179           auto_vec<slp_tree> rhs_nodes;
4180           poly_uint64 max_nunits = 1;
4181
4182           unsigned int rhs_common_nlanes = 0;
4183           unsigned int start = 0, end = i;
4184           while (start < group_size)
4185             {
4186               gcc_assert (end - start >= 1);
4187               vec<stmt_vec_info> substmts;
4188               substmts.create (end - start);
4189               for (unsigned j = start; j < end; ++j)
4190                 substmts.quick_push (scalar_stmts[j]);
4191               max_nunits = 1;
4192               node = vect_build_slp_tree (vinfo, substmts, end - start,
4193                                           &max_nunits,
4194                                           matches, limit, &tree_size, bst_map);
4195               if (node)
4196                 {
4197                   rhs_nodes.safe_push (node);
4198                   vect_update_max_nunits (&max_nunits, node->max_nunits);
4199                   if (start == 0)
4200                     rhs_common_nlanes = SLP_TREE_LANES (node);
4201                   else if (rhs_common_nlanes != SLP_TREE_LANES (node))
4202                     rhs_common_nlanes = 0;
4203                   start = end;
4204                   if (want_store_lanes || force_single_lane)
4205                     end = start + 1;
4206                   else
4207                     end = group_size;
4208                 }
4209               else
4210                 {
4211                   substmts.release ();
4212                   if (end - start == 1)
4213                     {
4214                       /* Single-lane discovery failed.  Free ressources.  */
4215                       for (auto node : rhs_nodes)
4216                         vect_free_slp_tree (node);
4217                       scalar_stmts.release ();
4218                       if (dump_enabled_p ())
4219                         dump_printf_loc (MSG_NOTE, vect_location,
4220                                          "SLP discovery failed\n");
4221                       return false;
4222                     }
4223
4224                   /* ???  It really happens that we soft-fail SLP
4225                      build at a mismatch but the matching part hard-fails
4226                      later.  As we know we arrived here with a group
4227                      larger than one try a group of size one!  */
4228                   if (!matches[0])
4229                     end = start + 1;
4230                   else
4231                     for (unsigned j = start; j < end; j++)
4232                       if (!matches[j - start])
4233                         {
4234                           end = j;
4235                           break;
4236                         }
4237                 }
4238             }
4239
4240           /* Now re-assess whether we want store lanes in case the
4241              discovery ended up producing all single-lane RHSs.  */
4242           if (! want_store_lanes
4243               && rhs_common_nlanes == 1
4244               && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
4245               && ! STMT_VINFO_STRIDED_P (stmt_info)
4246               && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
4247               && compare_step_with_zero (vinfo, stmt_info) > 0
4248               && (vect_store_lanes_supported (SLP_TREE_VECTYPE (rhs_nodes[0]),
4249                                               group_size, masked_p)
4250                   != IFN_LAST))
4251             want_store_lanes = true;
4252
4253           /* Now we assume we can build the root SLP node from all stores.  */
4254           if (want_store_lanes)
4255             {
4256               /* For store-lanes feed the store node with all RHS nodes
4257                  in order.  */
4258               node = vect_create_new_slp_node (scalar_stmts,
4259                                                SLP_TREE_CHILDREN
4260                                                  (rhs_nodes[0]).length ());
4261               SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
4262               node->max_nunits = max_nunits;
4263               node->ldst_lanes = true;
4264               SLP_TREE_CHILDREN (node)
4265                 .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
4266                                 + rhs_nodes.length () - 1);
4267               /* First store value and possibly mask.  */
4268               SLP_TREE_CHILDREN (node)
4269                 .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
4270               /* Rest of the store values.  All mask nodes are the same,
4271                  this should be guaranteed by dataref group discovery.  */
4272               for (unsigned j = 1; j < rhs_nodes.length (); ++j)
4273                 SLP_TREE_CHILDREN (node)
4274                   .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
4275               for (slp_tree child : SLP_TREE_CHILDREN (node))
4276                 child->refcnt++;
4277             }
4278           else
4279             node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts,
4280                                                       max_nunits);
4281
4282           while (!rhs_nodes.is_empty ())
4283             vect_free_slp_tree (rhs_nodes.pop ());
4284
4285           /* Create a new SLP instance.  */
4286           slp_instance new_instance = XNEW (class _slp_instance);
4287           SLP_INSTANCE_TREE (new_instance) = node;
4288           SLP_INSTANCE_LOADS (new_instance) = vNULL;
4289           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4290           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4291           SLP_INSTANCE_KIND (new_instance) = kind;
4292           new_instance->reduc_phis = NULL;
4293           new_instance->cost_vec = vNULL;
4294           new_instance->subgraph_entries = vNULL;
4295
4296           if (dump_enabled_p ())
4297             dump_printf_loc (MSG_NOTE, vect_location,
4298                              "SLP size %u vs. limit %u.\n",
4299                              tree_size, max_tree_size);
4300
4301           vinfo->slp_instances.safe_push (new_instance);
4302
4303           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4304              the number of scalar stmts in the root in a few places.
4305              Verify that assumption holds.  */
4306           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4307                         .length () == group_size);
4308
4309           if (dump_enabled_p ())
4310             {
4311               dump_printf_loc (MSG_NOTE, vect_location,
4312                                "Final SLP tree for instance %p:\n",
4313                                (void *) new_instance);
4314               vect_print_slp_graph (MSG_NOTE, vect_location,
4315                                     SLP_INSTANCE_TREE (new_instance));
4316             }
4317           return true;
4318         }
4319       else
4320         /* Free the allocated memory.  */
4321         scalar_stmts.release ();
4322
4323       /* Even though the first vector did not all match, we might be able to SLP
4324          (some) of the remainder.  FORNOW ignore this possibility.  */
4325     }
4326   else
4327     /* Free the allocated memory.  */
4328     scalar_stmts.release ();
4329
4330   /* Failed to SLP.  */
4331   if (dump_enabled_p ())
4332     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4333   return false;
4334 }
4335
4336
4337 /* Analyze an SLP instance starting from a group of grouped stores.  Call
4338    vect_build_slp_tree to build a tree of packed stmts if possible.
4339    Return FALSE if it's impossible to SLP any stmt in the loop.  */
4340
4341 static bool
4342 vect_analyze_slp_instance (vec_info *vinfo,
4343                            scalar_stmts_to_slp_tree_map_t *bst_map,
4344                            stmt_vec_info stmt_info,
4345                            slp_instance_kind kind,
4346                            unsigned max_tree_size, unsigned *limit,
4347                            bool force_single_lane)
4348 {
4349   vec<stmt_vec_info> scalar_stmts;
4350
4351   if (is_a <bb_vec_info> (vinfo))
4352     vect_location = stmt_info->stmt;
4353
4354   stmt_vec_info next_info = stmt_info;
4355   if (kind == slp_inst_kind_store)
4356     {
4357       /* Collect the stores and store them in scalar_stmts.  */
4358       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
4359       while (next_info)
4360         {
4361           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4362           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
4363         }
4364     }
4365   else if (kind == slp_inst_kind_reduc_chain)
4366     {
4367       /* Collect the reduction stmts and store them in scalar_stmts.  */
4368       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
4369       while (next_info)
4370         {
4371           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4372           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
4373         }
4374       /* Mark the first element of the reduction chain as reduction to properly
4375          transform the node.  In the reduction analysis phase only the last
4376          element of the chain is marked as reduction.  */
4377       STMT_VINFO_DEF_TYPE (stmt_info)
4378         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
4379       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
4380         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
4381     }
4382   else
4383     gcc_unreachable ();
4384
4385   vec<stmt_vec_info> roots = vNULL;
4386   vec<tree> remain = vNULL;
4387   /* Build the tree for the SLP instance.  */
4388   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
4389                                       roots, remain,
4390                                       max_tree_size, limit, bst_map,
4391                                       kind == slp_inst_kind_store
4392                                       ? stmt_info : NULL, force_single_lane);
4393
4394   /* ???  If this is slp_inst_kind_store and the above succeeded here's
4395      where we should do store group splitting.  */
4396
4397   return res;
4398 }
4399
4400 /* qsort comparator ordering SLP load nodes.  */
4401
4402 static int
4403 vllp_cmp (const void *a_, const void *b_)
4404 {
4405   const slp_tree a = *(const slp_tree *)a_;
4406   const slp_tree b = *(const slp_tree *)b_;
4407   stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
4408   stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
4409   if (STMT_VINFO_GROUPED_ACCESS (a0)
4410       && STMT_VINFO_GROUPED_ACCESS (b0)
4411       && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
4412     {
4413       /* Same group, order after lanes used.  */
4414       if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
4415         return 1;
4416       else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
4417         return -1;
4418       else
4419         {
4420           /* Try to order loads using the same lanes together, breaking
4421              the tie with the lane number that first differs.  */
4422           if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
4423               && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
4424             return 0;
4425           else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
4426                    && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
4427             return 1;
4428           else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
4429                    && SLP_TREE_LOAD_PERMUTATION (b).exists ())
4430             return -1;
4431           else
4432             {
4433               for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
4434                 if (SLP_TREE_LOAD_PERMUTATION (a)[i]
4435                     != SLP_TREE_LOAD_PERMUTATION (b)[i])
4436                   {
4437                     /* In-order lane first, that's what the above case for
4438                        no permutation does.  */
4439                     if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
4440                       return -1;
4441                     else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
4442                       return 1;
4443                     else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
4444                              < SLP_TREE_LOAD_PERMUTATION (b)[i])
4445                       return -1;
4446                     else
4447                       return 1;
4448                   }
4449               return 0;
4450             }
4451         }
4452     }
4453   else /* Different groups or non-groups.  */
4454     {
4455       /* Order groups as their first element to keep them together.  */
4456       if (STMT_VINFO_GROUPED_ACCESS (a0))
4457         a0 = DR_GROUP_FIRST_ELEMENT (a0);
4458       if (STMT_VINFO_GROUPED_ACCESS (b0))
4459         b0 = DR_GROUP_FIRST_ELEMENT (b0);
4460       if (a0 == b0)
4461         return 0;
4462       /* Tie using UID.  */
4463       else if (gimple_uid (STMT_VINFO_STMT (a0))
4464                < gimple_uid (STMT_VINFO_STMT (b0)))
4465         return -1;
4466       else
4467         {
4468           gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
4469                       != gimple_uid (STMT_VINFO_STMT (b0)));
4470           return 1;
4471         }
4472     }
4473 }
4474
4475 /* Process the set of LOADS that are all from the same dataref group.  */
4476
4477 static void
4478 vect_lower_load_permutations (loop_vec_info loop_vinfo,
4479                               scalar_stmts_to_slp_tree_map_t *bst_map,
4480                               const array_slice<slp_tree> &loads,
4481                               bool force_single_lane)
4482 {
4483   /* We at this point want to lower without a fixed VF or vector
4484      size in mind which means we cannot actually compute whether we
4485      need three or more vectors for a load permutation yet.  So always
4486      lower.  */
4487   stmt_vec_info first
4488     = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
4489   unsigned group_lanes = DR_GROUP_SIZE (first);
4490
4491   /* Verify if all load permutations can be implemented with a suitably
4492      large element load-lanes operation.  */
4493   unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
4494   if (STMT_VINFO_STRIDED_P (first)
4495       || compare_step_with_zero (loop_vinfo, first) <= 0
4496       || exact_log2 (ld_lanes_lanes) == -1
4497       /* ???  For now only support the single-lane case as there is
4498          missing support on the store-lane side and code generation
4499          isn't up to the task yet.  */
4500       || ld_lanes_lanes != 1
4501       || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
4502                                     group_lanes / ld_lanes_lanes,
4503                                     false) == IFN_LAST)
4504     ld_lanes_lanes = 0;
4505   else
4506     /* Verify the loads access the same number of lanes aligned to
4507        ld_lanes_lanes.  */
4508     for (slp_tree load : loads)
4509       {
4510         if (SLP_TREE_LANES (load) != ld_lanes_lanes)
4511           {
4512             ld_lanes_lanes = 0;
4513             break;
4514           }
4515         unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
4516         if (first % ld_lanes_lanes != 0)
4517           {
4518             ld_lanes_lanes = 0;
4519             break;
4520           }
4521         for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i)
4522           if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i)
4523             {
4524               ld_lanes_lanes = 0;
4525               break;
4526             }
4527       }
4528
4529   /* Only a power-of-two number of lanes matches interleaving with N levels.
4530      ???  An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
4531      at each step.  */
4532   if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
4533     return;
4534
4535   for (slp_tree load : loads)
4536     {
4537       /* Leave masked or gather loads alone for now.  */
4538       if (!SLP_TREE_CHILDREN (load).is_empty ())
4539         continue;
4540
4541       /* We want to pattern-match special cases here and keep those
4542          alone.  Candidates are splats and load-lane.  */
4543
4544       /* We need to lower only loads of less than half of the groups
4545          lanes, including duplicate lanes.  Note this leaves nodes
4546          with a non-1:1 load permutation around instead of canonicalizing
4547          those into a load and a permute node.  Removing this early
4548          check would do such canonicalization.  */
4549       if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
4550           && ld_lanes_lanes == 0)
4551         continue;
4552
4553       /* Build the permute to get the original load permutation order.  */
4554       bool contiguous = true;
4555       lane_permutation_t final_perm;
4556       final_perm.create (SLP_TREE_LANES (load));
4557       for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
4558         {
4559           final_perm.quick_push
4560             (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
4561           if (i != 0
4562               && (SLP_TREE_LOAD_PERMUTATION (load)[i]
4563                   != SLP_TREE_LOAD_PERMUTATION (load)[i-1] + 1))
4564             contiguous = false;
4565         }
4566
4567       /* When the load permutation accesses a contiguous unpermuted,
4568          power-of-two aligned and sized chunk leave the load alone.
4569          We can likely (re-)load it more efficiently rather than
4570          extracting it from the larger load.
4571          ???  Long-term some of the lowering should move to where
4572          the vector types involved are fixed.  */
4573       if (!force_single_lane
4574           && ld_lanes_lanes == 0
4575           && contiguous
4576           && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
4577           && pow2p_hwi (SLP_TREE_LANES (load))
4578           && pow2p_hwi (group_lanes)
4579           && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
4580           && group_lanes % SLP_TREE_LANES (load) == 0)
4581         {
4582           final_perm.release ();
4583           continue;
4584         }
4585
4586       /* First build (and possibly re-use) a load node for the
4587          unpermuted group.  Gaps in the middle and on the end are
4588          represented with NULL stmts.  */
4589       vec<stmt_vec_info> stmts;
4590       stmts.create (group_lanes);
4591       for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
4592         {
4593           if (s != first)
4594             for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
4595               stmts.quick_push (NULL);
4596           stmts.quick_push (s);
4597         }
4598       for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
4599         stmts.quick_push (NULL);
4600       poly_uint64 max_nunits = 1;
4601       bool *matches = XALLOCAVEC (bool, group_lanes);
4602       unsigned limit = 1;
4603       unsigned tree_size = 0;
4604       slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
4605                                          group_lanes,
4606                                          &max_nunits, matches, &limit,
4607                                          &tree_size, bst_map);
4608       gcc_assert (!SLP_TREE_LOAD_PERMUTATION (l0).exists ());
4609
4610       if (ld_lanes_lanes != 0)
4611         {
4612           /* ???  If this is not in sync with what get_load_store_type
4613              later decides the SLP representation is not good for other
4614              store vectorization methods.  */
4615           l0->ldst_lanes = true;
4616           load->ldst_lanes = true;
4617         }
4618
4619       while (1)
4620         {
4621           unsigned group_lanes = SLP_TREE_LANES (l0);
4622           if (ld_lanes_lanes != 0
4623               || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
4624             break;
4625
4626           /* Try to lower by reducing the group to half its size using an
4627              interleaving scheme.  For this try to compute whether all
4628              elements needed for this load are in even or odd elements of
4629              an even/odd decomposition with N consecutive elements.
4630              Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
4631              with N == 2.  */
4632           /* ???  Only an even number of lanes can be handed this way, but the
4633              fallback below could work for any number.  We have to make sure
4634              to round up in that case.  */
4635           gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
4636           unsigned even = 0, odd = 0;
4637           if ((group_lanes & 1) == 0)
4638             {
4639               even = (1 << ceil_log2 (group_lanes)) - 1;
4640               odd = even;
4641               for (auto l : final_perm)
4642                 {
4643                   even &= ~l.second;
4644                   odd &= l.second;
4645                 }
4646             }
4647
4648           /* Now build an even or odd extraction from the unpermuted load.  */
4649           lane_permutation_t perm;
4650           perm.create ((group_lanes + 1) / 2);
4651           unsigned even_level = even ? 1 << ctz_hwi (even) : 0;
4652           unsigned odd_level = odd ? 1 << ctz_hwi (odd) : 0;
4653           if (even_level
4654               && group_lanes % (2 * even_level) == 0
4655               /* ???  When code generating permutes we do not try to pun
4656                  to larger component modes so level != 1 isn't a natural
4657                  even/odd extract.  Prefer one if possible.  */
4658               && (even_level == 1 || !odd_level || odd_level != 1))
4659             {
4660               /* { 0, 1, ... 4, 5 ..., } */
4661               for (unsigned i = 0; i < group_lanes / 2 / even_level; ++i)
4662                 for (unsigned j = 0; j < even_level; ++j)
4663                   perm.quick_push (std::make_pair (0, 2 * i * even_level + j));
4664             }
4665           else if (odd_level)
4666             {
4667               /* { ..., 2, 3, ... 6, 7 } */
4668               gcc_assert (group_lanes % (2 * odd_level) == 0);
4669               for (unsigned i = 0; i < group_lanes / 2 / odd_level; ++i)
4670                 for (unsigned j = 0; j < odd_level; ++j)
4671                   perm.quick_push
4672                     (std::make_pair (0, (2 * i + 1) * odd_level + j));
4673             }
4674           else
4675             {
4676               /* As fallback extract all used lanes and fill to half the
4677                  group size by repeating the last element.
4678                  ???  This is quite a bad strathegy for re-use - we could
4679                  brute force our way to find more optimal filling lanes to
4680                  maximize re-use when looking at all loads from the group.  */
4681               auto_bitmap l;
4682               for (auto p : final_perm)
4683                 bitmap_set_bit (l, p.second);
4684               unsigned i = 0;
4685               bitmap_iterator bi;
4686               EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
4687                   perm.quick_push (std::make_pair (0, i));
4688               while (perm.length () < (group_lanes + 1) / 2)
4689                 perm.quick_push (perm.last ());
4690             }
4691
4692           /* Update final_perm with the intermediate permute.  */
4693           for (unsigned i = 0; i < final_perm.length (); ++i)
4694             {
4695               unsigned l = final_perm[i].second;
4696               unsigned j;
4697               for (j = 0; j < perm.length (); ++j)
4698                 if (perm[j].second == l)
4699                   {
4700                     final_perm[i].second = j;
4701                     break;
4702                   }
4703               gcc_assert (j < perm.length ());
4704             }
4705
4706           /* And create scalar stmts.  */
4707           vec<stmt_vec_info> perm_stmts;
4708           perm_stmts.create (perm.length ());
4709           for (unsigned i = 0; i < perm.length (); ++i)
4710             perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
4711
4712           slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
4713           SLP_TREE_CHILDREN (p).quick_push (l0);
4714           SLP_TREE_LANE_PERMUTATION (p) = perm;
4715           SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
4716           SLP_TREE_LANES (p) = perm.length ();
4717           SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
4718           /* ???  As we have scalar stmts for this intermediate permute we
4719              could CSE it via bst_map but we do not want to pick up
4720              another SLP node with a load permutation.  We instead should
4721              have a "local" CSE map here.  */
4722           SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
4723
4724           /* We now have a node for (group_lanes + 1) / 2 lanes.  */
4725           l0 = p;
4726         }
4727
4728       /* And finally from the ordered reduction node create the
4729          permute to shuffle the lanes into the original load-permutation
4730          order.  We replace the original load node with this.  */
4731       SLP_TREE_CODE (load) = VEC_PERM_EXPR;
4732       SLP_TREE_LOAD_PERMUTATION (load).release ();
4733       SLP_TREE_LANE_PERMUTATION (load) = final_perm;
4734       SLP_TREE_CHILDREN (load).create (1);
4735       SLP_TREE_CHILDREN (load).quick_push (l0);
4736     }
4737 }
4738
4739 /* Transform SLP loads in the SLP graph created by SLP discovery to
4740    group loads from the same group and lower load permutations that
4741    are unlikely to be supported into a series of permutes.
4742    In the degenerate case of having only single-lane SLP instances
4743    this should result in a series of permute nodes emulating an
4744    interleaving scheme.  */
4745
4746 static void
4747 vect_lower_load_permutations (loop_vec_info loop_vinfo,
4748                               scalar_stmts_to_slp_tree_map_t *bst_map,
4749                               bool force_single_lane)
4750 {
4751   /* Gather and sort loads across all instances.  */
4752   hash_set<slp_tree> visited;
4753   auto_vec<slp_tree> loads;
4754   for (auto inst : loop_vinfo->slp_instances)
4755     vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
4756   if (loads.is_empty ())
4757     return;
4758   loads.qsort (vllp_cmp);
4759
4760   /* Now process each dataref group separately.  */
4761   unsigned firsti = 0;
4762   for (unsigned i = 1; i < loads.length (); ++i)
4763     {
4764       slp_tree first = loads[firsti];
4765       slp_tree next = loads[i];
4766       stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
4767       stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
4768       if (STMT_VINFO_GROUPED_ACCESS (a0)
4769           && STMT_VINFO_GROUPED_ACCESS (b0)
4770           && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
4771         continue;
4772       /* Now we have one or multiple SLP loads of the same group from
4773          firsti to i - 1.  */
4774       if (STMT_VINFO_GROUPED_ACCESS (a0))
4775         vect_lower_load_permutations (loop_vinfo, bst_map,
4776                                       make_array_slice (&loads[firsti],
4777                                                         i - firsti),
4778                                       force_single_lane);
4779       firsti = i;
4780     }
4781   if (firsti < loads.length ()
4782       && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
4783     vect_lower_load_permutations (loop_vinfo, bst_map,
4784                                   make_array_slice (&loads[firsti],
4785                                                     loads.length () - firsti),
4786                                   force_single_lane);
4787 }
4788
4789 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
4790    trees of packed scalar stmts if SLP is possible.  */
4791
4792 opt_result
4793 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
4794                   bool force_single_lane)
4795 {
4796   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4797   unsigned int i;
4798   stmt_vec_info first_element;
4799   slp_instance instance;
4800
4801   DUMP_VECT_SCOPE ("vect_analyze_slp");
4802
4803   unsigned limit = max_tree_size;
4804
4805   scalar_stmts_to_slp_tree_map_t *bst_map
4806     = new scalar_stmts_to_slp_tree_map_t ();
4807
4808   /* Find SLP sequences starting from groups of grouped stores.  */
4809   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4810     vect_analyze_slp_instance (vinfo, bst_map, first_element,
4811                                slp_inst_kind_store, max_tree_size, &limit,
4812                                force_single_lane);
4813
4814   /* For loops also start SLP discovery from non-grouped stores.  */
4815   if (loop_vinfo)
4816     {
4817       data_reference_p dr;
4818       FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
4819         if (DR_IS_WRITE (dr))
4820           {
4821             stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
4822             /* Grouped stores are already handled above.  */
4823             if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
4824               continue;
4825             vec<stmt_vec_info> stmts;
4826             vec<stmt_vec_info> roots = vNULL;
4827             vec<tree> remain = vNULL;
4828             stmts.create (1);
4829             stmts.quick_push (stmt_info);
4830             vect_build_slp_instance (vinfo, slp_inst_kind_store,
4831                                      stmts, roots, remain, max_tree_size,
4832                                      &limit, bst_map, NULL, force_single_lane);
4833           }
4834     }
4835
4836   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4837     {
4838       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
4839         {
4840           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
4841           /* Apply patterns.  */
4842           for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
4843             bb_vinfo->roots[i].stmts[j]
4844               = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
4845           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
4846                                        bb_vinfo->roots[i].stmts,
4847                                        bb_vinfo->roots[i].roots,
4848                                        bb_vinfo->roots[i].remain,
4849                                        max_tree_size, &limit, bst_map, NULL,
4850                                        false))
4851             {
4852               bb_vinfo->roots[i].stmts = vNULL;
4853               bb_vinfo->roots[i].roots = vNULL;
4854               bb_vinfo->roots[i].remain = vNULL;
4855             }
4856         }
4857     }
4858
4859   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4860     {
4861       /* Find SLP sequences starting from reduction chains.  */
4862       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
4863         if (! STMT_VINFO_RELEVANT_P (first_element)
4864             && ! STMT_VINFO_LIVE_P (first_element))
4865           ;
4866         else if (force_single_lane
4867                  || ! vect_analyze_slp_instance (vinfo, bst_map, first_element,
4868                                                  slp_inst_kind_reduc_chain,
4869                                                  max_tree_size, &limit,
4870                                                  force_single_lane))
4871           {
4872             /* Dissolve reduction chain group.  */
4873             stmt_vec_info vinfo = first_element;
4874             stmt_vec_info last = NULL;
4875             while (vinfo)
4876               {
4877                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
4878                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
4879                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
4880                 last = vinfo;
4881                 vinfo = next;
4882               }
4883             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
4884             /* It can be still vectorized as part of an SLP reduction.  */
4885             loop_vinfo->reductions.safe_push (last);
4886           }
4887
4888       /* Find SLP sequences starting from groups of reductions.  */
4889       if (loop_vinfo->reductions.length () > 0)
4890         {
4891           /* Collect reduction statements we can combine into
4892              a SLP reduction.  */
4893           vec<stmt_vec_info> scalar_stmts;
4894           scalar_stmts.create (loop_vinfo->reductions.length ());
4895           for (auto next_info : loop_vinfo->reductions)
4896             {
4897               next_info = vect_stmt_to_vectorize (next_info);
4898               if ((STMT_VINFO_RELEVANT_P (next_info)
4899                    || STMT_VINFO_LIVE_P (next_info))
4900                   /* ???  Make sure we didn't skip a conversion around a
4901                      reduction path.  In that case we'd have to reverse
4902                      engineer that conversion stmt following the chain using
4903                      reduc_idx and from the PHI using reduc_def.  */
4904                   && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
4905                       || (STMT_VINFO_DEF_TYPE (next_info)
4906                           == vect_double_reduction_def)))
4907                 {
4908                   /* Do not discover SLP reductions combining lane-reducing
4909                      ops, that will fail later.  */
4910                   if (!force_single_lane
4911                       && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
4912                     scalar_stmts.quick_push (next_info);
4913                   else
4914                     {
4915                       /* Do SLP discovery for single-lane reductions.  */
4916                       vec<stmt_vec_info> stmts;
4917                       vec<stmt_vec_info> roots = vNULL;
4918                       vec<tree> remain = vNULL;
4919                       stmts.create (1);
4920                       stmts.quick_push (next_info);
4921                       vect_build_slp_instance (vinfo,
4922                                                slp_inst_kind_reduc_group,
4923                                                stmts, roots, remain,
4924                                                max_tree_size, &limit,
4925                                                bst_map, NULL,
4926                                                force_single_lane);
4927                     }
4928                 }
4929             }
4930           /* Save for re-processing on failure.  */
4931           vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
4932           vec<stmt_vec_info> roots = vNULL;
4933           vec<tree> remain = vNULL;
4934           if (scalar_stmts.length () <= 1
4935               || !vect_build_slp_instance (loop_vinfo,
4936                                            slp_inst_kind_reduc_group,
4937                                            scalar_stmts, roots, remain,
4938                                            max_tree_size, &limit, bst_map,
4939                                            NULL, force_single_lane))
4940             {
4941               if (scalar_stmts.length () <= 1)
4942                 scalar_stmts.release ();
4943               /* Do SLP discovery for single-lane reductions.  */
4944               for (auto stmt_info : saved_stmts)
4945                 {
4946                   vec<stmt_vec_info> stmts;
4947                   vec<stmt_vec_info> roots = vNULL;
4948                   vec<tree> remain = vNULL;
4949                   stmts.create (1);
4950                   stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
4951                   vect_build_slp_instance (vinfo,
4952                                            slp_inst_kind_reduc_group,
4953                                            stmts, roots, remain,
4954                                            max_tree_size, &limit,
4955                                            bst_map, NULL, force_single_lane);
4956                 }
4957             }
4958           saved_stmts.release ();
4959         }
4960
4961       /* Make sure to vectorize only-live stmts, usually inductions.  */
4962       for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
4963         for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
4964              gsi_next (&gsi))
4965           {
4966             gphi *lc_phi = *gsi;
4967             tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
4968             stmt_vec_info stmt_info;
4969             if (TREE_CODE (def) == SSA_NAME
4970                 && !virtual_operand_p (def)
4971                 && (stmt_info = loop_vinfo->lookup_def (def))
4972                 && ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true)
4973                 && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
4974                 && STMT_VINFO_LIVE_P (stmt_info)
4975                 && (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
4976                     || (STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
4977                         && STMT_VINFO_REDUC_IDX (stmt_info) == -1)))
4978               {
4979                 vec<stmt_vec_info> stmts;
4980                 vec<stmt_vec_info> roots = vNULL;
4981                 vec<tree> remain = vNULL;
4982                 stmts.create (1);
4983                 stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
4984                 vect_build_slp_instance (vinfo,
4985                                          slp_inst_kind_reduc_group,
4986                                          stmts, roots, remain,
4987                                          max_tree_size, &limit,
4988                                          bst_map, NULL, force_single_lane);
4989               }
4990           }
4991
4992       /* Find SLP sequences starting from gconds.  */
4993       for (auto cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo))
4994         {
4995           auto cond_info = loop_vinfo->lookup_stmt (cond);
4996
4997           cond_info = vect_stmt_to_vectorize (cond_info);
4998           vec<stmt_vec_info> roots = vNULL;
4999           roots.safe_push (cond_info);
5000           gimple *stmt = STMT_VINFO_STMT (cond_info);
5001           tree args0 = gimple_cond_lhs (stmt);
5002           tree args1 = gimple_cond_rhs (stmt);
5003
5004           /* These should be enforced by cond lowering.  */
5005           gcc_assert (gimple_cond_code (stmt) == NE_EXPR);
5006           gcc_assert (zerop (args1));
5007
5008           /* An argument without a loop def will be codegened from vectorizing the
5009              root gcond itself.  As such we don't need to try to build an SLP tree
5010              from them.  It's highly likely that the resulting SLP tree here if both
5011              arguments have a def will be incompatible, but we rely on it being split
5012              later on.  */
5013           auto varg = loop_vinfo->lookup_def (args0);
5014           vec<stmt_vec_info> stmts;
5015           vec<tree> remain = vNULL;
5016           stmts.create (1);
5017           stmts.quick_push (vect_stmt_to_vectorize (varg));
5018
5019           if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond,
5020                                          stmts, roots, remain,
5021                                          max_tree_size, &limit,
5022                                          bst_map, NULL, force_single_lane))
5023             roots.release ();
5024         }
5025
5026         /* Find and create slp instances for inductions that have been forced
5027            live due to early break.  */
5028         edge latch_e = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
5029         for (auto stmt_info : LOOP_VINFO_EARLY_BREAKS_LIVE_IVS (loop_vinfo))
5030           {
5031             vec<stmt_vec_info> stmts;
5032             vec<stmt_vec_info> roots = vNULL;
5033             vec<tree> remain = vNULL;
5034             gphi *lc_phi = as_a<gphi *> (STMT_VINFO_STMT (stmt_info));
5035             tree def = gimple_phi_arg_def_from_edge (lc_phi, latch_e);
5036             stmt_vec_info lc_info = loop_vinfo->lookup_def (def);
5037             stmts.create (1);
5038             stmts.quick_push (vect_stmt_to_vectorize (lc_info));
5039             vect_build_slp_instance (vinfo, slp_inst_kind_reduc_group,
5040                                      stmts, roots, remain,
5041                                      max_tree_size, &limit,
5042                                      bst_map, NULL, force_single_lane);
5043           }
5044     }
5045
5046   hash_set<slp_tree> visited_patterns;
5047   slp_tree_to_load_perm_map_t perm_cache;
5048   slp_compat_nodes_map_t compat_cache;
5049
5050   /* See if any patterns can be found in the SLP tree.  */
5051   bool pattern_found = false;
5052   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5053     pattern_found |= vect_match_slp_patterns (instance, vinfo,
5054                                               &visited_patterns, &perm_cache,
5055                                               &compat_cache);
5056
5057   /* If any were found optimize permutations of loads.  */
5058   if (pattern_found)
5059     {
5060       hash_map<slp_tree, slp_tree> load_map;
5061       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5062         {
5063           slp_tree root = SLP_INSTANCE_TREE (instance);
5064           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
5065                                         &load_map, root);
5066         }
5067     }
5068
5069   /* Check whether we should force some SLP instances to use load/store-lanes
5070      and do so by forcing SLP re-discovery with single lanes.  We used
5071      to cancel SLP when this applied to all instances in a loop but now
5072      we decide this per SLP instance.  It's important to do this only
5073      after SLP pattern recognition.  */
5074   if (is_a <loop_vec_info> (vinfo))
5075     FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5076       if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
5077           && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
5078         {
5079           slp_tree slp_root = SLP_INSTANCE_TREE (instance);
5080           int group_size = SLP_TREE_LANES (slp_root);
5081           tree vectype = SLP_TREE_VECTYPE (slp_root);
5082
5083           stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
5084           gimple *rep = STMT_VINFO_STMT (rep_info);
5085           bool masked = (is_gimple_call (rep)
5086                          && gimple_call_internal_p (rep)
5087                          && internal_fn_mask_index
5088                               (gimple_call_internal_fn (rep)) != -1);
5089           if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
5090               || slp_root->ldst_lanes
5091               || (vect_store_lanes_supported (vectype, group_size, masked)
5092                   == IFN_LAST))
5093             continue;
5094
5095           auto_vec<slp_tree> loads;
5096           hash_set<slp_tree> visited;
5097           vect_gather_slp_loads (loads, slp_root, visited);
5098
5099           /* Check whether any load in the SLP instance is possibly
5100              permuted.  */
5101           bool loads_permuted = false;
5102           slp_tree load_node;
5103           unsigned j;
5104           FOR_EACH_VEC_ELT (loads, j, load_node)
5105             {
5106               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
5107                 continue;
5108               unsigned k;
5109               stmt_vec_info load_info;
5110               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
5111                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
5112                   {
5113                     loads_permuted = true;
5114                     break;
5115                   }
5116             }
5117
5118           /* If the loads and stores can use load/store-lanes force re-discovery
5119              with single lanes.  */
5120           if (loads_permuted)
5121             {
5122               bool can_use_lanes = true;
5123               FOR_EACH_VEC_ELT (loads, j, load_node)
5124                 if (STMT_VINFO_GROUPED_ACCESS
5125                       (SLP_TREE_REPRESENTATIVE (load_node)))
5126                   {
5127                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
5128                         (SLP_TREE_REPRESENTATIVE (load_node));
5129                     rep = STMT_VINFO_STMT (stmt_vinfo);
5130                     masked = (is_gimple_call (rep)
5131                               && gimple_call_internal_p (rep)
5132                               && internal_fn_mask_index
5133                                    (gimple_call_internal_fn (rep)));
5134                     /* Use SLP for strided accesses (or if we can't
5135                        load-lanes).  */
5136                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
5137                         || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
5138                         || vect_load_lanes_supported
5139                              (STMT_VINFO_VECTYPE (stmt_vinfo),
5140                               DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
5141                         /* ???  During SLP re-discovery with a single lane
5142                            a masked grouped load will appear permuted and
5143                            discovery will fail.  We have to rework this
5144                            on the discovery side - for now avoid ICEing.  */
5145                         || masked)
5146                       {
5147                         can_use_lanes = false;
5148                         break;
5149                       }
5150                   }
5151
5152               if (can_use_lanes)
5153                 {
5154                   if (dump_enabled_p ())
5155                     dump_printf_loc (MSG_NOTE, vect_location,
5156                                      "SLP instance %p can use load/store-lanes,"
5157                                      " re-discovering with single-lanes\n",
5158                                      (void *) instance);
5159
5160                   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
5161
5162                   vect_free_slp_instance (instance);
5163                   limit = max_tree_size;
5164                   bool res = vect_analyze_slp_instance (vinfo, bst_map,
5165                                                         stmt_info,
5166                                                         slp_inst_kind_store,
5167                                                         max_tree_size, &limit,
5168                                                         true);
5169                   gcc_assert (res);
5170                   auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
5171                   LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
5172                 }
5173             }
5174         }
5175
5176   /* When we end up with load permutations that we cannot possibly handle,
5177      like those requiring three vector inputs, lower them using interleaving
5178      like schemes.  */
5179   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5180     {
5181       vect_lower_load_permutations (loop_vinfo, bst_map, force_single_lane);
5182       if (dump_enabled_p ())
5183         {
5184           dump_printf_loc (MSG_NOTE, vect_location,
5185                            "SLP graph after lowering permutations:\n");
5186           hash_set<slp_tree> visited;
5187           FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5188             vect_print_slp_graph (MSG_NOTE, vect_location,
5189                                   SLP_INSTANCE_TREE (instance), visited);
5190         }
5191     }
5192
5193   release_scalar_stmts_to_slp_tree_map (bst_map);
5194
5195   if (pattern_found && dump_enabled_p ())
5196     {
5197       dump_printf_loc (MSG_NOTE, vect_location,
5198                        "Pattern matched SLP tree\n");
5199       hash_set<slp_tree> visited;
5200       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5201         vect_print_slp_graph (MSG_NOTE, vect_location,
5202                               SLP_INSTANCE_TREE (instance), visited);
5203     }
5204
5205   return opt_result::success ();
5206 }
5207
5208 /* Estimates the cost of inserting layout changes into the SLP graph.
5209    It can also say that the insertion is impossible.  */
5210
5211 struct slpg_layout_cost
5212 {
5213   slpg_layout_cost () = default;
5214   slpg_layout_cost (sreal, bool);
5215
5216   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
5217   bool is_possible () const { return depth != sreal::max (); }
5218
5219   bool operator== (const slpg_layout_cost &) const;
5220   bool operator!= (const slpg_layout_cost &) const;
5221
5222   bool is_better_than (const slpg_layout_cost &, bool) const;
5223
5224   void add_parallel_cost (const slpg_layout_cost &);
5225   void add_serial_cost (const slpg_layout_cost &);
5226   void split (unsigned int);
5227
5228   /* The longest sequence of layout changes needed during any traversal
5229      of the partition dag, weighted by execution frequency.
5230
5231      This is the most important metric when optimizing for speed, since
5232      it helps to ensure that we keep the number of operations on
5233      critical paths to a minimum.  */
5234   sreal depth = 0;
5235
5236   /* An estimate of the total number of operations needed.  It is weighted by
5237      execution frequency when optimizing for speed but not when optimizing for
5238      size.  In order to avoid double-counting, a node with a fanout of N will
5239      distribute 1/N of its total cost to each successor.
5240
5241      This is the most important metric when optimizing for size, since
5242      it helps to keep the total number of operations to a minimum,  */
5243   sreal total = 0;
5244 };
5245
5246 /* Construct costs for a node with weight WEIGHT.  A higher weight
5247    indicates more frequent execution.  IS_FOR_SIZE is true if we are
5248    optimizing for size rather than speed.  */
5249
5250 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
5251   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
5252 {
5253 }
5254
5255 bool
5256 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
5257 {
5258   return depth == other.depth && total == other.total;
5259 }
5260
5261 bool
5262 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
5263 {
5264   return !operator== (other);
5265 }
5266
5267 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
5268    true if we are optimizing for size rather than speed.  */
5269
5270 bool
5271 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
5272                                   bool is_for_size) const
5273 {
5274   if (is_for_size)
5275     {
5276       if (total != other.total)
5277         return total < other.total;
5278       return depth < other.depth;
5279     }
5280   else
5281     {
5282       if (depth != other.depth)
5283         return depth < other.depth;
5284       return total < other.total;
5285     }
5286 }
5287
5288 /* Increase the costs to account for something with cost INPUT_COST
5289    happening in parallel with the current costs.  */
5290
5291 void
5292 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
5293 {
5294   depth = std::max (depth, input_cost.depth);
5295   total += input_cost.total;
5296 }
5297
5298 /* Increase the costs to account for something with cost INPUT_COST
5299    happening in series with the current costs.  */
5300
5301 void
5302 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
5303 {
5304   depth += other.depth;
5305   total += other.total;
5306 }
5307
5308 /* Split the total cost among TIMES successors or predecessors.  */
5309
5310 void
5311 slpg_layout_cost::split (unsigned int times)
5312 {
5313   if (times > 1)
5314     total /= times;
5315 }
5316
5317 /* Information about one node in the SLP graph, for use during
5318    vect_optimize_slp_pass.  */
5319
5320 struct slpg_vertex
5321 {
5322   slpg_vertex (slp_tree node_) : node (node_) {}
5323
5324   /* The node itself.  */
5325   slp_tree node;
5326
5327   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
5328      partitions are flexible; they can have whichever layout consumers
5329      want them to have.  */
5330   int partition = -1;
5331
5332   /* The number of nodes that directly use the result of this one
5333      (i.e. the number of nodes that count this one as a child).  */
5334   unsigned int out_degree = 0;
5335
5336   /* The execution frequency of the node.  */
5337   sreal weight = 0;
5338
5339   /* The total execution frequency of all nodes that directly use the
5340      result of this one.  */
5341   sreal out_weight = 0;
5342 };
5343
5344 /* Information about one partition of the SLP graph, for use during
5345    vect_optimize_slp_pass.  */
5346
5347 struct slpg_partition_info
5348 {
5349   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
5350      of m_partitioned_nodes.  */
5351   unsigned int node_begin = 0;
5352   unsigned int node_end = 0;
5353
5354   /* Which layout we've chosen to use for this partition, or -1 if
5355      we haven't picked one yet.  */
5356   int layout = -1;
5357
5358   /* The number of predecessors and successors in the partition dag.
5359      The predecessors always have lower partition numbers and the
5360      successors always have higher partition numbers.
5361
5362      Note that the directions of these edges are not necessarily the
5363      same as in the data flow graph.  For example, if an SCC has separate
5364      partitions for an inner loop and an outer loop, the inner loop's
5365      partition will have at least two incoming edges from the outer loop's
5366      partition: one for a live-in value and one for a live-out value.
5367      In data flow terms, one of these edges would also be from the outer loop
5368      to the inner loop, but the other would be in the opposite direction.  */
5369   unsigned int in_degree = 0;
5370   unsigned int out_degree = 0;
5371 };
5372
5373 /* Information about the costs of using a particular layout for a
5374    particular partition.  It can also say that the combination is
5375    impossible.  */
5376
5377 struct slpg_partition_layout_costs
5378 {
5379   bool is_possible () const { return internal_cost.is_possible (); }
5380   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
5381
5382   /* The costs inherited from predecessor partitions.  */
5383   slpg_layout_cost in_cost;
5384
5385   /* The inherent cost of the layout within the node itself.  For example,
5386      this is nonzero for a load if choosing a particular layout would require
5387      the load to permute the loaded elements.  It is nonzero for a
5388      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
5389      to full-vector moves.  */
5390   slpg_layout_cost internal_cost;
5391
5392   /* The costs inherited from successor partitions.  */
5393   slpg_layout_cost out_cost;
5394 };
5395
5396 /* This class tries to optimize the layout of vectors in order to avoid
5397    unnecessary shuffling.  At the moment, the set of possible layouts are
5398    restricted to bijective permutations.
5399
5400    The goal of the pass depends on whether we're optimizing for size or
5401    for speed.  When optimizing for size, the goal is to reduce the overall
5402    number of layout changes (including layout changes implied by things
5403    like load permutations).  When optimizing for speed, the goal is to
5404    reduce the maximum latency attributable to layout changes on any
5405    non-cyclical path through the data flow graph.
5406
5407    For example, when optimizing a loop nest for speed, we will prefer
5408    to make layout changes outside of a loop rather than inside of a loop,
5409    and will prefer to make layout changes in parallel rather than serially,
5410    even if that increases the overall number of layout changes.
5411
5412    The high-level procedure is:
5413
5414    (1) Build a graph in which edges go from uses (parents) to definitions
5415        (children).
5416
5417    (2) Divide the graph into a dag of strongly-connected components (SCCs).
5418
5419    (3) When optimizing for speed, partition the nodes in each SCC based
5420        on their containing cfg loop.  When optimizing for size, treat
5421        each SCC as a single partition.
5422
5423        This gives us a dag of partitions.  The goal is now to assign a
5424        layout to each partition.
5425
5426    (4) Construct a set of vector layouts that are worth considering.
5427        Record which nodes must keep their current layout.
5428
5429    (5) Perform a forward walk over the partition dag (from loads to stores)
5430        accumulating the "forward" cost of using each layout.  When visiting
5431        each partition, assign a tentative choice of layout to the partition
5432        and use that choice when calculating the cost of using a different
5433        layout in successor partitions.
5434
5435    (6) Perform a backward walk over the partition dag (from stores to loads),
5436        accumulating the "backward" cost of using each layout.  When visiting
5437        each partition, make a final choice of layout for that partition based
5438        on the accumulated forward costs (from (5)) and backward costs
5439        (from (6)).
5440
5441    (7) Apply the chosen layouts to the SLP graph.
5442
5443    For example, consider the SLP statements:
5444
5445    S1:      a_1 = load
5446        loop:
5447    S2:      a_2 = PHI<a_1, a_3>
5448    S3:      b_1 = load
5449    S4:      a_3 = a_2 + b_1
5450        exit:
5451    S5:      a_4 = PHI<a_3>
5452    S6:      store a_4
5453
5454    S2 and S4 form an SCC and are part of the same loop.  Every other
5455    statement is in a singleton SCC.  In this example there is a one-to-one
5456    mapping between SCCs and partitions and the partition dag looks like this;
5457
5458         S1     S3
5459          \     /
5460           S2+S4
5461             |
5462            S5
5463             |
5464            S6
5465
5466    S2, S3 and S4 will have a higher execution frequency than the other
5467    statements, so when optimizing for speed, the goal is to avoid any
5468    layout changes:
5469
5470    - within S3
5471    - within S2+S4
5472    - on the S3->S2+S4 edge
5473
5474    For example, if S3 was originally a reversing load, the goal of the
5475    pass is to make it an unreversed load and change the layout on the
5476    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
5477    on S1->S2+S4 and S5->S6 would also be acceptable.)
5478
5479    The difference between SCCs and partitions becomes important if we
5480    add an outer loop:
5481
5482    S1:      a_1 = ...
5483        loop1:
5484    S2:      a_2 = PHI<a_1, a_6>
5485    S3:      b_1 = load
5486    S4:      a_3 = a_2 + b_1
5487        loop2:
5488    S5:      a_4 = PHI<a_3, a_5>
5489    S6:      c_1 = load
5490    S7:      a_5 = a_4 + c_1
5491        exit2:
5492    S8:      a_6 = PHI<a_5>
5493    S9:      store a_6
5494        exit1:
5495
5496    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
5497    for speed, we usually do not want restrictions in the outer loop to "infect"
5498    the decision for the inner loop.  For example, if an outer-loop node
5499    in the SCC contains a statement with a fixed layout, that should not
5500    prevent the inner loop from using a different layout.  Conversely,
5501    the inner loop should not dictate a layout to the outer loop: if the
5502    outer loop does a lot of computation, then it may not be efficient to
5503    do all of that computation in the inner loop's preferred layout.
5504
5505    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
5506    and S5+S7 (inner).  We also try to arrange partitions so that:
5507
5508    - the partition for an outer loop comes before the partition for
5509      an inner loop
5510
5511    - if a sibling loop A dominates a sibling loop B, A's partition
5512      comes before B's
5513
5514    This gives the following partition dag for the example above:
5515
5516         S1        S3
5517          \        /
5518           S2+S4+S8   S6
5519            |   \\    /
5520            |    S5+S7
5521            |
5522           S9
5523
5524    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
5525    one for a reversal of the edge S7->S8.
5526
5527    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
5528    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
5529    preferred layout against the cost of changing the layout on entry to the
5530    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
5531
5532    Although this works well when optimizing for speed, it has the downside
5533    when optimizing for size that the choice of layout for S5+S7 is completely
5534    independent of S9, which lessens the chance of reducing the overall number
5535    of permutations.  We therefore do not partition SCCs when optimizing
5536    for size.
5537
5538    To give a concrete example of the difference between optimizing
5539    for size and speed, consider:
5540
5541    a[0] = (b[1] << c[3]) - d[1];
5542    a[1] = (b[0] << c[2]) - d[0];
5543    a[2] = (b[3] << c[1]) - d[3];
5544    a[3] = (b[2] << c[0]) - d[2];
5545
5546    There are three different layouts here: one for a, one for b and d,
5547    and one for c.  When optimizing for speed it is better to permute each
5548    of b, c and d into the order required by a, since those permutations
5549    happen in parallel.  But when optimizing for size, it is better to:
5550
5551    - permute c into the same order as b
5552    - do the arithmetic
5553    - permute the result into the order required by a
5554
5555    This gives 2 permutations rather than 3.  */
5556
5557 class vect_optimize_slp_pass
5558 {
5559 public:
5560   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
5561   void run ();
5562
5563 private:
5564   /* Graph building.  */
5565   struct loop *containing_loop (slp_tree);
5566   bool is_cfg_latch_edge (graph_edge *);
5567   void build_vertices (hash_set<slp_tree> &, slp_tree);
5568   void build_vertices ();
5569   void build_graph ();
5570
5571   /* Partitioning.  */
5572   void create_partitions ();
5573   template<typename T> void for_each_partition_edge (unsigned int, T);
5574
5575   /* Layout selection.  */
5576   bool is_compatible_layout (slp_tree, unsigned int);
5577   int change_layout_cost (slp_tree, unsigned int, unsigned int);
5578   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
5579                                                        unsigned int);
5580   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
5581                                int, unsigned int);
5582   int internal_node_cost (slp_tree, int, unsigned int);
5583   void start_choosing_layouts ();
5584
5585   /* Cost propagation.  */
5586   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
5587                                      unsigned int, unsigned int);
5588   slpg_layout_cost total_in_cost (unsigned int);
5589   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
5590   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
5591   void forward_pass ();
5592   void backward_pass ();
5593
5594   /* Rematerialization.  */
5595   slp_tree get_result_with_layout (slp_tree, unsigned int);
5596   void materialize ();
5597
5598   /* Clean-up.  */
5599   void remove_redundant_permutations ();
5600
5601   /* Masked load lanes discovery.  */
5602   void decide_masked_load_lanes ();
5603
5604   void dump ();
5605
5606   vec_info *m_vinfo;
5607
5608   /* True if we should optimize the graph for size, false if we should
5609      optimize it for speed.  (It wouldn't be easy to make this decision
5610      more locally.)  */
5611   bool m_optimize_size;
5612
5613   /* A graph of all SLP nodes, with edges leading from uses to definitions.
5614      In other words, a node's predecessors are its slp_tree parents and
5615      a node's successors are its slp_tree children.  */
5616   graph *m_slpg = nullptr;
5617
5618   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
5619   auto_vec<slpg_vertex> m_vertices;
5620
5621   /* The list of all leaves of M_SLPG. such as external definitions, constants,
5622      and loads.  */
5623   auto_vec<int> m_leafs;
5624
5625   /* This array has one entry for every vector layout that we're considering.
5626      Element 0 is null and indicates "no change".  Other entries describe
5627      permutations that are inherent in the current graph and that we would
5628      like to reverse if possible.
5629
5630      For example, a permutation { 1, 2, 3, 0 } means that something has
5631      effectively been permuted in that way, such as a load group
5632      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
5633      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
5634      in order to put things "back" in order.  */
5635   auto_vec<vec<unsigned> > m_perms;
5636
5637   /* A partitioning of the nodes for which a layout must be chosen.
5638      Each partition represents an <SCC, cfg loop> pair; that is,
5639      nodes in different SCCs belong to different partitions, and nodes
5640      within an SCC can be further partitioned according to a containing
5641      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
5642
5643      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
5644        from leaves (such as loads) to roots (such as stores).
5645
5646      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
5647   auto_vec<slpg_partition_info> m_partitions;
5648
5649   /* The list of all nodes for which a layout must be chosen.  Nodes for
5650      partition P come before the nodes for partition P+1.  Nodes within a
5651      partition are in reverse postorder.  */
5652   auto_vec<unsigned int> m_partitioned_nodes;
5653
5654   /* Index P * num-layouts + L contains the cost of using layout L
5655      for partition P.  */
5656   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
5657
5658   /* Index N * num-layouts + L, if nonnull, is a node that provides the
5659      original output of node N adjusted to have layout L.  */
5660   auto_vec<slp_tree> m_node_layouts;
5661 };
5662
5663 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
5664    Also record whether we should optimize anything for speed rather
5665    than size.  */
5666
5667 void
5668 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
5669                                         slp_tree node)
5670 {
5671   unsigned i;
5672   slp_tree child;
5673
5674   if (visited.add (node))
5675     return;
5676
5677   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
5678     {
5679       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
5680       if (optimize_bb_for_speed_p (bb))
5681         m_optimize_size = false;
5682     }
5683
5684   node->vertex = m_vertices.length ();
5685   m_vertices.safe_push (slpg_vertex (node));
5686
5687   bool leaf = true;
5688   bool force_leaf = false;
5689   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5690     if (child)
5691       {
5692         leaf = false;
5693         build_vertices (visited, child);
5694       }
5695     else
5696       force_leaf = true;
5697   /* Since SLP discovery works along use-def edges all cycles have an
5698      entry - but there's the exception of cycles where we do not handle
5699      the entry explicitely (but with a NULL SLP node), like some reductions
5700      and inductions.  Force those SLP PHIs to act as leafs to make them
5701      backwards reachable.  */
5702   if (leaf || force_leaf)
5703     m_leafs.safe_push (node->vertex);
5704 }
5705
5706 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
5707
5708 void
5709 vect_optimize_slp_pass::build_vertices ()
5710 {
5711   hash_set<slp_tree> visited;
5712   unsigned i;
5713   slp_instance instance;
5714   m_vertices.truncate (0);
5715   m_leafs.truncate (0);
5716   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
5717     build_vertices (visited, SLP_INSTANCE_TREE (instance));
5718 }
5719
5720 /* Apply (reverse) bijectite PERM to VEC.  */
5721
5722 template <class T>
5723 static void
5724 vect_slp_permute (vec<unsigned> perm,
5725                   vec<T> &vec, bool reverse)
5726 {
5727   auto_vec<T, 64> saved;
5728   saved.create (vec.length ());
5729   for (unsigned i = 0; i < vec.length (); ++i)
5730     saved.quick_push (vec[i]);
5731
5732   if (reverse)
5733     {
5734       for (unsigned i = 0; i < vec.length (); ++i)
5735         vec[perm[i]] = saved[i];
5736       for (unsigned i = 0; i < vec.length (); ++i)
5737         gcc_assert (vec[perm[i]] == saved[i]);
5738     }
5739   else
5740     {
5741       for (unsigned i = 0; i < vec.length (); ++i)
5742         vec[i] = saved[perm[i]];
5743       for (unsigned i = 0; i < vec.length (); ++i)
5744         gcc_assert (vec[i] == saved[perm[i]]);
5745     }
5746 }
5747
5748 /* Return the cfg loop that contains NODE.  */
5749
5750 struct loop *
5751 vect_optimize_slp_pass::containing_loop (slp_tree node)
5752 {
5753   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
5754   if (!rep)
5755     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
5756   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
5757 }
5758
5759 /* Return true if UD (an edge from a use to a definition) is associated
5760    with a loop latch edge in the cfg.  */
5761
5762 bool
5763 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
5764 {
5765   slp_tree use = m_vertices[ud->src].node;
5766   slp_tree def = m_vertices[ud->dest].node;
5767   if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
5768        || SLP_TREE_CODE (use) == VEC_PERM_EXPR)
5769       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
5770     return false;
5771
5772   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
5773   return (is_a<gphi *> (use_rep->stmt)
5774           && bb_loop_header_p (gimple_bb (use_rep->stmt))
5775           && containing_loop (def) == containing_loop (use));
5776 }
5777
5778 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
5779    a nonnull data field.  */
5780
5781 void
5782 vect_optimize_slp_pass::build_graph ()
5783 {
5784   m_optimize_size = true;
5785   build_vertices ();
5786
5787   m_slpg = new_graph (m_vertices.length ());
5788   for (slpg_vertex &v : m_vertices)
5789     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
5790       if (child)
5791         {
5792           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
5793           if (is_cfg_latch_edge (ud))
5794             ud->data = this;
5795         }
5796 }
5797
5798 /* Return true if E corresponds to a loop latch edge in the cfg.  */
5799
5800 static bool
5801 skip_cfg_latch_edges (graph_edge *e)
5802 {
5803   return e->data;
5804 }
5805
5806 /* Create the node partitions.  */
5807
5808 void
5809 vect_optimize_slp_pass::create_partitions ()
5810 {
5811   /* Calculate a postorder of the graph, ignoring edges that correspond
5812      to natural latch edges in the cfg.  Reading the vector from the end
5813      to the beginning gives the reverse postorder.  */
5814   auto_vec<int> initial_rpo;
5815   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
5816                false, NULL, skip_cfg_latch_edges);
5817   gcc_assert (initial_rpo.length () == m_vertices.length ());
5818
5819   /* Calculate the strongly connected components of the graph.  */
5820   auto_vec<int> scc_grouping;
5821   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
5822
5823   /* Create a new index order in which all nodes from the same SCC are
5824      consecutive.  Use scc_pos to record the index of the first node in
5825      each SCC.  */
5826   auto_vec<unsigned int> scc_pos (num_sccs);
5827   int last_component = -1;
5828   unsigned int node_count = 0;
5829   for (unsigned int node_i : scc_grouping)
5830     {
5831       if (last_component != m_slpg->vertices[node_i].component)
5832         {
5833           last_component = m_slpg->vertices[node_i].component;
5834           gcc_assert (last_component == int (scc_pos.length ()));
5835           scc_pos.quick_push (node_count);
5836         }
5837       node_count += 1;
5838     }
5839   gcc_assert (node_count == initial_rpo.length ()
5840               && last_component + 1 == int (num_sccs));
5841
5842   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
5843      inside each SCC following the RPO we calculated above.  The fact that
5844      we ignored natural latch edges when calculating the RPO should ensure
5845      that, for natural loop nests:
5846
5847      - the first node that we encounter in a cfg loop is the loop header phi
5848      - the loop header phis are in dominance order
5849
5850      Arranging for this is an optimization (see below) rather than a
5851      correctness issue.  Unnatural loops with a tangled mess of backedges
5852      will still work correctly, but might give poorer results.
5853
5854      Also update scc_pos so that it gives 1 + the index of the last node
5855      in the SCC.  */
5856   m_partitioned_nodes.safe_grow (node_count);
5857   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
5858     {
5859       unsigned int node_i = initial_rpo[old_i];
5860       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
5861       m_partitioned_nodes[new_i] = node_i;
5862     }
5863
5864   /* When optimizing for speed, partition each SCC based on the containing
5865      cfg loop. The order we constructed above should ensure that, for natural
5866      cfg loops, we'll create sub-SCC partitions for outer loops before
5867      the corresponding sub-SCC partitions for inner loops.  Similarly,
5868      when one sibling loop A dominates another sibling loop B, we should
5869      create a sub-SCC partition for A before a sub-SCC partition for B.
5870
5871      As above, nothing depends for correctness on whether this achieves
5872      a natural nesting, but we should get better results when it does.  */
5873   m_partitions.reserve (m_vertices.length ());
5874   unsigned int next_partition_i = 0;
5875   hash_map<struct loop *, int> loop_partitions;
5876   unsigned int rpo_begin = 0;
5877   unsigned int num_partitioned_nodes = 0;
5878   for (unsigned int rpo_end : scc_pos)
5879     {
5880       loop_partitions.empty ();
5881       unsigned int partition_i = next_partition_i;
5882       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
5883         {
5884           /* Handle externals and constants optimistically throughout.
5885              But treat existing vectors as fixed since we do not handle
5886              permuting them.  */
5887           unsigned int node_i = m_partitioned_nodes[rpo_i];
5888           auto &vertex = m_vertices[node_i];
5889           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
5890                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
5891               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
5892             vertex.partition = -1;
5893           else
5894             {
5895               bool existed;
5896               if (m_optimize_size)
5897                 existed = next_partition_i > partition_i;
5898               else
5899                 {
5900                   struct loop *loop = containing_loop (vertex.node);
5901                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
5902                   if (!existed)
5903                     entry = next_partition_i;
5904                   partition_i = entry;
5905                 }
5906               if (!existed)
5907                 {
5908                   m_partitions.quick_push (slpg_partition_info ());
5909                   next_partition_i += 1;
5910                 }
5911               vertex.partition = partition_i;
5912               num_partitioned_nodes += 1;
5913               m_partitions[partition_i].node_end += 1;
5914             }
5915         }
5916       rpo_begin = rpo_end;
5917     }
5918
5919   /* Assign ranges of consecutive node indices to each partition,
5920      in partition order.  Start with node_end being the same as
5921      node_begin so that the next loop can use it as a counter.  */
5922   unsigned int node_begin = 0;
5923   for (auto &partition : m_partitions)
5924     {
5925       partition.node_begin = node_begin;
5926       node_begin += partition.node_end;
5927       partition.node_end = partition.node_begin;
5928     }
5929   gcc_assert (node_begin == num_partitioned_nodes);
5930
5931   /* Finally build the list of nodes in partition order.  */
5932   m_partitioned_nodes.truncate (num_partitioned_nodes);
5933   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
5934     {
5935       int partition_i = m_vertices[node_i].partition;
5936       if (partition_i >= 0)
5937         {
5938           unsigned int order_i = m_partitions[partition_i].node_end++;
5939           m_partitioned_nodes[order_i] = node_i;
5940         }
5941     }
5942 }
5943
5944 /* Look for edges from earlier partitions into node NODE_I and edges from
5945    node NODE_I into later partitions.  Call:
5946
5947       FN (ud, other_node_i)
5948
5949    for each such use-to-def edge ud, where other_node_i is the node at the
5950    other end of the edge.  */
5951
5952 template<typename T>
5953 void
5954 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
5955 {
5956   int partition_i = m_vertices[node_i].partition;
5957   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
5958        pred; pred = pred->pred_next)
5959     {
5960       int src_partition_i = m_vertices[pred->src].partition;
5961       if (src_partition_i >= 0 && src_partition_i != partition_i)
5962         fn (pred, pred->src);
5963     }
5964   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
5965        succ; succ = succ->succ_next)
5966     {
5967       int dest_partition_i = m_vertices[succ->dest].partition;
5968       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
5969         fn (succ, succ->dest);
5970     }
5971 }
5972
5973 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
5974    that NODE would operate on.  This test is independent of NODE's actual
5975    operation.  */
5976
5977 bool
5978 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
5979                                               unsigned int layout_i)
5980 {
5981   if (layout_i == 0)
5982     return true;
5983
5984   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
5985     return false;
5986
5987   return true;
5988 }
5989
5990 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
5991    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
5992    layouts is incompatible with NODE or if the change is not possible for
5993    some other reason.
5994
5995    The properties taken from NODE include the number of lanes and the
5996    vector type.  The actual operation doesn't matter.  */
5997
5998 int
5999 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
6000                                             unsigned int from_layout_i,
6001                                             unsigned int to_layout_i)
6002 {
6003   if (!is_compatible_layout (node, from_layout_i)
6004       || !is_compatible_layout (node, to_layout_i))
6005     return -1;
6006
6007   if (from_layout_i == to_layout_i)
6008     return 0;
6009
6010   auto_vec<slp_tree, 1> children (1);
6011   children.quick_push (node);
6012   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
6013   if (from_layout_i > 0)
6014     for (unsigned int i : m_perms[from_layout_i])
6015       perm.quick_push ({ 0, i });
6016   else
6017     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
6018       perm.quick_push ({ 0, i });
6019   if (to_layout_i > 0)
6020     vect_slp_permute (m_perms[to_layout_i], perm, true);
6021   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
6022                                                children, false);
6023   if (count >= 0)
6024     return MAX (count, 1);
6025
6026   /* ??? In principle we could try changing via layout 0, giving two
6027      layout changes rather than 1.  Doing that would require
6028      corresponding support in get_result_with_layout.  */
6029   return -1;
6030 }
6031
6032 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
6033
6034 inline slpg_partition_layout_costs &
6035 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
6036                                                 unsigned int layout_i)
6037 {
6038   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
6039 }
6040
6041 /* Change PERM in one of two ways:
6042
6043    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
6044      chosen for child I of NODE.
6045
6046    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
6047
6048    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
6049
6050 void
6051 vect_optimize_slp_pass::
6052 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
6053                         int in_layout_i, unsigned int out_layout_i)
6054 {
6055   for (auto &entry : perm)
6056     {
6057       int this_in_layout_i = in_layout_i;
6058       if (this_in_layout_i < 0)
6059         {
6060           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
6061           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
6062           if (in_partition_i == -1u)
6063             continue;
6064           this_in_layout_i = m_partitions[in_partition_i].layout;
6065         }
6066       if (this_in_layout_i > 0)
6067         entry.second = m_perms[this_in_layout_i][entry.second];
6068     }
6069   if (out_layout_i > 0)
6070     vect_slp_permute (m_perms[out_layout_i], perm, true);
6071 }
6072
6073 /* Check whether the target allows NODE to be rearranged so that the node's
6074    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
6075    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
6076
6077    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
6078    NODE can adapt to the layout changes that have (perhaps provisionally)
6079    been chosen for NODE's children, so that no extra permutations are
6080    needed on either the input or the output of NODE.
6081
6082    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
6083    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
6084
6085    IN_LAYOUT_I has no meaning for other types of node.
6086
6087    Keeping the node as-is is always valid.  If the target doesn't appear
6088    to support the node as-is, but might realistically support other layouts,
6089    then layout 0 instead has the cost of a worst-case permutation.  On the
6090    one hand, this ensures that every node has at least one valid layout,
6091    avoiding what would otherwise be an awkward special case.  On the other,
6092    it still encourages the pass to change an invalid pre-existing layout
6093    choice into a valid one.  */
6094
6095 int
6096 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
6097                                             unsigned int out_layout_i)
6098 {
6099   const int fallback_cost = 1;
6100
6101   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6102     {
6103       auto_lane_permutation_t tmp_perm;
6104       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
6105
6106       /* Check that the child nodes support the chosen layout.  Checking
6107          the first child is enough, since any second child would have the
6108          same shape.  */
6109       auto first_child = SLP_TREE_CHILDREN (node)[0];
6110       if (in_layout_i > 0
6111           && !is_compatible_layout (first_child, in_layout_i))
6112         return -1;
6113
6114       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
6115       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
6116                                                   node, tmp_perm,
6117                                                   SLP_TREE_CHILDREN (node),
6118                                                   false);
6119       if (count < 0)
6120         {
6121           if (in_layout_i == 0 && out_layout_i == 0)
6122             {
6123               /* Use the fallback cost if the node could in principle support
6124                  some nonzero layout for both the inputs and the outputs.
6125                  Otherwise assume that the node will be rejected later
6126                  and rebuilt from scalars.  */
6127               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
6128                 return fallback_cost;
6129               return 0;
6130             }
6131           return -1;
6132         }
6133
6134       /* We currently have no way of telling whether the new layout is cheaper
6135          or more expensive than the old one.  But at least in principle,
6136          it should be worth making zero permutations (whole-vector shuffles)
6137          cheaper than real permutations, in case the pass is able to remove
6138          the latter.  */
6139       return count == 0 ? 0 : 1;
6140     }
6141
6142   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
6143   if (rep
6144       && STMT_VINFO_DATA_REF (rep)
6145       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
6146       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
6147     {
6148       auto_load_permutation_t tmp_perm;
6149       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
6150       if (out_layout_i > 0)
6151         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
6152
6153       poly_uint64 vf = 1;
6154       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
6155         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6156       unsigned int n_perms;
6157       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
6158                                            nullptr, vf, true, false, &n_perms))
6159         {
6160           auto rep = SLP_TREE_REPRESENTATIVE (node);
6161           if (out_layout_i == 0)
6162             {
6163               /* Use the fallback cost if the load is an N-to-N permutation.
6164                  Otherwise assume that the node will be rejected later
6165                  and rebuilt from scalars.  */
6166               if (STMT_VINFO_GROUPED_ACCESS (rep)
6167                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
6168                       == SLP_TREE_LANES (node)))
6169                 return fallback_cost;
6170               return 0;
6171             }
6172           return -1;
6173         }
6174
6175       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
6176       return n_perms == 0 ? 0 : 1;
6177     }
6178
6179   return 0;
6180 }
6181
6182 /* Decide which element layouts we should consider using.  Calculate the
6183    weights associated with inserting layout changes on partition edges.
6184    Also mark partitions that cannot change layout, by setting their
6185    layout to zero.  */
6186
6187 void
6188 vect_optimize_slp_pass::start_choosing_layouts ()
6189 {
6190   /* Used to assign unique permutation indices.  */
6191   using perm_hash = unbounded_hashmap_traits<
6192     vec_free_hash_base<int_hash_base<unsigned>>,
6193     int_hash<int, -1, -2>
6194   >;
6195   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
6196
6197   /* Layout 0 is "no change".  */
6198   m_perms.safe_push (vNULL);
6199
6200   /* Create layouts from existing permutations.  */
6201   auto_load_permutation_t tmp_perm;
6202   for (unsigned int node_i : m_partitioned_nodes)
6203     {
6204       /* Leafs also double as entries to the reverse graph.  Allow the
6205          layout of those to be changed.  */
6206       auto &vertex = m_vertices[node_i];
6207       auto &partition = m_partitions[vertex.partition];
6208       if (!m_slpg->vertices[node_i].succ)
6209         partition.layout = 0;
6210
6211       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
6212       slp_tree node = vertex.node;
6213       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
6214       slp_tree child;
6215       unsigned HOST_WIDE_INT imin, imax = 0;
6216       bool any_permute = false;
6217       tmp_perm.truncate (0);
6218       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
6219         {
6220           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
6221              unpermuted, record a layout that reverses this permutation.
6222
6223              We would need more work to cope with loads that are internally
6224              permuted and also have inputs (such as masks for
6225              IFN_MASK_LOADs).  */
6226           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
6227           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
6228             {
6229               partition.layout = -1;
6230               continue;
6231             }
6232           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
6233           imin = DR_GROUP_SIZE (dr_stmt) + 1;
6234           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
6235         }
6236       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
6237                && SLP_TREE_CHILDREN (node).length () == 1
6238                && (child = SLP_TREE_CHILDREN (node)[0])
6239                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
6240                    .is_constant (&imin)))
6241         {
6242           /* If the child has the same vector size as this node,
6243              reversing the permutation can make the permutation a no-op.
6244              In other cases it can change a true permutation into a
6245              full-vector extract.  */
6246           tmp_perm.reserve (SLP_TREE_LANES (node));
6247           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
6248             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
6249         }
6250       else
6251         continue;
6252
6253       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
6254         {
6255           unsigned idx = tmp_perm[j];
6256           imin = MIN (imin, idx);
6257           imax = MAX (imax, idx);
6258           if (idx - tmp_perm[0] != j)
6259             any_permute = true;
6260         }
6261       /* If the span doesn't match we'd disrupt VF computation, avoid
6262          that for now.  */
6263       if (imax - imin + 1 != SLP_TREE_LANES (node))
6264         continue;
6265       /* If there's no permute no need to split one out.  In this case
6266          we can consider turning a load into a permuted load, if that
6267          turns out to be cheaper than alternatives.  */
6268       if (!any_permute)
6269         {
6270           partition.layout = -1;
6271           continue;
6272         }
6273
6274       /* For now only handle true permutes, like
6275          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
6276          when permuting constants and invariants keeping the permute
6277          bijective.  */
6278       auto_sbitmap load_index (SLP_TREE_LANES (node));
6279       bitmap_clear (load_index);
6280       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
6281         bitmap_set_bit (load_index, tmp_perm[j] - imin);
6282       unsigned j;
6283       for (j = 0; j < SLP_TREE_LANES (node); ++j)
6284         if (!bitmap_bit_p (load_index, j))
6285           break;
6286       if (j != SLP_TREE_LANES (node))
6287         continue;
6288
6289       vec<unsigned> perm = vNULL;
6290       perm.safe_grow (SLP_TREE_LANES (node), true);
6291       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
6292         perm[j] = tmp_perm[j] - imin;
6293
6294       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
6295         {
6296           /* Continue to use existing layouts, but don't add any more.  */
6297           int *entry = layout_ids.get (perm);
6298           partition.layout = entry ? *entry : 0;
6299           perm.release ();
6300         }
6301       else
6302         {
6303           bool existed;
6304           int &layout_i = layout_ids.get_or_insert (perm, &existed);
6305           if (existed)
6306             perm.release ();
6307           else
6308             {
6309               layout_i = m_perms.length ();
6310               m_perms.safe_push (perm);
6311             }
6312           partition.layout = layout_i;
6313         }
6314     }
6315
6316   /* Initially assume that every layout is possible and has zero cost
6317      in every partition.  */
6318   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
6319                                               * m_perms.length ());
6320
6321   /* We have to mark outgoing permutations facing non-associating-reduction
6322      graph entries that are not represented as to be materialized.
6323      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
6324   for (slp_instance instance : m_vinfo->slp_instances)
6325     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
6326       {
6327         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
6328         m_partitions[m_vertices[node_i].partition].layout = 0;
6329       }
6330     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
6331       {
6332         stmt_vec_info stmt_info
6333           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
6334         stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
6335         if (needs_fold_left_reduction_p (TREE_TYPE
6336                                            (gimple_get_lhs (stmt_info->stmt)),
6337                                          STMT_VINFO_REDUC_CODE (reduc_info)))
6338           {
6339             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
6340             m_partitions[m_vertices[node_i].partition].layout = 0;
6341           }
6342       }
6343
6344   /* Check which layouts each node and partition can handle.  Calculate the
6345      weights associated with inserting layout changes on edges.  */
6346   for (unsigned int node_i : m_partitioned_nodes)
6347     {
6348       auto &vertex = m_vertices[node_i];
6349       auto &partition = m_partitions[vertex.partition];
6350       slp_tree node = vertex.node;
6351
6352       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
6353         {
6354           vertex.weight = vect_slp_node_weight (node);
6355
6356           /* We do not handle stores with a permutation, so all
6357              incoming permutations must have been materialized.
6358
6359              We also don't handle masked grouped loads, which lack a
6360              permutation vector.  In this case the memory locations
6361              form an implicit second input to the loads, on top of the
6362              explicit mask input, and the memory input's layout cannot
6363              be changed.
6364
6365              On the other hand, we do support permuting gather loads and
6366              masked gather loads, where each scalar load is independent
6367              of the others.  This can be useful if the address/index input
6368              benefits from permutation.  */
6369           if (STMT_VINFO_DATA_REF (rep)
6370               && STMT_VINFO_GROUPED_ACCESS (rep)
6371               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
6372             partition.layout = 0;
6373
6374           /* We cannot change the layout of an operation that is
6375              not independent on lanes.  Note this is an explicit
6376              negative list since that's much shorter than the respective
6377              positive one but it's critical to keep maintaining it.  */
6378           if (is_gimple_call (STMT_VINFO_STMT (rep)))
6379             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
6380               {
6381               case CFN_COMPLEX_ADD_ROT90:
6382               case CFN_COMPLEX_ADD_ROT270:
6383               case CFN_COMPLEX_MUL:
6384               case CFN_COMPLEX_MUL_CONJ:
6385               case CFN_VEC_ADDSUB:
6386               case CFN_VEC_FMADDSUB:
6387               case CFN_VEC_FMSUBADD:
6388                 partition.layout = 0;
6389               default:;
6390               }
6391         }
6392
6393       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
6394         {
6395           auto &other_vertex = m_vertices[other_node_i];
6396
6397           /* Count the number of edges from earlier partitions and the number
6398              of edges to later partitions.  */
6399           if (other_vertex.partition < vertex.partition)
6400             partition.in_degree += 1;
6401           else
6402             partition.out_degree += 1;
6403
6404           /* If the current node uses the result of OTHER_NODE_I, accumulate
6405              the effects of that.  */
6406           if (ud->src == int (node_i))
6407             {
6408               other_vertex.out_weight += vertex.weight;
6409               other_vertex.out_degree += 1;
6410             }
6411         };
6412       for_each_partition_edge (node_i, process_edge);
6413     }
6414 }
6415
6416 /* Return the incoming costs for node NODE_I, assuming that each input keeps
6417    its current (provisional) choice of layout.  The inputs do not necessarily
6418    have the same layout as each other.  */
6419
6420 slpg_layout_cost
6421 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
6422 {
6423   auto &vertex = m_vertices[node_i];
6424   slpg_layout_cost cost;
6425   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
6426     {
6427       auto &other_vertex = m_vertices[other_node_i];
6428       if (other_vertex.partition < vertex.partition)
6429         {
6430           auto &other_partition = m_partitions[other_vertex.partition];
6431           auto &other_costs = partition_layout_costs (other_vertex.partition,
6432                                                       other_partition.layout);
6433           slpg_layout_cost this_cost = other_costs.in_cost;
6434           this_cost.add_serial_cost (other_costs.internal_cost);
6435           this_cost.split (other_partition.out_degree);
6436           cost.add_parallel_cost (this_cost);
6437         }
6438     };
6439   for_each_partition_edge (node_i, add_cost);
6440   return cost;
6441 }
6442
6443 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
6444    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
6445    slpg_layout_cost::impossible () if the change isn't possible.  */
6446
6447 slpg_layout_cost
6448 vect_optimize_slp_pass::
6449 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
6450                   unsigned int layout2_i)
6451 {
6452   auto &def_vertex = m_vertices[ud->dest];
6453   auto &use_vertex = m_vertices[ud->src];
6454   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
6455   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
6456   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
6457                                     use_layout_i);
6458   if (factor < 0)
6459     return slpg_layout_cost::impossible ();
6460
6461   /* We have a choice of putting the layout change at the site of the
6462      definition or at the site of the use.  Prefer the former when
6463      optimizing for size or when the execution frequency of the
6464      definition is no greater than the combined execution frequencies of
6465      the uses.  When putting the layout change at the site of the definition,
6466      divvy up the cost among all consumers.  */
6467   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
6468     {
6469       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
6470       cost.split (def_vertex.out_degree);
6471       return cost;
6472     }
6473   return { use_vertex.weight * factor, m_optimize_size };
6474 }
6475
6476 /* UD represents a use-def link between FROM_NODE_I and a node in a later
6477    partition; FROM_NODE_I could be the definition node or the use node.
6478    The node at the other end of the link wants to use layout TO_LAYOUT_I.
6479    Return the cost of any necessary fix-ups on edge UD, or return
6480    slpg_layout_cost::impossible () if the change isn't possible.
6481
6482    At this point, FROM_NODE_I's partition has chosen the cheapest
6483    layout based on the information available so far, but this choice
6484    is only provisional.  */
6485
6486 slpg_layout_cost
6487 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
6488                                       unsigned int to_layout_i)
6489 {
6490   auto &from_vertex = m_vertices[from_node_i];
6491   unsigned int from_partition_i = from_vertex.partition;
6492   slpg_partition_info &from_partition = m_partitions[from_partition_i];
6493   gcc_assert (from_partition.layout >= 0);
6494
6495   /* First calculate the cost on the assumption that FROM_PARTITION sticks
6496      with its current layout preference.  */
6497   slpg_layout_cost cost = slpg_layout_cost::impossible ();
6498   auto edge_cost = edge_layout_cost (ud, from_node_i,
6499                                      from_partition.layout, to_layout_i);
6500   if (edge_cost.is_possible ())
6501     {
6502       auto &from_costs = partition_layout_costs (from_partition_i,
6503                                                  from_partition.layout);
6504       cost = from_costs.in_cost;
6505       cost.add_serial_cost (from_costs.internal_cost);
6506       cost.split (from_partition.out_degree);
6507       cost.add_serial_cost (edge_cost);
6508     }
6509   else if (from_partition.layout == 0)
6510     /* We must allow the source partition to have layout 0 as a fallback,
6511        in case all other options turn out to be impossible.  */
6512     return cost;
6513
6514   /* Take the minimum of that cost and the cost that applies if
6515      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
6516   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
6517                                                       to_layout_i);
6518   if (direct_layout_costs.is_possible ())
6519     {
6520       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
6521       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
6522       direct_cost.split (from_partition.out_degree);
6523       if (!cost.is_possible ()
6524           || direct_cost.is_better_than (cost, m_optimize_size))
6525         cost = direct_cost;
6526     }
6527
6528   return cost;
6529 }
6530
6531 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
6532    partition; TO_NODE_I could be the definition node or the use node.
6533    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
6534    return the cost of any necessary fix-ups on edge UD, or
6535    slpg_layout_cost::impossible () if the choice cannot be made.
6536
6537    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
6538
6539 slpg_layout_cost
6540 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
6541                                        unsigned int from_layout_i)
6542 {
6543   auto &to_vertex = m_vertices[to_node_i];
6544   unsigned int to_partition_i = to_vertex.partition;
6545   slpg_partition_info &to_partition = m_partitions[to_partition_i];
6546   gcc_assert (to_partition.layout >= 0);
6547
6548   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
6549      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
6550      any other inputs keep their current choice of layout.  */
6551   auto &to_costs = partition_layout_costs (to_partition_i,
6552                                            to_partition.layout);
6553   if (ud->src == int (to_node_i)
6554       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
6555     {
6556       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
6557       auto old_layout = from_partition.layout;
6558       from_partition.layout = from_layout_i;
6559       int factor = internal_node_cost (to_vertex.node, -1,
6560                                        to_partition.layout);
6561       from_partition.layout = old_layout;
6562       if (factor >= 0)
6563         {
6564           slpg_layout_cost cost = to_costs.out_cost;
6565           cost.add_serial_cost ({ to_vertex.weight * factor,
6566                                   m_optimize_size });
6567           cost.split (to_partition.in_degree);
6568           return cost;
6569         }
6570     }
6571
6572   /* Compute the cost if we insert any necessary layout change on edge UD.  */
6573   auto edge_cost = edge_layout_cost (ud, to_node_i,
6574                                      to_partition.layout, from_layout_i);
6575   if (edge_cost.is_possible ())
6576     {
6577       slpg_layout_cost cost = to_costs.out_cost;
6578       cost.add_serial_cost (to_costs.internal_cost);
6579       cost.split (to_partition.in_degree);
6580       cost.add_serial_cost (edge_cost);
6581       return cost;
6582     }
6583
6584   return slpg_layout_cost::impossible ();
6585 }
6586
6587 /* Make a forward pass through the partitions, accumulating input costs.
6588    Make a tentative (provisional) choice of layout for each partition,
6589    ensuring that this choice still allows later partitions to keep
6590    their original layout.  */
6591
6592 void
6593 vect_optimize_slp_pass::forward_pass ()
6594 {
6595   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
6596        ++partition_i)
6597     {
6598       auto &partition = m_partitions[partition_i];
6599
6600       /* If the partition consists of a single VEC_PERM_EXPR, precompute
6601          the incoming cost that would apply if every predecessor partition
6602          keeps its current layout.  This is used within the loop below.  */
6603       slpg_layout_cost in_cost;
6604       slp_tree single_node = nullptr;
6605       if (partition.node_end == partition.node_begin + 1)
6606         {
6607           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
6608           single_node = m_vertices[node_i].node;
6609           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
6610             in_cost = total_in_cost (node_i);
6611         }
6612
6613       /* Go through the possible layouts.  Decide which ones are valid
6614          for this partition and record which of the valid layouts has
6615          the lowest cost.  */
6616       unsigned int min_layout_i = 0;
6617       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
6618       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6619         {
6620           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6621           if (!layout_costs.is_possible ())
6622             continue;
6623
6624           /* If the recorded layout is already 0 then the layout cannot
6625              change.  */
6626           if (partition.layout == 0 && layout_i != 0)
6627             {
6628               layout_costs.mark_impossible ();
6629               continue;
6630             }
6631
6632           bool is_possible = true;
6633           for (unsigned int order_i = partition.node_begin;
6634                order_i < partition.node_end; ++order_i)
6635             {
6636               unsigned int node_i = m_partitioned_nodes[order_i];
6637               auto &vertex = m_vertices[node_i];
6638
6639               /* Reject the layout if it is individually incompatible
6640                  with any node in the partition.  */
6641               if (!is_compatible_layout (vertex.node, layout_i))
6642                 {
6643                   is_possible = false;
6644                   break;
6645                 }
6646
6647               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
6648                 {
6649                   auto &other_vertex = m_vertices[other_node_i];
6650                   if (other_vertex.partition < vertex.partition)
6651                     {
6652                       /* Accumulate the incoming costs from earlier
6653                          partitions, plus the cost of any layout changes
6654                          on UD itself.  */
6655                       auto cost = forward_cost (ud, other_node_i, layout_i);
6656                       if (!cost.is_possible ())
6657                         is_possible = false;
6658                       else
6659                         layout_costs.in_cost.add_parallel_cost (cost);
6660                     }
6661                   else
6662                     /* Reject the layout if it would make layout 0 impossible
6663                        for later partitions.  This amounts to testing that the
6664                        target supports reversing the layout change on edges
6665                        to later partitions.
6666
6667                        In principle, it might be possible to push a layout
6668                        change all the way down a graph, so that it never
6669                        needs to be reversed and so that the target doesn't
6670                        need to support the reverse operation.  But it would
6671                        be awkward to bail out if we hit a partition that
6672                        does not support the new layout, especially since
6673                        we are not dealing with a lattice.  */
6674                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
6675                                                      layout_i).is_possible ();
6676                 };
6677               for_each_partition_edge (node_i, add_cost);
6678
6679               /* Accumulate the cost of using LAYOUT_I within NODE,
6680                  both for the inputs and the outputs.  */
6681               int factor = internal_node_cost (vertex.node, layout_i,
6682                                                layout_i);
6683               if (factor < 0)
6684                 {
6685                   is_possible = false;
6686                   break;
6687                 }
6688               else if (factor)
6689                 layout_costs.internal_cost.add_serial_cost
6690                   ({ vertex.weight * factor, m_optimize_size });
6691             }
6692           if (!is_possible)
6693             {
6694               layout_costs.mark_impossible ();
6695               continue;
6696             }
6697
6698           /* Combine the incoming and partition-internal costs.  */
6699           slpg_layout_cost combined_cost = layout_costs.in_cost;
6700           combined_cost.add_serial_cost (layout_costs.internal_cost);
6701
6702           /* If this partition consists of a single VEC_PERM_EXPR, see
6703              if the VEC_PERM_EXPR can be changed to support output layout
6704              LAYOUT_I while keeping all the provisional choices of input
6705              layout.  */
6706           if (single_node
6707               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
6708             {
6709               int factor = internal_node_cost (single_node, -1, layout_i);
6710               if (factor >= 0)
6711                 {
6712                   auto weight = m_vertices[single_node->vertex].weight;
6713                   slpg_layout_cost internal_cost
6714                     = { weight * factor, m_optimize_size };
6715
6716                   slpg_layout_cost alt_cost = in_cost;
6717                   alt_cost.add_serial_cost (internal_cost);
6718                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
6719                     {
6720                       combined_cost = alt_cost;
6721                       layout_costs.in_cost = in_cost;
6722                       layout_costs.internal_cost = internal_cost;
6723                     }
6724                 }
6725             }
6726
6727           /* Record the layout with the lowest cost.  Prefer layout 0 in
6728              the event of a tie between it and another layout.  */
6729           if (!min_layout_cost.is_possible ()
6730               || combined_cost.is_better_than (min_layout_cost,
6731                                                m_optimize_size))
6732             {
6733               min_layout_i = layout_i;
6734               min_layout_cost = combined_cost;
6735             }
6736         }
6737
6738       /* This loop's handling of earlier partitions should ensure that
6739          choosing the original layout for the current partition is no
6740          less valid than it was in the original graph, even with the
6741          provisional layout choices for those earlier partitions.  */
6742       gcc_assert (min_layout_cost.is_possible ());
6743       partition.layout = min_layout_i;
6744     }
6745 }
6746
6747 /* Make a backward pass through the partitions, accumulating output costs.
6748    Make a final choice of layout for each partition.  */
6749
6750 void
6751 vect_optimize_slp_pass::backward_pass ()
6752 {
6753   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
6754     {
6755       auto &partition = m_partitions[partition_i];
6756
6757       unsigned int min_layout_i = 0;
6758       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
6759       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6760         {
6761           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6762           if (!layout_costs.is_possible ())
6763             continue;
6764
6765           /* Accumulate the costs from successor partitions.  */
6766           bool is_possible = true;
6767           for (unsigned int order_i = partition.node_begin;
6768                order_i < partition.node_end; ++order_i)
6769             {
6770               unsigned int node_i = m_partitioned_nodes[order_i];
6771               auto &vertex = m_vertices[node_i];
6772               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
6773                 {
6774                   auto &other_vertex = m_vertices[other_node_i];
6775                   auto &other_partition = m_partitions[other_vertex.partition];
6776                   if (other_vertex.partition > vertex.partition)
6777                     {
6778                       /* Accumulate the incoming costs from later
6779                          partitions, plus the cost of any layout changes
6780                          on UD itself.  */
6781                       auto cost = backward_cost (ud, other_node_i, layout_i);
6782                       if (!cost.is_possible ())
6783                         is_possible = false;
6784                       else
6785                         layout_costs.out_cost.add_parallel_cost (cost);
6786                     }
6787                   else
6788                     /* Make sure that earlier partitions can (if necessary
6789                        or beneficial) keep the layout that they chose in
6790                        the forward pass.  This ensures that there is at
6791                        least one valid choice of layout.  */
6792                     is_possible &= edge_layout_cost (ud, other_node_i,
6793                                                      other_partition.layout,
6794                                                      layout_i).is_possible ();
6795                 };
6796               for_each_partition_edge (node_i, add_cost);
6797             }
6798           if (!is_possible)
6799             {
6800               layout_costs.mark_impossible ();
6801               continue;
6802             }
6803
6804           /* Locally combine the costs from the forward and backward passes.
6805              (This combined cost is not passed on, since that would lead
6806              to double counting.)  */
6807           slpg_layout_cost combined_cost = layout_costs.in_cost;
6808           combined_cost.add_serial_cost (layout_costs.internal_cost);
6809           combined_cost.add_serial_cost (layout_costs.out_cost);
6810
6811           /* Record the layout with the lowest cost.  Prefer layout 0 in
6812              the event of a tie between it and another layout.  */
6813           if (!min_layout_cost.is_possible ()
6814               || combined_cost.is_better_than (min_layout_cost,
6815                                                m_optimize_size))
6816             {
6817               min_layout_i = layout_i;
6818               min_layout_cost = combined_cost;
6819             }
6820         }
6821
6822       gcc_assert (min_layout_cost.is_possible ());
6823       partition.layout = min_layout_i;
6824     }
6825 }
6826
6827 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
6828    NODE already has the layout that was selected for its partition.  */
6829
6830 slp_tree
6831 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
6832                                                 unsigned int to_layout_i)
6833 {
6834   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
6835   slp_tree result = m_node_layouts[result_i];
6836   if (result)
6837     return result;
6838
6839   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
6840       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
6841           /* We can't permute vector defs in place.  */
6842           && SLP_TREE_VEC_DEFS (node).is_empty ()))
6843     {
6844       /* If the vector is uniform or unchanged, there's nothing to do.  */
6845       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
6846         result = node;
6847       else
6848         {
6849           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
6850           result = vect_create_new_slp_node (scalar_ops);
6851           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
6852         }
6853     }
6854   else
6855     {
6856       unsigned int partition_i = m_vertices[node->vertex].partition;
6857       unsigned int from_layout_i = m_partitions[partition_i].layout;
6858       if (from_layout_i == to_layout_i)
6859         return node;
6860
6861       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
6862          permutation instead of a serial one.  Leave the new permutation
6863          in TMP_PERM on success.  */
6864       auto_lane_permutation_t tmp_perm;
6865       unsigned int num_inputs = 1;
6866       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6867         {
6868           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
6869           if (from_layout_i != 0)
6870             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
6871           if (to_layout_i != 0)
6872             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
6873           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
6874                                               tmp_perm,
6875                                               SLP_TREE_CHILDREN (node),
6876                                               false) >= 0)
6877             num_inputs = SLP_TREE_CHILDREN (node).length ();
6878           else
6879             tmp_perm.truncate (0);
6880         }
6881
6882       if (dump_enabled_p ())
6883         {
6884           if (tmp_perm.length () > 0)
6885             dump_printf_loc (MSG_NOTE, vect_location,
6886                              "duplicating permutation node %p with"
6887                              " layout %d\n",
6888                              (void *) node, to_layout_i);
6889           else
6890             dump_printf_loc (MSG_NOTE, vect_location,
6891                              "inserting permutation node in place of %p\n",
6892                              (void *) node);
6893         }
6894
6895       unsigned int num_lanes = SLP_TREE_LANES (node);
6896       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
6897       if (SLP_TREE_SCALAR_STMTS (node).length ())
6898         {
6899           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
6900           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
6901           if (from_layout_i != 0)
6902             vect_slp_permute (m_perms[from_layout_i], stmts, false);
6903           if (to_layout_i != 0)
6904             vect_slp_permute (m_perms[to_layout_i], stmts, true);
6905         }
6906       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
6907       SLP_TREE_LANES (result) = num_lanes;
6908       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
6909       result->vertex = -1;
6910
6911       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
6912       if (tmp_perm.length ())
6913         {
6914           lane_perm.safe_splice (tmp_perm);
6915           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
6916         }
6917       else
6918         {
6919           lane_perm.create (num_lanes);
6920           for (unsigned j = 0; j < num_lanes; ++j)
6921             lane_perm.quick_push ({ 0, j });
6922           if (from_layout_i != 0)
6923             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
6924           if (to_layout_i != 0)
6925             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
6926           SLP_TREE_CHILDREN (result).safe_push (node);
6927         }
6928       for (slp_tree child : SLP_TREE_CHILDREN (result))
6929         child->refcnt++;
6930     }
6931   m_node_layouts[result_i] = result;
6932   return result;
6933 }
6934
6935 /* Apply the chosen vector layouts to the SLP graph.  */
6936
6937 void
6938 vect_optimize_slp_pass::materialize ()
6939 {
6940   /* We no longer need the costs, so avoid having two O(N * P) arrays
6941      live at the same time.  */
6942   m_partition_layout_costs.release ();
6943   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
6944
6945   auto_sbitmap fully_folded (m_vertices.length ());
6946   bitmap_clear (fully_folded);
6947   for (unsigned int node_i : m_partitioned_nodes)
6948     {
6949       auto &vertex = m_vertices[node_i];
6950       slp_tree node = vertex.node;
6951       int layout_i = m_partitions[vertex.partition].layout;
6952       gcc_assert (layout_i >= 0);
6953
6954       /* Rearrange the scalar statements to match the chosen layout.  */
6955       if (layout_i > 0)
6956         vect_slp_permute (m_perms[layout_i],
6957                           SLP_TREE_SCALAR_STMTS (node), true);
6958
6959       /* Update load and lane permutations.  */
6960       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6961         {
6962           /* First try to absorb the input vector layouts.  If that fails,
6963              force the inputs to have layout LAYOUT_I too.  We checked that
6964              that was possible before deciding to use nonzero output layouts.
6965              (Note that at this stage we don't really have any guarantee that
6966              the target supports the original VEC_PERM_EXPR.)  */
6967           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
6968           auto_lane_permutation_t tmp_perm;
6969           tmp_perm.safe_splice (perm);
6970           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
6971           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
6972                                               tmp_perm,
6973                                               SLP_TREE_CHILDREN (node),
6974                                               false) >= 0)
6975             {
6976               if (dump_enabled_p ()
6977                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
6978                                   perm.begin ()))
6979                 dump_printf_loc (MSG_NOTE, vect_location,
6980                                  "absorbing input layouts into %p\n",
6981                                  (void *) node);
6982               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
6983               bitmap_set_bit (fully_folded, node_i);
6984             }
6985           else
6986             {
6987               /* Not MSG_MISSED because it would make no sense to users.  */
6988               if (dump_enabled_p ())
6989                 dump_printf_loc (MSG_NOTE, vect_location,
6990                                  "failed to absorb input layouts into %p\n",
6991                                  (void *) node);
6992               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
6993             }
6994         }
6995       else
6996         {
6997           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
6998           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
6999           if (layout_i > 0)
7000             /* ???  When we handle non-bijective permutes the idea
7001                is that we can force the load-permutation to be
7002                { min, min + 1, min + 2, ... max }.  But then the
7003                scalar defs might no longer match the lane content
7004                which means wrong-code with live lane vectorization.
7005                So we possibly have to have NULL entries for those.  */
7006             vect_slp_permute (m_perms[layout_i], load_perm, true);
7007         }
7008     }
7009
7010   /* Do this before any nodes disappear, since it involves a walk
7011      over the leaves.  */
7012   remove_redundant_permutations ();
7013
7014   /* Replace each child with a correctly laid-out version.  */
7015   for (unsigned int node_i : m_partitioned_nodes)
7016     {
7017       /* Skip nodes that have already been handled above.  */
7018       if (bitmap_bit_p (fully_folded, node_i))
7019         continue;
7020
7021       auto &vertex = m_vertices[node_i];
7022       int in_layout_i = m_partitions[vertex.partition].layout;
7023       gcc_assert (in_layout_i >= 0);
7024
7025       unsigned j;
7026       slp_tree child;
7027       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
7028         {
7029           if (!child)
7030             continue;
7031
7032           slp_tree new_child = get_result_with_layout (child, in_layout_i);
7033           if (new_child != child)
7034             {
7035               vect_free_slp_tree (child);
7036               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
7037               new_child->refcnt += 1;
7038             }
7039         }
7040     }
7041 }
7042
7043 /* Elide load permutations that are not necessary.  Such permutations might
7044    be pre-existing, rather than created by the layout optimizations.  */
7045
7046 void
7047 vect_optimize_slp_pass::remove_redundant_permutations ()
7048 {
7049   for (unsigned int node_i : m_leafs)
7050     {
7051       slp_tree node = m_vertices[node_i].node;
7052       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
7053         continue;
7054
7055       /* In basic block vectorization we allow any subchain of an interleaving
7056          chain.
7057          FORNOW: not in loop SLP because of realignment complications.  */
7058       if (is_a <bb_vec_info> (m_vinfo))
7059         {
7060           bool subchain_p = true;
7061           stmt_vec_info next_load_info = NULL;
7062           stmt_vec_info load_info;
7063           unsigned j;
7064           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
7065             {
7066               if (j != 0
7067                   && (next_load_info != load_info
7068                       || ! load_info
7069                       || DR_GROUP_GAP (load_info) != 1))
7070                 {
7071                   subchain_p = false;
7072                   break;
7073                 }
7074               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
7075             }
7076           if (subchain_p)
7077             {
7078               SLP_TREE_LOAD_PERMUTATION (node).release ();
7079               continue;
7080             }
7081         }
7082       else
7083         {
7084           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
7085           stmt_vec_info load_info;
7086           bool this_load_permuted = false;
7087           unsigned j;
7088           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
7089             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
7090               {
7091                 this_load_permuted = true;
7092                 break;
7093               }
7094           /* When this isn't a grouped access we know it's single element
7095              and contiguous.  */
7096           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
7097             {
7098               if (!this_load_permuted
7099                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
7100                       || SLP_TREE_LANES (node) == 1))
7101                 SLP_TREE_LOAD_PERMUTATION (node).release ();
7102               continue;
7103             }
7104           stmt_vec_info first_stmt_info
7105             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
7106           if (!this_load_permuted
7107               /* The load requires permutation when unrolling exposes
7108                  a gap either because the group is larger than the SLP
7109                  group-size or because there is a gap between the groups.  */
7110               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
7111                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
7112                       && DR_GROUP_GAP (first_stmt_info) == 0)))
7113             {
7114               SLP_TREE_LOAD_PERMUTATION (node).release ();
7115               continue;
7116             }
7117         }
7118     }
7119 }
7120
7121 /* Print the partition graph and layout information to the dump file.  */
7122
7123 void
7124 vect_optimize_slp_pass::dump ()
7125 {
7126   dump_printf_loc (MSG_NOTE, vect_location,
7127                    "SLP optimize permutations:\n");
7128   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
7129     {
7130       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
7131       const char *sep = "";
7132       for (unsigned int idx : m_perms[layout_i])
7133         {
7134           dump_printf (MSG_NOTE, "%s%d", sep, idx);
7135           sep = ", ";
7136         }
7137       dump_printf (MSG_NOTE, " }\n");
7138     }
7139   dump_printf_loc (MSG_NOTE, vect_location,
7140                    "SLP optimize partitions:\n");
7141   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
7142        ++partition_i)
7143     {
7144       auto &partition = m_partitions[partition_i];
7145       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
7146       dump_printf_loc (MSG_NOTE, vect_location,
7147                        "  partition %d (layout %d):\n",
7148                        partition_i, partition.layout);
7149       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
7150       for (unsigned int order_i = partition.node_begin;
7151            order_i < partition.node_end; ++order_i)
7152         {
7153           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
7154           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
7155                            (void *) vertex.node);
7156           dump_printf_loc (MSG_NOTE, vect_location,
7157                            "          weight: %f\n",
7158                            vertex.weight.to_double ());
7159           if (vertex.out_degree)
7160             dump_printf_loc (MSG_NOTE, vect_location,
7161                              "          out weight: %f (degree %d)\n",
7162                              vertex.out_weight.to_double (),
7163                              vertex.out_degree);
7164           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
7165             dump_printf_loc (MSG_NOTE, vect_location,
7166                              "          op: VEC_PERM_EXPR\n");
7167           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
7168             dump_printf_loc (MSG_NOTE, vect_location,
7169                              "          op template: %G", rep->stmt);
7170         }
7171       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
7172       for (unsigned int order_i = partition.node_begin;
7173            order_i < partition.node_end; ++order_i)
7174         {
7175           unsigned int node_i = m_partitioned_nodes[order_i];
7176           auto &vertex = m_vertices[node_i];
7177           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
7178             {
7179               auto &other_vertex = m_vertices[other_node_i];
7180               if (other_vertex.partition < vertex.partition)
7181                 dump_printf_loc (MSG_NOTE, vect_location,
7182                                  "      - %p [%d] --> %p\n",
7183                                  (void *) other_vertex.node,
7184                                  other_vertex.partition,
7185                                  (void *) vertex.node);
7186               else
7187                 dump_printf_loc (MSG_NOTE, vect_location,
7188                                  "      - %p --> [%d] %p\n",
7189                                  (void *) vertex.node,
7190                                  other_vertex.partition,
7191                                  (void *) other_vertex.node);
7192             };
7193           for_each_partition_edge (node_i, print_edge);
7194         }
7195
7196       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7197         {
7198           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7199           if (layout_costs.is_possible ())
7200             {
7201               dump_printf_loc (MSG_NOTE, vect_location,
7202                                "    layout %d:%s\n", layout_i,
7203                                partition.layout == int (layout_i)
7204                                ? " (*)" : "");
7205               slpg_layout_cost combined_cost = layout_costs.in_cost;
7206               combined_cost.add_serial_cost (layout_costs.internal_cost);
7207               combined_cost.add_serial_cost (layout_costs.out_cost);
7208 #define TEMPLATE "{depth: %f, total: %f}"
7209               dump_printf_loc (MSG_NOTE, vect_location,
7210                                "        " TEMPLATE "\n",
7211                                layout_costs.in_cost.depth.to_double (),
7212                                layout_costs.in_cost.total.to_double ());
7213               dump_printf_loc (MSG_NOTE, vect_location,
7214                                "      + " TEMPLATE "\n",
7215                                layout_costs.internal_cost.depth.to_double (),
7216                                layout_costs.internal_cost.total.to_double ());
7217               dump_printf_loc (MSG_NOTE, vect_location,
7218                                "      + " TEMPLATE "\n",
7219                                layout_costs.out_cost.depth.to_double (),
7220                                layout_costs.out_cost.total.to_double ());
7221               dump_printf_loc (MSG_NOTE, vect_location,
7222                                "      = " TEMPLATE "\n",
7223                                combined_cost.depth.to_double (),
7224                                combined_cost.total.to_double ());
7225 #undef TEMPLATE
7226             }
7227           else
7228             dump_printf_loc (MSG_NOTE, vect_location,
7229                              "    layout %d: rejected\n", layout_i);
7230         }
7231     }
7232 }
7233
7234 /* Masked load lanes discovery.  */
7235
7236 void
7237 vect_optimize_slp_pass::decide_masked_load_lanes ()
7238 {
7239   for (auto v : m_vertices)
7240     {
7241       slp_tree node = v.node;
7242       if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
7243           || SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7244         continue;
7245       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7246       if (! STMT_VINFO_GROUPED_ACCESS (stmt_info)
7247           /* The mask has to be uniform.  */
7248           || STMT_VINFO_SLP_VECT_ONLY (stmt_info)
7249           || ! is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
7250           || ! gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
7251                                        IFN_MASK_LOAD))
7252         continue;
7253       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
7254       if (STMT_VINFO_STRIDED_P (stmt_info)
7255           || compare_step_with_zero (m_vinfo, stmt_info) <= 0
7256           || vect_load_lanes_supported (SLP_TREE_VECTYPE (node),
7257                                         DR_GROUP_SIZE (stmt_info),
7258                                         true) == IFN_LAST)
7259         continue;
7260
7261       /* Uniform masks need to be suitably represented.  */
7262       slp_tree mask = SLP_TREE_CHILDREN (node)[0];
7263       if (SLP_TREE_CODE (mask) != VEC_PERM_EXPR
7264           || SLP_TREE_CHILDREN (mask).length () != 1)
7265         continue;
7266       bool match = true;
7267       for (auto perm : SLP_TREE_LANE_PERMUTATION (mask))
7268         if (perm.first != 0 || perm.second != 0)
7269           {
7270             match = false;
7271             break;
7272           }
7273       if (!match)
7274         continue;
7275
7276       /* Now see if the consumer side matches.  */
7277       for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
7278            pred; pred = pred->pred_next)
7279         {
7280           slp_tree pred_node = m_vertices[pred->src].node;
7281           /* All consumers should be a permute with a single outgoing lane.  */
7282           if (SLP_TREE_CODE (pred_node) != VEC_PERM_EXPR
7283               || SLP_TREE_LANES (pred_node) != 1)
7284             {
7285               match = false;
7286               break;
7287             }
7288           gcc_assert (SLP_TREE_CHILDREN (pred_node).length () == 1);
7289         }
7290       if (!match)
7291         continue;
7292       /* Now we can mark the nodes as to use load lanes.  */
7293       node->ldst_lanes = true;
7294       for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
7295            pred; pred = pred->pred_next)
7296         m_vertices[pred->src].node->ldst_lanes = true;
7297       /* The catch is we have to massage the mask.  We have arranged
7298          analyzed uniform masks to be represented by a splat VEC_PERM
7299          which we can now simply elide as we cannot easily re-do SLP
7300          discovery here.  */
7301       slp_tree new_mask = SLP_TREE_CHILDREN (mask)[0];
7302       SLP_TREE_REF_COUNT (new_mask)++;
7303       SLP_TREE_CHILDREN (node)[0] = new_mask;
7304       vect_free_slp_tree (mask);
7305     }
7306 }
7307
7308 /* Main entry point for the SLP graph optimization pass.  */
7309
7310 void
7311 vect_optimize_slp_pass::run ()
7312 {
7313   build_graph ();
7314   create_partitions ();
7315   start_choosing_layouts ();
7316   if (m_perms.length () > 1)
7317     {
7318       forward_pass ();
7319       backward_pass ();
7320       if (dump_enabled_p ())
7321         dump ();
7322       materialize ();
7323       while (!m_perms.is_empty ())
7324         m_perms.pop ().release ();
7325     }
7326   else
7327     remove_redundant_permutations ();
7328   free_graph (m_slpg);
7329   build_graph ();
7330   decide_masked_load_lanes ();
7331   free_graph (m_slpg);
7332 }
7333
7334 /* Apply CSE to NODE and its children using BST_MAP.  */
7335
7336 static void
7337 vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
7338 {
7339   bool put_p = false;
7340   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
7341       /* Besides some VEC_PERM_EXPR, two-operator nodes also
7342          lack scalar stmts and thus CSE doesn't work via bst_map.  Ideally
7343          we'd have sth that works for all internal and external nodes.  */
7344       && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
7345     {
7346       slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
7347       if (leader)
7348         {
7349           /* We've visited this node already.  */
7350           if (!*leader || *leader == node)
7351             return;
7352
7353           if (dump_enabled_p ())
7354             dump_printf_loc (MSG_NOTE, vect_location,
7355                              "re-using SLP tree %p for %p\n",
7356                              (void *)*leader, (void *)node);
7357           vect_free_slp_tree (node);
7358           (*leader)->refcnt += 1;
7359           node = *leader;
7360           return;
7361         }
7362
7363       /* Avoid creating a cycle by populating the map only after recursion.  */
7364       bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
7365       node->refcnt += 1;
7366       put_p = true;
7367       /* And recurse.  */
7368     }
7369
7370   for (slp_tree &child : SLP_TREE_CHILDREN (node))
7371     if (child)
7372       vect_cse_slp_nodes (bst_map, child);
7373
7374   /* Now record the node for CSE in other siblings.  */
7375   if (put_p)
7376     *bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
7377 }
7378
7379 /* Optimize the SLP graph of VINFO.  */
7380
7381 void
7382 vect_optimize_slp (vec_info *vinfo)
7383 {
7384   if (vinfo->slp_instances.is_empty ())
7385     return;
7386   vect_optimize_slp_pass (vinfo).run ();
7387
7388   /* Apply CSE again to nodes after permute optimization.  */
7389   scalar_stmts_to_slp_tree_map_t *bst_map
7390     = new scalar_stmts_to_slp_tree_map_t ();
7391
7392   for (auto inst : vinfo->slp_instances)
7393     vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
7394
7395   release_scalar_stmts_to_slp_tree_map (bst_map);
7396 }
7397
7398 /* Gather loads reachable from the individual SLP graph entries.  */
7399
7400 void
7401 vect_gather_slp_loads (vec_info *vinfo)
7402 {
7403   unsigned i;
7404   slp_instance instance;
7405   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
7406     {
7407       hash_set<slp_tree> visited;
7408       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
7409                              SLP_INSTANCE_TREE (instance), visited);
7410     }
7411 }
7412
7413 /* For NODE update VF based on the number of lanes and the vector types
7414    used.  */
7415
7416 static void
7417 vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
7418                              hash_set<slp_tree> &visited)
7419 {
7420   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7421     return;
7422   if (visited.add (node))
7423     return;
7424
7425   for (slp_tree child : SLP_TREE_CHILDREN (node))
7426     vect_update_slp_vf_for_node (child, vf, visited);
7427
7428   /* We do not visit SLP nodes for constants or externals - those neither
7429      have a vector type set yet (vectorizable_* does this) nor do they
7430      have max_nunits set.  Instead we rely on internal nodes max_nunit
7431      to cover constant/external operands.
7432      Note that when we stop using fixed size vectors externs and constants
7433      shouldn't influence the (minimum) vectorization factor, instead
7434      vectorizable_* should honor the vectorization factor when trying to
7435      assign vector types to constants and externals and cause iteration
7436      to a higher vectorization factor when required.  */
7437   poly_uint64 node_vf
7438     = calculate_unrolling_factor (node->max_nunits, SLP_TREE_LANES (node));
7439   vf = force_common_multiple (vf, node_vf);
7440
7441   /* For permute nodes that are fed from externs or constants we have to
7442      consider their number of lanes as well.  Likewise for store-lanes.  */
7443   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
7444       || node->ldst_lanes)
7445     for (slp_tree child : SLP_TREE_CHILDREN (node))
7446       if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7447         {
7448           poly_uint64 child_vf
7449             = calculate_unrolling_factor (node->max_nunits,
7450                                           SLP_TREE_LANES (child));
7451           vf = force_common_multiple (vf, child_vf);
7452         }
7453 }
7454
7455 /* For each possible SLP instance decide whether to SLP it and calculate overall
7456    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
7457    least one instance.  */
7458
7459 bool
7460 vect_make_slp_decision (loop_vec_info loop_vinfo)
7461 {
7462   unsigned int i;
7463   poly_uint64 unrolling_factor = 1;
7464   const vec<slp_instance> &slp_instances
7465     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7466   slp_instance instance;
7467   int decided_to_slp = 0;
7468
7469   DUMP_VECT_SCOPE ("vect_make_slp_decision");
7470
7471   hash_set<slp_tree> visited;
7472   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7473     {
7474       /* FORNOW: SLP if you can.  */
7475       /* All unroll factors have the form:
7476
7477            GET_MODE_SIZE (vinfo->vector_mode) * X
7478
7479          for some rational X, so they must have a common multiple.  */
7480       vect_update_slp_vf_for_node (SLP_INSTANCE_TREE (instance),
7481                                    unrolling_factor, visited);
7482
7483       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
7484          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
7485          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
7486       vect_mark_slp_stmts (loop_vinfo, SLP_INSTANCE_TREE (instance));
7487       decided_to_slp++;
7488     }
7489
7490   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
7491
7492   if (decided_to_slp && dump_enabled_p ())
7493     {
7494       dump_printf_loc (MSG_NOTE, vect_location,
7495                        "Decided to SLP %d instances. Unrolling factor ",
7496                        decided_to_slp);
7497       dump_dec (MSG_NOTE, unrolling_factor);
7498       dump_printf (MSG_NOTE, "\n");
7499     }
7500
7501   return (decided_to_slp > 0);
7502 }
7503
7504 /* Private data for vect_detect_hybrid_slp.  */
7505 struct vdhs_data
7506 {
7507   loop_vec_info loop_vinfo;
7508   vec<stmt_vec_info> *worklist;
7509 };
7510
7511 /* Walker for walk_gimple_op.  */
7512
7513 static tree
7514 vect_detect_hybrid_slp (tree *tp, int *, void *data)
7515 {
7516   walk_stmt_info *wi = (walk_stmt_info *)data;
7517   vdhs_data *dat = (vdhs_data *)wi->info;
7518
7519   if (wi->is_lhs)
7520     return NULL_TREE;
7521
7522   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
7523   if (!def_stmt_info)
7524     return NULL_TREE;
7525   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
7526   if (PURE_SLP_STMT (def_stmt_info))
7527     {
7528       if (dump_enabled_p ())
7529         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
7530                          def_stmt_info->stmt);
7531       STMT_SLP_TYPE (def_stmt_info) = hybrid;
7532       dat->worklist->safe_push (def_stmt_info);
7533     }
7534
7535   return NULL_TREE;
7536 }
7537
7538 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
7539    if so, otherwise pushing it to WORKLIST.  */
7540
7541 static void
7542 maybe_push_to_hybrid_worklist (vec_info *vinfo,
7543                                vec<stmt_vec_info> &worklist,
7544                                stmt_vec_info stmt_info)
7545 {
7546   if (dump_enabled_p ())
7547     dump_printf_loc (MSG_NOTE, vect_location,
7548                      "Processing hybrid candidate : %G", stmt_info->stmt);
7549   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
7550   imm_use_iterator iter2;
7551   ssa_op_iter iter1;
7552   use_operand_p use_p;
7553   def_operand_p def_p;
7554   bool any_def = false;
7555   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
7556     {
7557       any_def = true;
7558       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
7559         {
7560           if (is_gimple_debug (USE_STMT (use_p)))
7561             continue;
7562           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
7563           /* An out-of loop use means this is a loop_vect sink.  */
7564           if (!use_info)
7565             {
7566               if (dump_enabled_p ())
7567                 dump_printf_loc (MSG_NOTE, vect_location,
7568                                  "Found loop_vect sink: %G", stmt_info->stmt);
7569               worklist.safe_push (stmt_info);
7570               return;
7571             }
7572           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
7573             {
7574               if (dump_enabled_p ())
7575                 dump_printf_loc (MSG_NOTE, vect_location,
7576                                  "Found loop_vect use: %G", use_info->stmt);
7577               worklist.safe_push (stmt_info);
7578               return;
7579             }
7580         }
7581     }
7582   /* No def means this is a loop_vect sink.  Gimple conditionals also don't have a
7583      def but shouldn't be considered sinks.  */
7584   if (!any_def && STMT_VINFO_DEF_TYPE (stmt_info) != vect_condition_def)
7585     {
7586       if (dump_enabled_p ())
7587         dump_printf_loc (MSG_NOTE, vect_location,
7588                          "Found loop_vect sink: %G", stmt_info->stmt);
7589       worklist.safe_push (stmt_info);
7590       return;
7591     }
7592   if (dump_enabled_p ())
7593     dump_printf_loc (MSG_NOTE, vect_location,
7594                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
7595   STMT_SLP_TYPE (stmt_info) = pure_slp;
7596 }
7597
7598 /* Find stmts that must be both vectorized and SLPed.  */
7599
7600 void
7601 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
7602 {
7603   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
7604
7605   /* All stmts participating in SLP are marked pure_slp, all other
7606      stmts are loop_vect.
7607      First collect all loop_vect stmts into a worklist.
7608      SLP patterns cause not all original scalar stmts to appear in
7609      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
7610      Rectify this here and do a backward walk over the IL only considering
7611      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
7612      mark them as pure_slp.  */
7613   auto_vec<stmt_vec_info> worklist;
7614   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
7615     {
7616       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
7617       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
7618            gsi_next (&gsi))
7619         {
7620           gphi *phi = gsi.phi ();
7621           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
7622           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
7623             maybe_push_to_hybrid_worklist (loop_vinfo,
7624                                            worklist, stmt_info);
7625         }
7626       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
7627            gsi_prev (&gsi))
7628         {
7629           gimple *stmt = gsi_stmt (gsi);
7630           if (is_gimple_debug (stmt))
7631             continue;
7632           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
7633           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
7634             {
7635               for (gimple_stmt_iterator gsi2
7636                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
7637                    !gsi_end_p (gsi2); gsi_next (&gsi2))
7638                 {
7639                   stmt_vec_info patt_info
7640                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
7641                   if (!STMT_SLP_TYPE (patt_info)
7642                       && STMT_VINFO_RELEVANT (patt_info))
7643                     maybe_push_to_hybrid_worklist (loop_vinfo,
7644                                                    worklist, patt_info);
7645                 }
7646               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7647             }
7648           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
7649             maybe_push_to_hybrid_worklist (loop_vinfo,
7650                                            worklist, stmt_info);
7651         }
7652     }
7653
7654   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
7655      mark any SLP vectorized stmt as hybrid.
7656      ???  We're visiting def stmts N times (once for each non-SLP and
7657      once for each hybrid-SLP use).  */
7658   walk_stmt_info wi;
7659   vdhs_data dat;
7660   dat.worklist = &worklist;
7661   dat.loop_vinfo = loop_vinfo;
7662   memset (&wi, 0, sizeof (wi));
7663   wi.info = (void *)&dat;
7664   while (!worklist.is_empty ())
7665     {
7666       stmt_vec_info stmt_info = worklist.pop ();
7667       /* Since SSA operands are not set up for pattern stmts we need
7668          to use walk_gimple_op.  */
7669       wi.is_lhs = 0;
7670       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
7671       /* For gather/scatter make sure to walk the offset operand, that
7672          can be a scaling and conversion away.  */
7673       gather_scatter_info gs_info;
7674       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
7675           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
7676         {
7677           int dummy;
7678           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
7679         }
7680     }
7681 }
7682
7683
7684 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
7685
7686 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
7687   : vec_info (vec_info::bb, shared),
7688     roots (vNULL)
7689 {
7690   /* The region we are operating on.  bbs[0] is the entry, excluding
7691      its PHI nodes.  In the future we might want to track an explicit
7692      entry edge to cover bbs[0] PHI nodes and have a region entry
7693      insert location.  */
7694   bbs = _bbs.address ();
7695   nbbs = _bbs.length ();
7696
7697   for (unsigned i = 0; i < nbbs; ++i)
7698     {
7699       if (i != 0)
7700         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
7701              gsi_next (&si))
7702           {
7703             gphi *phi = si.phi ();
7704             gimple_set_uid (phi, 0);
7705             add_stmt (phi);
7706           }
7707       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
7708            !gsi_end_p (gsi); gsi_next (&gsi))
7709         {
7710           gimple *stmt = gsi_stmt (gsi);
7711           gimple_set_uid (stmt, 0);
7712           if (is_gimple_debug (stmt))
7713             continue;
7714           add_stmt (stmt);
7715         }
7716     }
7717 }
7718
7719
7720 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
7721    stmts in the basic block.  */
7722
7723 _bb_vec_info::~_bb_vec_info ()
7724 {
7725   /* Reset region marker.  */
7726   for (unsigned i = 0; i < nbbs; ++i)
7727     {
7728       if (i != 0)
7729         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
7730              gsi_next (&si))
7731           {
7732             gphi *phi = si.phi ();
7733             gimple_set_uid (phi, -1);
7734           }
7735       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
7736            !gsi_end_p (gsi); gsi_next (&gsi))
7737         {
7738           gimple *stmt = gsi_stmt (gsi);
7739           gimple_set_uid (stmt, -1);
7740         }
7741     }
7742
7743   for (unsigned i = 0; i < roots.length (); ++i)
7744     {
7745       roots[i].stmts.release ();
7746       roots[i].roots.release ();
7747       roots[i].remain.release ();
7748     }
7749   roots.release ();
7750 }
7751
7752 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
7753    given then that child nodes have already been processed, and that
7754    their def types currently match their SLP node's def type.  */
7755
7756 static bool
7757 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
7758                                     slp_instance node_instance,
7759                                     stmt_vector_for_cost *cost_vec)
7760 {
7761   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7762
7763   /* Calculate the number of vector statements to be created for the scalar
7764      stmts in this node.  It is the number of scalar elements in one scalar
7765      iteration (DR_GROUP_SIZE) multiplied by VF divided by the number of
7766      elements in a vector.  For single-defuse-cycle, lane-reducing op, and
7767      PHI statement that starts reduction comprised of only lane-reducing ops,
7768      the number is more than effective vector statements actually required.  */
7769   SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vect_get_num_copies (vinfo, node);
7770
7771   /* Handle purely internal nodes.  */
7772   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7773     {
7774       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
7775         return false;
7776
7777       stmt_vec_info slp_stmt_info;
7778       unsigned int i;
7779       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
7780         {
7781           if (slp_stmt_info
7782               && STMT_VINFO_LIVE_P (slp_stmt_info)
7783               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
7784                                                node_instance, i,
7785                                                false, cost_vec))
7786             return false;
7787         }
7788       return true;
7789     }
7790
7791   bool dummy;
7792   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
7793                             node, node_instance, cost_vec);
7794 }
7795
7796 /* Verify if we can externalize a set of internal defs.  */
7797
7798 static bool
7799 vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
7800 {
7801   basic_block bb = NULL;
7802   for (stmt_vec_info stmt : stmts)
7803     if (!stmt)
7804       return false;
7805     /* Constant generation uses get_later_stmt which can only handle
7806        defs from the same BB.  */
7807     else if (!bb)
7808       bb = gimple_bb (stmt->stmt);
7809     else if (gimple_bb (stmt->stmt) != bb)
7810       return false;
7811   return true;
7812 }
7813
7814 /* Try to build NODE from scalars, returning true on success.
7815    NODE_INSTANCE is the SLP instance that contains NODE.  */
7816
7817 static bool
7818 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
7819                               slp_instance node_instance)
7820 {
7821   stmt_vec_info stmt_info;
7822   unsigned int i;
7823
7824   if (!is_a <bb_vec_info> (vinfo)
7825       || node == SLP_INSTANCE_TREE (node_instance)
7826       || !SLP_TREE_SCALAR_STMTS (node).exists ()
7827       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
7828       /* Force the mask use to be built from scalars instead.  */
7829       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))
7830       || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node)))
7831     return false;
7832
7833   if (dump_enabled_p ())
7834     dump_printf_loc (MSG_NOTE, vect_location,
7835                      "Building vector operands of %p from scalars instead\n",
7836                      (void *) node);
7837
7838   /* Don't remove and free the child nodes here, since they could be
7839      referenced by other structures.  The analysis and scheduling phases
7840      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
7841   unsigned int group_size = SLP_TREE_LANES (node);
7842   SLP_TREE_DEF_TYPE (node) = vect_external_def;
7843   /* Invariants get their vector type from the uses.  */
7844   SLP_TREE_VECTYPE (node) = NULL_TREE;
7845   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
7846   SLP_TREE_LOAD_PERMUTATION (node).release ();
7847   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7848     {
7849       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
7850       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
7851     }
7852   return true;
7853 }
7854
7855 /* Return true if all elements of the slice are the same.  */
7856 bool
7857 vect_scalar_ops_slice::all_same_p () const
7858 {
7859   for (unsigned int i = 1; i < length; ++i)
7860     if (!operand_equal_p (op (0), op (i)))
7861       return false;
7862   return true;
7863 }
7864
7865 hashval_t
7866 vect_scalar_ops_slice_hash::hash (const value_type &s)
7867 {
7868   hashval_t hash = 0;
7869   for (unsigned i = 0; i < s.length; ++i)
7870     hash = iterative_hash_expr (s.op (i), hash);
7871   return hash;
7872 }
7873
7874 bool
7875 vect_scalar_ops_slice_hash::equal (const value_type &s1,
7876                                    const compare_type &s2)
7877 {
7878   if (s1.length != s2.length)
7879     return false;
7880   for (unsigned i = 0; i < s1.length; ++i)
7881     if (!operand_equal_p (s1.op (i), s2.op (i)))
7882       return false;
7883   return true;
7884 }
7885
7886 /* Compute the prologue cost for invariant or constant operands represented
7887    by NODE.  */
7888
7889 static void
7890 vect_prologue_cost_for_slp (slp_tree node,
7891                             stmt_vector_for_cost *cost_vec)
7892 {
7893   /* There's a special case of an existing vector, that costs nothing.  */
7894   if (SLP_TREE_SCALAR_OPS (node).length () == 0
7895       && !SLP_TREE_VEC_DEFS (node).is_empty ())
7896     return;
7897   /* Without looking at the actual initializer a vector of
7898      constants can be implemented as load from the constant pool.
7899      When all elements are the same we can use a splat.  */
7900   tree vectype = SLP_TREE_VECTYPE (node);
7901   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
7902   unsigned HOST_WIDE_INT const_nunits;
7903   unsigned nelt_limit;
7904   auto ops = &SLP_TREE_SCALAR_OPS (node);
7905   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7906   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
7907       && ! multiple_p (const_nunits, group_size))
7908     {
7909       nelt_limit = const_nunits;
7910       hash_set<vect_scalar_ops_slice_hash> vector_ops;
7911       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
7912         if (!vector_ops.add ({ ops, i * nelt_limit, nelt_limit }))
7913           starts.quick_push (i * nelt_limit);
7914     }
7915   else
7916     {
7917       /* If either the vector has variable length or the vectors
7918          are composed of repeated whole groups we only need to
7919          cost construction once.  All vectors will be the same.  */
7920       nelt_limit = group_size;
7921       starts.quick_push (0);
7922     }
7923   /* ???  We're just tracking whether vectors in a single node are the same.
7924      Ideally we'd do something more global.  */
7925   bool passed = false;
7926   for (unsigned int start : starts)
7927     {
7928       vect_cost_for_stmt kind;
7929       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
7930         kind = vector_load;
7931       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
7932         kind = scalar_to_vec;
7933       else
7934         kind = vec_construct;
7935       /* The target cost hook has no idea which part of the SLP node
7936          we are costing so avoid passing it down more than once.  Pass
7937          it to the first vec_construct or scalar_to_vec part since for those
7938          the x86 backend tries to account for GPR to XMM register moves.  */
7939       record_stmt_cost (cost_vec, 1, kind,
7940                         (kind != vector_load && !passed) ? node : nullptr,
7941                         vectype, 0, vect_prologue);
7942       if (kind != vector_load)
7943         passed = true;
7944     }
7945 }
7946
7947 /* Analyze statements contained in SLP tree NODE after recursively analyzing
7948    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
7949
7950    Return true if the operations are supported.  */
7951
7952 static bool
7953 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
7954                                   slp_instance node_instance,
7955                                   hash_set<slp_tree> &visited_set,
7956                                   vec<slp_tree> &visited_vec,
7957                                   stmt_vector_for_cost *cost_vec)
7958 {
7959   int i, j;
7960   slp_tree child;
7961
7962   /* Assume we can code-generate all invariants.  */
7963   if (!node
7964       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
7965       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7966     return true;
7967
7968   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
7969     {
7970       if (dump_enabled_p ())
7971         dump_printf_loc (MSG_NOTE, vect_location,
7972                          "Failed cyclic SLP reference in %p\n", (void *) node);
7973       return false;
7974     }
7975   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
7976
7977   /* If we already analyzed the exact same set of scalar stmts we're done.
7978      We share the generated vector stmts for those.  */
7979   if (visited_set.add (node))
7980     return true;
7981   visited_vec.safe_push (node);
7982
7983   bool res = true;
7984   unsigned visited_rec_start = visited_vec.length ();
7985   unsigned cost_vec_rec_start = cost_vec->length ();
7986   bool seen_non_constant_child = false;
7987   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7988     {
7989       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
7990                                               visited_set, visited_vec,
7991                                               cost_vec);
7992       if (!res)
7993         break;
7994       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
7995         seen_non_constant_child = true;
7996     }
7997   /* We're having difficulties scheduling nodes with just constant
7998      operands and no scalar stmts since we then cannot compute a stmt
7999      insertion place.  */
8000   if (res
8001       && !seen_non_constant_child
8002       && SLP_TREE_SCALAR_STMTS (node).is_empty ())
8003     {
8004       if (dump_enabled_p ())
8005         dump_printf_loc (MSG_NOTE, vect_location,
8006                          "Cannot vectorize all-constant op node %p\n",
8007                          (void *) node);
8008       res = false;
8009     }
8010
8011   if (res)
8012     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
8013                                               cost_vec);
8014   /* If analysis failed we have to pop all recursive visited nodes
8015      plus ourselves.  */
8016   if (!res)
8017     {
8018       while (visited_vec.length () >= visited_rec_start)
8019         visited_set.remove (visited_vec.pop ());
8020       cost_vec->truncate (cost_vec_rec_start);
8021     }
8022
8023   /* When the node can be vectorized cost invariant nodes it references.
8024      This is not done in DFS order to allow the refering node
8025      vectorizable_* calls to nail down the invariant nodes vector type
8026      and possibly unshare it if it needs a different vector type than
8027      other referrers.  */
8028   if (res)
8029     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
8030       if (child
8031           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
8032               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
8033           /* Perform usual caching, note code-generation still
8034              code-gens these nodes multiple times but we expect
8035              to CSE them later.  */
8036           && !visited_set.add (child))
8037         {
8038           visited_vec.safe_push (child);
8039           /* ???  After auditing more code paths make a "default"
8040              and push the vector type from NODE to all children
8041              if it is not already set.  */
8042           /* Compute the number of vectors to be generated.  */
8043           tree vector_type = SLP_TREE_VECTYPE (child);
8044           if (!vector_type)
8045             {
8046               /* Masked loads can have an undefined (default SSA definition)
8047                  else operand.  We do not need to cost it.  */
8048               vec<tree> ops = SLP_TREE_SCALAR_OPS (child);
8049               if ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
8050                    == load_vec_info_type)
8051                   && ((ops.length ()
8052                        && TREE_CODE (ops[0]) == SSA_NAME
8053                        && SSA_NAME_IS_DEFAULT_DEF (ops[0])
8054                        && VAR_P (SSA_NAME_VAR (ops[0])))
8055                       || SLP_TREE_DEF_TYPE (child) == vect_constant_def))
8056                 continue;
8057
8058               /* For shifts with a scalar argument we don't need
8059                  to cost or code-generate anything.
8060                  ???  Represent this more explicitely.  */
8061               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
8062                            == shift_vec_info_type)
8063                           && j == 1);
8064               continue;
8065             }
8066
8067           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
8068                 = vect_get_num_copies (vinfo, child);
8069           /* And cost them.  */
8070           vect_prologue_cost_for_slp (child, cost_vec);
8071         }
8072
8073   /* If this node or any of its children can't be vectorized, try pruning
8074      the tree here rather than felling the whole thing.  */
8075   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
8076     {
8077       /* We'll need to revisit this for invariant costing and number
8078          of vectorized stmt setting.   */
8079       res = true;
8080     }
8081
8082   return res;
8083 }
8084
8085 /* Given a definition DEF, analyze if it will have any live scalar use after
8086    performing SLP vectorization whose information is represented by BB_VINFO,
8087    and record result into hash map SCALAR_USE_MAP as cache for later fast
8088    check.  If recursion DEPTH exceeds a limit, stop analysis and make a
8089    conservative assumption.  Return 0 if no scalar use, 1 if there is, -1
8090    means recursion is limited.  */
8091
8092 static int
8093 vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
8094                         hash_map<tree, int> &scalar_use_map,
8095                         int depth = 0)
8096 {
8097   const int depth_limit = 2;
8098   imm_use_iterator use_iter;
8099   gimple *use_stmt;
8100
8101   if (int *res = scalar_use_map.get (def))
8102     return *res;
8103
8104   int scalar_use = 1;
8105
8106   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
8107     {
8108       if (is_gimple_debug (use_stmt))
8109         continue;
8110
8111       stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
8112
8113       if (!use_stmt_info)
8114         break;
8115
8116       if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
8117         continue;
8118
8119       /* Do not step forward when encounter PHI statement, since it may
8120          involve cyclic reference and cause infinite recursive invocation.  */
8121       if (gimple_code (use_stmt) == GIMPLE_PHI)
8122         break;
8123
8124       /* When pattern recognition is involved, a statement whose definition is
8125          consumed in some pattern, may not be included in the final replacement
8126          pattern statements, so would be skipped when building SLP graph.
8127
8128          * Original
8129           char a_c = *(char *) a;
8130           char b_c = *(char *) b;
8131           unsigned short a_s = (unsigned short) a_c;
8132           int a_i = (int) a_s;
8133           int b_i = (int) b_c;
8134           int r_i = a_i - b_i;
8135
8136          * After pattern replacement
8137           a_s = (unsigned short) a_c;
8138           a_i = (int) a_s;
8139
8140           patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
8141           patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
8142
8143           patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
8144           patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
8145
8146          The definitions of a_i(original statement) and b_i(pattern statement)
8147          are related to, but actually not part of widen_minus pattern.
8148          Vectorizing the pattern does not cause these definition statements to
8149          be marked as PURE_SLP.  For this case, we need to recursively check
8150          whether their uses are all absorbed into vectorized code.  But there
8151          is an exception that some use may participate in an vectorized
8152          operation via an external SLP node containing that use as an element.
8153          The parameter "scalar_use_map" tags such kind of SSA as having scalar
8154          use in advance.  */
8155       tree lhs = gimple_get_lhs (use_stmt);
8156
8157       if (!lhs || TREE_CODE (lhs) != SSA_NAME)
8158         break;
8159
8160       if (depth_limit && depth >= depth_limit)
8161         return -1;
8162
8163       if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
8164                                                 depth + 1)))
8165         break;
8166     }
8167
8168   if (end_imm_use_stmt_p (&use_iter))
8169     scalar_use = 0;
8170
8171   /* If recursion is limited, do not cache result for non-root defs.  */
8172   if (!depth || scalar_use >= 0)
8173     {
8174       bool added = scalar_use_map.put (def, scalar_use);
8175       gcc_assert (!added);
8176     }
8177
8178   return scalar_use;
8179 }
8180
8181 /* Mark lanes of NODE that are live outside of the basic-block vectorized
8182    region and that can be vectorized using vectorizable_live_operation
8183    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
8184    scalar code computing it to be retained.  */
8185
8186 static void
8187 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
8188                              slp_instance instance,
8189                              stmt_vector_for_cost *cost_vec,
8190                              hash_map<tree, int> &scalar_use_map,
8191                              hash_set<stmt_vec_info> &svisited,
8192                              hash_set<slp_tree> &visited)
8193 {
8194   if (visited.add (node))
8195     return;
8196
8197   unsigned i;
8198   stmt_vec_info stmt_info;
8199   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
8200   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8201     {
8202       if (!stmt_info || svisited.contains (stmt_info))
8203         continue;
8204       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
8205       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
8206           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
8207         /* Only the pattern root stmt computes the original scalar value.  */
8208         continue;
8209       bool mark_visited = true;
8210       gimple *orig_stmt = orig_stmt_info->stmt;
8211       ssa_op_iter op_iter;
8212       def_operand_p def_p;
8213       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
8214         {
8215           if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
8216                                       scalar_use_map))
8217             {
8218               STMT_VINFO_LIVE_P (stmt_info) = true;
8219               if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
8220                                                instance, i, false, cost_vec))
8221                 /* ???  So we know we can vectorize the live stmt from one SLP
8222                    node.  If we cannot do so from all or none consistently
8223                    we'd have to record which SLP node (and lane) we want to
8224                    use for the live operation.  So make sure we can
8225                    code-generate from all nodes.  */
8226                 mark_visited = false;
8227               else
8228                 STMT_VINFO_LIVE_P (stmt_info) = false;
8229             }
8230
8231           /* We have to verify whether we can insert the lane extract
8232              before all uses.  The following is a conservative approximation.
8233              We cannot put this into vectorizable_live_operation because
8234              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
8235              doesn't work.
8236              Note that while the fact that we emit code for loads at the
8237              first load should make this a non-problem leafs we construct
8238              from scalars are vectorized after the last scalar def.
8239              ???  If we'd actually compute the insert location during
8240              analysis we could use sth less conservative than the last
8241              scalar stmt in the node for the dominance check.  */
8242           /* ???  What remains is "live" uses in vector CTORs in the same
8243              SLP graph which is where those uses can end up code-generated
8244              right after their definition instead of close to their original
8245              use.  But that would restrict us to code-generate lane-extracts
8246              from the latest stmt in a node.  So we compensate for this
8247              during code-generation, simply not replacing uses for those
8248              hopefully rare cases.  */
8249           imm_use_iterator use_iter;
8250           gimple *use_stmt;
8251           stmt_vec_info use_stmt_info;
8252
8253           if (STMT_VINFO_LIVE_P (stmt_info))
8254             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
8255               if (!is_gimple_debug (use_stmt)
8256                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
8257                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
8258                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
8259                 {
8260                   if (dump_enabled_p ())
8261                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8262                                      "Cannot determine insertion place for "
8263                                      "lane extract\n");
8264                   STMT_VINFO_LIVE_P (stmt_info) = false;
8265                   mark_visited = true;
8266                 }
8267         }
8268       if (mark_visited)
8269         svisited.add (stmt_info);
8270     }
8271
8272   slp_tree child;
8273   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8274     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8275       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
8276                                    scalar_use_map, svisited, visited);
8277 }
8278
8279 /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
8280    are live outside of the basic-block vectorized region and that can be
8281    vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
8282
8283 static void
8284 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
8285 {
8286   if (bb_vinfo->slp_instances.is_empty ())
8287     return;
8288
8289   hash_set<stmt_vec_info> svisited;
8290   hash_set<slp_tree> visited;
8291   hash_map<tree, int> scalar_use_map;
8292   auto_vec<slp_tree> worklist;
8293
8294   for (slp_instance instance : bb_vinfo->slp_instances)
8295     {
8296       if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
8297         for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
8298           if (TREE_CODE (op) == SSA_NAME)
8299             scalar_use_map.put (op, 1);
8300       if (!visited.add (SLP_INSTANCE_TREE (instance)))
8301         worklist.safe_push (SLP_INSTANCE_TREE (instance));
8302     }
8303
8304   do
8305     {
8306       slp_tree node = worklist.pop ();
8307
8308       if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
8309         {
8310           for (tree op : SLP_TREE_SCALAR_OPS (node))
8311             if (TREE_CODE (op) == SSA_NAME)
8312               scalar_use_map.put (op, 1);
8313         }
8314       else
8315         {
8316           for (slp_tree child : SLP_TREE_CHILDREN (node))
8317             if (child && !visited.add (child))
8318               worklist.safe_push (child);
8319         }
8320     }
8321   while (!worklist.is_empty ());
8322
8323   visited.empty ();
8324
8325   for (slp_instance instance : bb_vinfo->slp_instances)
8326     {
8327       vect_location = instance->location ();
8328       vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
8329                                    instance, &instance->cost_vec,
8330                                    scalar_use_map, svisited, visited);
8331     }
8332 }
8333
8334 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
8335
8336 static bool
8337 vectorizable_bb_reduc_epilogue (slp_instance instance,
8338                                 stmt_vector_for_cost *cost_vec)
8339 {
8340   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
8341   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
8342   if (reduc_code == MINUS_EXPR)
8343     reduc_code = PLUS_EXPR;
8344   internal_fn reduc_fn;
8345   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
8346   if (!vectype
8347       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
8348       || reduc_fn == IFN_LAST
8349       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
8350       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
8351                                      TREE_TYPE (vectype)))
8352     {
8353       if (dump_enabled_p ())
8354         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8355                          "not vectorized: basic block reduction epilogue "
8356                          "operation unsupported.\n");
8357       return false;
8358     }
8359
8360   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
8361      cost log2 vector operations plus shuffles and one extraction.  */
8362   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
8363   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
8364                     vectype, 0, vect_body);
8365   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
8366                     vectype, 0, vect_body);
8367   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
8368                     vectype, 0, vect_body);
8369
8370   /* Since we replace all stmts of a possibly longer scalar reduction
8371      chain account for the extra scalar stmts for that.  */
8372   record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
8373                     instance->root_stmts[0], 0, vect_body);
8374   return true;
8375 }
8376
8377 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
8378    and recurse to children.  */
8379
8380 static void
8381 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
8382                               hash_set<slp_tree> &visited)
8383 {
8384   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
8385       || visited.add (node))
8386     return;
8387
8388   stmt_vec_info stmt;
8389   unsigned i;
8390   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
8391     if (stmt)
8392       roots.remove (vect_orig_stmt (stmt));
8393
8394   slp_tree child;
8395   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8396     if (child)
8397       vect_slp_prune_covered_roots (child, roots, visited);
8398 }
8399
8400 /* Analyze statements in SLP instances of VINFO.  Return true if the
8401    operations are supported. */
8402
8403 bool
8404 vect_slp_analyze_operations (vec_info *vinfo)
8405 {
8406   slp_instance instance;
8407   int i;
8408
8409   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
8410
8411   hash_set<slp_tree> visited;
8412   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
8413     {
8414       auto_vec<slp_tree> visited_vec;
8415       stmt_vector_for_cost cost_vec;
8416       cost_vec.create (2);
8417       if (is_a <bb_vec_info> (vinfo))
8418         vect_location = instance->location ();
8419       if (!vect_slp_analyze_node_operations (vinfo,
8420                                              SLP_INSTANCE_TREE (instance),
8421                                              instance, visited, visited_vec,
8422                                              &cost_vec)
8423           /* CTOR instances require vectorized defs for the SLP tree root.  */
8424           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
8425               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
8426                   != vect_internal_def
8427                   /* Make sure we vectorized with the expected type.  */
8428                   || !useless_type_conversion_p
8429                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
8430                                               (instance->root_stmts[0]->stmt))),
8431                          TREE_TYPE (SLP_TREE_VECTYPE
8432                                             (SLP_INSTANCE_TREE (instance))))))
8433           /* Check we can vectorize the reduction.  */
8434           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
8435               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))
8436           /* Check we can vectorize the gcond.  */
8437           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond
8438               && !vectorizable_early_exit (vinfo,
8439                                            SLP_INSTANCE_ROOT_STMTS (instance)[0],
8440                                            NULL, NULL,
8441                                            SLP_INSTANCE_TREE (instance),
8442                                            &cost_vec)))
8443         {
8444           cost_vec.release ();
8445           slp_tree node = SLP_INSTANCE_TREE (instance);
8446           stmt_vec_info stmt_info;
8447           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8448             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
8449           else
8450             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8451           if (is_a <loop_vec_info> (vinfo))
8452             {
8453               if (dump_enabled_p ())
8454                 dump_printf_loc (MSG_NOTE, vect_location,
8455                                  "unsupported SLP instance starting from: %G",
8456                                  stmt_info->stmt);
8457               return false;
8458             }
8459           if (dump_enabled_p ())
8460             dump_printf_loc (MSG_NOTE, vect_location,
8461                              "removing SLP instance operations starting from: %G",
8462                              stmt_info->stmt);
8463           vect_free_slp_instance (instance);
8464           vinfo->slp_instances.ordered_remove (i);
8465           while (!visited_vec.is_empty ())
8466             visited.remove (visited_vec.pop ());
8467         }
8468       else
8469         {
8470           i++;
8471           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
8472             {
8473               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
8474               cost_vec.release ();
8475             }
8476           else
8477             /* For BB vectorization remember the SLP graph entry
8478                cost for later.  */
8479             instance->cost_vec = cost_vec;
8480         }
8481     }
8482
8483   /* Now look for SLP instances with a root that are covered by other
8484      instances and remove them.  */
8485   hash_set<stmt_vec_info> roots;
8486   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
8487     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8488       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
8489   if (!roots.is_empty ())
8490     {
8491       visited.empty ();
8492       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
8493         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
8494                                       visited);
8495       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
8496         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
8497             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
8498           {
8499             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
8500             if (dump_enabled_p ())
8501               dump_printf_loc (MSG_NOTE, vect_location,
8502                                "removing SLP instance operations starting "
8503                                "from: %G", root->stmt);
8504             vect_free_slp_instance (instance);
8505             vinfo->slp_instances.ordered_remove (i);
8506           }
8507         else
8508           ++i;
8509     }
8510
8511   /* Compute vectorizable live stmts.  */
8512   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
8513     vect_bb_slp_mark_live_stmts (bb_vinfo);
8514
8515   return !vinfo->slp_instances.is_empty ();
8516 }
8517
8518 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
8519    closing the eventual chain.  */
8520
8521 static slp_instance
8522 get_ultimate_leader (slp_instance instance,
8523                      hash_map<slp_instance, slp_instance> &instance_leader)
8524 {
8525   auto_vec<slp_instance *, 8> chain;
8526   slp_instance *tem;
8527   while (*(tem = instance_leader.get (instance)) != instance)
8528     {
8529       chain.safe_push (tem);
8530       instance = *tem;
8531     }
8532   while (!chain.is_empty ())
8533     *chain.pop () = instance;
8534   return instance;
8535 }
8536
8537 namespace {
8538 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
8539    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
8540    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
8541
8542    INSTANCE_LEADER is as for get_ultimate_leader.  */
8543
8544 template<typename T>
8545 bool
8546 vect_map_to_instance (slp_instance instance, T key,
8547                       hash_map<T, slp_instance> &key_to_instance,
8548                       hash_map<slp_instance, slp_instance> &instance_leader)
8549 {
8550   bool existed_p;
8551   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
8552   if (!existed_p)
8553     ;
8554   else if (key_instance != instance)
8555     {
8556       /* If we're running into a previously marked key make us the
8557          leader of the current ultimate leader.  This keeps the
8558          leader chain acyclic and works even when the current instance
8559          connects two previously independent graph parts.  */
8560       slp_instance key_leader
8561         = get_ultimate_leader (key_instance, instance_leader);
8562       if (key_leader != instance)
8563         instance_leader.put (key_leader, instance);
8564     }
8565   key_instance = instance;
8566   return existed_p;
8567 }
8568 }
8569
8570 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
8571
8572 static void
8573 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
8574                            slp_instance instance, slp_tree node,
8575                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
8576                            hash_map<slp_tree, slp_instance> &node_to_instance,
8577                            hash_map<slp_instance, slp_instance> &instance_leader)
8578 {
8579   stmt_vec_info stmt_info;
8580   unsigned i;
8581
8582   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8583     if (stmt_info)
8584       vect_map_to_instance (instance, stmt_info, stmt_to_instance,
8585                             instance_leader);
8586
8587   if (vect_map_to_instance (instance, node, node_to_instance,
8588                             instance_leader))
8589     return;
8590
8591   slp_tree child;
8592   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8593     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8594       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
8595                                  node_to_instance, instance_leader);
8596 }
8597
8598 /* Partition the SLP graph into pieces that can be costed independently.  */
8599
8600 static void
8601 vect_bb_partition_graph (bb_vec_info bb_vinfo)
8602 {
8603   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
8604
8605   /* First walk the SLP graph assigning each involved scalar stmt a
8606      corresponding SLP graph entry and upon visiting a previously
8607      marked stmt, make the stmts leader the current SLP graph entry.  */
8608   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
8609   hash_map<slp_tree, slp_instance> node_to_instance;
8610   hash_map<slp_instance, slp_instance> instance_leader;
8611   slp_instance instance;
8612   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
8613     {
8614       instance_leader.put (instance, instance);
8615       vect_bb_partition_graph_r (bb_vinfo,
8616                                  instance, SLP_INSTANCE_TREE (instance),
8617                                  stmt_to_instance, node_to_instance,
8618                                  instance_leader);
8619     }
8620
8621   /* Then collect entries to each independent subgraph.  */
8622   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
8623     {
8624       slp_instance leader = get_ultimate_leader (instance, instance_leader);
8625       leader->subgraph_entries.safe_push (instance);
8626       if (dump_enabled_p ()
8627           && leader != instance)
8628         dump_printf_loc (MSG_NOTE, vect_location,
8629                          "instance %p is leader of %p\n",
8630                          (void *) leader, (void *) instance);
8631     }
8632 }
8633
8634 /* Compute the set of scalar stmts participating in internal and external
8635    nodes.  */
8636
8637 static void
8638 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
8639                                          hash_set<slp_tree> &visited,
8640                                          hash_set<stmt_vec_info> &vstmts,
8641                                          hash_set<stmt_vec_info> &estmts)
8642 {
8643   int i;
8644   stmt_vec_info stmt_info;
8645   slp_tree child;
8646
8647   if (visited.add (node))
8648     return;
8649
8650   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
8651     {
8652       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8653         if (stmt_info)
8654           vstmts.add (stmt_info);
8655
8656       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8657         if (child)
8658           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
8659                                                    vstmts, estmts);
8660     }
8661   else
8662     for (tree def : SLP_TREE_SCALAR_OPS (node))
8663       {
8664         stmt_vec_info def_stmt = vinfo->lookup_def (def);
8665         if (def_stmt)
8666           estmts.add (def_stmt);
8667       }
8668 }
8669
8670
8671 /* Compute the scalar cost of the SLP node NODE and its children
8672    and return it.  Do not account defs that are marked in LIFE and
8673    update LIFE according to uses of NODE.  */
8674
8675 static void
8676 vect_bb_slp_scalar_cost (vec_info *vinfo,
8677                          slp_tree node, vec<bool, va_heap> *life,
8678                          stmt_vector_for_cost *cost_vec,
8679                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
8680                          hash_set<stmt_vec_info> &scalar_stmts_in_externs,
8681                          hash_set<slp_tree> &visited)
8682 {
8683   unsigned i;
8684   stmt_vec_info stmt_info;
8685   slp_tree child;
8686
8687   if (visited.add (node))
8688     return;
8689
8690   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8691     {
8692       ssa_op_iter op_iter;
8693       def_operand_p def_p;
8694
8695       if (!stmt_info
8696           || (*life)[i]
8697           /* Defs also used in external nodes are not in the
8698              vectorized_scalar_stmts set as they need to be preserved.
8699              Honor that.  */
8700           || scalar_stmts_in_externs.contains (stmt_info))
8701         continue;
8702
8703       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
8704       gimple *orig_stmt = orig_stmt_info->stmt;
8705
8706       /* If there is a non-vectorized use of the defs then the scalar
8707          stmt is kept live in which case we do not account it or any
8708          required defs in the SLP children in the scalar cost.  This
8709          way we make the vectorization more costly when compared to
8710          the scalar cost.  */
8711       if (!STMT_VINFO_LIVE_P (stmt_info))
8712         {
8713           auto_vec<gimple *, 8> worklist;
8714           hash_set<gimple *> *worklist_visited = NULL;
8715           worklist.quick_push (orig_stmt);
8716           do
8717             {
8718               gimple *work_stmt = worklist.pop ();
8719               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
8720                 {
8721                   imm_use_iterator use_iter;
8722                   gimple *use_stmt;
8723                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
8724                                          DEF_FROM_PTR (def_p))
8725                     if (!is_gimple_debug (use_stmt))
8726                       {
8727                         stmt_vec_info use_stmt_info
8728                           = vinfo->lookup_stmt (use_stmt);
8729                         if (!use_stmt_info
8730                             || !vectorized_scalar_stmts.contains (use_stmt_info))
8731                           {
8732                             if (use_stmt_info
8733                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
8734                               {
8735                                 /* For stmts participating in patterns we have
8736                                    to check its uses recursively.  */
8737                                 if (!worklist_visited)
8738                                   worklist_visited = new hash_set<gimple *> ();
8739                                 if (!worklist_visited->add (use_stmt))
8740                                   worklist.safe_push (use_stmt);
8741                                 continue;
8742                               }
8743                             (*life)[i] = true;
8744                             goto next_lane;
8745                           }
8746                       }
8747                 }
8748             }
8749           while (!worklist.is_empty ());
8750 next_lane:
8751           if (worklist_visited)
8752             delete worklist_visited;
8753           if ((*life)[i])
8754             continue;
8755         }
8756
8757       /* Count scalar stmts only once.  */
8758       if (gimple_visited_p (orig_stmt))
8759         continue;
8760       gimple_set_visited (orig_stmt, true);
8761
8762       vect_cost_for_stmt kind;
8763       if (STMT_VINFO_DATA_REF (orig_stmt_info))
8764         {
8765           data_reference_p dr = STMT_VINFO_DATA_REF (orig_stmt_info);
8766           tree base = get_base_address (DR_REF (dr));
8767           /* When the scalar access is to a non-global not address-taken
8768              decl that is not BLKmode assume we can access it with a single
8769              non-load/store instruction.  */
8770           if (DECL_P (base)
8771               && !is_global_var (base)
8772               && !TREE_ADDRESSABLE (base)
8773               && DECL_MODE (base) != BLKmode)
8774             kind = scalar_stmt;
8775           else if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
8776             kind = scalar_load;
8777           else
8778             kind = scalar_store;
8779         }
8780       else if (vect_nop_conversion_p (orig_stmt_info))
8781         continue;
8782       /* For single-argument PHIs assume coalescing which means zero cost
8783          for the scalar and the vector PHIs.  This avoids artificially
8784          favoring the vector path (but may pessimize it in some cases).  */
8785       else if (is_a <gphi *> (orig_stmt_info->stmt)
8786                && gimple_phi_num_args
8787                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
8788         continue;
8789       else
8790         kind = scalar_stmt;
8791       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
8792                         SLP_TREE_VECTYPE (node), 0, vect_body);
8793     }
8794
8795   auto_vec<bool, 20> subtree_life;
8796   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8797     {
8798       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8799         {
8800           /* Do not directly pass LIFE to the recursive call, copy it to
8801              confine changes in the callee to the current child/subtree.  */
8802           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
8803             {
8804               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
8805               for (unsigned j = 0;
8806                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
8807                 {
8808                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
8809                   if (perm.first == i)
8810                     subtree_life[perm.second] = (*life)[j];
8811                 }
8812             }
8813           else
8814             {
8815               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
8816               subtree_life.safe_splice (*life);
8817             }
8818           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
8819                                    vectorized_scalar_stmts,
8820                                    scalar_stmts_in_externs, visited);
8821           subtree_life.truncate (0);
8822         }
8823     }
8824 }
8825
8826 /* Comparator for the loop-index sorted cost vectors.  */
8827
8828 static int
8829 li_cost_vec_cmp (const void *a_, const void *b_)
8830 {
8831   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
8832   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
8833   if (a->first < b->first)
8834     return -1;
8835   else if (a->first == b->first)
8836     return 0;
8837   return 1;
8838 }
8839
8840 /* Check if vectorization of the basic block is profitable for the
8841    subgraph denoted by SLP_INSTANCES.  */
8842
8843 static bool
8844 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
8845                                     vec<slp_instance> slp_instances,
8846                                     loop_p orig_loop)
8847 {
8848   slp_instance instance;
8849   int i;
8850   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
8851   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
8852
8853   if (dump_enabled_p ())
8854     {
8855       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
8856       hash_set<slp_tree> visited;
8857       FOR_EACH_VEC_ELT (slp_instances, i, instance)
8858         vect_print_slp_graph (MSG_NOTE, vect_location,
8859                               SLP_INSTANCE_TREE (instance), visited);
8860     }
8861
8862   /* Compute the set of scalar stmts we know will go away 'locally' when
8863      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
8864      not accurate for nodes promoted extern late or for scalar stmts that
8865      are used both in extern defs and in vectorized defs.  */
8866   hash_set<stmt_vec_info> vectorized_scalar_stmts;
8867   hash_set<stmt_vec_info> scalar_stmts_in_externs;
8868   hash_set<slp_tree> visited;
8869   FOR_EACH_VEC_ELT (slp_instances, i, instance)
8870     {
8871       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
8872                                                SLP_INSTANCE_TREE (instance),
8873                                                visited,
8874                                                vectorized_scalar_stmts,
8875                                                scalar_stmts_in_externs);
8876       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
8877         vectorized_scalar_stmts.add (rstmt);
8878     }
8879   /* Scalar stmts used as defs in external nodes need to be preseved, so
8880      remove them from vectorized_scalar_stmts.  */
8881   for (stmt_vec_info stmt : scalar_stmts_in_externs)
8882     vectorized_scalar_stmts.remove (stmt);
8883
8884   /* Calculate scalar cost and sum the cost for the vector stmts
8885      previously collected.  */
8886   stmt_vector_for_cost scalar_costs = vNULL;
8887   stmt_vector_for_cost vector_costs = vNULL;
8888   visited.empty ();
8889   FOR_EACH_VEC_ELT (slp_instances, i, instance)
8890     {
8891       auto_vec<bool, 20> life;
8892       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
8893                               true);
8894       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8895         record_stmt_cost (&scalar_costs,
8896                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
8897                           scalar_stmt,
8898                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
8899       vect_bb_slp_scalar_cost (bb_vinfo,
8900                                SLP_INSTANCE_TREE (instance),
8901                                &life, &scalar_costs, vectorized_scalar_stmts,
8902                                scalar_stmts_in_externs, visited);
8903       vector_costs.safe_splice (instance->cost_vec);
8904       instance->cost_vec.release ();
8905     }
8906
8907   if (dump_enabled_p ())
8908     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
8909
8910   /* When costing non-loop vectorization we need to consider each covered
8911      loop independently and make sure vectorization is profitable.  For
8912      now we assume a loop may be not entered or executed an arbitrary
8913      number of iterations (???  static information can provide more
8914      precise info here) which means we can simply cost each containing
8915      loops stmts separately.  */
8916
8917   /* First produce cost vectors sorted by loop index.  */
8918   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
8919     li_scalar_costs (scalar_costs.length ());
8920   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
8921     li_vector_costs (vector_costs.length ());
8922   stmt_info_for_cost *cost;
8923   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
8924     {
8925       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
8926       li_scalar_costs.quick_push (std::make_pair (l, cost));
8927     }
8928   /* Use a random used loop as fallback in case the first vector_costs
8929      entry does not have a stmt_info associated with it.  */
8930   unsigned l = li_scalar_costs[0].first;
8931   FOR_EACH_VEC_ELT (vector_costs, i, cost)
8932     {
8933       /* We inherit from the previous COST, invariants, externals and
8934          extracts immediately follow the cost for the related stmt.  */
8935       if (cost->stmt_info)
8936         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
8937       li_vector_costs.quick_push (std::make_pair (l, cost));
8938     }
8939   li_scalar_costs.qsort (li_cost_vec_cmp);
8940   li_vector_costs.qsort (li_cost_vec_cmp);
8941
8942   /* Now cost the portions individually.  */
8943   unsigned vi = 0;
8944   unsigned si = 0;
8945   bool profitable = true;
8946   while (si < li_scalar_costs.length ()
8947          && vi < li_vector_costs.length ())
8948     {
8949       unsigned sl = li_scalar_costs[si].first;
8950       unsigned vl = li_vector_costs[vi].first;
8951       if (sl != vl)
8952         {
8953           if (dump_enabled_p ())
8954             dump_printf_loc (MSG_NOTE, vect_location,
8955                              "Scalar %d and vector %d loop part do not "
8956                              "match up, skipping scalar part\n", sl, vl);
8957           /* Skip the scalar part, assuming zero cost on the vector side.  */
8958           do
8959             {
8960               si++;
8961             }
8962           while (si < li_scalar_costs.length ()
8963                  && li_scalar_costs[si].first == sl);
8964           continue;
8965         }
8966
8967       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
8968       do
8969         {
8970           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
8971           si++;
8972         }
8973       while (si < li_scalar_costs.length ()
8974              && li_scalar_costs[si].first == sl);
8975       scalar_target_cost_data->finish_cost (nullptr);
8976       scalar_cost = scalar_target_cost_data->body_cost ();
8977
8978       /* Complete the target-specific vector cost calculation.  */
8979       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
8980       do
8981         {
8982           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
8983           vi++;
8984         }
8985       while (vi < li_vector_costs.length ()
8986              && li_vector_costs[vi].first == vl);
8987       vect_target_cost_data->finish_cost (scalar_target_cost_data);
8988       vec_prologue_cost = vect_target_cost_data->prologue_cost ();
8989       vec_inside_cost = vect_target_cost_data->body_cost ();
8990       vec_epilogue_cost = vect_target_cost_data->epilogue_cost ();
8991       delete scalar_target_cost_data;
8992       delete vect_target_cost_data;
8993
8994       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
8995
8996       if (dump_enabled_p ())
8997         {
8998           dump_printf_loc (MSG_NOTE, vect_location,
8999                            "Cost model analysis for part in loop %d:\n", sl);
9000           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
9001                        vec_inside_cost + vec_outside_cost);
9002           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
9003         }
9004
9005       /* Vectorization is profitable if its cost is more than the cost of scalar
9006          version.  Note that we err on the vector side for equal cost because
9007          the cost estimate is otherwise quite pessimistic (constant uses are
9008          free on the scalar side but cost a load on the vector side for
9009          example).  */
9010       if (vec_outside_cost + vec_inside_cost > scalar_cost)
9011         {
9012           profitable = false;
9013           break;
9014         }
9015     }
9016   if (profitable && vi < li_vector_costs.length ())
9017     {
9018       if (dump_enabled_p ())
9019         dump_printf_loc (MSG_NOTE, vect_location,
9020                          "Excess vector cost for part in loop %d:\n",
9021                          li_vector_costs[vi].first);
9022       profitable = false;
9023     }
9024
9025   /* Unset visited flag.  This is delayed when the subgraph is profitable
9026      and we process the loop for remaining unvectorized if-converted code.  */
9027   if (!orig_loop || !profitable)
9028     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9029       gimple_set_visited  (cost->stmt_info->stmt, false);
9030
9031   scalar_costs.release ();
9032   vector_costs.release ();
9033
9034   return profitable;
9035 }
9036
9037 /* qsort comparator for lane defs.  */
9038
9039 static int
9040 vld_cmp (const void *a_, const void *b_)
9041 {
9042   auto *a = (const std::pair<unsigned, tree> *)a_;
9043   auto *b = (const std::pair<unsigned, tree> *)b_;
9044   return a->first - b->first;
9045 }
9046
9047 /* Return true if USE_STMT is a vector lane insert into VEC and set
9048    *THIS_LANE to the lane number that is set.  */
9049
9050 static bool
9051 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
9052 {
9053   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
9054   if (!use_ass
9055       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
9056       || (vec
9057           ? gimple_assign_rhs1 (use_ass) != vec
9058           : ((vec = gimple_assign_rhs1 (use_ass)), false))
9059       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
9060                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
9061       || !constant_multiple_p
9062             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
9063              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
9064              this_lane))
9065     return false;
9066   return true;
9067 }
9068
9069 /* Find any vectorizable constructors and add them to the grouped_store
9070    array.  */
9071
9072 static void
9073 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
9074 {
9075   for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
9076     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
9077          !gsi_end_p (gsi); gsi_next (&gsi))
9078     {
9079       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
9080       /* This can be used to start SLP discovery for early breaks for BB early breaks
9081          when we get that far.  */
9082       if (!assign)
9083         continue;
9084
9085       tree rhs = gimple_assign_rhs1 (assign);
9086       enum tree_code code = gimple_assign_rhs_code (assign);
9087       use_operand_p use_p;
9088       gimple *use_stmt;
9089       if (code == CONSTRUCTOR)
9090         {
9091           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9092               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
9093                            CONSTRUCTOR_NELTS (rhs))
9094               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
9095               || uniform_vector_p (rhs))
9096             continue;
9097
9098           unsigned j;
9099           tree val;
9100           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9101             if (TREE_CODE (val) != SSA_NAME
9102                 || !bb_vinfo->lookup_def (val))
9103               break;
9104           if (j != CONSTRUCTOR_NELTS (rhs))
9105             continue;
9106
9107           vec<stmt_vec_info> roots = vNULL;
9108           roots.safe_push (bb_vinfo->lookup_stmt (assign));
9109           vec<stmt_vec_info> stmts;
9110           stmts.create (CONSTRUCTOR_NELTS (rhs));
9111           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9112             stmts.quick_push
9113               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
9114           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9115                                                stmts, roots));
9116         }
9117       else if (code == BIT_INSERT_EXPR
9118                && VECTOR_TYPE_P (TREE_TYPE (rhs))
9119                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
9120                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
9121                && integer_zerop (gimple_assign_rhs3 (assign))
9122                && useless_type_conversion_p
9123                     (TREE_TYPE (TREE_TYPE (rhs)),
9124                      TREE_TYPE (gimple_assign_rhs2 (assign)))
9125                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
9126         {
9127           /* We start to match on insert to lane zero but since the
9128              inserts need not be ordered we'd have to search both
9129              the def and the use chains.  */
9130           tree vectype = TREE_TYPE (rhs);
9131           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
9132           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
9133           auto_sbitmap lanes (nlanes);
9134           bitmap_clear (lanes);
9135           bitmap_set_bit (lanes, 0);
9136           tree def = gimple_assign_lhs (assign);
9137           lane_defs.quick_push
9138                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
9139           unsigned lanes_found = 1;
9140           /* Start with the use chains, the last stmt will be the root.  */
9141           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
9142           vec<stmt_vec_info> roots = vNULL;
9143           roots.safe_push (last);
9144           do
9145             {
9146               use_operand_p use_p;
9147               gimple *use_stmt;
9148               if (!single_imm_use (def, &use_p, &use_stmt))
9149                 break;
9150               unsigned this_lane;
9151               if (!bb_vinfo->lookup_stmt (use_stmt)
9152                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
9153                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
9154                 break;
9155               if (bitmap_bit_p (lanes, this_lane))
9156                 break;
9157               lanes_found++;
9158               bitmap_set_bit (lanes, this_lane);
9159               gassign *use_ass = as_a <gassign *> (use_stmt);
9160               lane_defs.quick_push (std::make_pair
9161                                      (this_lane, gimple_assign_rhs2 (use_ass)));
9162               last = bb_vinfo->lookup_stmt (use_ass);
9163               roots.safe_push (last);
9164               def = gimple_assign_lhs (use_ass);
9165             }
9166           while (lanes_found < nlanes);
9167           if (roots.length () > 1)
9168             std::swap(roots[0], roots[roots.length () - 1]);
9169           if (lanes_found < nlanes)
9170             {
9171               /* Now search the def chain.  */
9172               def = gimple_assign_rhs1 (assign);
9173               do
9174                 {
9175                   if (TREE_CODE (def) != SSA_NAME
9176                       || !has_single_use (def))
9177                     break;
9178                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
9179                   unsigned this_lane;
9180                   if (!bb_vinfo->lookup_stmt (def_stmt)
9181                       || !vect_slp_is_lane_insert (def_stmt,
9182                                                    NULL_TREE, &this_lane)
9183                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
9184                     break;
9185                   if (bitmap_bit_p (lanes, this_lane))
9186                     break;
9187                   lanes_found++;
9188                   bitmap_set_bit (lanes, this_lane);
9189                   lane_defs.quick_push (std::make_pair
9190                                           (this_lane,
9191                                            gimple_assign_rhs2 (def_stmt)));
9192                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
9193                   def = gimple_assign_rhs1 (def_stmt);
9194                 }
9195               while (lanes_found < nlanes);
9196             }
9197           if (lanes_found == nlanes)
9198             {
9199               /* Sort lane_defs after the lane index and register the root.  */
9200               lane_defs.qsort (vld_cmp);
9201               vec<stmt_vec_info> stmts;
9202               stmts.create (nlanes);
9203               for (unsigned i = 0; i < nlanes; ++i)
9204                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
9205               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9206                                                    stmts, roots));
9207             }
9208           else
9209             roots.release ();
9210         }
9211       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9212                && (associative_tree_code (code) || code == MINUS_EXPR)
9213                /* ???  This pessimizes a two-element reduction.  PR54400.
9214                   ???  In-order reduction could be handled if we only
9215                   traverse one operand chain in vect_slp_linearize_chain.  */
9216                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
9217                /* Ops with constants at the tail can be stripped here.  */
9218                && TREE_CODE (rhs) == SSA_NAME
9219                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
9220                /* Should be the chain end.  */
9221                && (!single_imm_use (gimple_assign_lhs (assign),
9222                                     &use_p, &use_stmt)
9223                    || !is_gimple_assign (use_stmt)
9224                    || (gimple_assign_rhs_code (use_stmt) != code
9225                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
9226                            || (gimple_assign_rhs_code (use_stmt)
9227                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
9228         {
9229           /* We start the match at the end of a possible association
9230              chain.  */
9231           auto_vec<chain_op_t> chain;
9232           auto_vec<std::pair<tree_code, gimple *> > worklist;
9233           auto_vec<gimple *> chain_stmts;
9234           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
9235           if (code == MINUS_EXPR)
9236             code = PLUS_EXPR;
9237           internal_fn reduc_fn;
9238           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
9239               || reduc_fn == IFN_LAST)
9240             continue;
9241           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
9242                                     /* ??? */
9243                                     code_stmt, alt_code_stmt, &chain_stmts);
9244           if (chain.length () > 1)
9245             {
9246               /* Sort the chain according to def_type and operation.  */
9247               chain.sort (dt_sort_cmp, bb_vinfo);
9248               /* ???  Now we'd want to strip externals and constants
9249                  but record those to be handled in the epilogue.  */
9250               /* ???  For now do not allow mixing ops or externs/constants.  */
9251               bool invalid = false;
9252               unsigned remain_cnt = 0;
9253               unsigned last_idx = 0;
9254               for (unsigned i = 0; i < chain.length (); ++i)
9255                 {
9256                   if (chain[i].code != code)
9257                     {
9258                       invalid = true;
9259                       break;
9260                     }
9261                   if (chain[i].dt != vect_internal_def
9262                       /* Avoid stmts where the def is not the LHS, like
9263                          ASMs.  */
9264                       || (gimple_get_lhs (bb_vinfo->lookup_def
9265                                                       (chain[i].op)->stmt)
9266                           != chain[i].op))
9267                     remain_cnt++;
9268                   else
9269                     last_idx = i;
9270                 }
9271               /* Make sure to have an even number of lanes as we later do
9272                  all-or-nothing discovery, not trying to split further.  */
9273               if ((chain.length () - remain_cnt) & 1)
9274                 remain_cnt++;
9275               if (!invalid && chain.length () - remain_cnt > 1)
9276                 {
9277                   vec<stmt_vec_info> stmts;
9278                   vec<tree> remain = vNULL;
9279                   stmts.create (chain.length ());
9280                   if (remain_cnt > 0)
9281                     remain.create (remain_cnt);
9282                   for (unsigned i = 0; i < chain.length (); ++i)
9283                     {
9284                       stmt_vec_info stmt_info;
9285                       if (chain[i].dt == vect_internal_def
9286                           && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
9287                               gimple_get_lhs (stmt_info->stmt) == chain[i].op)
9288                           && (i != last_idx
9289                               || (stmts.length () & 1)))
9290                         stmts.quick_push (stmt_info);
9291                       else
9292                         remain.quick_push (chain[i].op);
9293                     }
9294                   vec<stmt_vec_info> roots;
9295                   roots.create (chain_stmts.length ());
9296                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
9297                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
9298                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
9299                                                        stmts, roots, remain));
9300                 }
9301             }
9302         }
9303     }
9304 }
9305
9306 /* Walk the grouped store chains and replace entries with their
9307    pattern variant if any.  */
9308
9309 static void
9310 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
9311 {
9312   stmt_vec_info first_element;
9313   unsigned i;
9314
9315   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
9316     {
9317       /* We also have CTORs in this array.  */
9318       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
9319         continue;
9320       if (STMT_VINFO_IN_PATTERN_P (first_element))
9321         {
9322           stmt_vec_info orig = first_element;
9323           first_element = STMT_VINFO_RELATED_STMT (first_element);
9324           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
9325           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
9326           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
9327           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
9328           vinfo->grouped_stores[i] = first_element;
9329         }
9330       stmt_vec_info prev = first_element;
9331       while (DR_GROUP_NEXT_ELEMENT (prev))
9332         {
9333           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
9334           if (STMT_VINFO_IN_PATTERN_P (elt))
9335             {
9336               stmt_vec_info orig = elt;
9337               elt = STMT_VINFO_RELATED_STMT (elt);
9338               DR_GROUP_NEXT_ELEMENT (prev) = elt;
9339               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
9340               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
9341             }
9342           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
9343           prev = elt;
9344         }
9345     }
9346 }
9347
9348 /* Check if the region described by BB_VINFO can be vectorized, returning
9349    true if so.  When returning false, set FATAL to true if the same failure
9350    would prevent vectorization at other vector sizes, false if it is still
9351    worth trying other sizes.  N_STMTS is the number of statements in the
9352    region.  */
9353
9354 static bool
9355 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
9356                        vec<int> *dataref_groups)
9357 {
9358   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
9359
9360   slp_instance instance;
9361   int i;
9362   poly_uint64 min_vf = 2;
9363
9364   /* The first group of checks is independent of the vector size.  */
9365   fatal = true;
9366
9367   /* Analyze the data references.  */
9368
9369   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
9370     {
9371       if (dump_enabled_p ())
9372         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9373                          "not vectorized: unhandled data-ref in basic "
9374                          "block.\n");
9375       return false;
9376     }
9377
9378   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
9379     {
9380      if (dump_enabled_p ())
9381        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9382                         "not vectorized: unhandled data access in "
9383                         "basic block.\n");
9384       return false;
9385     }
9386
9387   vect_slp_check_for_roots (bb_vinfo);
9388
9389   /* If there are no grouped stores and no constructors in the region
9390      there is no need to continue with pattern recog as vect_analyze_slp
9391      will fail anyway.  */
9392   if (bb_vinfo->grouped_stores.is_empty ()
9393       && bb_vinfo->roots.is_empty ())
9394     {
9395       if (dump_enabled_p ())
9396         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9397                          "not vectorized: no grouped stores in "
9398                          "basic block.\n");
9399       return false;
9400     }
9401
9402   /* While the rest of the analysis below depends on it in some way.  */
9403   fatal = false;
9404
9405   vect_pattern_recog (bb_vinfo);
9406
9407   /* Update store groups from pattern processing.  */
9408   vect_fixup_store_groups_with_patterns (bb_vinfo);
9409
9410   /* Check the SLP opportunities in the basic block, analyze and build SLP
9411      trees.  */
9412   if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
9413     {
9414       if (dump_enabled_p ())
9415         {
9416           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9417                            "Failed to SLP the basic block.\n");
9418           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9419                            "not vectorized: failed to find SLP opportunities "
9420                            "in basic block.\n");
9421         }
9422       return false;
9423     }
9424
9425   /* Optimize permutations.  */
9426   vect_optimize_slp (bb_vinfo);
9427
9428   /* Gather the loads reachable from the SLP graph entries.  */
9429   vect_gather_slp_loads (bb_vinfo);
9430
9431   vect_record_base_alignments (bb_vinfo);
9432
9433   /* Analyze and verify the alignment of data references and the
9434      dependence in the SLP instances.  */
9435   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
9436     {
9437       vect_location = instance->location ();
9438       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
9439           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
9440         {
9441           slp_tree node = SLP_INSTANCE_TREE (instance);
9442           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
9443           if (dump_enabled_p ())
9444             dump_printf_loc (MSG_NOTE, vect_location,
9445                              "removing SLP instance operations starting from: %G",
9446                              stmt_info->stmt);
9447           vect_free_slp_instance (instance);
9448           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
9449           continue;
9450         }
9451
9452       /* Mark all the statements that we want to vectorize as pure SLP and
9453          relevant.  */
9454       vect_mark_slp_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance));
9455       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
9456       unsigned j;
9457       stmt_vec_info root;
9458       /* Likewise consider instance root stmts as vectorized.  */
9459       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
9460         STMT_SLP_TYPE (root) = pure_slp;
9461
9462       i++;
9463     }
9464   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
9465     return false;
9466
9467   if (!vect_slp_analyze_operations (bb_vinfo))
9468     {
9469       if (dump_enabled_p ())
9470         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9471                          "not vectorized: bad operation in basic block.\n");
9472       return false;
9473     }
9474
9475   vect_bb_partition_graph (bb_vinfo);
9476
9477   return true;
9478 }
9479
9480 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
9481    basic blocks in BBS, returning true on success.
9482    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
9483
9484 static bool
9485 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
9486                  vec<int> *dataref_groups, unsigned int n_stmts,
9487                  loop_p orig_loop)
9488 {
9489   bb_vec_info bb_vinfo;
9490   auto_vector_modes vector_modes;
9491
9492   /* Autodetect first vector size we try.  */
9493   machine_mode next_vector_mode = VOIDmode;
9494   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
9495   unsigned int mode_i = 0;
9496
9497   vec_info_shared shared;
9498
9499   machine_mode autodetected_vector_mode = VOIDmode;
9500   while (1)
9501     {
9502       bool vectorized = false;
9503       bool fatal = false;
9504       bb_vinfo = new _bb_vec_info (bbs, &shared);
9505
9506       bool first_time_p = shared.datarefs.is_empty ();
9507       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
9508       if (first_time_p)
9509         bb_vinfo->shared->save_datarefs ();
9510       else
9511         bb_vinfo->shared->check_datarefs ();
9512       bb_vinfo->vector_mode = next_vector_mode;
9513
9514       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
9515         {
9516           if (dump_enabled_p ())
9517             {
9518               dump_printf_loc (MSG_NOTE, vect_location,
9519                                "***** Analysis succeeded with vector mode"
9520                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
9521               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
9522             }
9523
9524           bb_vinfo->shared->check_datarefs ();
9525
9526           bool force_clear = false;
9527           auto_vec<slp_instance> profitable_subgraphs;
9528           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
9529             {
9530               if (instance->subgraph_entries.is_empty ())
9531                 continue;
9532
9533               dump_user_location_t saved_vect_location = vect_location;
9534               vect_location = instance->location ();
9535               if (!unlimited_cost_model (NULL)
9536                   && !vect_bb_vectorization_profitable_p
9537                         (bb_vinfo, instance->subgraph_entries, orig_loop))
9538                 {
9539                   if (dump_enabled_p ())
9540                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9541                                      "not vectorized: vectorization is not "
9542                                      "profitable.\n");
9543                   vect_location = saved_vect_location;
9544                   continue;
9545                 }
9546
9547               vect_location = saved_vect_location;
9548               if (!dbg_cnt (vect_slp))
9549                 {
9550                   force_clear = true;
9551                   continue;
9552                 }
9553
9554               profitable_subgraphs.safe_push (instance);
9555             }
9556
9557           /* When we're vectorizing an if-converted loop body make sure
9558              we vectorized all if-converted code.  */
9559           if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
9560             {
9561               gcc_assert (bb_vinfo->nbbs == 1);
9562               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
9563                    !gsi_end_p (gsi); gsi_next (&gsi))
9564                 {
9565                   /* The costing above left us with DCEable vectorized scalar
9566                      stmts having the visited flag set on profitable
9567                      subgraphs.  Do the delayed clearing of the flag here.  */
9568                   if (gimple_visited_p (gsi_stmt (gsi)))
9569                     {
9570                       gimple_set_visited (gsi_stmt (gsi), false);
9571                       continue;
9572                     }
9573                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
9574                     continue;
9575
9576                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
9577                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
9578                       {
9579                         if (!profitable_subgraphs.is_empty ()
9580                             && dump_enabled_p ())
9581                           dump_printf_loc (MSG_NOTE, vect_location,
9582                                            "not profitable because of "
9583                                            "unprofitable if-converted scalar "
9584                                            "code\n");
9585                         profitable_subgraphs.truncate (0);
9586                       }
9587                 }
9588             }
9589
9590           /* Finally schedule the profitable subgraphs.  */
9591           for (slp_instance instance : profitable_subgraphs)
9592             {
9593               if (!vectorized && dump_enabled_p ())
9594                 dump_printf_loc (MSG_NOTE, vect_location,
9595                                  "Basic block will be vectorized "
9596                                  "using SLP\n");
9597               vectorized = true;
9598
9599               /* Dump before scheduling as store vectorization will remove
9600                  the original stores and mess with the instance tree
9601                  so querying its location will eventually ICE.  */
9602               if (flag_checking)
9603                 for (slp_instance sub : instance->subgraph_entries)
9604                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
9605               unsigned HOST_WIDE_INT bytes;
9606               if (dump_enabled_p ())
9607                 for (slp_instance sub : instance->subgraph_entries)
9608                   {
9609                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
9610                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
9611                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
9612                                        sub->location (),
9613                                        "basic block part vectorized using %wu "
9614                                        "byte vectors\n", bytes);
9615                     else
9616                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
9617                                        sub->location (),
9618                                        "basic block part vectorized using "
9619                                        "variable length vectors\n");
9620                   }
9621
9622               dump_user_location_t saved_vect_location = vect_location;
9623               vect_location = instance->location ();
9624
9625               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
9626
9627               vect_location = saved_vect_location;
9628             }
9629
9630
9631           /* Generate the invariant statements.  */
9632           if (!gimple_seq_empty_p (bb_vinfo->inv_pattern_def_seq))
9633             {
9634               if (dump_enabled_p ())
9635                 dump_printf_loc (MSG_NOTE, vect_location,
9636                          "------>generating invariant statements\n");
9637
9638               bb_vinfo->insert_seq_on_entry (NULL,
9639                                              bb_vinfo->inv_pattern_def_seq);
9640             }
9641         }
9642       else
9643         {
9644           if (dump_enabled_p ())
9645             dump_printf_loc (MSG_NOTE, vect_location,
9646                              "***** Analysis failed with vector mode %s\n",
9647                              GET_MODE_NAME (bb_vinfo->vector_mode));
9648         }
9649
9650       if (mode_i == 0)
9651         autodetected_vector_mode = bb_vinfo->vector_mode;
9652
9653       if (!fatal)
9654         while (mode_i < vector_modes.length ()
9655                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
9656           {
9657             if (dump_enabled_p ())
9658               dump_printf_loc (MSG_NOTE, vect_location,
9659                                "***** The result for vector mode %s would"
9660                                " be the same\n",
9661                                GET_MODE_NAME (vector_modes[mode_i]));
9662             mode_i += 1;
9663           }
9664
9665       delete bb_vinfo;
9666
9667       if (mode_i < vector_modes.length ()
9668           && VECTOR_MODE_P (autodetected_vector_mode)
9669           && (related_vector_mode (vector_modes[mode_i],
9670                                    GET_MODE_INNER (autodetected_vector_mode))
9671               == autodetected_vector_mode)
9672           && (related_vector_mode (autodetected_vector_mode,
9673                                    GET_MODE_INNER (vector_modes[mode_i]))
9674               == vector_modes[mode_i]))
9675         {
9676           if (dump_enabled_p ())
9677             dump_printf_loc (MSG_NOTE, vect_location,
9678                              "***** Skipping vector mode %s, which would"
9679                              " repeat the analysis for %s\n",
9680                              GET_MODE_NAME (vector_modes[mode_i]),
9681                              GET_MODE_NAME (autodetected_vector_mode));
9682           mode_i += 1;
9683         }
9684
9685       if (vectorized
9686           || mode_i == vector_modes.length ()
9687           || autodetected_vector_mode == VOIDmode
9688           /* If vect_slp_analyze_bb_1 signaled that analysis for all
9689              vector sizes will fail do not bother iterating.  */
9690           || fatal)
9691         return vectorized;
9692
9693       /* Try the next biggest vector size.  */
9694       next_vector_mode = vector_modes[mode_i++];
9695       if (dump_enabled_p ())
9696         dump_printf_loc (MSG_NOTE, vect_location,
9697                          "***** Re-trying analysis with vector mode %s\n",
9698                          GET_MODE_NAME (next_vector_mode));
9699     }
9700 }
9701
9702
9703 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
9704    true if anything in the basic-block was vectorized.  */
9705
9706 static bool
9707 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
9708 {
9709   vec<data_reference_p> datarefs = vNULL;
9710   auto_vec<int> dataref_groups;
9711   int insns = 0;
9712   int current_group = 0;
9713
9714   for (unsigned i = 0; i < bbs.length (); i++)
9715     {
9716       basic_block bb = bbs[i];
9717       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
9718            gsi_next (&gsi))
9719         {
9720           gimple *stmt = gsi_stmt (gsi);
9721           if (is_gimple_debug (stmt))
9722             continue;
9723
9724           insns++;
9725
9726           if (gimple_location (stmt) != UNKNOWN_LOCATION)
9727             vect_location = stmt;
9728
9729           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
9730                                               &dataref_groups, current_group))
9731             ++current_group;
9732         }
9733       /* New BBs always start a new DR group.  */
9734       ++current_group;
9735     }
9736
9737   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
9738 }
9739
9740 /* Special entry for the BB vectorizer.  Analyze and transform a single
9741    if-converted BB with ORIG_LOOPs body being the not if-converted
9742    representation.  Returns true if anything in the basic-block was
9743    vectorized.  */
9744
9745 bool
9746 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
9747 {
9748   auto_vec<basic_block> bbs;
9749   bbs.safe_push (bb);
9750   return vect_slp_bbs (bbs, orig_loop);
9751 }
9752
9753 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
9754    true if anything in the basic-block was vectorized.  */
9755
9756 bool
9757 vect_slp_function (function *fun)
9758 {
9759   bool r = false;
9760   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
9761   auto_bitmap exit_bbs;
9762   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
9763   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
9764   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
9765                                                       true, rpo, NULL);
9766
9767   /* For the moment split the function into pieces to avoid making
9768      the iteration on the vector mode moot.  Split at points we know
9769      to not handle well which is CFG merges (SLP discovery doesn't
9770      handle non-loop-header PHIs) and loop exits.  Since pattern
9771      recog requires reverse iteration to visit uses before defs
9772      simply chop RPO into pieces.  */
9773   auto_vec<basic_block> bbs;
9774   for (unsigned i = 0; i < n; i++)
9775     {
9776       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
9777       bool split = false;
9778
9779       /* Split when a BB is not dominated by the first block.  */
9780       if (!bbs.is_empty ()
9781           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
9782         {
9783           if (dump_enabled_p ())
9784             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9785                              "splitting region at dominance boundary bb%d\n",
9786                              bb->index);
9787           split = true;
9788         }
9789       /* Split when the loop determined by the first block
9790          is exited.  This is because we eventually insert
9791          invariants at region begin.  */
9792       else if (!bbs.is_empty ()
9793                && bbs[0]->loop_father != bb->loop_father
9794                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
9795         {
9796           if (dump_enabled_p ())
9797             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9798                              "splitting region at loop %d exit at bb%d\n",
9799                              bbs[0]->loop_father->num, bb->index);
9800           split = true;
9801         }
9802       else if (!bbs.is_empty ()
9803                && bb->loop_father->header == bb
9804                && bb->loop_father->dont_vectorize)
9805         {
9806           if (dump_enabled_p ())
9807             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9808                              "splitting region at dont-vectorize loop %d "
9809                              "entry at bb%d\n",
9810                              bb->loop_father->num, bb->index);
9811           split = true;
9812         }
9813
9814       if (split && !bbs.is_empty ())
9815         {
9816           r |= vect_slp_bbs (bbs, NULL);
9817           bbs.truncate (0);
9818         }
9819
9820       if (bbs.is_empty ())
9821         {
9822           /* We need to be able to insert at the head of the region which
9823              we cannot for region starting with a returns-twice call.  */
9824           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
9825             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
9826               {
9827                 if (dump_enabled_p ())
9828                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9829                                    "skipping bb%d as start of region as it "
9830                                    "starts with returns-twice call\n",
9831                                    bb->index);
9832                 continue;
9833               }
9834           /* If the loop this BB belongs to is marked as not to be vectorized
9835              honor that also for BB vectorization.  */
9836           if (bb->loop_father->dont_vectorize)
9837             continue;
9838         }
9839
9840       bbs.safe_push (bb);
9841
9842       /* When we have a stmt ending this block and defining a
9843          value we have to insert on edges when inserting after it for
9844          a vector containing its definition.  Avoid this for now.  */
9845       if (gimple *last = *gsi_last_bb (bb))
9846         if (gimple_get_lhs (last)
9847             && is_ctrl_altering_stmt (last))
9848           {
9849             if (dump_enabled_p ())
9850               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9851                                "splitting region at control altering "
9852                                "definition %G", last);
9853             r |= vect_slp_bbs (bbs, NULL);
9854             bbs.truncate (0);
9855           }
9856     }
9857
9858   if (!bbs.is_empty ())
9859     r |= vect_slp_bbs (bbs, NULL);
9860
9861   free (rpo);
9862
9863   return r;
9864 }
9865
9866 /* Build a variable-length vector in which the elements in ELTS are repeated
9867    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
9868    RESULTS and add any new instructions to SEQ.
9869
9870    The approach we use is:
9871
9872    (1) Find a vector mode VM with integer elements of mode IM.
9873
9874    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
9875        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
9876        from small vectors to IM.
9877
9878    (3) Duplicate each ELTS'[I] into a vector of mode VM.
9879
9880    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
9881        correct byte contents.
9882
9883    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
9884
9885    We try to find the largest IM for which this sequence works, in order
9886    to cut down on the number of interleaves.  */
9887
9888 void
9889 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
9890                           const vec<tree> &elts, unsigned int nresults,
9891                           vec<tree> &results)
9892 {
9893   unsigned int nelts = elts.length ();
9894   tree element_type = TREE_TYPE (vector_type);
9895
9896   /* (1) Find a vector mode VM with integer elements of mode IM.  */
9897   unsigned int nvectors = 1;
9898   tree new_vector_type;
9899   tree permutes[2];
9900   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
9901                                        &nvectors, &new_vector_type,
9902                                        permutes))
9903     gcc_unreachable ();
9904
9905   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
9906   unsigned int partial_nelts = nelts / nvectors;
9907   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
9908
9909   tree_vector_builder partial_elts;
9910   auto_vec<tree, 32> pieces (nvectors * 2);
9911   pieces.quick_grow_cleared (nvectors * 2);
9912   for (unsigned int i = 0; i < nvectors; ++i)
9913     {
9914       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
9915              ELTS' has mode IM.  */
9916       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
9917       for (unsigned int j = 0; j < partial_nelts; ++j)
9918         partial_elts.quick_push (elts[i * partial_nelts + j]);
9919       tree t = gimple_build_vector (seq, &partial_elts);
9920       t = gimple_build (seq, VIEW_CONVERT_EXPR,
9921                         TREE_TYPE (new_vector_type), t);
9922
9923       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
9924       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
9925     }
9926
9927   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
9928          correct byte contents.
9929
9930      Conceptually, we need to repeat the following operation log2(nvectors)
9931      times, where hi_start = nvectors / 2:
9932
9933         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
9934         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
9935
9936      However, if each input repeats every N elements and the VF is
9937      a multiple of N * 2, the HI result is the same as the LO result.
9938      This will be true for the first N1 iterations of the outer loop,
9939      followed by N2 iterations for which both the LO and HI results
9940      are needed.  I.e.:
9941
9942         N1 + N2 = log2(nvectors)
9943
9944      Each "N1 iteration" doubles the number of redundant vectors and the
9945      effect of the process as a whole is to have a sequence of nvectors/2**N1
9946      vectors that repeats 2**N1 times.  Rather than generate these redundant
9947      vectors, we halve the number of vectors for each N1 iteration.  */
9948   unsigned int in_start = 0;
9949   unsigned int out_start = nvectors;
9950   unsigned int new_nvectors = nvectors;
9951   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
9952     {
9953       unsigned int hi_start = new_nvectors / 2;
9954       unsigned int out_i = 0;
9955       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
9956         {
9957           if ((in_i & 1) != 0
9958               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
9959                              2 * in_repeat))
9960             continue;
9961
9962           tree output = make_ssa_name (new_vector_type);
9963           tree input1 = pieces[in_start + (in_i / 2)];
9964           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
9965           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
9966                                                input1, input2,
9967                                                permutes[in_i & 1]);
9968           gimple_seq_add_stmt (seq, stmt);
9969           pieces[out_start + out_i] = output;
9970           out_i += 1;
9971         }
9972       std::swap (in_start, out_start);
9973       new_nvectors = out_i;
9974     }
9975
9976   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
9977   results.reserve (nresults);
9978   for (unsigned int i = 0; i < nresults; ++i)
9979     if (i < new_nvectors)
9980       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
9981                                         pieces[in_start + i]));
9982     else
9983       results.quick_push (results[i - new_nvectors]);
9984 }
9985
9986
9987 /* For constant and loop invariant defs in OP_NODE this function creates
9988    vector defs that will be used in the vectorized stmts and stores them
9989    to SLP_TREE_VEC_DEFS of OP_NODE.  */
9990
9991 static void
9992 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
9993 {
9994   unsigned HOST_WIDE_INT nunits;
9995   tree vec_cst;
9996   unsigned j, number_of_places_left_in_vector;
9997   tree vector_type;
9998   tree vop;
9999   int group_size = op_node->ops.length ();
10000   unsigned int vec_num, i;
10001   unsigned number_of_copies = 1;
10002   bool constant_p;
10003   gimple_seq ctor_seq = NULL;
10004   auto_vec<tree, 16> permute_results;
10005
10006   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
10007   vector_type = SLP_TREE_VECTYPE (op_node);
10008
10009   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
10010   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
10011   auto_vec<tree> voprnds (number_of_vectors);
10012
10013   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
10014      created vectors. It is greater than 1 if unrolling is performed.
10015
10016      For example, we have two scalar operands, s1 and s2 (e.g., group of
10017      strided accesses of size two), while NUNITS is four (i.e., four scalars
10018      of this type can be packed in a vector).  The output vector will contain
10019      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
10020      will be 2).
10021
10022      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
10023      containing the operands.
10024
10025      For example, NUNITS is four as before, and the group size is 8
10026      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
10027      {s5, s6, s7, s8}.  */
10028
10029   /* When using duplicate_and_interleave, we just need one element for
10030      each scalar statement.  */
10031   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
10032     nunits = group_size;
10033
10034   number_of_copies = nunits * number_of_vectors / group_size;
10035
10036   number_of_places_left_in_vector = nunits;
10037   constant_p = true;
10038   tree uniform_elt = NULL_TREE;
10039   tree_vector_builder elts (vector_type, nunits, 1);
10040   elts.quick_grow (nunits);
10041   stmt_vec_info insert_after = NULL;
10042   for (j = 0; j < number_of_copies; j++)
10043     {
10044       tree op;
10045       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
10046         {
10047           /* Create 'vect_ = {op0,op1,...,opn}'.  */
10048           tree orig_op = op;
10049           if (number_of_places_left_in_vector == nunits)
10050             uniform_elt = op;
10051           else if (uniform_elt && operand_equal_p (uniform_elt, op))
10052             op = elts[number_of_places_left_in_vector];
10053           else
10054             uniform_elt = NULL_TREE;
10055           number_of_places_left_in_vector--;
10056           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
10057             {
10058               if (CONSTANT_CLASS_P (op))
10059                 {
10060                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10061                     {
10062                       /* Can't use VIEW_CONVERT_EXPR for booleans because
10063                          of possibly different sizes of scalar value and
10064                          vector element.  */
10065                       if (integer_zerop (op))
10066                         op = build_int_cst (TREE_TYPE (vector_type), 0);
10067                       else if (integer_onep (op))
10068                         op = build_all_ones_cst (TREE_TYPE (vector_type));
10069                       else
10070                         gcc_unreachable ();
10071                     }
10072                   else
10073                     op = fold_unary (VIEW_CONVERT_EXPR,
10074                                      TREE_TYPE (vector_type), op);
10075                   gcc_assert (op && CONSTANT_CLASS_P (op));
10076                 }
10077               else
10078                 {
10079                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
10080                   gimple *init_stmt;
10081                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10082                     {
10083                       tree true_val
10084                         = build_all_ones_cst (TREE_TYPE (vector_type));
10085                       tree false_val
10086                         = build_zero_cst (TREE_TYPE (vector_type));
10087                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
10088                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
10089                                                        op, true_val,
10090                                                        false_val);
10091                     }
10092                   else
10093                     {
10094                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
10095                                    op);
10096                       init_stmt
10097                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
10098                                                op);
10099                     }
10100                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
10101                   op = new_temp;
10102                 }
10103             }
10104           elts[number_of_places_left_in_vector] = op;
10105           if (!CONSTANT_CLASS_P (op))
10106             constant_p = false;
10107           /* For BB vectorization we have to compute an insert location
10108              when a def is inside the analyzed region since we cannot
10109              simply insert at the BB start in this case.  */
10110           stmt_vec_info opdef;
10111           if (TREE_CODE (orig_op) == SSA_NAME
10112               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
10113               && is_a <bb_vec_info> (vinfo)
10114               && (opdef = vinfo->lookup_def (orig_op)))
10115             {
10116               if (!insert_after)
10117                 insert_after = opdef;
10118               else
10119                 insert_after = get_later_stmt (insert_after, opdef);
10120             }
10121
10122           if (number_of_places_left_in_vector == 0)
10123             {
10124               auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
10125               if (uniform_elt)
10126                 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
10127                                                         elts[0]);
10128               else if (constant_p
10129                        ? multiple_p (type_nunits, nunits)
10130                        : known_eq (type_nunits, nunits))
10131                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
10132               else
10133                 {
10134                   if (permute_results.is_empty ())
10135                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
10136                                               elts, number_of_vectors,
10137                                               permute_results);
10138                   vec_cst = permute_results[number_of_vectors - j - 1];
10139                 }
10140               if (!gimple_seq_empty_p (ctor_seq))
10141                 {
10142                   if (insert_after)
10143                     {
10144                       gimple_stmt_iterator gsi;
10145                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
10146                         {
10147                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
10148                           gsi_insert_seq_before (&gsi, ctor_seq,
10149                                                  GSI_CONTINUE_LINKING);
10150                         }
10151                       else if (!stmt_ends_bb_p (insert_after->stmt))
10152                         {
10153                           gsi = gsi_for_stmt (insert_after->stmt);
10154                           gsi_insert_seq_after (&gsi, ctor_seq,
10155                                                 GSI_CONTINUE_LINKING);
10156                         }
10157                       else
10158                         {
10159                           /* When we want to insert after a def where the
10160                              defining stmt throws then insert on the fallthru
10161                              edge.  */
10162                           edge e = find_fallthru_edge
10163                                      (gimple_bb (insert_after->stmt)->succs);
10164                           basic_block new_bb
10165                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
10166                           gcc_assert (!new_bb);
10167                         }
10168                     }
10169                   else
10170                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
10171                   ctor_seq = NULL;
10172                 }
10173               voprnds.quick_push (vec_cst);
10174               insert_after = NULL;
10175               number_of_places_left_in_vector = nunits;
10176               constant_p = true;
10177               elts.new_vector (vector_type, nunits, 1);
10178               elts.quick_grow (nunits);
10179             }
10180         }
10181     }
10182
10183   /* Since the vectors are created in the reverse order, we should invert
10184      them.  */
10185   vec_num = voprnds.length ();
10186   for (j = vec_num; j != 0; j--)
10187     {
10188       vop = voprnds[j - 1];
10189       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
10190     }
10191
10192   /* In case that VF is greater than the unrolling factor needed for the SLP
10193      group of stmts, NUMBER_OF_VECTORS to be created is greater than
10194      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
10195      to replicate the vectors.  */
10196   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
10197     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
10198          i++)
10199       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
10200 }
10201
10202 /* Get the Ith vectorized definition from SLP_NODE.  */
10203
10204 tree
10205 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
10206 {
10207   return SLP_TREE_VEC_DEFS (slp_node)[i];
10208 }
10209
10210 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
10211
10212 void
10213 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
10214 {
10215   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
10216   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
10217 }
10218
10219 /* Get N vectorized definitions for SLP_NODE.  */
10220
10221 void
10222 vect_get_slp_defs (vec_info *,
10223                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
10224 {
10225   if (n == -1U)
10226     n = SLP_TREE_CHILDREN (slp_node).length ();
10227
10228   for (unsigned i = 0; i < n; ++i)
10229     {
10230       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
10231       vec<tree> vec_defs = vNULL;
10232       vect_get_slp_defs (child, &vec_defs);
10233       vec_oprnds->quick_push (vec_defs);
10234     }
10235 }
10236
10237 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
10238    - PERM gives the permutation that the caller wants to use for NODE,
10239      which might be different from SLP_LOAD_PERMUTATION.
10240    - DUMP_P controls whether the function dumps information.  */
10241
10242 static bool
10243 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
10244                                 load_permutation_t &perm,
10245                                 const vec<tree> &dr_chain,
10246                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
10247                                 bool analyze_only, bool dump_p,
10248                                 unsigned *n_perms, unsigned int *n_loads,
10249                                 bool dce_chain)
10250 {
10251   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
10252   int vec_index = 0;
10253   tree vectype = SLP_TREE_VECTYPE (node);
10254   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
10255   unsigned int mask_element;
10256   unsigned dr_group_size;
10257   machine_mode mode;
10258
10259   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
10260     dr_group_size = 1;
10261   else
10262     {
10263       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
10264       dr_group_size = DR_GROUP_SIZE (stmt_info);
10265     }
10266
10267   mode = TYPE_MODE (vectype);
10268   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10269   unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
10270
10271   /* Initialize the vect stmts of NODE to properly insert the generated
10272      stmts later.  */
10273   if (! analyze_only)
10274     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
10275       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
10276
10277   /* Generate permutation masks for every NODE. Number of masks for each NODE
10278      is equal to GROUP_SIZE.
10279      E.g., we have a group of three nodes with three loads from the same
10280      location in each node, and the vector size is 4. I.e., we have a
10281      a0b0c0a1b1c1... sequence and we need to create the following vectors:
10282      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
10283      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
10284      ...
10285
10286      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
10287      The last mask is illegal since we assume two operands for permute
10288      operation, and the mask element values can't be outside that range.
10289      Hence, the last mask must be converted into {2,5,5,5}.
10290      For the first two permutations we need the first and the second input
10291      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
10292      we need the second and the third vectors: {b1,c1,a2,b2} and
10293      {c2,a3,b3,c3}.  */
10294
10295   int vect_stmts_counter = 0;
10296   unsigned int index = 0;
10297   int first_vec_index = -1;
10298   int second_vec_index = -1;
10299   bool noop_p = true;
10300   *n_perms = 0;
10301
10302   vec_perm_builder mask;
10303   unsigned int nelts_to_build;
10304   unsigned int nvectors_per_build;
10305   unsigned int in_nlanes;
10306   bool repeating_p = (group_size == dr_group_size
10307                       && multiple_p (nunits, group_size));
10308   if (repeating_p)
10309     {
10310       /* A single vector contains a whole number of copies of the node, so:
10311          (a) all permutes can use the same mask; and
10312          (b) the permutes only need a single vector input.  */
10313       mask.new_vector (nunits, group_size, 3);
10314       nelts_to_build = mask.encoded_nelts ();
10315       /* It's possible to obtain zero nstmts during analyze_only, so make
10316          it at least one to ensure the later computation for n_perms
10317          proceed.  */
10318       nvectors_per_build = nstmts > 0 ? nstmts : 1;
10319       in_nlanes = dr_group_size * 3;
10320     }
10321   else
10322     {
10323       /* We need to construct a separate mask for each vector statement.  */
10324       unsigned HOST_WIDE_INT const_nunits, const_vf;
10325       if (!nunits.is_constant (&const_nunits)
10326           || !vf.is_constant (&const_vf))
10327         return false;
10328       mask.new_vector (const_nunits, const_nunits, 1);
10329       nelts_to_build = const_vf * group_size;
10330       nvectors_per_build = 1;
10331       in_nlanes = const_vf * dr_group_size;
10332     }
10333   auto_sbitmap used_in_lanes (in_nlanes);
10334   bitmap_clear (used_in_lanes);
10335   auto_bitmap used_defs;
10336
10337   unsigned int count = mask.encoded_nelts ();
10338   mask.quick_grow (count);
10339   vec_perm_indices indices;
10340
10341   for (unsigned int j = 0; j < nelts_to_build; j++)
10342     {
10343       unsigned int iter_num = j / group_size;
10344       unsigned int stmt_num = j % group_size;
10345       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
10346       bitmap_set_bit (used_in_lanes, i);
10347       if (repeating_p)
10348         {
10349           first_vec_index = 0;
10350           mask_element = i;
10351         }
10352       else
10353         {
10354           /* Enforced before the loop when !repeating_p.  */
10355           unsigned int const_nunits = nunits.to_constant ();
10356           vec_index = i / const_nunits;
10357           mask_element = i % const_nunits;
10358           if (vec_index == first_vec_index
10359               || first_vec_index == -1)
10360             {
10361               first_vec_index = vec_index;
10362             }
10363           else if (vec_index == second_vec_index
10364                    || second_vec_index == -1)
10365             {
10366               second_vec_index = vec_index;
10367               mask_element += const_nunits;
10368             }
10369           else
10370             {
10371               if (dump_p)
10372                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10373                                  "permutation requires at "
10374                                  "least three vectors %G",
10375                                  stmt_info->stmt);
10376               gcc_assert (analyze_only);
10377               return false;
10378             }
10379
10380           gcc_assert (mask_element < 2 * const_nunits);
10381         }
10382
10383       if (mask_element != index)
10384         noop_p = false;
10385       mask[index++] = mask_element;
10386
10387       if (index == count)
10388         {
10389           if (!noop_p)
10390             {
10391               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
10392               if (!can_vec_perm_const_p (mode, mode, indices))
10393                 {
10394                   if (dump_p)
10395                     {
10396                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10397                                        "unsupported vect permute { ");
10398                       for (i = 0; i < count; ++i)
10399                         {
10400                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
10401                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
10402                         }
10403                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
10404                     }
10405                   gcc_assert (analyze_only);
10406                   return false;
10407                 }
10408
10409               tree mask_vec = NULL_TREE;
10410               if (!analyze_only)
10411                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
10412
10413               if (second_vec_index == -1)
10414                 second_vec_index = first_vec_index;
10415
10416               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
10417                 {
10418                   ++*n_perms;
10419                   if (analyze_only)
10420                     continue;
10421                   /* Generate the permute statement if necessary.  */
10422                   tree first_vec = dr_chain[first_vec_index + ri];
10423                   tree second_vec = dr_chain[second_vec_index + ri];
10424                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
10425                   tree perm_dest
10426                     = vect_create_destination_var (gimple_assign_lhs (stmt),
10427                                                    vectype);
10428                   perm_dest = make_ssa_name (perm_dest);
10429                   gimple *perm_stmt
10430                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
10431                                            second_vec, mask_vec);
10432                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
10433                                                gsi);
10434                   if (dce_chain)
10435                     {
10436                       bitmap_set_bit (used_defs, first_vec_index + ri);
10437                       bitmap_set_bit (used_defs, second_vec_index + ri);
10438                     }
10439
10440                   /* Store the vector statement in NODE.  */
10441                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
10442                 }
10443             }
10444           else if (!analyze_only)
10445             {
10446               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
10447                 {
10448                   tree first_vec = dr_chain[first_vec_index + ri];
10449                   /* If mask was NULL_TREE generate the requested
10450                      identity transform.  */
10451                   if (dce_chain)
10452                     bitmap_set_bit (used_defs, first_vec_index + ri);
10453
10454                   /* Store the vector statement in NODE.  */
10455                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
10456                 }
10457             }
10458
10459           index = 0;
10460           first_vec_index = -1;
10461           second_vec_index = -1;
10462           noop_p = true;
10463         }
10464     }
10465
10466   if (n_loads)
10467     {
10468       if (repeating_p)
10469         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
10470       else
10471         {
10472           /* Enforced above when !repeating_p.  */
10473           unsigned int const_nunits = nunits.to_constant ();
10474           *n_loads = 0;
10475           bool load_seen = false;
10476           for (unsigned i = 0; i < in_nlanes; ++i)
10477             {
10478               if (i % const_nunits == 0)
10479                 {
10480                   if (load_seen)
10481                     *n_loads += 1;
10482                   load_seen = false;
10483                 }
10484               if (bitmap_bit_p (used_in_lanes, i))
10485                 load_seen = true;
10486             }
10487           if (load_seen)
10488             *n_loads += 1;
10489         }
10490     }
10491
10492   if (dce_chain)
10493     for (unsigned i = 0; i < dr_chain.length (); ++i)
10494       if (!bitmap_bit_p (used_defs, i))
10495         {
10496           tree def = dr_chain[i];
10497           do
10498             {
10499               gimple *stmt = SSA_NAME_DEF_STMT (def);
10500               if (is_gimple_assign (stmt)
10501                   && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
10502                       || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
10503                 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
10504               else
10505                 def = NULL;
10506               gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
10507               gsi_remove (&rgsi, true);
10508               release_defs (stmt);
10509             }
10510           while (def);
10511         }
10512
10513   return true;
10514 }
10515
10516 /* Generate vector permute statements from a list of loads in DR_CHAIN.
10517    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
10518    permute statements for the SLP node NODE.  Store the number of vector
10519    permute instructions in *N_PERMS and the number of vector load
10520    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
10521    that were not needed.  */
10522
10523 bool
10524 vect_transform_slp_perm_load (vec_info *vinfo,
10525                               slp_tree node, const vec<tree> &dr_chain,
10526                               gimple_stmt_iterator *gsi, poly_uint64 vf,
10527                               bool analyze_only, unsigned *n_perms,
10528                               unsigned int *n_loads, bool dce_chain)
10529 {
10530   return vect_transform_slp_perm_load_1 (vinfo, node,
10531                                          SLP_TREE_LOAD_PERMUTATION (node),
10532                                          dr_chain, gsi, vf, analyze_only,
10533                                          dump_enabled_p (), n_perms, n_loads,
10534                                          dce_chain);
10535 }
10536
10537 /* Produce the next vector result for SLP permutation NODE by adding a vector
10538    statement at GSI.  If MASK_VEC is nonnull, add:
10539
10540       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
10541
10542    otherwise add:
10543
10544       <new SSA name> = FIRST_DEF.  */
10545
10546 static void
10547 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
10548                           slp_tree node, tree first_def, tree second_def,
10549                           tree mask_vec, poly_uint64 identity_offset)
10550 {
10551   tree vectype = SLP_TREE_VECTYPE (node);
10552
10553   /* ???  We SLP match existing vector element extracts but
10554      allow punning which we need to re-instantiate at uses
10555      but have no good way of explicitly representing.  */
10556   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
10557       && !types_compatible_p (TREE_TYPE (first_def), vectype))
10558     {
10559       gassign *conv_stmt
10560         = gimple_build_assign (make_ssa_name (vectype),
10561                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
10562       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
10563       first_def = gimple_assign_lhs (conv_stmt);
10564     }
10565   gassign *perm_stmt;
10566   tree perm_dest = make_ssa_name (vectype);
10567   if (mask_vec)
10568     {
10569       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
10570                            TYPE_SIZE (vectype))
10571           && !types_compatible_p (TREE_TYPE (second_def), vectype))
10572         {
10573           gassign *conv_stmt
10574             = gimple_build_assign (make_ssa_name (vectype),
10575                                    build1 (VIEW_CONVERT_EXPR,
10576                                            vectype, second_def));
10577           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
10578           second_def = gimple_assign_lhs (conv_stmt);
10579         }
10580       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
10581                                        first_def, second_def,
10582                                        mask_vec);
10583     }
10584   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
10585     {
10586       /* For identity permutes we still need to handle the case
10587          of offsetted extracts or concats.  */
10588       unsigned HOST_WIDE_INT c;
10589       auto first_def_nunits
10590         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
10591       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
10592         {
10593           unsigned HOST_WIDE_INT elsz
10594             = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
10595           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
10596                                  TYPE_SIZE (vectype),
10597                                  bitsize_int (identity_offset * elsz));
10598           perm_stmt = gimple_build_assign (perm_dest, lowpart);
10599         }
10600       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
10601                                     first_def_nunits, &c) && c == 2)
10602         {
10603           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
10604                                             NULL_TREE, second_def);
10605           perm_stmt = gimple_build_assign (perm_dest, ctor);
10606         }
10607       else
10608         gcc_unreachable ();
10609     }
10610   else
10611     {
10612       /* We need a copy here in case the def was external.  */
10613       perm_stmt = gimple_build_assign (perm_dest, first_def);
10614     }
10615   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
10616   /* Store the vector statement in NODE.  */
10617   node->push_vec_def (perm_stmt);
10618 }
10619
10620 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
10621    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
10622    If GSI is nonnull, emit the permutation there.
10623
10624    When GSI is null, the only purpose of NODE is to give properties
10625    of the result, such as the vector type and number of SLP lanes.
10626    The node does not need to be a VEC_PERM_EXPR.
10627
10628    If the target supports the operation, return the number of individual
10629    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
10630    dump file if DUMP_P is true.  */
10631
10632 static int
10633 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
10634                                 slp_tree node, lane_permutation_t &perm,
10635                                 vec<slp_tree> &children, bool dump_p)
10636 {
10637   tree vectype = SLP_TREE_VECTYPE (node);
10638
10639   /* ???  We currently only support all same vector input types
10640      while the SLP IL should really do a concat + select and thus accept
10641      arbitrary mismatches.  */
10642   slp_tree child;
10643   unsigned i;
10644   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10645   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
10646   /* True if we're permuting a single input of 2N vectors down
10647      to N vectors.  This case doesn't generalize beyond 2 since
10648      VEC_PERM_EXPR only takes 2 inputs.  */
10649   bool pack_p = false;
10650   /* If we're permuting inputs of N vectors each into X*N outputs,
10651      this is the value of X, otherwise it is 1.  */
10652   unsigned int unpack_factor = 1;
10653   tree op_vectype = NULL_TREE;
10654   FOR_EACH_VEC_ELT (children, i, child)
10655     if (SLP_TREE_VECTYPE (child))
10656       {
10657         op_vectype = SLP_TREE_VECTYPE (child);
10658         break;
10659       }
10660   if (!op_vectype)
10661     op_vectype = vectype;
10662   FOR_EACH_VEC_ELT (children, i, child)
10663     {
10664       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
10665            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
10666           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
10667           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
10668         {
10669           if (dump_p)
10670             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10671                              "Unsupported vector types in lane permutation\n");
10672           return -1;
10673         }
10674       auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
10675       unsigned int this_unpack_factor;
10676       /* Detect permutations of external, pre-existing vectors.  The external
10677          node's SLP_TREE_LANES stores the total number of units in the vector,
10678          or zero if the vector has variable length.
10679
10680          We are expected to keep the original VEC_PERM_EXPR for such cases.
10681          There is no repetition to model.  */
10682       if (SLP_TREE_DEF_TYPE (child) == vect_external_def
10683           && SLP_TREE_SCALAR_OPS (child).is_empty ())
10684         repeating_p = false;
10685       /* Check whether the input has twice as many lanes per vector.  */
10686       else if (children.length () == 1
10687                && known_eq (SLP_TREE_LANES (child) * nunits,
10688                             SLP_TREE_LANES (node) * op_nunits * 2))
10689         pack_p = true;
10690       /* Check whether the output has N times as many lanes per vector.  */
10691       else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
10692                                     SLP_TREE_LANES (child) * nunits,
10693                                     &this_unpack_factor)
10694                && (i == 0 || unpack_factor == this_unpack_factor))
10695         unpack_factor = this_unpack_factor;
10696       else
10697         repeating_p = false;
10698     }
10699
10700   gcc_assert (perm.length () == SLP_TREE_LANES (node));
10701
10702   /* Load-lanes permute.  This permute only acts as a forwarder to
10703      select the correct vector def of the load-lanes load which
10704      has the permuted vectors in its vector defs like
10705      { v0, w0, r0, v1, w1, r1 ... } for a ld3.  All costs are
10706      accounted for in the costing for the actual load so we
10707      return zero here.  */
10708   if (node->ldst_lanes)
10709     {
10710       gcc_assert (children.length () == 1);
10711       if (!gsi)
10712         /* This is a trivial op always supported.  */
10713         return 0;
10714       slp_tree child = children[0];
10715       unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
10716                           / SLP_TREE_LANES (node));
10717       unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
10718       for (unsigned i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
10719         {
10720           tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num  + vec_idx];
10721           node->push_vec_def (def);
10722         }
10723       return 0;
10724     }
10725
10726   /* Set REPEATING_P to true if the permutations are cylical wrt UNPACK_FACTOR
10727      and if we can generate the vectors in a vector-length agnostic way.
10728      This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
10729      compile time.
10730
10731      The significance of UNPACK_STEP is that, when PACK_P is false,
10732      output vector I operates on a window of UNPACK_STEP elements from each
10733      input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR).  For example,
10734      when UNPACK_FACTOR is 2, the first output vector operates on lanes
10735      [0, NUNITS / 2 - 1] of each input vector and the second output vector
10736      operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
10737
10738      When REPEATING_P is true, NOUTPUTS holds the total number of outputs
10739      that we actually need to generate.  */
10740   uint64_t noutputs = 0;
10741   poly_uint64 unpack_step = 0;
10742   loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
10743   if (!linfo
10744       || !multiple_p (nunits, unpack_factor, &unpack_step)
10745       || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
10746                                * SLP_TREE_LANES (node), nunits, &noutputs))
10747     repeating_p = false;
10748
10749   /* We can handle the conditions described for REPEATING_P above for
10750      both variable- and constant-length vectors.  The fallback requires
10751      us to generate every element of every permute vector explicitly,
10752      which is only possible for constant-length permute vectors.
10753
10754      Set:
10755
10756      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
10757        mask vectors that we want to build.
10758
10759      - NCOPIES to the number of copies of PERM that we need in order
10760        to build the necessary permute mask vectors.  */
10761   uint64_t npatterns;
10762   unsigned nelts_per_pattern;
10763   uint64_t ncopies;
10764   if (repeating_p)
10765     {
10766       /* We need permute mask vectors that have the form:
10767
10768            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
10769
10770          In other words, the original n-element permute in PERM is
10771          "unrolled" to fill a full vector.  The stepped vector encoding
10772          that we use for permutes requires 3n elements.  */
10773       npatterns = SLP_TREE_LANES (node);
10774       nelts_per_pattern = ncopies = 3;
10775     }
10776   else
10777     {
10778       /* Calculate every element of every permute mask vector explicitly,
10779          instead of relying on the pattern described above.  */
10780       if (!nunits.is_constant (&npatterns)
10781           || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
10782         {
10783           if (dump_p)
10784             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10785                              "unsupported permutation %p on variable-length"
10786                              " vectors\n", (void *) node);
10787           return -1;
10788         }
10789       nelts_per_pattern = ncopies = 1;
10790       if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
10791         {
10792           if (dump_p)
10793             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10794                              "unsupported permutation %p for variable VF\n",
10795                              (void *) node);
10796           return -1;
10797         }
10798       pack_p = false;
10799       unpack_factor = 1;
10800     }
10801   unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
10802   gcc_assert (repeating_p || multiple_p (olanes, nunits));
10803
10804   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
10805      from the { SLP operand, scalar lane } permutation as recorded in the
10806      SLP node as intermediate step.  This part should already work
10807      with SLP children with arbitrary number of lanes.  */
10808   auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
10809   auto_vec<poly_uint64> active_lane;
10810   vperm.create (olanes);
10811   active_lane.safe_grow_cleared (children.length (), true);
10812   for (unsigned int ui = 0; ui < unpack_factor; ++ui)
10813     {
10814       for (unsigned j = 0; j < children.length (); ++j)
10815         active_lane[j] = ui * unpack_step;
10816       for (unsigned i = 0; i < ncopies; ++i)
10817         {
10818           for (unsigned pi = 0; pi < perm.length (); ++pi)
10819             {
10820               std::pair<unsigned, unsigned> p = perm[pi];
10821               tree vtype = SLP_TREE_VECTYPE (children[p.first]);
10822               if (repeating_p)
10823                 vperm.quick_push ({{p.first, 0},
10824                                    p.second + active_lane[p.first]});
10825               else
10826                 {
10827                   /* We checked above that the vectors are constant-length.  */
10828                   unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
10829                     .to_constant ();
10830                   unsigned lane = active_lane[p.first].to_constant ();
10831                   unsigned vi = (lane + p.second) / vnunits;
10832                   unsigned vl = (lane + p.second) % vnunits;
10833                   vperm.quick_push ({{p.first, vi}, vl});
10834                 }
10835             }
10836           /* Advance to the next group.  */
10837           for (unsigned j = 0; j < children.length (); ++j)
10838             active_lane[j] += SLP_TREE_LANES (children[j]);
10839         }
10840     }
10841
10842   if (dump_p)
10843     {
10844       dump_printf_loc (MSG_NOTE, vect_location,
10845                        "vectorizing permutation %p", (void *)node);
10846       for (unsigned i = 0; i < perm.length (); ++i)
10847         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
10848       if (repeating_p)
10849         dump_printf (MSG_NOTE, " (repeat %d)", SLP_TREE_LANES (node));
10850       dump_printf (MSG_NOTE, "\n");
10851       dump_printf_loc (MSG_NOTE, vect_location, "as");
10852       for (unsigned i = 0; i < vperm.length (); ++i)
10853         {
10854           if (i != 0
10855               && (repeating_p
10856                   ? multiple_p (i, npatterns)
10857                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
10858             dump_printf (MSG_NOTE, ",");
10859           dump_printf (MSG_NOTE, " vops%u[%u][",
10860                        vperm[i].first.first, vperm[i].first.second);
10861           dump_dec (MSG_NOTE, vperm[i].second);
10862           dump_printf (MSG_NOTE, "]");
10863         }
10864       dump_printf (MSG_NOTE, "\n");
10865     }
10866
10867   /* We can only handle two-vector permutes, everything else should
10868      be lowered on the SLP level.  The following is closely inspired
10869      by vect_transform_slp_perm_load and is supposed to eventually
10870      replace it.
10871      ???   As intermediate step do code-gen in the SLP tree representation
10872      somehow?  */
10873   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
10874   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
10875   unsigned int index = 0;
10876   poly_uint64 mask_element;
10877   vec_perm_builder mask;
10878   mask.new_vector (nunits, npatterns, nelts_per_pattern);
10879   unsigned int count = mask.encoded_nelts ();
10880   mask.quick_grow (count);
10881   vec_perm_indices indices;
10882   unsigned nperms = 0;
10883   /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
10884      vectors to check during analysis, but we need to generate NOUTPUTS
10885      vectors during transformation.  */
10886   unsigned total_nelts = olanes;
10887   unsigned process_nelts = olanes;
10888   if (repeating_p)
10889     {
10890       total_nelts = (total_nelts / unpack_factor) * noutputs;
10891       if (gsi)
10892         process_nelts = total_nelts;
10893     }
10894   unsigned last_ei = (total_nelts - 1) % process_nelts;
10895   for (unsigned i = 0; i < process_nelts; ++i)
10896     {
10897       /* VI is the input vector index when generating code for REPEATING_P.  */
10898       unsigned vi = i / olanes * (pack_p ? 2 : 1);
10899       unsigned ei = i % olanes;
10900       mask_element = vperm[ei].second;
10901       if (pack_p)
10902         {
10903           /* In this case, we have N outputs and the single child provides 2N
10904              inputs.  Output X permutes inputs 2X and 2X+1.
10905
10906              The mask indices are taken directly from the SLP permutation node.
10907              Index X selects from the first vector if (X / NUNITS) % 2 == 0;
10908              X selects from the second vector otherwise.  These conditions
10909              are only known at compile time for constant-length vectors.  */
10910           first_vec = std::make_pair (0, 0);
10911           second_vec = std::make_pair (0, 1);
10912         }
10913       else if (first_vec.first == -1U
10914                || first_vec == vperm[ei].first)
10915         first_vec = vperm[ei].first;
10916       else if (second_vec.first == -1U
10917                || second_vec == vperm[ei].first)
10918         {
10919           second_vec = vperm[ei].first;
10920           mask_element += nunits;
10921         }
10922       else
10923         {
10924           if (dump_p)
10925             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10926                              "permutation requires at "
10927                              "least three vectors\n");
10928           gcc_assert (!gsi);
10929           return -1;
10930         }
10931
10932       mask[index++] = mask_element;
10933
10934       if (index == count)
10935         {
10936           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
10937                               TYPE_VECTOR_SUBPARTS (op_vectype));
10938           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
10939                              && constant_multiple_p (mask[0], nunits));
10940           machine_mode vmode = TYPE_MODE (vectype);
10941           machine_mode op_vmode = TYPE_MODE (op_vectype);
10942           unsigned HOST_WIDE_INT c;
10943           if ((!identity_p
10944                && !can_vec_perm_const_p (vmode, op_vmode, indices))
10945               || (identity_p
10946                   && !known_le (nunits,
10947                                 TYPE_VECTOR_SUBPARTS (op_vectype))
10948                   && (!constant_multiple_p (nunits,
10949                                             TYPE_VECTOR_SUBPARTS (op_vectype),
10950                                             &c) || c != 2)))
10951             {
10952               if (dump_p)
10953                 {
10954                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
10955                                    vect_location,
10956                                    "unsupported vect permute { ");
10957                   for (i = 0; i < count; ++i)
10958                     {
10959                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
10960                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
10961                     }
10962                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
10963                 }
10964               gcc_assert (!gsi);
10965               return -1;
10966             }
10967
10968           if (!identity_p)
10969             nperms += CEIL (total_nelts, process_nelts) - (ei > last_ei);
10970           if (gsi)
10971             {
10972               if (second_vec.first == -1U)
10973                 second_vec = first_vec;
10974
10975               slp_tree
10976                 first_node = children[first_vec.first],
10977                 second_node = children[second_vec.first];
10978
10979               tree mask_vec = NULL_TREE;
10980               if (!identity_p)
10981                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
10982
10983               tree first_def
10984                 = vect_get_slp_vect_def (first_node, first_vec.second + vi);
10985               tree second_def
10986                 = vect_get_slp_vect_def (second_node, second_vec.second + vi);
10987               vect_add_slp_permutation (vinfo, gsi, node, first_def,
10988                                         second_def, mask_vec, mask[0]);
10989             }
10990
10991           index = 0;
10992           first_vec = std::make_pair (-1U, -1U);
10993           second_vec = std::make_pair (-1U, -1U);
10994         }
10995     }
10996
10997   return nperms;
10998 }
10999
11000 /* Vectorize the SLP permutations in NODE as specified
11001    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
11002    child number and lane number.
11003    Interleaving of two two-lane two-child SLP subtrees (not supported):
11004      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
11005    A blend of two four-lane two-child SLP subtrees:
11006      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
11007    Highpart of a four-lane one-child SLP subtree (not supported):
11008      [ { 0, 2 }, { 0, 3 } ]
11009    Where currently only a subset is supported by code generating below.  */
11010
11011 static bool
11012 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
11013                               slp_tree node, stmt_vector_for_cost *cost_vec)
11014 {
11015   tree vectype = SLP_TREE_VECTYPE (node);
11016   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
11017   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
11018                                                SLP_TREE_CHILDREN (node),
11019                                                dump_enabled_p ());
11020   if (nperms < 0)
11021     return false;
11022
11023   if (!gsi)
11024     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
11025
11026   return true;
11027 }
11028
11029 /* Vectorize SLP NODE.  */
11030
11031 static void
11032 vect_schedule_slp_node (vec_info *vinfo,
11033                         slp_tree node, slp_instance instance)
11034 {
11035   gimple_stmt_iterator si;
11036   int i;
11037   slp_tree child;
11038
11039   /* Vectorize externals and constants.  */
11040   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
11041       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
11042     {
11043       /* ???  vectorizable_shift can end up using a scalar operand which is
11044          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
11045          node in this case.  */
11046       if (!SLP_TREE_VECTYPE (node))
11047         return;
11048
11049       /* There are two reasons vector defs might already exist.  The first
11050          is that we are vectorizing an existing vector def.  The second is
11051          when performing BB vectorization shared constant/external nodes
11052          are not split apart during partitioning so during the code-gen
11053          DFS walk we can end up visiting them twice.  */
11054       if (! SLP_TREE_VEC_DEFS (node).exists ())
11055         vect_create_constant_vectors (vinfo, node);
11056       return;
11057     }
11058
11059   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
11060
11061   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
11062
11063   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
11064   SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
11065
11066   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
11067       && STMT_VINFO_DATA_REF (stmt_info))
11068     {
11069       /* Vectorized loads go before the first scalar load to make it
11070          ready early, vectorized stores go before the last scalar
11071          stmt which is where all uses are ready.  */
11072       stmt_vec_info last_stmt_info = NULL;
11073       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
11074         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
11075       else /* DR_IS_WRITE */
11076         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
11077       si = gsi_for_stmt (last_stmt_info->stmt);
11078     }
11079   else if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
11080            && (STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
11081                || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
11082                || STMT_VINFO_TYPE (stmt_info) == phi_info_type))
11083     {
11084       /* For PHI node vectorization we do not use the insertion iterator.  */
11085       si = gsi_none ();
11086     }
11087   else
11088     {
11089       /* Emit other stmts after the children vectorized defs which is
11090          earliest possible.  */
11091       gimple *last_stmt = NULL;
11092       bool seen_vector_def = false;
11093       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11094         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
11095           {
11096             /* For fold-left reductions we are retaining the scalar
11097                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
11098                set so the representation isn't perfect.  Resort to the
11099                last scalar def here.  */
11100             if (SLP_TREE_VEC_DEFS (child).is_empty ())
11101               {
11102                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
11103                             == cycle_phi_info_type);
11104                 gphi *phi = as_a <gphi *>
11105                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
11106                 if (!last_stmt
11107                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
11108                   last_stmt = phi;
11109               }
11110             /* We are emitting all vectorized stmts in the same place and
11111                the last one is the last.
11112                ???  Unless we have a load permutation applied and that
11113                figures to re-use an earlier generated load.  */
11114             unsigned j;
11115             tree vdef;
11116             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11117               {
11118                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11119                 if (!last_stmt
11120                     || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11121                   last_stmt = vstmt;
11122               }
11123           }
11124         else if (!SLP_TREE_VECTYPE (child))
11125           {
11126             /* For externals we use unvectorized at all scalar defs.  */
11127             unsigned j;
11128             tree def;
11129             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
11130               if (TREE_CODE (def) == SSA_NAME
11131                   && !SSA_NAME_IS_DEFAULT_DEF (def))
11132                 {
11133                   gimple *stmt = SSA_NAME_DEF_STMT (def);
11134                   if (!last_stmt
11135                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
11136                     last_stmt = stmt;
11137                 }
11138           }
11139         else
11140           {
11141             /* For externals we have to look at all defs since their
11142                insertion place is decided per vector.  But beware
11143                of pre-existing vectors where we need to make sure
11144                we do not insert before the region boundary.  */
11145             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
11146                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
11147               seen_vector_def = true;
11148             else
11149               {
11150                 unsigned j;
11151                 tree vdef;
11152                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11153                   if (TREE_CODE (vdef) == SSA_NAME
11154                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
11155                     {
11156                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11157                       if (!last_stmt
11158                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11159                         last_stmt = vstmt;
11160                     }
11161               }
11162           }
11163       /* This can happen when all children are pre-existing vectors or
11164          constants.  */
11165       if (!last_stmt)
11166         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
11167       if (!last_stmt)
11168         {
11169           gcc_assert (seen_vector_def);
11170           si = gsi_after_labels (vinfo->bbs[0]);
11171         }
11172       else if (is_ctrl_altering_stmt (last_stmt))
11173         {
11174           /* We split regions to vectorize at control altering stmts
11175              with a definition so this must be an external which
11176              we can insert at the start of the region.  */
11177           si = gsi_after_labels (vinfo->bbs[0]);
11178         }
11179       else if (is_a <bb_vec_info> (vinfo)
11180                && SLP_TREE_CODE (node) != VEC_PERM_EXPR
11181                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
11182                && gimple_could_trap_p (stmt_info->stmt))
11183         {
11184           /* We've constrained possibly trapping operations to all come
11185              from the same basic-block, if vectorized defs would allow earlier
11186              scheduling still force vectorized stmts to the original block.
11187              This is only necessary for BB vectorization since for loop vect
11188              all operations are in a single BB and scalar stmt based
11189              placement doesn't play well with epilogue vectorization.  */
11190           gcc_assert (dominated_by_p (CDI_DOMINATORS,
11191                                       gimple_bb (stmt_info->stmt),
11192                                       gimple_bb (last_stmt)));
11193           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
11194         }
11195       else if (is_a <gphi *> (last_stmt))
11196         si = gsi_after_labels (gimple_bb (last_stmt));
11197       else
11198         {
11199           si = gsi_for_stmt (last_stmt);
11200           gsi_next (&si);
11201
11202           /* Avoid scheduling internal defs outside of the loop when
11203              we might have only implicitly tracked loop mask/len defs.  */
11204           if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
11205             if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
11206                 || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
11207               {
11208                 gimple_stmt_iterator si2
11209                   = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
11210                 if ((gsi_end_p (si2)
11211                      && (LOOP_VINFO_LOOP (loop_vinfo)->header
11212                          != gimple_bb (last_stmt))
11213                      && dominated_by_p (CDI_DOMINATORS,
11214                                         LOOP_VINFO_LOOP (loop_vinfo)->header,
11215                                         gimple_bb (last_stmt)))
11216                     || (!gsi_end_p (si2)
11217                         && last_stmt != *si2
11218                         && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
11219                   si = si2;
11220               }
11221         }
11222     }
11223
11224   /* Handle purely internal nodes.  */
11225   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
11226     {
11227       if (dump_enabled_p ())
11228         dump_printf_loc (MSG_NOTE, vect_location,
11229                          "------>vectorizing SLP permutation node\n");
11230       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
11231          be shared with different SLP nodes (but usually it's the same
11232          operation apart from the case the stmt is only there for denoting
11233          the actual scalar lane defs ...).  So do not call vect_transform_stmt
11234          but open-code it here (partly).  */
11235       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
11236       gcc_assert (done);
11237       stmt_vec_info slp_stmt_info;
11238       unsigned int i;
11239       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
11240         if (slp_stmt_info && STMT_VINFO_LIVE_P (slp_stmt_info))
11241           {
11242             done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
11243                                                 instance, i, true, NULL);
11244             gcc_assert (done);
11245           }
11246     }
11247   else
11248     {
11249       if (dump_enabled_p ())
11250         dump_printf_loc (MSG_NOTE, vect_location,
11251                          "------>vectorizing SLP node starting from: %G",
11252                          stmt_info->stmt);
11253       vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
11254     }
11255 }
11256
11257 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
11258    For loop vectorization this is done in vectorizable_call, but for SLP
11259    it needs to be deferred until end of vect_schedule_slp, because multiple
11260    SLP instances may refer to the same scalar stmt.  */
11261
11262 static void
11263 vect_remove_slp_scalar_calls (vec_info *vinfo,
11264                               slp_tree node, hash_set<slp_tree> &visited)
11265 {
11266   gimple *new_stmt;
11267   gimple_stmt_iterator gsi;
11268   int i;
11269   slp_tree child;
11270   tree lhs;
11271   stmt_vec_info stmt_info;
11272
11273   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
11274     return;
11275
11276   if (visited.add (node))
11277     return;
11278
11279   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11280     vect_remove_slp_scalar_calls (vinfo, child, visited);
11281
11282   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
11283     {
11284       if (!stmt_info)
11285         continue;
11286       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
11287       if (!stmt || gimple_bb (stmt) == NULL)
11288         continue;
11289       if (is_pattern_stmt_p (stmt_info)
11290           || !PURE_SLP_STMT (stmt_info))
11291         continue;
11292       lhs = gimple_call_lhs (stmt);
11293       if (lhs)
11294         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
11295       else
11296         {
11297           new_stmt = gimple_build_nop ();
11298           unlink_stmt_vdef (stmt_info->stmt);
11299         }
11300       gsi = gsi_for_stmt (stmt);
11301       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
11302       if (lhs)
11303         SSA_NAME_DEF_STMT (lhs) = new_stmt;
11304     }
11305 }
11306
11307 static void
11308 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
11309 {
11310   hash_set<slp_tree> visited;
11311   vect_remove_slp_scalar_calls (vinfo, node, visited);
11312 }
11313
11314 /* Vectorize the instance root.  */
11315
11316 void
11317 vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance instance)
11318 {
11319   gassign *rstmt = NULL;
11320
11321   if (instance->kind == slp_inst_kind_ctor)
11322     {
11323       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
11324         {
11325           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
11326           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
11327           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
11328                                           TREE_TYPE (vect_lhs)))
11329             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
11330                                vect_lhs);
11331           rstmt = gimple_build_assign (root_lhs, vect_lhs);
11332         }
11333       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
11334         {
11335           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
11336           tree child_def;
11337           int j;
11338           vec<constructor_elt, va_gc> *v;
11339           vec_alloc (v, nelts);
11340
11341           /* A CTOR can handle V16HI composition from VNx8HI so we
11342              do not need to convert vector elements if the types
11343              do not match.  */
11344           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
11345             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
11346           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
11347           tree rtype
11348             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
11349           tree r_constructor = build_constructor (rtype, v);
11350           rstmt = gimple_build_assign (lhs, r_constructor);
11351         }
11352     }
11353   else if (instance->kind == slp_inst_kind_bb_reduc)
11354     {
11355       /* Largely inspired by reduction chain epilogue handling in
11356          vect_create_epilog_for_reduction.  */
11357       vec<tree> vec_defs = vNULL;
11358       vect_get_slp_defs (node, &vec_defs);
11359       enum tree_code reduc_code
11360         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
11361       /* ???  We actually have to reflect signs somewhere.  */
11362       if (reduc_code == MINUS_EXPR)
11363         reduc_code = PLUS_EXPR;
11364       gimple_seq epilogue = NULL;
11365       /* We may end up with more than one vector result, reduce them
11366          to one vector.  */
11367       tree vec_def = vec_defs[0];
11368       tree vectype = TREE_TYPE (vec_def);
11369       tree compute_vectype = vectype;
11370       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
11371                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
11372                                  && operation_can_overflow (reduc_code));
11373       if (pun_for_overflow_p)
11374         {
11375           compute_vectype = unsigned_type_for (vectype);
11376           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
11377                                   compute_vectype, vec_def);
11378         }
11379       for (unsigned i = 1; i < vec_defs.length (); ++i)
11380         {
11381           tree def = vec_defs[i];
11382           if (pun_for_overflow_p)
11383             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
11384                                 compute_vectype, def);
11385           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
11386                                   vec_def, def);
11387         }
11388       vec_defs.release ();
11389       /* ???  Support other schemes than direct internal fn.  */
11390       internal_fn reduc_fn;
11391       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
11392           || reduc_fn == IFN_LAST)
11393         gcc_unreachable ();
11394       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
11395                                       TREE_TYPE (compute_vectype), vec_def);
11396       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
11397         {
11398           tree rem_def = NULL_TREE;
11399           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
11400             {
11401               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
11402               if (!rem_def)
11403                 rem_def = def;
11404               else
11405                 rem_def = gimple_build (&epilogue, reduc_code,
11406                                         TREE_TYPE (scalar_def),
11407                                         rem_def, def);
11408             }
11409           scalar_def = gimple_build (&epilogue, reduc_code,
11410                                      TREE_TYPE (scalar_def),
11411                                      scalar_def, rem_def);
11412         }
11413       scalar_def = gimple_convert (&epilogue,
11414                                    TREE_TYPE (vectype), scalar_def);
11415       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
11416       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
11417       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
11418       update_stmt (gsi_stmt (rgsi));
11419       return;
11420     }
11421   else if (instance->kind == slp_inst_kind_gcond)
11422     {
11423       /* Only support a single root for now as we can't codegen CFG yet and so we
11424          can't support lane > 1 at this time.  */
11425       gcc_assert (instance->root_stmts.length () == 1);
11426       auto root_stmt_info = instance->root_stmts[0];
11427       auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info));
11428       gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt);
11429       gimple *vec_stmt = NULL;
11430       gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ());
11431       bool res = vectorizable_early_exit (vinfo, root_stmt_info, &rgsi,
11432                                           &vec_stmt, node, NULL);
11433       gcc_assert (res);
11434       return;
11435     }
11436   else
11437     gcc_unreachable ();
11438
11439   gcc_assert (rstmt);
11440
11441   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
11442   gsi_replace (&rgsi, rstmt, true);
11443 }
11444
11445 struct slp_scc_info
11446 {
11447   bool on_stack;
11448   int dfs;
11449   int lowlink;
11450 };
11451
11452 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
11453
11454 static void
11455 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
11456                    hash_map<slp_tree, slp_scc_info> &scc_info,
11457                    int &maxdfs, vec<slp_tree> &stack)
11458 {
11459   bool existed_p;
11460   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
11461   gcc_assert (!existed_p);
11462   info->dfs = maxdfs;
11463   info->lowlink = maxdfs;
11464   maxdfs++;
11465
11466   /* Leaf.  */
11467   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
11468     {
11469       info->on_stack = false;
11470       vect_schedule_slp_node (vinfo, node, instance);
11471       return;
11472     }
11473
11474   info->on_stack = true;
11475   stack.safe_push (node);
11476
11477   unsigned i;
11478   slp_tree child;
11479   /* DFS recurse.  */
11480   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11481     {
11482       if (!child)
11483         continue;
11484       slp_scc_info *child_info = scc_info.get (child);
11485       if (!child_info)
11486         {
11487           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
11488           /* Recursion might have re-allocated the node.  */
11489           info = scc_info.get (node);
11490           child_info = scc_info.get (child);
11491           info->lowlink = MIN (info->lowlink, child_info->lowlink);
11492         }
11493       else if (child_info->on_stack)
11494         info->lowlink = MIN (info->lowlink, child_info->dfs);
11495     }
11496   if (info->lowlink != info->dfs)
11497     return;
11498
11499   auto_vec<slp_tree, 4> phis_to_fixup;
11500
11501   /* Singleton.  */
11502   if (stack.last () == node)
11503     {
11504       stack.pop ();
11505       info->on_stack = false;
11506       vect_schedule_slp_node (vinfo, node, instance);
11507       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
11508           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
11509         phis_to_fixup.quick_push (node);
11510     }
11511   else
11512     {
11513       /* SCC.  */
11514       int last_idx = stack.length () - 1;
11515       while (stack[last_idx] != node)
11516         last_idx--;
11517       /* We can break the cycle at PHIs who have at least one child
11518          code generated.  Then we could re-start the DFS walk until
11519          all nodes in the SCC are covered (we might have new entries
11520          for only back-reachable nodes).  But it's simpler to just
11521          iterate and schedule those that are ready.  */
11522       unsigned todo = stack.length () - last_idx;
11523       do
11524         {
11525           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
11526             {
11527               slp_tree entry = stack[idx];
11528               if (!entry)
11529                 continue;
11530               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
11531                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
11532               bool ready = !phi;
11533               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
11534                   if (!child)
11535                     {
11536                       gcc_assert (phi);
11537                       ready = true;
11538                       break;
11539                     }
11540                   else if (scc_info.get (child)->on_stack)
11541                     {
11542                       if (!phi)
11543                         {
11544                           ready = false;
11545                           break;
11546                         }
11547                     }
11548                   else
11549                     {
11550                       if (phi)
11551                         {
11552                           ready = true;
11553                           break;
11554                         }
11555                     }
11556               if (ready)
11557                 {
11558                   vect_schedule_slp_node (vinfo, entry, instance);
11559                   scc_info.get (entry)->on_stack = false;
11560                   stack[idx] = NULL;
11561                   todo--;
11562                   if (phi)
11563                     phis_to_fixup.safe_push (entry);
11564                 }
11565             }
11566         }
11567       while (todo != 0);
11568
11569       /* Pop the SCC.  */
11570       stack.truncate (last_idx);
11571     }
11572
11573   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
11574   slp_tree phi_node;
11575   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
11576     {
11577       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
11578       edge_iterator ei;
11579       edge e;
11580       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
11581         {
11582           unsigned dest_idx = e->dest_idx;
11583           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
11584           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
11585             continue;
11586           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
11587           /* Simply fill all args.  */
11588           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
11589               != vect_first_order_recurrence)
11590             for (unsigned i = 0; i < n; ++i)
11591               {
11592                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
11593                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11594                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
11595                              e, gimple_phi_arg_location (phi, dest_idx));
11596               }
11597           else
11598             {
11599               /* Unless it is a first order recurrence which needs
11600                  args filled in for both the PHI node and the permutes.  */
11601               gimple *perm
11602                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
11603               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
11604               add_phi_arg (as_a <gphi *> (rphi),
11605                            vect_get_slp_vect_def (child, n - 1),
11606                            e, gimple_phi_arg_location (phi, dest_idx));
11607               for (unsigned i = 0; i < n; ++i)
11608                 {
11609                   gimple *perm
11610                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
11611                   if (i > 0)
11612                     gimple_assign_set_rhs1 (perm,
11613                                             vect_get_slp_vect_def (child, i - 1));
11614                   gimple_assign_set_rhs2 (perm,
11615                                           vect_get_slp_vect_def (child, i));
11616                   update_stmt (perm);
11617                 }
11618             }
11619         }
11620     }
11621 }
11622
11623 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
11624
11625 void
11626 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
11627 {
11628   slp_instance instance;
11629   unsigned int i;
11630
11631   hash_map<slp_tree, slp_scc_info> scc_info;
11632   int maxdfs = 0;
11633   FOR_EACH_VEC_ELT (slp_instances, i, instance)
11634     {
11635       slp_tree node = SLP_INSTANCE_TREE (instance);
11636       if (dump_enabled_p ())
11637         {
11638           dump_printf_loc (MSG_NOTE, vect_location,
11639                            "Vectorizing SLP tree:\n");
11640           /* ???  Dump all?  */
11641           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
11642             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
11643                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
11644           vect_print_slp_graph (MSG_NOTE, vect_location,
11645                                 SLP_INSTANCE_TREE (instance));
11646         }
11647       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
11648          have a PHI be the node breaking the cycle.  */
11649       auto_vec<slp_tree> stack;
11650       if (!scc_info.get (node))
11651         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
11652
11653       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
11654         vectorize_slp_instance_root_stmt (vinfo, node, instance);
11655
11656       if (dump_enabled_p ())
11657         dump_printf_loc (MSG_NOTE, vect_location,
11658                          "vectorizing stmts using SLP.\n");
11659     }
11660
11661   FOR_EACH_VEC_ELT (slp_instances, i, instance)
11662     {
11663       slp_tree root = SLP_INSTANCE_TREE (instance);
11664       stmt_vec_info store_info;
11665       unsigned int j;
11666
11667       /* Remove scalar call stmts.  Do not do this for basic-block
11668          vectorization as not all uses may be vectorized.
11669          ???  Why should this be necessary?  DCE should be able to
11670          remove the stmts itself.
11671          ???  For BB vectorization we can as well remove scalar
11672          stmts starting from the SLP tree root if they have no
11673          uses.  */
11674       if (is_a <loop_vec_info> (vinfo))
11675         vect_remove_slp_scalar_calls (vinfo, root);
11676
11677       /* Remove vectorized stores original scalar stmts.  */
11678       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
11679         {
11680           if (!STMT_VINFO_DATA_REF (store_info)
11681               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
11682             break;
11683
11684           store_info = vect_orig_stmt (store_info);
11685           /* Free the attached stmt_vec_info and remove the stmt.  */
11686           vinfo->remove_stmt (store_info);
11687
11688           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
11689              to not crash in vect_free_slp_tree later.  */
11690           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
11691             SLP_TREE_REPRESENTATIVE (root) = NULL;
11692         }
11693     }
11694 }