gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #define INCLUDE_MEMORY
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "gimple.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "insn-config.h"
  36 #include "recog.h"              /* FIXME: for insn_data */
  37 #include "fold-const.h"
  38 #include "stor-layout.h"
  39 #include "gimple-iterator.h"
  40 #include "cfgloop.h"
  41 #include "tree-vectorizer.h"
  42 #include "langhooks.h"
  43 #include "gimple-walk.h"
  44 #include "dbgcnt.h"
  45 #include "tree-vector-builder.h"
  46 #include "vec-perm-indices.h"
  47 #include "gimple-fold.h"
  48 #include "internal-fn.h"
  49 #include "dump-context.h"
  50 #include "cfganal.h"
  51 #include "tree-eh.h"
  52 #include "tree-cfg.h"
  53 #include "alloc-pool.h"
  54 #include "sreal.h"
  55 #include "predict.h"
  56
  57 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  58                                             load_permutation_t &,
  59                                             const vec<tree> &,
  60                                             gimple_stmt_iterator *,
  61                                             poly_uint64, bool, bool,
  62                                             unsigned *,
  63                                             unsigned * = nullptr,
  64                                             bool = false);
  65 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  66                                            slp_tree, lane_permutation_t &,
  67                                            vec<slp_tree> &, bool);
  68 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  69                                           slp_tree, stmt_vector_for_cost *);
  70 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  71
  72 static object_allocator<_slp_tree> *slp_tree_pool;
  73 static slp_tree slp_first_node;
  74
  75 void
  76 vect_slp_init (void)
  77 {
  78   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  79 }
  80
  81 void
  82 vect_slp_fini (void)
  83 {
  84   while (slp_first_node)
  85     delete slp_first_node;
  86   delete slp_tree_pool;
  87   slp_tree_pool = NULL;
  88 }
  89
  90 void *
  91 _slp_tree::operator new (size_t n)
  92 {
  93   gcc_assert (n == sizeof (_slp_tree));
  94   return slp_tree_pool->allocate_raw ();
  95 }
  96
  97 void
  98 _slp_tree::operator delete (void *node, size_t n)
  99 {
 100   gcc_assert (n == sizeof (_slp_tree));
 101   slp_tree_pool->remove_raw (node);
 102 }
 103
 104
 105 /* Initialize a SLP node.  */
 106
 107 _slp_tree::_slp_tree ()
 108 {
 109   this->prev_node = NULL;
 110   if (slp_first_node)
 111     slp_first_node->prev_node = this;
 112   this->next_node = slp_first_node;
 113   slp_first_node = this;
 114   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 115   SLP_TREE_SCALAR_OPS (this) = vNULL;
 116   SLP_TREE_VEC_DEFS (this) = vNULL;
 117   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 118   SLP_TREE_CHILDREN (this) = vNULL;
 119   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 120   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 121   SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
 122   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 123   SLP_TREE_CODE (this) = ERROR_MARK;
 124   this->ldst_lanes = false;
 125   SLP_TREE_VECTYPE (this) = NULL_TREE;
 126   SLP_TREE_REPRESENTATIVE (this) = NULL;
 127   SLP_TREE_REF_COUNT (this) = 1;
 128   this->failed = NULL;
 129   this->max_nunits = 1;
 130   this->lanes = 0;
 131 }
 132
 133 /* Tear down a SLP node.  */
 134
 135 _slp_tree::~_slp_tree ()
 136 {
 137   if (this->prev_node)
 138     this->prev_node->next_node = this->next_node;
 139   else
 140     slp_first_node = this->next_node;
 141   if (this->next_node)
 142     this->next_node->prev_node = this->prev_node;
 143   SLP_TREE_CHILDREN (this).release ();
 144   SLP_TREE_SCALAR_STMTS (this).release ();
 145   SLP_TREE_SCALAR_OPS (this).release ();
 146   SLP_TREE_VEC_DEFS (this).release ();
 147   SLP_TREE_LOAD_PERMUTATION (this).release ();
 148   SLP_TREE_LANE_PERMUTATION (this).release ();
 149   SLP_TREE_SIMD_CLONE_INFO (this).release ();
 150   if (this->failed)
 151     free (failed);
 152 }
 153
 154 /* Push the single SSA definition in DEF to the vector of vector defs.  */
 155
 156 void
 157 _slp_tree::push_vec_def (gimple *def)
 158 {
 159   if (gphi *phi = dyn_cast <gphi *> (def))
 160     vec_defs.quick_push (gimple_phi_result (phi));
 161   else
 162     {
 163       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
 164       vec_defs.quick_push (get_def_from_ptr (defop));
 165     }
 166 }
 167
 168 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 169
 170 void
 171 vect_free_slp_tree (slp_tree node)
 172 {
 173   int i;
 174   slp_tree child;
 175
 176   if (--SLP_TREE_REF_COUNT (node) != 0)
 177     return;
 178
 179   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 180     if (child)
 181       vect_free_slp_tree (child);
 182
 183   /* If the node defines any SLP only patterns then those patterns are no
 184      longer valid and should be removed.  */
 185   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 186   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 187     {
 188       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 189       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 190       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 191     }
 192
 193   delete node;
 194 }
 195
 196 /* Return a location suitable for dumpings related to the SLP instance.  */
 197
 198 dump_user_location_t
 199 _slp_instance::location () const
 200 {
 201   if (!root_stmts.is_empty ())
 202     return root_stmts[0]->stmt;
 203   else
 204     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 205 }
 206
 207
 208 /* Free the memory allocated for the SLP instance.  */
 209
 210 void
 211 vect_free_slp_instance (slp_instance instance)
 212 {
 213   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 214   SLP_INSTANCE_LOADS (instance).release ();
 215   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 216   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
 217   instance->subgraph_entries.release ();
 218   instance->cost_vec.release ();
 219   free (instance);
 220 }
 221
 222
 223 /* Create an SLP node for SCALAR_STMTS.  */
 224
 225 slp_tree
 226 vect_create_new_slp_node (unsigned nops, tree_code code)
 227 {
 228   slp_tree node = new _slp_tree;
 229   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 230   SLP_TREE_CHILDREN (node).create (nops);
 231   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 232   SLP_TREE_CODE (node) = code;
 233   return node;
 234 }
 235 /* Create an SLP node for SCALAR_STMTS.  */
 236
 237 static slp_tree
 238 vect_create_new_slp_node (slp_tree node,
 239                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 240 {
 241   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 242   SLP_TREE_CHILDREN (node).create (nops);
 243   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 244   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 245   SLP_TREE_LANES (node) = scalar_stmts.length ();
 246   return node;
 247 }
 248
 249 /* Create an SLP node for SCALAR_STMTS.  */
 250
 251 static slp_tree
 252 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 253 {
 254   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 255 }
 256
 257 /* Create an SLP node for OPS.  */
 258
 259 static slp_tree
 260 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 261 {
 262   SLP_TREE_SCALAR_OPS (node) = ops;
 263   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 264   SLP_TREE_LANES (node) = ops.length ();
 265   return node;
 266 }
 267
 268 /* Create an SLP node for OPS.  */
 269
 270 static slp_tree
 271 vect_create_new_slp_node (vec<tree> ops)
 272 {
 273   return vect_create_new_slp_node (new _slp_tree, ops);
 274 }
 275
 276
 277 /* This structure is used in creation of an SLP tree.  Each instance
 278    corresponds to the same operand in a group of scalar stmts in an SLP
 279    node.  */
 280 typedef struct _slp_oprnd_info
 281 {
 282   /* Def-stmts for the operands.  */
 283   vec<stmt_vec_info> def_stmts;
 284   /* Operands.  */
 285   vec<tree> ops;
 286   /* Information about the first statement, its vector def-type, type, the
 287      operand itself in case it's constant, and an indication if it's a pattern
 288      stmt and gather/scatter info.  */
 289   tree first_op_type;
 290   enum vect_def_type first_dt;
 291   bool any_pattern;
 292   bool first_gs_p;
 293   gather_scatter_info first_gs_info;
 294 } *slp_oprnd_info;
 295
 296
 297 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 298    operand.  */
 299 static vec<slp_oprnd_info>
 300 vect_create_oprnd_info (int nops, int group_size)
 301 {
 302   int i;
 303   slp_oprnd_info oprnd_info;
 304   vec<slp_oprnd_info> oprnds_info;
 305
 306   oprnds_info.create (nops);
 307   for (i = 0; i < nops; i++)
 308     {
 309       oprnd_info = XNEW (struct _slp_oprnd_info);
 310       oprnd_info->def_stmts.create (group_size);
 311       oprnd_info->ops.create (group_size);
 312       oprnd_info->first_dt = vect_uninitialized_def;
 313       oprnd_info->first_op_type = NULL_TREE;
 314       oprnd_info->any_pattern = false;
 315       oprnd_info->first_gs_p = false;
 316       oprnds_info.quick_push (oprnd_info);
 317     }
 318
 319   return oprnds_info;
 320 }
 321
 322
 323 /* Free operands info.  */
 324
 325 static void
 326 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 327 {
 328   int i;
 329   slp_oprnd_info oprnd_info;
 330
 331   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 332     {
 333       oprnd_info->def_stmts.release ();
 334       oprnd_info->ops.release ();
 335       XDELETE (oprnd_info);
 336     }
 337
 338   oprnds_info.release ();
 339 }
 340
 341 /* Return the execution frequency of NODE (so that a higher value indicates
 342    a "more important" node when optimizing for speed).  */
 343
 344 static sreal
 345 vect_slp_node_weight (slp_tree node)
 346 {
 347   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 348   basic_block bb = gimple_bb (stmt_info->stmt);
 349   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 350 }
 351
 352 /* Return true if STMTS contains a pattern statement.  */
 353
 354 static bool
 355 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 356 {
 357   stmt_vec_info stmt_info;
 358   unsigned int i;
 359   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 360     if (stmt_info && is_pattern_stmt_p (stmt_info))
 361       return true;
 362   return false;
 363 }
 364
 365 /* Return true when all lanes in the external or constant NODE have
 366    the same value.  */
 367
 368 static bool
 369 vect_slp_tree_uniform_p (slp_tree node)
 370 {
 371   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 372               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 373
 374   /* Pre-exsting vectors.  */
 375   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 376     return false;
 377
 378   unsigned i;
 379   tree op, first = NULL_TREE;
 380   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 381     if (!first)
 382       first = op;
 383     else if (!operand_equal_p (first, op, 0))
 384       return false;
 385
 386   return true;
 387 }
 388
 389 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 390    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 391    of the chain.  */
 392
 393 int
 394 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 395                                       stmt_vec_info first_stmt_info)
 396 {
 397   stmt_vec_info next_stmt_info = first_stmt_info;
 398   int result = 0;
 399
 400   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 401     return -1;
 402
 403   do
 404     {
 405       if (next_stmt_info == stmt_info)
 406         return result;
 407       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 408       if (next_stmt_info)
 409         result += DR_GROUP_GAP (next_stmt_info);
 410     }
 411   while (next_stmt_info);
 412
 413   return -1;
 414 }
 415
 416 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 417    using the method implemented by duplicate_and_interleave.  Return true
 418    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 419    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 420    (if nonnull).  */
 421
 422 bool
 423 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 424                                 tree elt_type, unsigned int *nvectors_out,
 425                                 tree *vector_type_out,
 426                                 tree *permutes)
 427 {
 428   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 429   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 430     return false;
 431
 432   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 433   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 434   unsigned int nvectors = 1;
 435   for (;;)
 436     {
 437       scalar_int_mode int_mode;
 438       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 439       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 440         {
 441           /* Get the natural vector type for this SLP group size.  */
 442           tree int_type = build_nonstandard_integer_type
 443             (GET_MODE_BITSIZE (int_mode), 1);
 444           tree vector_type
 445             = get_vectype_for_scalar_type (vinfo, int_type, count);
 446           poly_int64 half_nelts;
 447           if (vector_type
 448               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 449               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 450                            GET_MODE_SIZE (base_vector_mode))
 451               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
 452                              2, &half_nelts))
 453             {
 454               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 455                  together into elements of type INT_TYPE and using the result
 456                  to build NVECTORS vectors.  */
 457               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 458               vec_perm_builder sel1 (nelts, 2, 3);
 459               vec_perm_builder sel2 (nelts, 2, 3);
 460
 461               for (unsigned int i = 0; i < 3; ++i)
 462                 {
 463                   sel1.quick_push (i);
 464                   sel1.quick_push (i + nelts);
 465                   sel2.quick_push (half_nelts + i);
 466                   sel2.quick_push (half_nelts + i + nelts);
 467                 }
 468               vec_perm_indices indices1 (sel1, 2, nelts);
 469               vec_perm_indices indices2 (sel2, 2, nelts);
 470               machine_mode vmode = TYPE_MODE (vector_type);
 471               if (can_vec_perm_const_p (vmode, vmode, indices1)
 472                   && can_vec_perm_const_p (vmode, vmode, indices2))
 473                 {
 474                   if (nvectors_out)
 475                     *nvectors_out = nvectors;
 476                   if (vector_type_out)
 477                     *vector_type_out = vector_type;
 478                   if (permutes)
 479                     {
 480                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 481                                                                 indices1);
 482                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 483                                                                 indices2);
 484                     }
 485                   return true;
 486                 }
 487             }
 488         }
 489       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 490         return false;
 491       nvectors *= 2;
 492     }
 493 }
 494
 495 /* Return true if DTA and DTB match.  */
 496
 497 static bool
 498 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 499 {
 500   return (dta == dtb
 501           || ((dta == vect_external_def || dta == vect_constant_def)
 502               && (dtb == vect_external_def || dtb == vect_constant_def)));
 503 }
 504
 505 static const int cond_expr_maps[3][5] = {
 506   { 4, -1, -2, 1, 2 },
 507   { 4, -2, -1, 1, 2 },
 508   { 4, -1, -2, 2, 1 }
 509 };
 510 static const int arg0_map[] = { 1, 0 };
 511 static const int arg1_map[] = { 1, 1 };
 512 static const int arg2_map[] = { 1, 2 };
 513 static const int arg1_arg4_map[] = { 2, 1, 4 };
 514 static const int arg3_arg2_map[] = { 2, 3, 2 };
 515 static const int op1_op0_map[] = { 2, 1, 0 };
 516 static const int off_map[] = { 1, -3 };
 517 static const int off_op0_map[] = { 2, -3, 0 };
 518 static const int off_arg2_map[] = { 2, -3, 2 };
 519 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
 520 static const int mask_call_maps[6][7] = {
 521   { 1, 1, },
 522   { 2, 1, 2, },
 523   { 3, 1, 2, 3, },
 524   { 4, 1, 2, 3, 4, },
 525   { 5, 1, 2, 3, 4, 5, },
 526   { 6, 1, 2, 3, 4, 5, 6 },
 527 };
 528
 529 /* For most SLP statements, there is a one-to-one mapping between
 530    gimple arguments and child nodes.  If that is not true for STMT,
 531    return an array that contains:
 532
 533    - the number of child nodes, followed by
 534    - for each child node, the index of the argument associated with that node.
 535      The special index -1 is the first operand of an embedded comparison and
 536      the special index -2 is the second operand of an embedded comparison.
 537      The special indes -3 is the offset of a gather as analyzed by
 538      vect_check_gather_scatter.
 539
 540    SWAP is as for vect_get_and_check_slp_defs.  */
 541
 542 static const int *
 543 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
 544                       unsigned char swap = 0)
 545 {
 546   if (auto assign = dyn_cast<const gassign *> (stmt))
 547     {
 548       if (gimple_assign_rhs_code (assign) == COND_EXPR
 549           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 550         return cond_expr_maps[swap];
 551       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 552           && swap)
 553         return op1_op0_map;
 554       if (gather_scatter_p)
 555         return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
 556                 ? off_op0_map : off_map);
 557     }
 558   gcc_assert (!swap);
 559   if (auto call = dyn_cast<const gcall *> (stmt))
 560     {
 561       if (gimple_call_internal_p (call))
 562         switch (gimple_call_internal_fn (call))
 563           {
 564           case IFN_MASK_LOAD:
 565             return gather_scatter_p ? off_arg2_map : arg2_map;
 566
 567           case IFN_GATHER_LOAD:
 568             return arg1_map;
 569
 570           case IFN_MASK_GATHER_LOAD:
 571           case IFN_MASK_LEN_GATHER_LOAD:
 572             return arg1_arg4_map;
 573
 574           case IFN_MASK_STORE:
 575             return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
 576
 577           case IFN_MASK_CALL:
 578             {
 579               unsigned nargs = gimple_call_num_args (call);
 580               if (nargs >= 2 && nargs <= 7)
 581                 return mask_call_maps[nargs-2];
 582               else
 583                 return nullptr;
 584             }
 585
 586           case IFN_CLZ:
 587           case IFN_CTZ:
 588             return arg0_map;
 589
 590           default:
 591             break;
 592           }
 593     }
 594   return nullptr;
 595 }
 596
 597 /* Return the SLP node child index for operand OP of STMT.  */
 598
 599 int
 600 vect_slp_child_index_for_operand (const gimple *stmt, int op,
 601                                   bool gather_scatter_p)
 602 {
 603   const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
 604   if (!opmap)
 605     return op;
 606   for (int i = 1; i < 1 + opmap[0]; ++i)
 607     if (opmap[i] == op)
 608       return i - 1;
 609   gcc_unreachable ();
 610 }
 611
 612 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 613    they are of a valid type and that they match the defs of the first stmt of
 614    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 615    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 616    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 617    is 1 if STMT is cond and operands of comparison need to be swapped;
 618    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 619
 620    If there was a fatal error return -1; if the error could be corrected by
 621    swapping operands of father node of this one, return 1; if everything is
 622    ok return 0.  */
 623 static int
 624 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 625                              bool *skip_args,
 626                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 627                              vec<slp_oprnd_info> *oprnds_info)
 628 {
 629   stmt_vec_info stmt_info = stmts[stmt_num];
 630   tree oprnd;
 631   unsigned int i, number_of_oprnds;
 632   enum vect_def_type dt = vect_uninitialized_def;
 633   slp_oprnd_info oprnd_info;
 634   gather_scatter_info gs_info;
 635   unsigned int gs_op = -1u;
 636   unsigned int commutative_op = -1U;
 637   bool first = stmt_num == 0;
 638
 639   if (!is_a<gcall *> (stmt_info->stmt)
 640       && !is_a<gassign *> (stmt_info->stmt)
 641       && !is_a<gphi *> (stmt_info->stmt))
 642     return -1;
 643
 644   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 645   const int *map
 646     = vect_get_operand_map (stmt_info->stmt,
 647                             STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
 648   if (map)
 649     number_of_oprnds = *map++;
 650   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 651     {
 652       if (gimple_call_internal_p (stmt))
 653         {
 654           internal_fn ifn = gimple_call_internal_fn (stmt);
 655           commutative_op = first_commutative_argument (ifn);
 656         }
 657     }
 658   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 659     {
 660       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 661         commutative_op = 0;
 662     }
 663
 664   bool swapped = (swap != 0);
 665   bool backedge = false;
 666   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 667   for (i = 0; i < number_of_oprnds; i++)
 668     {
 669       oprnd_info = (*oprnds_info)[i];
 670       int opno = map ? map[i] : int (i);
 671       if (opno == -3)
 672         {
 673           gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
 674           if (!is_a <loop_vec_info> (vinfo)
 675               || !vect_check_gather_scatter (stmt_info,
 676                                              as_a <loop_vec_info> (vinfo),
 677                                              first ? &oprnd_info->first_gs_info
 678                                              : &gs_info))
 679             return -1;
 680
 681           if (first)
 682             {
 683               oprnd_info->first_gs_p = true;
 684               oprnd = oprnd_info->first_gs_info.offset;
 685             }
 686           else
 687             {
 688               gs_op = i;
 689               oprnd = gs_info.offset;
 690             }
 691         }
 692       else if (opno < 0)
 693         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 694       else
 695         {
 696           oprnd = gimple_arg (stmt_info->stmt, opno);
 697           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 698             {
 699               edge e = gimple_phi_arg_edge (stmt, opno);
 700               backedge = (is_a <bb_vec_info> (vinfo)
 701                           ? e->flags & EDGE_DFS_BACK
 702                           : dominated_by_p (CDI_DOMINATORS, e->src,
 703                                             gimple_bb (stmt_info->stmt)));
 704             }
 705         }
 706       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 707         oprnd = TREE_OPERAND (oprnd, 0);
 708
 709       stmt_vec_info def_stmt_info;
 710       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 711         {
 712           if (dump_enabled_p ())
 713             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 714                              "Build SLP failed: can't analyze def for %T\n",
 715                              oprnd);
 716
 717           return -1;
 718         }
 719
 720       if (skip_args[i])
 721         {
 722           oprnd_info->def_stmts.quick_push (NULL);
 723           oprnd_info->ops.quick_push (NULL_TREE);
 724           oprnd_info->first_dt = vect_uninitialized_def;
 725           continue;
 726         }
 727
 728       oprnd_info->def_stmts.quick_push (def_stmt_info);
 729       oprnd_info->ops.quick_push (oprnd);
 730
 731       if (def_stmt_info
 732           && is_pattern_stmt_p (def_stmt_info))
 733         {
 734           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 735               != def_stmt_info)
 736             oprnd_info->any_pattern = true;
 737           else
 738             /* If we promote this to external use the original stmt def.  */
 739             oprnd_info->ops.last ()
 740               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 741         }
 742
 743       /* If there's a extern def on a backedge make sure we can
 744          code-generate at the region start.
 745          ???  This is another case that could be fixed by adjusting
 746          how we split the function but at the moment we'd have conflicting
 747          goals there.  */
 748       if (backedge
 749           && dts[i] == vect_external_def
 750           && is_a <bb_vec_info> (vinfo)
 751           && TREE_CODE (oprnd) == SSA_NAME
 752           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 753           && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
 754                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 755         {
 756           if (dump_enabled_p ())
 757             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 758                              "Build SLP failed: extern def %T only defined "
 759                              "on backedge\n", oprnd);
 760           return -1;
 761         }
 762
 763       if (first)
 764         {
 765           tree type = TREE_TYPE (oprnd);
 766           dt = dts[i];
 767
 768           /* For the swapping logic below force vect_reduction_def
 769              for the reduction op in a SLP reduction group.  */
 770           if (!STMT_VINFO_DATA_REF (stmt_info)
 771               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 772               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 773               && def_stmt_info)
 774             dts[i] = dt = vect_reduction_def;
 775
 776           /* Check the types of the definition.  */
 777           switch (dt)
 778             {
 779             case vect_external_def:
 780             case vect_constant_def:
 781             case vect_internal_def:
 782             case vect_reduction_def:
 783             case vect_double_reduction_def:
 784             case vect_induction_def:
 785             case vect_nested_cycle:
 786             case vect_first_order_recurrence:
 787               break;
 788
 789             default:
 790               /* FORNOW: Not supported.  */
 791               if (dump_enabled_p ())
 792                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 793                                  "Build SLP failed: illegal type of def %T\n",
 794                                  oprnd);
 795               return -1;
 796             }
 797
 798           oprnd_info->first_dt = dt;
 799           oprnd_info->first_op_type = type;
 800         }
 801     }
 802   if (first)
 803     return 0;
 804
 805   /* Now match the operand definition types to that of the first stmt.  */
 806   for (i = 0; i < number_of_oprnds;)
 807     {
 808       if (skip_args[i])
 809         {
 810           ++i;
 811           continue;
 812         }
 813
 814       oprnd_info = (*oprnds_info)[i];
 815       dt = dts[i];
 816       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 817       oprnd = oprnd_info->ops[stmt_num];
 818       tree type = TREE_TYPE (oprnd);
 819
 820       if (!types_compatible_p (oprnd_info->first_op_type, type))
 821         {
 822           if (dump_enabled_p ())
 823             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 824                              "Build SLP failed: different operand types\n");
 825           return 1;
 826         }
 827
 828       if ((gs_op == i) != oprnd_info->first_gs_p)
 829         {
 830           if (dump_enabled_p ())
 831             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 832                              "Build SLP failed: mixed gather and non-gather\n");
 833           return 1;
 834         }
 835       else if (gs_op == i)
 836         {
 837           if (!operand_equal_p (oprnd_info->first_gs_info.base,
 838                                 gs_info.base))
 839             {
 840               if (dump_enabled_p ())
 841                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 842                                  "Build SLP failed: different gather base\n");
 843               return 1;
 844             }
 845           if (oprnd_info->first_gs_info.scale != gs_info.scale)
 846             {
 847               if (dump_enabled_p ())
 848                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 849                                  "Build SLP failed: different gather scale\n");
 850               return 1;
 851             }
 852         }
 853
 854       /* Not first stmt of the group, check that the def-stmt/s match
 855          the def-stmt/s of the first stmt.  Allow different definition
 856          types for reduction chains: the first stmt must be a
 857          vect_reduction_def (a phi node), and the rest
 858          end in the reduction chain.  */
 859       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 860            && !(oprnd_info->first_dt == vect_reduction_def
 861                 && !STMT_VINFO_DATA_REF (stmt_info)
 862                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 863                 && def_stmt_info
 864                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 865                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 866                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 867           || (!STMT_VINFO_DATA_REF (stmt_info)
 868               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 869               && ((!def_stmt_info
 870                    || STMT_VINFO_DATA_REF (def_stmt_info)
 871                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 872                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 873                   != (oprnd_info->first_dt != vect_reduction_def))))
 874         {
 875           /* Try swapping operands if we got a mismatch.  For BB
 876              vectorization only in case it will clearly improve things.  */
 877           if (i == commutative_op && !swapped
 878               && (!is_a <bb_vec_info> (vinfo)
 879                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 880                                              dts[i+1])
 881                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 882                           || vect_def_types_match
 883                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 884             {
 885               if (dump_enabled_p ())
 886                 dump_printf_loc (MSG_NOTE, vect_location,
 887                                  "trying swapped operands\n");
 888               std::swap (dts[i], dts[i+1]);
 889               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 890                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 891               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 892                          (*oprnds_info)[i+1]->ops[stmt_num]);
 893               /* After swapping some operands we lost track whether an
 894                  operand has any pattern defs so be conservative here.  */
 895               if ((*oprnds_info)[i]->any_pattern
 896                   || (*oprnds_info)[i+1]->any_pattern)
 897                 (*oprnds_info)[i]->any_pattern
 898                   = (*oprnds_info)[i+1]->any_pattern = true;
 899               swapped = true;
 900               continue;
 901             }
 902
 903           if (is_a <bb_vec_info> (vinfo)
 904               && !oprnd_info->any_pattern)
 905             {
 906               /* Now for commutative ops we should see whether we can
 907                  make the other operand matching.  */
 908               if (dump_enabled_p ())
 909                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 910                                  "treating operand as external\n");
 911               oprnd_info->first_dt = dt = vect_external_def;
 912             }
 913           else
 914             {
 915               if (dump_enabled_p ())
 916                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 917                                  "Build SLP failed: different types\n");
 918               return 1;
 919             }
 920         }
 921
 922       /* Make sure to demote the overall operand to external.  */
 923       if (dt == vect_external_def)
 924         oprnd_info->first_dt = vect_external_def;
 925       /* For a SLP reduction chain we want to duplicate the reduction to
 926          each of the chain members.  That gets us a sane SLP graph (still
 927          the stmts are not 100% correct wrt the initial values).  */
 928       else if ((dt == vect_internal_def
 929                 || dt == vect_reduction_def)
 930                && oprnd_info->first_dt == vect_reduction_def
 931                && !STMT_VINFO_DATA_REF (stmt_info)
 932                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 933                && !STMT_VINFO_DATA_REF (def_stmt_info)
 934                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 935                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 936         {
 937           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 938           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 939         }
 940
 941       ++i;
 942     }
 943
 944   /* Swap operands.  */
 945   if (swapped)
 946     {
 947       if (dump_enabled_p ())
 948         dump_printf_loc (MSG_NOTE, vect_location,
 949                          "swapped operands to match def types in %G",
 950                          stmt_info->stmt);
 951     }
 952
 953   return 0;
 954 }
 955
 956 /* Return true if call statements CALL1 and CALL2 are similar enough
 957    to be combined into the same SLP group.  */
 958
 959 bool
 960 compatible_calls_p (gcall *call1, gcall *call2)
 961 {
 962   unsigned int nargs = gimple_call_num_args (call1);
 963   if (nargs != gimple_call_num_args (call2))
 964     return false;
 965
 966   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 967     return false;
 968
 969   if (gimple_call_internal_p (call1))
 970     {
 971       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 972                                TREE_TYPE (gimple_call_lhs (call2))))
 973         return false;
 974       for (unsigned int i = 0; i < nargs; ++i)
 975         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 976                                  TREE_TYPE (gimple_call_arg (call2, i))))
 977           return false;
 978     }
 979   else
 980     {
 981       if (!operand_equal_p (gimple_call_fn (call1),
 982                             gimple_call_fn (call2), 0))
 983         return false;
 984
 985       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 986         return false;
 987     }
 988
 989   /* Check that any unvectorized arguments are equal.  */
 990   if (const int *map = vect_get_operand_map (call1))
 991     {
 992       unsigned int nkept = *map++;
 993       unsigned int mapi = 0;
 994       for (unsigned int i = 0; i < nargs; ++i)
 995         if (mapi < nkept && map[mapi] == int (i))
 996           mapi += 1;
 997         else if (!operand_equal_p (gimple_call_arg (call1, i),
 998                                    gimple_call_arg (call2, i)))
 999           return false;
1000     }
1001
1002   return true;
1003 }
1004
1005 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1006    caller's attempt to find the vector type in STMT_INFO with the narrowest
1007    element type.  Return true if VECTYPE is nonnull and if it is valid
1008    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
1009    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
1010    vect_build_slp_tree.  */
1011
1012 static bool
1013 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1014                         unsigned int group_size,
1015                         tree vectype, poly_uint64 *max_nunits)
1016 {
1017   if (!vectype)
1018     {
1019       if (dump_enabled_p ())
1020         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1021                          "Build SLP failed: unsupported data-type in %G\n",
1022                          stmt_info->stmt);
1023       /* Fatal mismatch.  */
1024       return false;
1025     }
1026
1027   /* If populating the vector type requires unrolling then fail
1028      before adjusting *max_nunits for basic-block vectorization.  */
1029   if (is_a <bb_vec_info> (vinfo)
1030       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1031     {
1032       if (dump_enabled_p ())
1033         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1034                          "Build SLP failed: unrolling required "
1035                          "in basic block SLP\n");
1036       /* Fatal mismatch.  */
1037       return false;
1038     }
1039
1040   /* In case of multiple types we need to detect the smallest type.  */
1041   vect_update_max_nunits (max_nunits, vectype);
1042   return true;
1043 }
1044
1045 /* Verify if the scalar stmts STMTS are isomorphic, require data
1046    permutation or are of unsupported types of operation.  Return
1047    true if they are, otherwise return false and indicate in *MATCHES
1048    which stmts are not isomorphic to the first one.  If MATCHES[0]
1049    is false then this indicates the comparison could not be
1050    carried out or the stmts will never be vectorized by SLP.
1051
1052    Note COND_EXPR is possibly isomorphic to another one after swapping its
1053    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1054    the first stmt by swapping the two operands of comparison; set SWAP[i]
1055    to 2 if stmt I is isormorphic to the first stmt by inverting the code
1056    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1057    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
1058
1059 static bool
1060 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1061                        vec<stmt_vec_info> stmts, unsigned int group_size,
1062                        poly_uint64 *max_nunits, bool *matches,
1063                        bool *two_operators, tree *node_vectype)
1064 {
1065   unsigned int i;
1066   stmt_vec_info first_stmt_info = stmts[0];
1067   code_helper first_stmt_code = ERROR_MARK;
1068   code_helper alt_stmt_code = ERROR_MARK;
1069   code_helper rhs_code = ERROR_MARK;
1070   code_helper first_cond_code = ERROR_MARK;
1071   tree lhs;
1072   bool need_same_oprnds = false;
1073   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1074   stmt_vec_info first_load = NULL, prev_first_load = NULL;
1075   bool first_stmt_ldst_p = false, ldst_p = false;
1076   bool first_stmt_phi_p = false, phi_p = false;
1077   int first_reduc_idx = -1;
1078   bool maybe_soft_fail = false;
1079   tree soft_fail_nunits_vectype = NULL_TREE;
1080
1081   /* For every stmt in NODE find its def stmt/s.  */
1082   stmt_vec_info stmt_info;
1083   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1084     {
1085       swap[i] = 0;
1086       matches[i] = false;
1087       if (!stmt_info)
1088         {
1089           matches[i] = true;
1090           continue;
1091         }
1092
1093       gimple *stmt = stmt_info->stmt;
1094       if (dump_enabled_p ())
1095         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1096
1097       /* Fail to vectorize statements marked as unvectorizable, throw
1098          or are volatile.  */
1099       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1100           || stmt_can_throw_internal (cfun, stmt)
1101           || gimple_has_volatile_ops (stmt))
1102         {
1103           if (dump_enabled_p ())
1104             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1105                              "Build SLP failed: unvectorizable statement %G",
1106                              stmt);
1107           /* ???  For BB vectorization we want to commutate operands in a way
1108              to shuffle all unvectorizable defs into one operand and have
1109              the other still vectorized.  The following doesn't reliably
1110              work for this though but it's the easiest we can do here.  */
1111           if (is_a <bb_vec_info> (vinfo) && i != 0)
1112             continue;
1113           /* Fatal mismatch.  */
1114           matches[0] = false;
1115           return false;
1116         }
1117
1118       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1119       lhs = gimple_get_lhs (stmt);
1120       if (lhs == NULL_TREE
1121           && (!call_stmt
1122               || !gimple_call_internal_p (stmt)
1123               || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1124         {
1125           if (dump_enabled_p ())
1126             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1127                              "Build SLP failed: not GIMPLE_ASSIGN nor "
1128                              "GIMPLE_CALL %G", stmt);
1129           if (is_a <bb_vec_info> (vinfo) && i != 0)
1130             continue;
1131           /* Fatal mismatch.  */
1132           matches[0] = false;
1133           return false;
1134         }
1135
1136       tree nunits_vectype;
1137       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1138                                            &nunits_vectype, group_size))
1139         {
1140           if (is_a <bb_vec_info> (vinfo) && i != 0)
1141             continue;
1142           /* Fatal mismatch.  */
1143           matches[0] = false;
1144           return false;
1145         }
1146       /* Record nunits required but continue analysis, producing matches[]
1147          as if nunits was not an issue.  This allows splitting of groups
1148          to happen.  */
1149       if (nunits_vectype
1150           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1151                                       nunits_vectype, max_nunits))
1152         {
1153           gcc_assert (is_a <bb_vec_info> (vinfo));
1154           maybe_soft_fail = true;
1155           soft_fail_nunits_vectype = nunits_vectype;
1156         }
1157
1158       gcc_assert (vectype);
1159
1160       if (call_stmt)
1161         {
1162           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1163           if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1164             rhs_code = cfn;
1165           else
1166             rhs_code = CALL_EXPR;
1167
1168           if (cfn == CFN_MASK_LOAD
1169               || cfn == CFN_GATHER_LOAD
1170               || cfn == CFN_MASK_GATHER_LOAD
1171               || cfn == CFN_MASK_LEN_GATHER_LOAD)
1172             ldst_p = true;
1173           else if (cfn == CFN_MASK_STORE)
1174             {
1175               ldst_p = true;
1176               rhs_code = CFN_MASK_STORE;
1177             }
1178           else if ((cfn != CFN_LAST
1179                     && cfn != CFN_MASK_CALL
1180                     && internal_fn_p (cfn)
1181                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1182                    || gimple_call_tail_p (call_stmt)
1183                    || gimple_call_noreturn_p (call_stmt)
1184                    || gimple_call_chain (call_stmt))
1185             {
1186               if (dump_enabled_p ())
1187                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1188                                  "Build SLP failed: unsupported call type %G",
1189                                  (gimple *) call_stmt);
1190               if (is_a <bb_vec_info> (vinfo) && i != 0)
1191                 continue;
1192               /* Fatal mismatch.  */
1193               matches[0] = false;
1194               return false;
1195             }
1196         }
1197       else if (gimple_code (stmt) == GIMPLE_PHI)
1198         {
1199           rhs_code = ERROR_MARK;
1200           phi_p = true;
1201         }
1202       else
1203         {
1204           rhs_code = gimple_assign_rhs_code (stmt);
1205           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1206         }
1207
1208       /* Check the operation.  */
1209       if (i == 0)
1210         {
1211           *node_vectype = vectype;
1212           first_stmt_code = rhs_code;
1213           first_stmt_ldst_p = ldst_p;
1214           first_stmt_phi_p = phi_p;
1215           first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
1216
1217           /* Shift arguments should be equal in all the packed stmts for a
1218              vector shift with scalar shift operand.  */
1219           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1220               || rhs_code == LROTATE_EXPR
1221               || rhs_code == RROTATE_EXPR)
1222             {
1223               /* First see if we have a vector/vector shift.  */
1224               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1225                 {
1226                   /* No vector/vector shift, try for a vector/scalar shift.  */
1227                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1228                     {
1229                       if (dump_enabled_p ())
1230                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1231                                          "Build SLP failed: "
1232                                          "op not supported by target.\n");
1233                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1234                         continue;
1235                       /* Fatal mismatch.  */
1236                       matches[0] = false;
1237                       return false;
1238                     }
1239                   need_same_oprnds = true;
1240                   first_op1 = gimple_assign_rhs2 (stmt);
1241                 }
1242             }
1243           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1244             {
1245               need_same_oprnds = true;
1246               first_op1 = gimple_assign_rhs2 (stmt);
1247             }
1248           else if (!ldst_p
1249                    && rhs_code == BIT_FIELD_REF)
1250             {
1251               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1252               if (!is_a <bb_vec_info> (vinfo)
1253                   || TREE_CODE (vec) != SSA_NAME
1254                   /* When the element types are not compatible we pun the
1255                      source to the target vectype which requires equal size.  */
1256                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1257                        || !types_compatible_p (TREE_TYPE (vectype),
1258                                                TREE_TYPE (TREE_TYPE (vec))))
1259                       && !operand_equal_p (TYPE_SIZE (vectype),
1260                                            TYPE_SIZE (TREE_TYPE (vec)))))
1261                 {
1262                   if (dump_enabled_p ())
1263                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1264                                      "Build SLP failed: "
1265                                      "BIT_FIELD_REF not supported\n");
1266                   /* Fatal mismatch.  */
1267                   matches[0] = false;
1268                   return false;
1269                 }
1270             }
1271           else if (rhs_code == CFN_DIV_POW2)
1272             {
1273               need_same_oprnds = true;
1274               first_op1 = gimple_call_arg (call_stmt, 1);
1275             }
1276         }
1277       else
1278         {
1279           if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1280               /* For SLP reduction groups the index isn't necessarily
1281                  uniform but only that of the first stmt matters.  */
1282               && !(first_reduc_idx != -1
1283                    && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1284                    && REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
1285             {
1286               if (dump_enabled_p ())
1287                 {
1288                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1289                                    "Build SLP failed: different reduc_idx "
1290                                    "%d instead of %d in %G",
1291                                    STMT_VINFO_REDUC_IDX (stmt_info),
1292                                    first_reduc_idx, stmt);
1293                 }
1294               /* Mismatch.  */
1295               continue;
1296             }
1297           if (first_stmt_code != rhs_code
1298               && alt_stmt_code == ERROR_MARK)
1299             alt_stmt_code = rhs_code;
1300           if ((first_stmt_code != rhs_code
1301                && (first_stmt_code != IMAGPART_EXPR
1302                    || rhs_code != REALPART_EXPR)
1303                && (first_stmt_code != REALPART_EXPR
1304                    || rhs_code != IMAGPART_EXPR)
1305                /* Handle mismatches in plus/minus by computing both
1306                   and merging the results.  */
1307                && !((first_stmt_code == PLUS_EXPR
1308                      || first_stmt_code == MINUS_EXPR)
1309                     && (alt_stmt_code == PLUS_EXPR
1310                         || alt_stmt_code == MINUS_EXPR)
1311                     && rhs_code == alt_stmt_code)
1312                && !(first_stmt_code.is_tree_code ()
1313                     && rhs_code.is_tree_code ()
1314                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1315                         == tcc_comparison)
1316                     && (swap_tree_comparison (tree_code (first_stmt_code))
1317                         == tree_code (rhs_code)))
1318                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1319                     && (first_stmt_code == ARRAY_REF
1320                         || first_stmt_code == BIT_FIELD_REF
1321                         || first_stmt_code == COMPONENT_REF
1322                         || first_stmt_code == REALPART_EXPR
1323                         || first_stmt_code == IMAGPART_EXPR
1324                         || first_stmt_code == MEM_REF)
1325                     && (rhs_code == ARRAY_REF
1326                         || rhs_code == BIT_FIELD_REF
1327                         || rhs_code == COMPONENT_REF
1328                         || rhs_code == REALPART_EXPR
1329                         || rhs_code == IMAGPART_EXPR
1330                         || rhs_code == MEM_REF)))
1331               || (ldst_p
1332                   && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1333                       != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1334               || (ldst_p
1335                   && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1336                       != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1337               || first_stmt_ldst_p != ldst_p
1338               || first_stmt_phi_p != phi_p)
1339             {
1340               if (dump_enabled_p ())
1341                 {
1342                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1343                                    "Build SLP failed: different operation "
1344                                    "in stmt %G", stmt);
1345                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1346                                    "original stmt %G", first_stmt_info->stmt);
1347                 }
1348               /* Mismatch.  */
1349               continue;
1350             }
1351
1352           if (!ldst_p
1353               && first_stmt_code == BIT_FIELD_REF
1354               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1355                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1356             {
1357               if (dump_enabled_p ())
1358                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1359                                  "Build SLP failed: different BIT_FIELD_REF "
1360                                  "arguments in %G", stmt);
1361               /* Mismatch.  */
1362               continue;
1363             }
1364
1365           if (call_stmt
1366               && first_stmt_code != CFN_MASK_LOAD
1367               && first_stmt_code != CFN_MASK_STORE)
1368             {
1369               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1370                                        call_stmt))
1371                 {
1372                   if (dump_enabled_p ())
1373                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1374                                      "Build SLP failed: different calls in %G",
1375                                      stmt);
1376                   /* Mismatch.  */
1377                   continue;
1378                 }
1379             }
1380
1381           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1382               && (gimple_bb (first_stmt_info->stmt)
1383                   != gimple_bb (stmt_info->stmt)))
1384             {
1385               if (dump_enabled_p ())
1386                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1387                                  "Build SLP failed: different BB for PHI "
1388                                  "or possibly trapping operation in %G", stmt);
1389               /* Mismatch.  */
1390               continue;
1391             }
1392
1393           if (need_same_oprnds)
1394             {
1395               tree other_op1 = gimple_arg (stmt, 1);
1396               if (!operand_equal_p (first_op1, other_op1, 0))
1397                 {
1398                   if (dump_enabled_p ())
1399                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1400                                      "Build SLP failed: different shift "
1401                                      "arguments in %G", stmt);
1402                   /* Mismatch.  */
1403                   continue;
1404                 }
1405             }
1406
1407           if (!types_compatible_p (vectype, *node_vectype))
1408             {
1409               if (dump_enabled_p ())
1410                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1411                                  "Build SLP failed: different vector type "
1412                                  "in %G", stmt);
1413               /* Mismatch.  */
1414               continue;
1415             }
1416         }
1417
1418       /* Grouped store or load.  */
1419       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1420         {
1421           gcc_assert (ldst_p);
1422           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1423             {
1424               /* Store.  */
1425               gcc_assert (rhs_code == CFN_MASK_STORE
1426                           || REFERENCE_CLASS_P (lhs)
1427                           || DECL_P (lhs));
1428             }
1429           else
1430             {
1431               /* Load.  */
1432               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1433               if (prev_first_load)
1434                 {
1435                   /* Check that there are no loads from different interleaving
1436                      chains in the same node.  */
1437                   if (prev_first_load != first_load)
1438                     {
1439                       if (dump_enabled_p ())
1440                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1441                                          vect_location,
1442                                          "Build SLP failed: different "
1443                                          "interleaving chains in one node %G",
1444                                          stmt);
1445                       /* Mismatch.  */
1446                       continue;
1447                     }
1448                 }
1449               else
1450                 prev_first_load = first_load;
1451            }
1452         }
1453       /* Non-grouped store or load.  */
1454       else if (ldst_p)
1455         {
1456           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1457               && rhs_code != CFN_GATHER_LOAD
1458               && rhs_code != CFN_MASK_GATHER_LOAD
1459               && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1460               && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1461               /* Not grouped loads are handled as externals for BB
1462                  vectorization.  For loop vectorization we can handle
1463                  splats the same we handle single element interleaving.  */
1464               && (is_a <bb_vec_info> (vinfo)
1465                   || stmt_info != first_stmt_info))
1466             {
1467               /* Not grouped load.  */
1468               if (dump_enabled_p ())
1469                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1470                                  "Build SLP failed: not grouped load %G", stmt);
1471
1472               if (i != 0)
1473                 continue;
1474               /* Fatal mismatch.  */
1475               matches[0] = false;
1476               return false;
1477             }
1478         }
1479       /* Not memory operation.  */
1480       else
1481         {
1482           if (!phi_p
1483               && rhs_code.is_tree_code ()
1484               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1485               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1486               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1487               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1488               && rhs_code != VIEW_CONVERT_EXPR
1489               && rhs_code != CALL_EXPR
1490               && rhs_code != BIT_FIELD_REF)
1491             {
1492               if (dump_enabled_p ())
1493                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1494                                  "Build SLP failed: operation unsupported %G",
1495                                  stmt);
1496               if (is_a <bb_vec_info> (vinfo) && i != 0)
1497                 continue;
1498               /* Fatal mismatch.  */
1499               matches[0] = false;
1500               return false;
1501             }
1502
1503           if (rhs_code == COND_EXPR)
1504             {
1505               tree cond_expr = gimple_assign_rhs1 (stmt);
1506               enum tree_code cond_code = TREE_CODE (cond_expr);
1507               enum tree_code swap_code = ERROR_MARK;
1508               enum tree_code invert_code = ERROR_MARK;
1509
1510               if (i == 0)
1511                 first_cond_code = TREE_CODE (cond_expr);
1512               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1513                 {
1514                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1515                   swap_code = swap_tree_comparison (cond_code);
1516                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1517                 }
1518
1519               if (first_cond_code == cond_code)
1520                 ;
1521               /* Isomorphic can be achieved by swapping.  */
1522               else if (first_cond_code == swap_code)
1523                 swap[i] = 1;
1524               /* Isomorphic can be achieved by inverting.  */
1525               else if (first_cond_code == invert_code)
1526                 swap[i] = 2;
1527               else
1528                 {
1529                   if (dump_enabled_p ())
1530                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1531                                      "Build SLP failed: different"
1532                                      " operation %G", stmt);
1533                   /* Mismatch.  */
1534                   continue;
1535                 }
1536             }
1537
1538           if (rhs_code.is_tree_code ()
1539               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1540               && (swap_tree_comparison ((tree_code)first_stmt_code)
1541                   == (tree_code)rhs_code))
1542             swap[i] = 1;
1543         }
1544
1545       matches[i] = true;
1546     }
1547
1548   for (i = 0; i < group_size; ++i)
1549     if (!matches[i])
1550       return false;
1551
1552   /* If we allowed a two-operation SLP node verify the target can cope
1553      with the permute we are going to use.  */
1554   if (alt_stmt_code != ERROR_MARK
1555       && (!alt_stmt_code.is_tree_code ()
1556           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1557               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1558     {
1559       *two_operators = true;
1560     }
1561
1562   if (maybe_soft_fail)
1563     {
1564       unsigned HOST_WIDE_INT const_nunits;
1565       if (!TYPE_VECTOR_SUBPARTS
1566             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1567           || const_nunits > group_size)
1568         matches[0] = false;
1569       else
1570         {
1571           /* With constant vector elements simulate a mismatch at the
1572              point we need to split.  */
1573           unsigned tail = group_size & (const_nunits - 1);
1574           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1575         }
1576       return false;
1577     }
1578
1579   return true;
1580 }
1581
1582 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1583    Note we never remove apart from at destruction time so we do not
1584    need a special value for deleted that differs from empty.  */
1585 struct bst_traits
1586 {
1587   typedef vec <stmt_vec_info> value_type;
1588   typedef vec <stmt_vec_info> compare_type;
1589   static inline hashval_t hash (value_type);
1590   static inline bool equal (value_type existing, value_type candidate);
1591   static inline bool is_empty (value_type x) { return !x.exists (); }
1592   static inline bool is_deleted (value_type x) { return !x.exists (); }
1593   static const bool empty_zero_p = true;
1594   static inline void mark_empty (value_type &x) { x.release (); }
1595   static inline void mark_deleted (value_type &x) { x.release (); }
1596   static inline void remove (value_type &x) { x.release (); }
1597 };
1598 inline hashval_t
1599 bst_traits::hash (value_type x)
1600 {
1601   inchash::hash h;
1602   for (unsigned i = 0; i < x.length (); ++i)
1603     h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
1604   return h.end ();
1605 }
1606 inline bool
1607 bst_traits::equal (value_type existing, value_type candidate)
1608 {
1609   if (existing.length () != candidate.length ())
1610     return false;
1611   for (unsigned i = 0; i < existing.length (); ++i)
1612     if (existing[i] != candidate[i])
1613       return false;
1614   return true;
1615 }
1616
1617 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1618                   simple_hashmap_traits <bst_traits, slp_tree> >
1619   scalar_stmts_to_slp_tree_map_t;
1620
1621 /* Release BST_MAP.  */
1622
1623 static void
1624 release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
1625 {
1626   /* The map keeps a reference on SLP nodes built, release that.  */
1627   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1628        it != bst_map->end (); ++it)
1629     if ((*it).second)
1630       vect_free_slp_tree ((*it).second);
1631   delete bst_map;
1632 }
1633
1634 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1635    but then vec::insert does memmove and that's not compatible with
1636    std::pair.  */
1637 struct chain_op_t
1638 {
1639   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1640       : code (code_), dt (dt_), op (op_) {}
1641   tree_code code;
1642   vect_def_type dt;
1643   tree op;
1644 };
1645
1646 /* Comparator for sorting associatable chains.  */
1647
1648 static int
1649 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1650 {
1651   auto *op1 = (const chain_op_t *) op1_;
1652   auto *op2 = (const chain_op_t *) op2_;
1653   if (op1->dt != op2->dt)
1654     return (int)op1->dt - (int)op2->dt;
1655   return (int)op1->code - (int)op2->code;
1656 }
1657
1658 /* Linearize the associatable expression chain at START with the
1659    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1660    filling CHAIN with the result and using WORKLIST as intermediate storage.
1661    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1662    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1663    stmts, starting with START.  */
1664
1665 static void
1666 vect_slp_linearize_chain (vec_info *vinfo,
1667                           vec<std::pair<tree_code, gimple *> > &worklist,
1668                           vec<chain_op_t> &chain,
1669                           enum tree_code code, gimple *start,
1670                           gimple *&code_stmt, gimple *&alt_code_stmt,
1671                           vec<gimple *> *chain_stmts)
1672 {
1673   /* For each lane linearize the addition/subtraction (or other
1674      uniform associatable operation) expression tree.  */
1675   worklist.safe_push (std::make_pair (code, start));
1676   while (!worklist.is_empty ())
1677     {
1678       auto entry = worklist.pop ();
1679       gassign *stmt = as_a <gassign *> (entry.second);
1680       enum tree_code in_code = entry.first;
1681       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1682       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1683       if (!code_stmt
1684           && gimple_assign_rhs_code (stmt) == code)
1685         code_stmt = stmt;
1686       else if (!alt_code_stmt
1687                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1688         alt_code_stmt = stmt;
1689       if (chain_stmts)
1690         chain_stmts->safe_push (stmt);
1691       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1692         {
1693           tree op = gimple_op (stmt, opnum);
1694           vect_def_type dt;
1695           stmt_vec_info def_stmt_info;
1696           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1697           gcc_assert (res);
1698           if (dt == vect_internal_def
1699               && is_pattern_stmt_p (def_stmt_info))
1700             op = gimple_get_lhs (def_stmt_info->stmt);
1701           gimple *use_stmt;
1702           use_operand_p use_p;
1703           if (dt == vect_internal_def
1704               && single_imm_use (op, &use_p, &use_stmt)
1705               && is_gimple_assign (def_stmt_info->stmt)
1706               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1707                   || (code == PLUS_EXPR
1708                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1709                           == MINUS_EXPR))))
1710             {
1711               tree_code op_def_code = this_code;
1712               if (op_def_code == MINUS_EXPR && opnum == 1)
1713                 op_def_code = PLUS_EXPR;
1714               if (in_code == MINUS_EXPR)
1715                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1716               worklist.safe_push (std::make_pair (op_def_code,
1717                                                   def_stmt_info->stmt));
1718             }
1719           else
1720             {
1721               tree_code op_def_code = this_code;
1722               if (op_def_code == MINUS_EXPR && opnum == 1)
1723                 op_def_code = PLUS_EXPR;
1724               if (in_code == MINUS_EXPR)
1725                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1726               chain.safe_push (chain_op_t (op_def_code, dt, op));
1727             }
1728         }
1729     }
1730 }
1731
1732 static slp_tree
1733 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1734                        vec<stmt_vec_info> stmts, unsigned int group_size,
1735                        poly_uint64 *max_nunits,
1736                        bool *matches, unsigned *limit, unsigned *tree_size,
1737                        scalar_stmts_to_slp_tree_map_t *bst_map);
1738
1739 static slp_tree
1740 vect_build_slp_tree (vec_info *vinfo,
1741                      vec<stmt_vec_info> stmts, unsigned int group_size,
1742                      poly_uint64 *max_nunits,
1743                      bool *matches, unsigned *limit, unsigned *tree_size,
1744                      scalar_stmts_to_slp_tree_map_t *bst_map)
1745 {
1746   if (slp_tree *leader = bst_map->get (stmts))
1747     {
1748       if (dump_enabled_p ())
1749         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1750                          !(*leader)->failed ? "" : "failed ",
1751                          (void *) *leader);
1752       if (!(*leader)->failed)
1753         {
1754           SLP_TREE_REF_COUNT (*leader)++;
1755           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1756           stmts.release ();
1757           return *leader;
1758         }
1759       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1760       return NULL;
1761     }
1762
1763   /* Single-lane SLP doesn't have the chance of run-away, do not account
1764      it to the limit.  */
1765   if (stmts.length () > 1)
1766     {
1767       if (*limit == 0)
1768         {
1769           if (dump_enabled_p ())
1770             dump_printf_loc (MSG_NOTE, vect_location,
1771                              "SLP discovery limit exceeded\n");
1772           memset (matches, 0, sizeof (bool) * group_size);
1773           return NULL;
1774         }
1775       --*limit;
1776     }
1777
1778   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1779      so we can pick up backedge destinations during discovery.  */
1780   slp_tree res = new _slp_tree;
1781   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1782   SLP_TREE_SCALAR_STMTS (res) = stmts;
1783   bst_map->put (stmts.copy (), res);
1784
1785   if (dump_enabled_p ())
1786     dump_printf_loc (MSG_NOTE, vect_location,
1787                      "starting SLP discovery for node %p\n", (void *) res);
1788
1789   poly_uint64 this_max_nunits = 1;
1790   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1791                                         &this_max_nunits,
1792                                         matches, limit, tree_size, bst_map);
1793   if (!res_)
1794     {
1795       if (dump_enabled_p ())
1796         dump_printf_loc (MSG_NOTE, vect_location,
1797                          "SLP discovery for node %p failed\n", (void *) res);
1798       /* Mark the node invalid so we can detect those when still in use
1799          as backedge destinations.  */
1800       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1801       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1802       res->failed = XNEWVEC (bool, group_size);
1803       if (flag_checking)
1804         {
1805           unsigned i;
1806           for (i = 0; i < group_size; ++i)
1807             if (!matches[i])
1808               break;
1809           gcc_assert (i < group_size);
1810         }
1811       memcpy (res->failed, matches, sizeof (bool) * group_size);
1812     }
1813   else
1814     {
1815       if (dump_enabled_p ())
1816         dump_printf_loc (MSG_NOTE, vect_location,
1817                          "SLP discovery for node %p succeeded\n",
1818                          (void *) res);
1819       gcc_assert (res_ == res);
1820       res->max_nunits = this_max_nunits;
1821       vect_update_max_nunits (max_nunits, this_max_nunits);
1822       /* Keep a reference for the bst_map use.  */
1823       SLP_TREE_REF_COUNT (res)++;
1824     }
1825   return res_;
1826 }
1827
1828 /* Helper for building an associated SLP node chain.  */
1829
1830 static void
1831 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1832                                    slp_tree op0, slp_tree op1,
1833                                    stmt_vec_info oper1, stmt_vec_info oper2,
1834                                    vec<std::pair<unsigned, unsigned> > lperm)
1835 {
1836   unsigned group_size = SLP_TREE_LANES (op1);
1837
1838   slp_tree child1 = new _slp_tree;
1839   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1840   SLP_TREE_VECTYPE (child1) = vectype;
1841   SLP_TREE_LANES (child1) = group_size;
1842   SLP_TREE_CHILDREN (child1).create (2);
1843   SLP_TREE_CHILDREN (child1).quick_push (op0);
1844   SLP_TREE_CHILDREN (child1).quick_push (op1);
1845   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1846
1847   slp_tree child2 = new _slp_tree;
1848   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1849   SLP_TREE_VECTYPE (child2) = vectype;
1850   SLP_TREE_LANES (child2) = group_size;
1851   SLP_TREE_CHILDREN (child2).create (2);
1852   SLP_TREE_CHILDREN (child2).quick_push (op0);
1853   SLP_TREE_REF_COUNT (op0)++;
1854   SLP_TREE_CHILDREN (child2).quick_push (op1);
1855   SLP_TREE_REF_COUNT (op1)++;
1856   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1857
1858   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1859   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1860   SLP_TREE_VECTYPE (perm) = vectype;
1861   SLP_TREE_LANES (perm) = group_size;
1862   /* ???  We should set this NULL but that's not expected.  */
1863   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1864   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1865   SLP_TREE_CHILDREN (perm).quick_push (child1);
1866   SLP_TREE_CHILDREN (perm).quick_push (child2);
1867 }
1868
1869 /* Recursively build an SLP tree starting from NODE.
1870    Fail (and return a value not equal to zero) if def-stmts are not
1871    isomorphic, require data permutation or are of unsupported types of
1872    operation.  Otherwise, return 0.
1873    The value returned is the depth in the SLP tree where a mismatch
1874    was found.  */
1875
1876 static slp_tree
1877 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1878                        vec<stmt_vec_info> stmts, unsigned int group_size,
1879                        poly_uint64 *max_nunits,
1880                        bool *matches, unsigned *limit, unsigned *tree_size,
1881                        scalar_stmts_to_slp_tree_map_t *bst_map)
1882 {
1883   unsigned nops, i, this_tree_size = 0;
1884   poly_uint64 this_max_nunits = *max_nunits;
1885
1886   matches[0] = false;
1887
1888   stmt_vec_info stmt_info = stmts[0];
1889   if (!is_a<gcall *> (stmt_info->stmt)
1890       && !is_a<gassign *> (stmt_info->stmt)
1891       && !is_a<gphi *> (stmt_info->stmt))
1892     return NULL;
1893
1894   nops = gimple_num_args (stmt_info->stmt);
1895   if (const int *map = vect_get_operand_map (stmt_info->stmt,
1896                                              STMT_VINFO_GATHER_SCATTER_P
1897                                                (stmt_info)))
1898     nops = map[0];
1899
1900   /* If the SLP node is a PHI (induction or reduction), terminate
1901      the recursion.  */
1902   bool *skip_args = XALLOCAVEC (bool, nops);
1903   memset (skip_args, 0, sizeof (bool) * nops);
1904   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1905     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1906       {
1907         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1908         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1909                                                     group_size);
1910         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1911                                      max_nunits))
1912           return NULL;
1913
1914         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1915         if (def_type == vect_induction_def)
1916           {
1917             /* Induction PHIs are not cycles but walk the initial
1918                value.  Only for inner loops through, for outer loops
1919                we need to pick up the value from the actual PHIs
1920                to more easily support peeling and epilogue vectorization.  */
1921             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1922             if (!nested_in_vect_loop_p (loop, stmt_info))
1923               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1924             else
1925               loop = loop->inner;
1926             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1927           }
1928         else if (def_type == vect_reduction_def
1929                  || def_type == vect_double_reduction_def
1930                  || def_type == vect_nested_cycle
1931                  || def_type == vect_first_order_recurrence)
1932           {
1933             /* Else def types have to match.  */
1934             stmt_vec_info other_info;
1935             bool all_same = true;
1936             FOR_EACH_VEC_ELT (stmts, i, other_info)
1937               {
1938                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1939                   return NULL;
1940                 if (other_info != stmt_info)
1941                   all_same = false;
1942               }
1943             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1944             /* Reduction initial values are not explicitely represented.  */
1945             if (def_type != vect_first_order_recurrence
1946                 && gimple_bb (stmt_info->stmt) == loop->header)
1947               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1948             /* Reduction chain backedge defs are filled manually.
1949                ???  Need a better way to identify a SLP reduction chain PHI.
1950                Or a better overall way to SLP match those.  */
1951             if (stmts.length () > 1
1952                 && all_same && def_type == vect_reduction_def)
1953               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1954           }
1955         else if (def_type != vect_internal_def)
1956           return NULL;
1957       }
1958
1959
1960   bool two_operators = false;
1961   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1962   tree vectype = NULL_TREE;
1963   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1964                               &this_max_nunits, matches, &two_operators,
1965                               &vectype))
1966     return NULL;
1967
1968   /* If the SLP node is a load, terminate the recursion unless masked.  */
1969   if (STMT_VINFO_DATA_REF (stmt_info)
1970       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1971     {
1972       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1973         gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1974       else
1975         {
1976           *max_nunits = this_max_nunits;
1977           (*tree_size)++;
1978           node = vect_create_new_slp_node (node, stmts, 0);
1979           SLP_TREE_VECTYPE (node) = vectype;
1980           /* And compute the load permutation.  Whether it is actually
1981              a permutation depends on the unrolling factor which is
1982              decided later.  */
1983           vec<unsigned> load_permutation;
1984           int j;
1985           stmt_vec_info load_info;
1986           load_permutation.create (group_size);
1987           stmt_vec_info first_stmt_info
1988             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1989           bool any_permute = false;
1990           bool any_null = false;
1991           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1992             {
1993               int load_place;
1994               if (! load_info)
1995                 {
1996                   load_place = j;
1997                   any_null = true;
1998                 }
1999               else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2000                 load_place = vect_get_place_in_interleaving_chain
2001                     (load_info, first_stmt_info);
2002               else
2003                 load_place = 0;
2004               gcc_assert (load_place != -1);
2005               any_permute |= load_place != j;
2006               load_permutation.quick_push (load_place);
2007             }
2008           if (any_null)
2009             {
2010               gcc_assert (!any_permute);
2011               load_permutation.release ();
2012             }
2013
2014           if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
2015             {
2016               gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
2017                           || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
2018                           || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
2019                           || gimple_call_internal_p (stmt,
2020                                                      IFN_MASK_LEN_GATHER_LOAD));
2021               load_permutation.release ();
2022               /* We cannot handle permuted masked loads, see PR114375.  */
2023               if (any_permute
2024                   || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2025                       && DR_GROUP_SIZE (first_stmt_info) != group_size)
2026                   || STMT_VINFO_STRIDED_P (stmt_info))
2027                 {
2028                   matches[0] = false;
2029                   return NULL;
2030                 }
2031             }
2032           else
2033             {
2034               SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
2035               return node;
2036             }
2037         }
2038     }
2039   else if (gimple_assign_single_p (stmt_info->stmt)
2040            && !gimple_vuse (stmt_info->stmt)
2041            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
2042     {
2043       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
2044          the same SSA name vector of a compatible type to vectype.  */
2045       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
2046       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
2047       stmt_vec_info estmt_info;
2048       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
2049         {
2050           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
2051           tree bfref = gimple_assign_rhs1 (estmt);
2052           HOST_WIDE_INT lane;
2053           if (!known_eq (bit_field_size (bfref),
2054                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2055               || !constant_multiple_p (bit_field_offset (bfref),
2056                                        bit_field_size (bfref), &lane))
2057             {
2058               lperm.release ();
2059               matches[0] = false;
2060               return NULL;
2061             }
2062           lperm.safe_push (std::make_pair (0, (unsigned)lane));
2063         }
2064       slp_tree vnode = vect_create_new_slp_node (vNULL);
2065       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2066         /* ???  We record vectype here but we hide eventually necessary
2067            punning and instead rely on code generation to materialize
2068            VIEW_CONVERT_EXPRs as necessary.  We instead should make
2069            this explicit somehow.  */
2070         SLP_TREE_VECTYPE (vnode) = vectype;
2071       else
2072         {
2073           /* For different size but compatible elements we can still
2074              use VEC_PERM_EXPR without punning.  */
2075           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2076                       && types_compatible_p (TREE_TYPE (vectype),
2077                                              TREE_TYPE (TREE_TYPE (vec))));
2078           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2079         }
2080       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2081       unsigned HOST_WIDE_INT const_nunits;
2082       if (nunits.is_constant (&const_nunits))
2083         SLP_TREE_LANES (vnode) = const_nunits;
2084       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2085       /* We are always building a permutation node even if it is an identity
2086          permute to shield the rest of the vectorizer from the odd node
2087          representing an actual vector without any scalar ops.
2088          ???  We could hide it completely with making the permute node
2089          external?  */
2090       node = vect_create_new_slp_node (node, stmts, 1);
2091       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2092       SLP_TREE_LANE_PERMUTATION (node) = lperm;
2093       SLP_TREE_VECTYPE (node) = vectype;
2094       SLP_TREE_CHILDREN (node).quick_push (vnode);
2095       return node;
2096     }
2097   /* When discovery reaches an associatable operation see whether we can
2098      improve that to match up lanes in a way superior to the operand
2099      swapping code which at most looks at two defs.
2100      ???  For BB vectorization we cannot do the brute-force search
2101      for matching as we can succeed by means of builds from scalars
2102      and have no good way to "cost" one build against another.  */
2103   else if (is_a <loop_vec_info> (vinfo)
2104            /* Do not bother for single-lane SLP.  */
2105            && group_size > 1
2106            /* ???  We don't handle !vect_internal_def defs below.  */
2107            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2108            /* ???  Do not associate a reduction, this will wreck REDUC_IDX
2109               mapping as long as that exists on the stmt_info level.  */
2110            && STMT_VINFO_REDUC_IDX (stmt_info) == -1
2111            && is_gimple_assign (stmt_info->stmt)
2112            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2113                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2114            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2115                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2116                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2117     {
2118       /* See if we have a chain of (mixed) adds or subtracts or other
2119          associatable ops.  */
2120       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2121       if (code == MINUS_EXPR)
2122         code = PLUS_EXPR;
2123       stmt_vec_info other_op_stmt_info = NULL;
2124       stmt_vec_info op_stmt_info = NULL;
2125       unsigned chain_len = 0;
2126       auto_vec<chain_op_t> chain;
2127       auto_vec<std::pair<tree_code, gimple *> > worklist;
2128       auto_vec<vec<chain_op_t> > chains (group_size);
2129       auto_vec<slp_tree, 4> children;
2130       bool hard_fail = true;
2131       for (unsigned lane = 0; lane < group_size; ++lane)
2132         {
2133           /* For each lane linearize the addition/subtraction (or other
2134              uniform associatable operation) expression tree.  */
2135           gimple *op_stmt = NULL, *other_op_stmt = NULL;
2136           vect_slp_linearize_chain (vinfo, worklist, chain, code,
2137                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
2138                                     NULL);
2139           if (!op_stmt_info && op_stmt)
2140             op_stmt_info = vinfo->lookup_stmt (op_stmt);
2141           if (!other_op_stmt_info && other_op_stmt)
2142             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2143           if (chain.length () == 2)
2144             {
2145               /* In a chain of just two elements resort to the regular
2146                  operand swapping scheme.  Likewise if we run into a
2147                  length mismatch process regularly as well as we did not
2148                  process the other lanes we cannot report a good hint what
2149                  lanes to try swapping in the parent.  */
2150               hard_fail = false;
2151               break;
2152             }
2153           else if (chain_len == 0)
2154             chain_len = chain.length ();
2155           else if (chain.length () != chain_len)
2156             {
2157               /* ???  Here we could slip in magic to compensate with
2158                  neutral operands.  */
2159               matches[lane] = false;
2160               if (lane != group_size - 1)
2161                 matches[0] = false;
2162               break;
2163             }
2164           chains.quick_push (chain.copy ());
2165           chain.truncate (0);
2166         }
2167       if (chains.length () == group_size)
2168         {
2169           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
2170           if (!op_stmt_info)
2171             {
2172               hard_fail = false;
2173               goto out;
2174             }
2175           /* Now we have a set of chains with the same length.  */
2176           /* 1. pre-sort according to def_type and operation.  */
2177           for (unsigned lane = 0; lane < group_size; ++lane)
2178             chains[lane].stablesort (dt_sort_cmp, vinfo);
2179           if (dump_enabled_p ())
2180             {
2181               dump_printf_loc (MSG_NOTE, vect_location,
2182                                "pre-sorted chains of %s\n",
2183                                get_tree_code_name (code));
2184               for (unsigned lane = 0; lane < group_size; ++lane)
2185                 {
2186                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2187                     dump_printf (MSG_NOTE, "%s %T ",
2188                                  get_tree_code_name (chains[lane][opnum].code),
2189                                  chains[lane][opnum].op);
2190                   dump_printf (MSG_NOTE, "\n");
2191                 }
2192             }
2193           /* 2. try to build children nodes, associating as necessary.  */
2194           for (unsigned n = 0; n < chain_len; ++n)
2195             {
2196               vect_def_type dt = chains[0][n].dt;
2197               unsigned lane;
2198               for (lane = 0; lane < group_size; ++lane)
2199                 if (chains[lane][n].dt != dt)
2200                   {
2201                     if (dt == vect_constant_def
2202                         && chains[lane][n].dt == vect_external_def)
2203                       dt = vect_external_def;
2204                     else if (dt == vect_external_def
2205                              && chains[lane][n].dt == vect_constant_def)
2206                       ;
2207                     else
2208                       break;
2209                   }
2210               if (lane != group_size)
2211                 {
2212                   if (dump_enabled_p ())
2213                     dump_printf_loc (MSG_NOTE, vect_location,
2214                                      "giving up on chain due to mismatched "
2215                                      "def types\n");
2216                   matches[lane] = false;
2217                   if (lane != group_size - 1)
2218                     matches[0] = false;
2219                   goto out;
2220                 }
2221               if (dt == vect_constant_def
2222                   || dt == vect_external_def)
2223                 {
2224                   /* Check whether we can build the invariant.  If we can't
2225                      we never will be able to.  */
2226                   tree type = TREE_TYPE (chains[0][n].op);
2227                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2228                       && (TREE_CODE (type) == BOOLEAN_TYPE
2229                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2230                                                               type)))
2231                     {
2232                       matches[0] = false;
2233                       goto out;
2234                     }
2235                   vec<tree> ops;
2236                   ops.create (group_size);
2237                   for (lane = 0; lane < group_size; ++lane)
2238                     ops.quick_push (chains[lane][n].op);
2239                   slp_tree child = vect_create_new_slp_node (ops);
2240                   SLP_TREE_DEF_TYPE (child) = dt;
2241                   children.safe_push (child);
2242                 }
2243               else if (dt != vect_internal_def)
2244                 {
2245                   /* Not sure, we might need sth special.
2246                      gcc.dg/vect/pr96854.c,
2247                      gfortran.dg/vect/fast-math-pr37021.f90
2248                      and gfortran.dg/vect/pr61171.f trigger.  */
2249                   /* Soft-fail for now.  */
2250                   hard_fail = false;
2251                   goto out;
2252                 }
2253               else
2254                 {
2255                   vec<stmt_vec_info> op_stmts;
2256                   op_stmts.create (group_size);
2257                   slp_tree child = NULL;
2258                   /* Brute-force our way.  We have to consider a lane
2259                      failing after fixing an earlier fail up in the
2260                      SLP discovery recursion.  So track the current
2261                      permute per lane.  */
2262                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2263                   memset (perms, 0, sizeof (unsigned) * group_size);
2264                   do
2265                     {
2266                       op_stmts.truncate (0);
2267                       for (lane = 0; lane < group_size; ++lane)
2268                         op_stmts.quick_push
2269                           (vinfo->lookup_def (chains[lane][n].op));
2270                       child = vect_build_slp_tree (vinfo, op_stmts,
2271                                                    group_size, &this_max_nunits,
2272                                                    matches, limit,
2273                                                    &this_tree_size, bst_map);
2274                       /* ???  We're likely getting too many fatal mismatches
2275                          here so maybe we want to ignore them (but then we
2276                          have no idea which lanes fatally mismatched).  */
2277                       if (child || !matches[0])
2278                         break;
2279                       /* Swap another lane we have not yet matched up into
2280                          lanes that did not match.  If we run out of
2281                          permute possibilities for a lane terminate the
2282                          search.  */
2283                       bool term = false;
2284                       for (lane = 1; lane < group_size; ++lane)
2285                         if (!matches[lane])
2286                           {
2287                             if (n + perms[lane] + 1 == chain_len)
2288                               {
2289                                 term = true;
2290                                 break;
2291                               }
2292                             std::swap (chains[lane][n],
2293                                        chains[lane][n + perms[lane] + 1]);
2294                             perms[lane]++;
2295                           }
2296                       if (term)
2297                         break;
2298                     }
2299                   while (1);
2300                   if (!child)
2301                     {
2302                       if (dump_enabled_p ())
2303                         dump_printf_loc (MSG_NOTE, vect_location,
2304                                          "failed to match up op %d\n", n);
2305                       op_stmts.release ();
2306                       if (lane != group_size - 1)
2307                         matches[0] = false;
2308                       else
2309                         matches[lane] = false;
2310                       goto out;
2311                     }
2312                   if (dump_enabled_p ())
2313                     {
2314                       dump_printf_loc (MSG_NOTE, vect_location,
2315                                        "matched up op %d to\n", n);
2316                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2317                     }
2318                   children.safe_push (child);
2319                 }
2320             }
2321           /* 3. build SLP nodes to combine the chain.  */
2322           for (unsigned lane = 0; lane < group_size; ++lane)
2323             if (chains[lane][0].code != code)
2324               {
2325                 /* See if there's any alternate all-PLUS entry.  */
2326                 unsigned n;
2327                 for (n = 1; n < chain_len; ++n)
2328                   {
2329                     for (lane = 0; lane < group_size; ++lane)
2330                       if (chains[lane][n].code != code)
2331                         break;
2332                     if (lane == group_size)
2333                       break;
2334                   }
2335                 if (n != chain_len)
2336                   {
2337                     /* Swap that in at first position.  */
2338                     std::swap (children[0], children[n]);
2339                     for (lane = 0; lane < group_size; ++lane)
2340                       std::swap (chains[lane][0], chains[lane][n]);
2341                   }
2342                 else
2343                   {
2344                     /* ???  When this triggers and we end up with two
2345                        vect_constant/external_def up-front things break (ICE)
2346                        spectacularly finding an insertion place for the
2347                        all-constant op.  We should have a fully
2348                        vect_internal_def operand though(?) so we can swap
2349                        that into first place and then prepend the all-zero
2350                        constant.  */
2351                     if (dump_enabled_p ())
2352                       dump_printf_loc (MSG_NOTE, vect_location,
2353                                        "inserting constant zero to compensate "
2354                                        "for (partially) negated first "
2355                                        "operand\n");
2356                     chain_len++;
2357                     for (lane = 0; lane < group_size; ++lane)
2358                       chains[lane].safe_insert
2359                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2360                     vec<tree> zero_ops;
2361                     zero_ops.create (group_size);
2362                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2363                     for (lane = 1; lane < group_size; ++lane)
2364                       zero_ops.quick_push (zero_ops[0]);
2365                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2366                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2367                     children.safe_insert (0, zero);
2368                   }
2369                 break;
2370               }
2371           for (unsigned i = 1; i < children.length (); ++i)
2372             {
2373               slp_tree op0 = children[i - 1];
2374               slp_tree op1 = children[i];
2375               bool this_two_op = false;
2376               for (unsigned lane = 0; lane < group_size; ++lane)
2377                 if (chains[lane][i].code != chains[0][i].code)
2378                   {
2379                     this_two_op = true;
2380                     break;
2381                   }
2382               slp_tree child;
2383               if (i == children.length () - 1)
2384                 child = vect_create_new_slp_node (node, stmts, 2);
2385               else
2386                 child = vect_create_new_slp_node (2, ERROR_MARK);
2387               if (this_two_op)
2388                 {
2389                   vec<std::pair<unsigned, unsigned> > lperm;
2390                   lperm.create (group_size);
2391                   for (unsigned lane = 0; lane < group_size; ++lane)
2392                     lperm.quick_push (std::make_pair
2393                       (chains[lane][i].code != chains[0][i].code, lane));
2394                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2395                                                      (chains[0][i].code == code
2396                                                       ? op_stmt_info
2397                                                       : other_op_stmt_info),
2398                                                      (chains[0][i].code == code
2399                                                       ? other_op_stmt_info
2400                                                       : op_stmt_info),
2401                                                      lperm);
2402                 }
2403               else
2404                 {
2405                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2406                   SLP_TREE_VECTYPE (child) = vectype;
2407                   SLP_TREE_LANES (child) = group_size;
2408                   SLP_TREE_CHILDREN (child).quick_push (op0);
2409                   SLP_TREE_CHILDREN (child).quick_push (op1);
2410                   SLP_TREE_REPRESENTATIVE (child)
2411                     = (chains[0][i].code == code
2412                        ? op_stmt_info : other_op_stmt_info);
2413                 }
2414               children[i] = child;
2415             }
2416           *tree_size += this_tree_size + 1;
2417           *max_nunits = this_max_nunits;
2418           while (!chains.is_empty ())
2419             chains.pop ().release ();
2420           return node;
2421         }
2422 out:
2423       if (dump_enabled_p ())
2424         dump_printf_loc (MSG_NOTE, vect_location,
2425                          "failed to line up SLP graph by re-associating "
2426                          "operations in lanes%s\n",
2427                          !hard_fail ? " trying regular discovery" : "");
2428       while (!children.is_empty ())
2429         vect_free_slp_tree (children.pop ());
2430       while (!chains.is_empty ())
2431         chains.pop ().release ();
2432       /* Hard-fail, otherwise we might run into quadratic processing of the
2433          chains starting one stmt into the chain again.  */
2434       if (hard_fail)
2435         return NULL;
2436       /* Fall thru to normal processing.  */
2437     }
2438
2439   /* Get at the operands, verifying they are compatible.  */
2440   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2441   slp_oprnd_info oprnd_info;
2442   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2443     {
2444       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2445                                              stmts, i, &oprnds_info);
2446       if (res != 0)
2447         matches[(res == -1) ? 0 : i] = false;
2448       if (!matches[0])
2449         break;
2450     }
2451   for (i = 0; i < group_size; ++i)
2452     if (!matches[i])
2453       {
2454         vect_free_oprnd_info (oprnds_info);
2455         return NULL;
2456       }
2457   swap = NULL;
2458
2459   bool has_two_operators_perm = false;
2460   auto_vec<unsigned> two_op_perm_indices[2];
2461   vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
2462
2463   if (two_operators && oprnds_info.length () == 2 && group_size > 2)
2464     {
2465       unsigned idx = 0;
2466       hash_map<gimple *, unsigned> seen;
2467       vec<slp_oprnd_info> new_oprnds_info
2468         = vect_create_oprnd_info (1, group_size);
2469       bool success = true;
2470
2471       enum tree_code code = ERROR_MARK;
2472       if (oprnds_info[0]->def_stmts[0]
2473           && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
2474         code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
2475
2476       for (unsigned j = 0; j < group_size; ++j)
2477         {
2478           FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2479             {
2480               stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
2481               if (!stmt_info || !stmt_info->stmt
2482                   || !is_a<gassign *> (stmt_info->stmt)
2483                   || gimple_assign_rhs_code (stmt_info->stmt) != code
2484                   || skip_args[i])
2485                 {
2486                   success = false;
2487                   break;
2488                 }
2489
2490               bool exists;
2491               unsigned &stmt_idx
2492                 = seen.get_or_insert (stmt_info->stmt, &exists);
2493
2494               if (!exists)
2495                 {
2496                   new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
2497                   new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
2498                   stmt_idx = idx;
2499                   idx++;
2500                 }
2501
2502               two_op_perm_indices[i].safe_push (stmt_idx);
2503             }
2504
2505           if (!success)
2506             break;
2507         }
2508
2509       if (success && idx == group_size)
2510         {
2511           if (dump_enabled_p ())
2512             {
2513               dump_printf_loc (MSG_NOTE, vect_location,
2514                                "Replace two_operators operands:\n");
2515
2516               FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2517                 {
2518                   dump_printf_loc (MSG_NOTE, vect_location,
2519                                    "Operand %u:\n", i);
2520                   for (unsigned j = 0; j < group_size; j++)
2521                     dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2522                                      j, oprnd_info->def_stmts[j]->stmt);
2523                 }
2524
2525               dump_printf_loc (MSG_NOTE, vect_location,
2526                                "With a single operand:\n");
2527               for (unsigned j = 0; j < group_size; j++)
2528                 dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2529                                  j, new_oprnds_info[0]->def_stmts[j]->stmt);
2530             }
2531
2532           two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
2533           two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
2534
2535           new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
2536           new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
2537           new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
2538           new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
2539           new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
2540
2541           vect_free_oprnd_info (oprnds_info);
2542           oprnds_info = new_oprnds_info;
2543           nops = 1;
2544           has_two_operators_perm = true;
2545         }
2546     }
2547
2548   auto_vec<slp_tree, 4> children;
2549
2550   stmt_info = stmts[0];
2551
2552   /* Create SLP_TREE nodes for the definition node/s.  */
2553   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2554     {
2555       slp_tree child = nullptr;
2556       unsigned int j;
2557
2558       /* We're skipping certain operands from processing, for example
2559          outer loop reduction initial defs.  */
2560       if (skip_args[i])
2561         {
2562           children.safe_push (NULL);
2563           continue;
2564         }
2565
2566       if (oprnd_info->first_dt == vect_uninitialized_def)
2567         {
2568           /* COND_EXPR have one too many eventually if the condition
2569              is a SSA name.  */
2570           gcc_assert (i == 3 && nops == 4);
2571           continue;
2572         }
2573
2574       if (is_a <bb_vec_info> (vinfo)
2575           && oprnd_info->first_dt == vect_internal_def
2576           && !oprnd_info->any_pattern)
2577         {
2578           /* For BB vectorization, if all defs are the same do not
2579              bother to continue the build along the single-lane
2580              graph but use a splat of the scalar value.  */
2581           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2582           for (j = 1; j < group_size; ++j)
2583             if (oprnd_info->def_stmts[j] != first_def)
2584               break;
2585           if (j == group_size
2586               /* But avoid doing this for loads where we may be
2587                  able to CSE things, unless the stmt is not
2588                  vectorizable.  */
2589               && (!STMT_VINFO_VECTORIZABLE (first_def)
2590                   || !gimple_vuse (first_def->stmt)))
2591             {
2592               if (dump_enabled_p ())
2593                 dump_printf_loc (MSG_NOTE, vect_location,
2594                                  "Using a splat of the uniform operand %G",
2595                                  first_def->stmt);
2596               oprnd_info->first_dt = vect_external_def;
2597             }
2598         }
2599
2600       if (oprnd_info->first_dt == vect_external_def
2601           || oprnd_info->first_dt == vect_constant_def)
2602         {
2603           if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2604             {
2605               tree op0;
2606               tree uniform_val = op0 = oprnd_info->ops[0];
2607               for (j = 1; j < oprnd_info->ops.length (); ++j)
2608                 if (!operand_equal_p (uniform_val, oprnd_info->ops[j]))
2609                   {
2610                     uniform_val = NULL_TREE;
2611                     break;
2612                   }
2613               if (!uniform_val
2614                   && !can_duplicate_and_interleave_p (vinfo,
2615                                                       oprnd_info->ops.length (),
2616                                                       TREE_TYPE (op0)))
2617                 {
2618                   matches[j] = false;
2619                   if (dump_enabled_p ())
2620                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2621                                      "Build SLP failed: invalid type of def "
2622                                      "for variable-length SLP %T\n", op0);
2623                   goto fail;
2624                 }
2625             }
2626           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2627           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2628           oprnd_info->ops = vNULL;
2629           children.safe_push (invnode);
2630           continue;
2631         }
2632
2633       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2634                                         group_size, &this_max_nunits,
2635                                         matches, limit,
2636                                         &this_tree_size, bst_map)) != NULL)
2637         {
2638           oprnd_info->def_stmts = vNULL;
2639           children.safe_push (child);
2640           continue;
2641         }
2642
2643       /* If the SLP build for operand zero failed and operand zero
2644          and one can be commutated try that for the scalar stmts
2645          that failed the match.  */
2646       if (i == 0
2647           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2648           && matches[0]
2649           /* ???  For COND_EXPRs we can swap the comparison operands
2650              as well as the arms under some constraints.  */
2651           && nops == 2
2652           && oprnds_info[1]->first_dt == vect_internal_def
2653           && is_gimple_assign (stmt_info->stmt)
2654           /* Swapping operands for reductions breaks assumptions later on.  */
2655           && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
2656         {
2657           /* See whether we can swap the matching or the non-matching
2658              stmt operands.  */
2659           bool swap_not_matching = true;
2660           do
2661             {
2662               for (j = 0; j < group_size; ++j)
2663                 {
2664                   if (matches[j] != !swap_not_matching)
2665                     continue;
2666                   stmt_vec_info stmt_info = stmts[j];
2667                   /* Verify if we can swap operands of this stmt.  */
2668                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2669                   if (!stmt
2670                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2671                     {
2672                       if (!swap_not_matching)
2673                         goto fail;
2674                       swap_not_matching = false;
2675                       break;
2676                     }
2677                 }
2678             }
2679           while (j != group_size);
2680
2681           /* Swap mismatched definition stmts.  */
2682           if (dump_enabled_p ())
2683             dump_printf_loc (MSG_NOTE, vect_location,
2684                              "Re-trying with swapped operands of stmts ");
2685           for (j = 0; j < group_size; ++j)
2686             if (matches[j] == !swap_not_matching)
2687               {
2688                 std::swap (oprnds_info[0]->def_stmts[j],
2689                            oprnds_info[1]->def_stmts[j]);
2690                 std::swap (oprnds_info[0]->ops[j],
2691                            oprnds_info[1]->ops[j]);
2692                 if (dump_enabled_p ())
2693                   dump_printf (MSG_NOTE, "%d ", j);
2694               }
2695           if (dump_enabled_p ())
2696             dump_printf (MSG_NOTE, "\n");
2697           /* After swapping some operands we lost track whether an
2698              operand has any pattern defs so be conservative here.  */
2699           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2700             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2701           /* And try again with scratch 'matches' ... */
2702           bool *tem = XALLOCAVEC (bool, group_size);
2703           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2704                                             group_size, &this_max_nunits,
2705                                             tem, limit,
2706                                             &this_tree_size, bst_map)) != NULL)
2707             {
2708               oprnd_info->def_stmts = vNULL;
2709               children.safe_push (child);
2710               continue;
2711             }
2712         }
2713 fail:
2714
2715       /* If the SLP build failed and we analyze a basic-block
2716          simply treat nodes we fail to build as externally defined
2717          (and thus build vectors from the scalar defs).
2718          The cost model will reject outright expensive cases.
2719          ???  This doesn't treat cases where permutation ultimatively
2720          fails (or we don't try permutation below).  Ideally we'd
2721          even compute a permutation that will end up with the maximum
2722          SLP tree size...  */
2723       if (is_a <bb_vec_info> (vinfo)
2724           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2725              do extra work to cancel the pattern so the uses see the
2726              scalar version.  */
2727           && !is_pattern_stmt_p (stmt_info)
2728           && !oprnd_info->any_pattern)
2729         {
2730           /* But if there's a leading vector sized set of matching stmts
2731              fail here so we can split the group.  This matches the condition
2732              vect_analyze_slp_instance uses.  */
2733           /* ???  We might want to split here and combine the results to support
2734              multiple vector sizes better.  */
2735           for (j = 0; j < group_size; ++j)
2736             if (!matches[j])
2737               break;
2738           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2739             {
2740               if (dump_enabled_p ())
2741                 dump_printf_loc (MSG_NOTE, vect_location,
2742                                  "Building vector operands from scalars\n");
2743               this_tree_size++;
2744               child = vect_create_new_slp_node (oprnd_info->ops);
2745               children.safe_push (child);
2746               oprnd_info->ops = vNULL;
2747               continue;
2748             }
2749         }
2750
2751       gcc_assert (child == NULL);
2752       FOR_EACH_VEC_ELT (children, j, child)
2753         if (child)
2754           vect_free_slp_tree (child);
2755       vect_free_oprnd_info (oprnds_info);
2756       return NULL;
2757     }
2758
2759   vect_free_oprnd_info (oprnds_info);
2760
2761   /* If we have all children of a child built up from uniform scalars
2762      or does more than one possibly expensive vector construction then
2763      just throw that away, causing it built up from scalars.
2764      The exception is the SLP node for the vector store.  */
2765   if (is_a <bb_vec_info> (vinfo)
2766       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2767       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2768          do extra work to cancel the pattern so the uses see the
2769          scalar version.  */
2770       && !is_pattern_stmt_p (stmt_info))
2771     {
2772       slp_tree child;
2773       unsigned j;
2774       bool all_uniform_p = true;
2775       unsigned n_vector_builds = 0;
2776       FOR_EACH_VEC_ELT (children, j, child)
2777         {
2778           if (!child)
2779             ;
2780           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2781             all_uniform_p = false;
2782           else if (!vect_slp_tree_uniform_p (child))
2783             {
2784               all_uniform_p = false;
2785               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2786                 n_vector_builds++;
2787             }
2788         }
2789       if (all_uniform_p
2790           || n_vector_builds > 1
2791           || (n_vector_builds == children.length ()
2792               && is_a <gphi *> (stmt_info->stmt)))
2793         {
2794           /* Roll back.  */
2795           matches[0] = false;
2796           FOR_EACH_VEC_ELT (children, j, child)
2797             if (child)
2798               vect_free_slp_tree (child);
2799
2800           if (dump_enabled_p ())
2801             dump_printf_loc (MSG_NOTE, vect_location,
2802                              "Building parent vector operands from "
2803                              "scalars instead\n");
2804           return NULL;
2805         }
2806     }
2807
2808   *tree_size += this_tree_size + 1;
2809   *max_nunits = this_max_nunits;
2810
2811   if (two_operators)
2812     {
2813       /* ???  We'd likely want to either cache in bst_map sth like
2814          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2815          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2816          explicit stmts to put in so the keying on 'stmts' doesn't
2817          work (but we have the same issue with nodes that use 'ops').  */
2818
2819       if (has_two_operators_perm)
2820         {
2821           slp_tree child = children[0];
2822           children.truncate (0);
2823           for (i = 0; i < 2; i++)
2824             {
2825               slp_tree pnode
2826                 = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
2827               SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
2828               SLP_TREE_VECTYPE (pnode) = vectype;
2829               SLP_TREE_CHILDREN (pnode).quick_push (child);
2830               SLP_TREE_CHILDREN (pnode).quick_push (child);
2831               lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
2832               children.safe_push (pnode);
2833
2834               for (unsigned j = 0; j < stmts.length (); j++)
2835                 perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
2836             }
2837
2838           SLP_TREE_REF_COUNT (child) += 4;
2839         }
2840
2841       slp_tree one = new _slp_tree;
2842       slp_tree two = new _slp_tree;
2843       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2844       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2845       SLP_TREE_VECTYPE (one) = vectype;
2846       SLP_TREE_VECTYPE (two) = vectype;
2847       SLP_TREE_CHILDREN (one).safe_splice (children);
2848       SLP_TREE_CHILDREN (two).safe_splice (children);
2849       slp_tree child;
2850       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2851         SLP_TREE_REF_COUNT (child)++;
2852
2853       /* Here we record the original defs since this
2854          node represents the final lane configuration.  */
2855       node = vect_create_new_slp_node (node, stmts, 2);
2856       SLP_TREE_VECTYPE (node) = vectype;
2857       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2858       SLP_TREE_CHILDREN (node).quick_push (one);
2859       SLP_TREE_CHILDREN (node).quick_push (two);
2860       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2861       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2862       enum tree_code ocode = ERROR_MARK;
2863       stmt_vec_info ostmt_info;
2864       unsigned j = 0;
2865       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2866         {
2867           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2868           if (gimple_assign_rhs_code (ostmt) != code0)
2869             {
2870               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2871               ocode = gimple_assign_rhs_code (ostmt);
2872               j = i;
2873             }
2874           else
2875             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2876         }
2877
2878       SLP_TREE_CODE (one) = code0;
2879       SLP_TREE_CODE (two) = ocode;
2880       SLP_TREE_LANES (one) = stmts.length ();
2881       SLP_TREE_LANES (two) = stmts.length ();
2882       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2883       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2884
2885       return node;
2886     }
2887
2888   node = vect_create_new_slp_node (node, stmts, nops);
2889   SLP_TREE_VECTYPE (node) = vectype;
2890   SLP_TREE_CHILDREN (node).splice (children);
2891   return node;
2892 }
2893
2894 /* Dump a single SLP tree NODE.  */
2895
2896 static void
2897 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2898                      slp_tree node)
2899 {
2900   unsigned i, j;
2901   slp_tree child;
2902   stmt_vec_info stmt_info;
2903   tree op;
2904
2905   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2906   dump_user_location_t user_loc = loc.get_user_location ();
2907   dump_printf_loc (metadata, user_loc,
2908                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2909                    ", refcnt=%u)",
2910                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2911                    ? " (external)"
2912                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2913                       ? " (constant)"
2914                       : ""), (void *) node,
2915                    estimated_poly_value (node->max_nunits),
2916                                          SLP_TREE_REF_COUNT (node));
2917   if (SLP_TREE_VECTYPE (node))
2918     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2919   dump_printf (metadata, "\n");
2920   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2921     {
2922       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2923         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2924       else
2925         dump_printf_loc (metadata, user_loc, "op template: %G",
2926                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2927     }
2928   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2929     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2930       if (stmt_info)
2931         dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
2932                          STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
2933                          i, stmt_info->stmt);
2934       else
2935         dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
2936   else
2937     {
2938       dump_printf_loc (metadata, user_loc, "\t{ ");
2939       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2940         dump_printf (metadata, "%T%s ", op,
2941                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2942       dump_printf (metadata, "}\n");
2943     }
2944   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2945     {
2946       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2947       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2948         dump_printf (dump_kind, " %u", j);
2949       dump_printf (dump_kind, " }\n");
2950     }
2951   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2952     {
2953       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2954       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2955         dump_printf (dump_kind, " %u[%u]",
2956                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2957                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2958       dump_printf (dump_kind, " }%s\n",
2959                    node->ldst_lanes ? " (load-lanes)" : "");
2960     }
2961   if (SLP_TREE_CHILDREN (node).is_empty ())
2962     return;
2963   dump_printf_loc (metadata, user_loc, "\tchildren");
2964   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2965     dump_printf (dump_kind, " %p", (void *)child);
2966   dump_printf (dump_kind, "%s\n",
2967                node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
2968                ? " (store-lanes)" : "");
2969 }
2970
2971 DEBUG_FUNCTION void
2972 debug (slp_tree node)
2973 {
2974   debug_dump_context ctx;
2975   vect_print_slp_tree (MSG_NOTE,
2976                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2977                        node);
2978 }
2979
2980 /* Recursive helper for the dot producer below.  */
2981
2982 static void
2983 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2984 {
2985   if (visited.add (node))
2986     return;
2987
2988   fprintf (f, "\"%p\" [label=\"", (void *)node);
2989   vect_print_slp_tree (MSG_NOTE,
2990                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2991                        node);
2992   fprintf (f, "\"];\n");
2993
2994
2995   for (slp_tree child : SLP_TREE_CHILDREN (node))
2996     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2997
2998   for (slp_tree child : SLP_TREE_CHILDREN (node))
2999     if (child)
3000       dot_slp_tree (f, child, visited);
3001 }
3002
3003 DEBUG_FUNCTION void
3004 dot_slp_tree (const char *fname, slp_tree node)
3005 {
3006   FILE *f = fopen (fname, "w");
3007   fprintf (f, "digraph {\n");
3008   fflush (f);
3009     {
3010       debug_dump_context ctx (f);
3011       hash_set<slp_tree> visited;
3012       dot_slp_tree (f, node, visited);
3013     }
3014   fflush (f);
3015   fprintf (f, "}\n");
3016   fclose (f);
3017 }
3018
3019 DEBUG_FUNCTION void
3020 dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
3021 {
3022   FILE *f = fopen (fname, "w");
3023   fprintf (f, "digraph {\n");
3024   fflush (f);
3025     {
3026       debug_dump_context ctx (f);
3027       hash_set<slp_tree> visited;
3028       for (auto inst : slp_instances)
3029         dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
3030     }
3031   fflush (f);
3032   fprintf (f, "}\n");
3033   fclose (f);
3034 }
3035
3036 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
3037
3038 static void
3039 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3040                       slp_tree node, hash_set<slp_tree> &visited)
3041 {
3042   unsigned i;
3043   slp_tree child;
3044
3045   if (visited.add (node))
3046     return;
3047
3048   vect_print_slp_tree (dump_kind, loc, node);
3049
3050   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3051     if (child)
3052       vect_print_slp_graph (dump_kind, loc, child, visited);
3053 }
3054
3055 static void
3056 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3057                       slp_tree entry)
3058 {
3059   hash_set<slp_tree> visited;
3060   vect_print_slp_graph (dump_kind, loc, entry, visited);
3061 }
3062
3063 DEBUG_FUNCTION void
3064 debug (slp_instance instance)
3065 {
3066   debug_dump_context ctx;
3067   vect_print_slp_graph (MSG_NOTE,
3068                         dump_location_t::from_location_t (UNKNOWN_LOCATION),
3069                         SLP_INSTANCE_TREE (instance));
3070 }
3071
3072 /* Mark the tree rooted at NODE with PURE_SLP.  */
3073
3074 static void
3075 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
3076 {
3077   int i;
3078   stmt_vec_info stmt_info;
3079   slp_tree child;
3080
3081   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3082     return;
3083
3084   if (visited.add (node))
3085     return;
3086
3087   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3088     if (stmt_info)
3089       STMT_SLP_TYPE (stmt_info) = pure_slp;
3090
3091   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3092     if (child)
3093       vect_mark_slp_stmts (child, visited);
3094 }
3095
3096 static void
3097 vect_mark_slp_stmts (slp_tree node)
3098 {
3099   hash_set<slp_tree> visited;
3100   vect_mark_slp_stmts (node, visited);
3101 }
3102
3103 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
3104
3105 static void
3106 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
3107 {
3108   int i;
3109   stmt_vec_info stmt_info;
3110   slp_tree child;
3111
3112   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3113     return;
3114
3115   if (visited.add (node))
3116     return;
3117
3118   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3119     if (stmt_info)
3120       {
3121         gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
3122                     || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
3123         STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
3124       }
3125
3126   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3127     if (child)
3128       vect_mark_slp_stmts_relevant (child, visited);
3129 }
3130
3131 static void
3132 vect_mark_slp_stmts_relevant (slp_tree node)
3133 {
3134   hash_set<slp_tree> visited;
3135   vect_mark_slp_stmts_relevant (node, visited);
3136 }
3137
3138
3139 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
3140
3141 static void
3142 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
3143                        hash_set<slp_tree> &visited)
3144 {
3145   if (!node || visited.add (node))
3146     return;
3147
3148   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3149     return;
3150
3151   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3152     {
3153       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3154       if (STMT_VINFO_DATA_REF (stmt_info)
3155           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3156         loads.safe_push (node);
3157     }
3158
3159   unsigned i;
3160   slp_tree child;
3161   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3162     vect_gather_slp_loads (loads, child, visited);
3163 }
3164
3165
3166 /* Find the last store in SLP INSTANCE.  */
3167
3168 stmt_vec_info
3169 vect_find_last_scalar_stmt_in_slp (slp_tree node)
3170 {
3171   stmt_vec_info last = NULL;
3172   stmt_vec_info stmt_vinfo;
3173
3174   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3175     if (stmt_vinfo)
3176       {
3177         stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3178         last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
3179       }
3180
3181   return last;
3182 }
3183
3184 /* Find the first stmt in NODE.  */
3185
3186 stmt_vec_info
3187 vect_find_first_scalar_stmt_in_slp (slp_tree node)
3188 {
3189   stmt_vec_info first = NULL;
3190   stmt_vec_info stmt_vinfo;
3191
3192   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3193     if (stmt_vinfo)
3194       {
3195         stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3196         if (!first
3197             || get_later_stmt (stmt_vinfo, first) == first)
3198           first = stmt_vinfo;
3199       }
3200
3201   return first;
3202 }
3203
3204 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3205    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3206    (also containing the first GROUP1_SIZE stmts, since stores are
3207    consecutive), the second containing the remainder.
3208    Return the first stmt in the second group.  */
3209
3210 static stmt_vec_info
3211 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3212 {
3213   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3214   gcc_assert (group1_size > 0);
3215   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3216   gcc_assert (group2_size > 0);
3217   DR_GROUP_SIZE (first_vinfo) = group1_size;
3218
3219   stmt_vec_info stmt_info = first_vinfo;
3220   for (unsigned i = group1_size; i > 1; i--)
3221     {
3222       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3223       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3224     }
3225   /* STMT is now the last element of the first group.  */
3226   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3227   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3228
3229   DR_GROUP_SIZE (group2) = group2_size;
3230   for (stmt_info = group2; stmt_info;
3231        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3232     {
3233       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3234       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3235     }
3236
3237   /* For the second group, the DR_GROUP_GAP is that before the original group,
3238      plus skipping over the first vector.  */
3239   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3240
3241   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
3242   DR_GROUP_GAP (first_vinfo) += group2_size;
3243
3244   if (dump_enabled_p ())
3245     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3246                      group1_size, group2_size);
3247
3248   return group2;
3249 }
3250
3251 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3252    statements and a vector of NUNITS elements.  */
3253
3254 static poly_uint64
3255 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3256 {
3257   return exact_div (common_multiple (nunits, group_size), group_size);
3258 }
3259
3260 /* Helper that checks to see if a node is a load node.  */
3261
3262 static inline bool
3263 vect_is_slp_load_node  (slp_tree root)
3264 {
3265   return (SLP_TREE_CODE (root) != VEC_PERM_EXPR
3266           && SLP_TREE_DEF_TYPE (root) == vect_internal_def
3267           && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3268           && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
3269 }
3270
3271
3272 /* Helper function of optimize_load_redistribution that performs the operation
3273    recursively.  */
3274
3275 static slp_tree
3276 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3277                                 vec_info *vinfo, unsigned int group_size,
3278                                 hash_map<slp_tree, slp_tree> *load_map,
3279                                 slp_tree root)
3280 {
3281   if (slp_tree *leader = load_map->get (root))
3282     return *leader;
3283
3284   slp_tree node;
3285   unsigned i;
3286
3287   /* For now, we don't know anything about externals so do not do anything.  */
3288   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3289     return NULL;
3290   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3291     {
3292       /* First convert this node into a load node and add it to the leaves
3293          list and flatten the permute from a lane to a load one.  If it's
3294          unneeded it will be elided later.  */
3295       vec<stmt_vec_info> stmts;
3296       stmts.create (SLP_TREE_LANES (root));
3297       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3298       for (unsigned j = 0; j < lane_perm.length (); j++)
3299         {
3300           std::pair<unsigned, unsigned> perm = lane_perm[j];
3301           node = SLP_TREE_CHILDREN (root)[perm.first];
3302
3303           if (!vect_is_slp_load_node (node)
3304               || SLP_TREE_CHILDREN (node).exists ())
3305             {
3306               stmts.release ();
3307               goto next;
3308             }
3309
3310           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3311         }
3312
3313       if (dump_enabled_p ())
3314         dump_printf_loc (MSG_NOTE, vect_location,
3315                          "converting stmts on permute node %p\n",
3316                          (void *) root);
3317
3318       bool *matches = XALLOCAVEC (bool, group_size);
3319       poly_uint64 max_nunits = 1;
3320       unsigned tree_size = 0, limit = 1;
3321       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3322                                   matches, &limit, &tree_size, bst_map);
3323       if (!node)
3324         stmts.release ();
3325
3326       load_map->put (root, node);
3327       return node;
3328     }
3329
3330 next:
3331   load_map->put (root, NULL);
3332
3333   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3334     {
3335       slp_tree value
3336         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3337                                           node);
3338       if (value)
3339         {
3340           SLP_TREE_REF_COUNT (value)++;
3341           SLP_TREE_CHILDREN (root)[i] = value;
3342           /* ???  We know the original leafs of the replaced nodes will
3343              be referenced by bst_map, only the permutes created by
3344              pattern matching are not.  */
3345           if (SLP_TREE_REF_COUNT (node) == 1)
3346             load_map->remove (node);
3347           vect_free_slp_tree (node);
3348         }
3349     }
3350
3351   return NULL;
3352 }
3353
3354 /* Temporary workaround for loads not being CSEd during SLP build.  This
3355    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3356    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3357    same DR such that the final operation is equal to a permuted load.  Such
3358    NODES are then directly converted into LOADS themselves.  The nodes are
3359    CSEd using BST_MAP.  */
3360
3361 static void
3362 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3363                               vec_info *vinfo, unsigned int group_size,
3364                               hash_map<slp_tree, slp_tree> *load_map,
3365                               slp_tree root)
3366 {
3367   slp_tree node;
3368   unsigned i;
3369
3370   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3371     {
3372       slp_tree value
3373         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3374                                           node);
3375       if (value)
3376         {
3377           SLP_TREE_REF_COUNT (value)++;
3378           SLP_TREE_CHILDREN (root)[i] = value;
3379           /* ???  We know the original leafs of the replaced nodes will
3380              be referenced by bst_map, only the permutes created by
3381              pattern matching are not.  */
3382           if (SLP_TREE_REF_COUNT (node) == 1)
3383             load_map->remove (node);
3384           vect_free_slp_tree (node);
3385         }
3386     }
3387 }
3388
3389 /* Helper function of vect_match_slp_patterns.
3390
3391    Attempts to match patterns against the slp tree rooted in REF_NODE using
3392    VINFO.  Patterns are matched in post-order traversal.
3393
3394    If matching is successful the value in REF_NODE is updated and returned, if
3395    not then it is returned unchanged.  */
3396
3397 static bool
3398 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3399                            slp_tree_to_load_perm_map_t *perm_cache,
3400                            slp_compat_nodes_map_t *compat_cache,
3401                            hash_set<slp_tree> *visited)
3402 {
3403   unsigned i;
3404   slp_tree node = *ref_node;
3405   bool found_p = false;
3406   if (!node || visited->add (node))
3407     return false;
3408
3409   slp_tree child;
3410   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3411     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3412                                           vinfo, perm_cache, compat_cache,
3413                                           visited);
3414
3415   for (unsigned x = 0; x < num__slp_patterns; x++)
3416     {
3417       vect_pattern *pattern
3418         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3419       if (pattern)
3420         {
3421           pattern->build (vinfo);
3422           delete pattern;
3423           found_p = true;
3424         }
3425     }
3426
3427   return found_p;
3428 }
3429
3430 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3431    vec_info VINFO.
3432
3433    The modified tree is returned.  Patterns are tried in order and multiple
3434    patterns may match.  */
3435
3436 static bool
3437 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3438                          hash_set<slp_tree> *visited,
3439                          slp_tree_to_load_perm_map_t *perm_cache,
3440                          slp_compat_nodes_map_t *compat_cache)
3441 {
3442   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3443   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3444
3445   if (dump_enabled_p ())
3446     dump_printf_loc (MSG_NOTE, vect_location,
3447                      "Analyzing SLP tree %p for patterns\n",
3448                      (void *) SLP_INSTANCE_TREE (instance));
3449
3450   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3451                                     visited);
3452 }
3453
3454 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3455    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3456    Return true if we could use IFN_STORE_LANES instead and if that appears
3457    to be the better approach.  */
3458
3459 static bool
3460 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3461                                unsigned int group_size,
3462                                unsigned int new_group_size)
3463 {
3464   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3465   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3466   if (!vectype)
3467     return false;
3468   /* Allow the split if one of the two new groups would operate on full
3469      vectors *within* rather than across one scalar loop iteration.
3470      This is purely a heuristic, but it should work well for group
3471      sizes of 3 and 4, where the possible splits are:
3472
3473        3->2+1:  OK if the vector has exactly two elements
3474        4->2+2:  Likewise
3475        4->3+1:  Less clear-cut.  */
3476   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3477       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3478     return false;
3479   return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3480 }
3481
3482 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3483    vect_build_slp_tree to build a tree of packed stmts if possible.
3484    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3485
3486 static bool
3487 vect_analyze_slp_instance (vec_info *vinfo,
3488                            scalar_stmts_to_slp_tree_map_t *bst_map,
3489                            stmt_vec_info stmt_info, slp_instance_kind kind,
3490                            unsigned max_tree_size, unsigned *limit,
3491                            bool force_single_lane = false);
3492
3493 /* Build an interleaving scheme for the store sources RHS_NODES from
3494    SCALAR_STMTS.  */
3495
3496 static slp_tree
3497 vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
3498                                    vec<stmt_vec_info> &scalar_stmts)
3499 {
3500   unsigned int group_size = scalar_stmts.length ();
3501   slp_tree node = vect_create_new_slp_node (scalar_stmts,
3502                                             SLP_TREE_CHILDREN
3503                                               (rhs_nodes[0]).length ());
3504   SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
3505   for (unsigned l = 0;
3506        l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
3507     {
3508       /* And a permute merging all RHS SLP trees.  */
3509       slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
3510                                                 VEC_PERM_EXPR);
3511       SLP_TREE_CHILDREN (node).quick_push (perm);
3512       SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
3513       SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
3514       SLP_TREE_LANES (perm) = group_size;
3515       /* ???  We should set this NULL but that's not expected.  */
3516       SLP_TREE_REPRESENTATIVE (perm)
3517         = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
3518       for (unsigned j = 0; j < rhs_nodes.length (); ++j)
3519         {
3520           SLP_TREE_CHILDREN (perm)
3521             .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
3522           SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
3523           for (unsigned k = 0;
3524                k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
3525             {
3526               /* ???  We should populate SLP_TREE_SCALAR_STMTS
3527                  or SLP_TREE_SCALAR_OPS but then we might have
3528                  a mix of both in our children.  */
3529               SLP_TREE_LANE_PERMUTATION (perm)
3530                 .quick_push (std::make_pair (j, k));
3531             }
3532         }
3533
3534       /* Now we have a single permute node but we cannot code-generate
3535          the case with more than two inputs.
3536          Perform pairwise reduction, reducing the two inputs
3537          with the least number of lanes to one and then repeat until
3538          we end up with two inputs.  That scheme makes sure we end
3539          up with permutes satisfying the restriction of requiring at
3540          most two vector inputs to produce a single vector output
3541          when the number of lanes is even.  */
3542       while (SLP_TREE_CHILDREN (perm).length () > 2)
3543         {
3544           /* When we have three equal sized groups left the pairwise
3545              reduction does not result in a scheme that avoids using
3546              three vectors.  Instead merge the first two groups
3547              to the final size with do-not-care elements (chosen
3548              from the first group) and then merge with the third.
3549                   { A0, B0,  x, A1, B1,  x, ... }
3550                -> { A0, B0, C0, A1, B1, C1, ... }
3551              This handles group size of three (and at least
3552              power-of-two multiples of that).  */
3553           if (SLP_TREE_CHILDREN (perm).length () == 3
3554               && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3555                   == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
3556               && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3557                   == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
3558             {
3559               int ai = 0;
3560               int bi = 1;
3561               slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3562               slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3563               unsigned n = SLP_TREE_LANES (perm);
3564
3565               slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3566               SLP_TREE_LANES (permab) = n;
3567               SLP_TREE_LANE_PERMUTATION (permab).create (n);
3568               SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3569               /* ???  Should be NULL but that's not expected.  */
3570               SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3571               SLP_TREE_CHILDREN (permab).quick_push (a);
3572               for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3573                 SLP_TREE_LANE_PERMUTATION (permab)
3574                   .quick_push (std::make_pair (0, k));
3575               SLP_TREE_CHILDREN (permab).quick_push (b);
3576               for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3577                 SLP_TREE_LANE_PERMUTATION (permab)
3578                   .quick_push (std::make_pair (1, k));
3579               /* Push the do-not-care lanes.  */
3580               for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3581                 SLP_TREE_LANE_PERMUTATION (permab)
3582                   .quick_push (std::make_pair (0, k));
3583
3584               /* Put the merged node into 'perm', in place of a.  */
3585               SLP_TREE_CHILDREN (perm)[ai] = permab;
3586               /* Adjust the references to b in the permutation
3587                  of perm and to the later children which we'll
3588                  remove.  */
3589               for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3590                 {
3591                   std::pair<unsigned, unsigned> &p
3592                     = SLP_TREE_LANE_PERMUTATION (perm)[k];
3593                   if (p.first == (unsigned) bi)
3594                     {
3595                       p.first = ai;
3596                       p.second += SLP_TREE_LANES (a);
3597                     }
3598                   else if (p.first > (unsigned) bi)
3599                     p.first--;
3600                 }
3601               SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3602               break;
3603             }
3604
3605           /* Pick the two nodes with the least number of lanes,
3606              prefer the earliest candidate and maintain ai < bi.  */
3607           int ai = -1;
3608           int bi = -1;
3609           for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
3610             {
3611               if (ai == -1)
3612                 ai = ci;
3613               else if (bi == -1)
3614                 bi = ci;
3615               else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
3616                         < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
3617                        || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
3618                            < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
3619                 {
3620                   if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
3621                       <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
3622                     bi = ci;
3623                   else
3624                     {
3625                       ai = bi;
3626                       bi = ci;
3627                     }
3628                 }
3629             }
3630
3631           /* Produce a merge of nodes ai and bi.  */
3632           slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3633           slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3634           unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
3635           slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3636           SLP_TREE_LANES (permab) = n;
3637           SLP_TREE_LANE_PERMUTATION (permab).create (n);
3638           SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3639           /* ???  Should be NULL but that's not expected.  */
3640           SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3641           SLP_TREE_CHILDREN (permab).quick_push (a);
3642           for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3643             SLP_TREE_LANE_PERMUTATION (permab)
3644               .quick_push (std::make_pair (0, k));
3645           SLP_TREE_CHILDREN (permab).quick_push (b);
3646           for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3647             SLP_TREE_LANE_PERMUTATION (permab)
3648               .quick_push (std::make_pair (1, k));
3649
3650           /* Put the merged node into 'perm', in place of a.  */
3651           SLP_TREE_CHILDREN (perm)[ai] = permab;
3652           /* Adjust the references to b in the permutation
3653              of perm and to the later children which we'll
3654              remove.  */
3655           for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3656             {
3657               std::pair<unsigned, unsigned> &p
3658                 = SLP_TREE_LANE_PERMUTATION (perm)[k];
3659               if (p.first == (unsigned) bi)
3660                 {
3661                   p.first = ai;
3662                   p.second += SLP_TREE_LANES (a);
3663                 }
3664               else if (p.first > (unsigned) bi)
3665                 p.first--;
3666             }
3667           SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3668         }
3669     }
3670
3671   return node;
3672 }
3673
3674 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3675    of KIND.  Return true if successful.  */
3676
3677 static bool
3678 vect_build_slp_instance (vec_info *vinfo,
3679                          slp_instance_kind kind,
3680                          vec<stmt_vec_info> &scalar_stmts,
3681                          vec<stmt_vec_info> &root_stmt_infos,
3682                          vec<tree> &remain,
3683                          unsigned max_tree_size, unsigned *limit,
3684                          scalar_stmts_to_slp_tree_map_t *bst_map,
3685                          /* ???  We need stmt_info for group splitting.  */
3686                          stmt_vec_info stmt_info_,
3687                          bool force_single_lane = false)
3688 {
3689   /* If there's no budget left bail out early.  */
3690   if (*limit == 0)
3691     return false;
3692
3693   if (kind == slp_inst_kind_ctor)
3694     {
3695       if (dump_enabled_p ())
3696         dump_printf_loc (MSG_NOTE, vect_location,
3697                          "Analyzing vectorizable constructor: %G\n",
3698                          root_stmt_infos[0]->stmt);
3699     }
3700
3701   if (dump_enabled_p ())
3702     {
3703       dump_printf_loc (MSG_NOTE, vect_location,
3704                        "Starting SLP discovery for\n");
3705       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3706         dump_printf_loc (MSG_NOTE, vect_location,
3707                          "  %G", scalar_stmts[i]->stmt);
3708     }
3709
3710   /* Build the tree for the SLP instance.  */
3711   unsigned int group_size = scalar_stmts.length ();
3712   bool *matches = XALLOCAVEC (bool, group_size);
3713   poly_uint64 max_nunits = 1;
3714   unsigned tree_size = 0;
3715   unsigned i;
3716
3717   slp_tree node = NULL;
3718   if (force_single_lane)
3719     {
3720       matches[0] = true;
3721       matches[1] = false;
3722     }
3723   else
3724     node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3725                                 &max_nunits, matches, limit,
3726                                 &tree_size, bst_map);
3727   if (node != NULL)
3728     {
3729       /* Calculate the unrolling factor based on the smallest type.  */
3730       poly_uint64 unrolling_factor
3731         = calculate_unrolling_factor (max_nunits, group_size);
3732
3733       if (maybe_ne (unrolling_factor, 1U)
3734           && is_a <bb_vec_info> (vinfo))
3735         {
3736           unsigned HOST_WIDE_INT const_max_nunits;
3737           if (!max_nunits.is_constant (&const_max_nunits)
3738               || const_max_nunits > group_size)
3739             {
3740               if (dump_enabled_p ())
3741                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3742                                  "Build SLP failed: store group "
3743                                  "size not a multiple of the vector size "
3744                                  "in basic block SLP\n");
3745               vect_free_slp_tree (node);
3746               return false;
3747             }
3748           /* Fatal mismatch.  */
3749           if (dump_enabled_p ())
3750             dump_printf_loc (MSG_NOTE, vect_location,
3751                              "SLP discovery succeeded but node needs "
3752                              "splitting\n");
3753           memset (matches, true, group_size);
3754           matches[group_size / const_max_nunits * const_max_nunits] = false;
3755           vect_free_slp_tree (node);
3756         }
3757       else
3758         {
3759           /* Create a new SLP instance.  */
3760           slp_instance new_instance = XNEW (class _slp_instance);
3761           SLP_INSTANCE_TREE (new_instance) = node;
3762           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3763           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3764           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3765           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3766           SLP_INSTANCE_KIND (new_instance) = kind;
3767           new_instance->reduc_phis = NULL;
3768           new_instance->cost_vec = vNULL;
3769           new_instance->subgraph_entries = vNULL;
3770
3771           if (dump_enabled_p ())
3772             dump_printf_loc (MSG_NOTE, vect_location,
3773                              "SLP size %u vs. limit %u.\n",
3774                              tree_size, max_tree_size);
3775
3776           /* Fixup SLP reduction chains.  */
3777           if (kind == slp_inst_kind_reduc_chain)
3778             {
3779               /* If this is a reduction chain with a conversion in front
3780                  amend the SLP tree with a node for that.  */
3781               gimple *scalar_def
3782                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3783               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3784                 {
3785                   /* Get at the conversion stmt - we know it's the single use
3786                      of the last stmt of the reduction chain.  */
3787                   use_operand_p use_p;
3788                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3789                                            &use_p, &scalar_def);
3790                   gcc_assert (r);
3791                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3792                   next_info = vect_stmt_to_vectorize (next_info);
3793                   scalar_stmts = vNULL;
3794                   scalar_stmts.create (group_size);
3795                   for (unsigned i = 0; i < group_size; ++i)
3796                     scalar_stmts.quick_push (next_info);
3797                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3798                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3799                   SLP_TREE_CHILDREN (conv).quick_push (node);
3800                   SLP_INSTANCE_TREE (new_instance) = conv;
3801                   /* We also have to fake this conversion stmt as SLP reduction
3802                      group so we don't have to mess with too much code
3803                      elsewhere.  */
3804                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3805                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3806                 }
3807               /* Fill the backedge child of the PHI SLP node.  The
3808                  general matching code cannot find it because the
3809                  scalar code does not reflect how we vectorize the
3810                  reduction.  */
3811               use_operand_p use_p;
3812               imm_use_iterator imm_iter;
3813               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3814               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3815                                      gimple_get_lhs (scalar_def))
3816                 /* There are exactly two non-debug uses, the reduction
3817                    PHI and the loop-closed PHI node.  */
3818                 if (!is_gimple_debug (USE_STMT (use_p))
3819                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3820                   {
3821                     auto_vec<stmt_vec_info, 64> phis (group_size);
3822                     stmt_vec_info phi_info
3823                       = vinfo->lookup_stmt (USE_STMT (use_p));
3824                     for (unsigned i = 0; i < group_size; ++i)
3825                       phis.quick_push (phi_info);
3826                     slp_tree *phi_node = bst_map->get (phis);
3827                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3828                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3829                       = SLP_INSTANCE_TREE (new_instance);
3830                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3831                   }
3832             }
3833
3834           vinfo->slp_instances.safe_push (new_instance);
3835
3836           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3837              the number of scalar stmts in the root in a few places.
3838              Verify that assumption holds.  */
3839           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3840                         .length () == group_size);
3841
3842           if (dump_enabled_p ())
3843             {
3844               dump_printf_loc (MSG_NOTE, vect_location,
3845                                "Final SLP tree for instance %p:\n",
3846                                (void *) new_instance);
3847               vect_print_slp_graph (MSG_NOTE, vect_location,
3848                                     SLP_INSTANCE_TREE (new_instance));
3849             }
3850
3851           return true;
3852         }
3853     }
3854   /* Failed to SLP.  */
3855
3856   stmt_vec_info stmt_info = stmt_info_;
3857   /* Try to break the group up into pieces.  */
3858   if (*limit > 0 && kind == slp_inst_kind_store)
3859     {
3860       /* ???  We could delay all the actual splitting of store-groups
3861          until after SLP discovery of the original group completed.
3862          Then we can recurse to vect_build_slp_instance directly.  */
3863       for (i = 0; i < group_size; i++)
3864         if (!matches[i])
3865           break;
3866
3867       /* For basic block SLP, try to break the group up into multiples of
3868          a vector size.  */
3869       if (is_a <bb_vec_info> (vinfo)
3870           && (i > 1 && i < group_size))
3871         {
3872           /* Free the allocated memory.  */
3873           scalar_stmts.release ();
3874
3875           tree scalar_type
3876             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3877           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3878                                                       1 << floor_log2 (i));
3879           unsigned HOST_WIDE_INT const_nunits;
3880           if (vectype
3881               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3882             {
3883               /* Split into two groups at the first vector boundary.  */
3884               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3885               unsigned group1_size = i & ~(const_nunits - 1);
3886
3887               if (dump_enabled_p ())
3888                 dump_printf_loc (MSG_NOTE, vect_location,
3889                                  "Splitting SLP group at stmt %u\n", i);
3890               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3891                                                                group1_size);
3892               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3893                                                     kind, max_tree_size,
3894                                                     limit);
3895               /* Split the rest at the failure point and possibly
3896                  re-analyze the remaining matching part if it has
3897                  at least two lanes.  */
3898               if (group1_size < i
3899                   && (i + 1 < group_size
3900                       || i - group1_size > 1))
3901                 {
3902                   stmt_vec_info rest2 = rest;
3903                   rest = vect_split_slp_store_group (rest, i - group1_size);
3904                   if (i - group1_size > 1)
3905                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3906                                                       kind, max_tree_size,
3907                                                       limit);
3908                 }
3909               /* Re-analyze the non-matching tail if it has at least
3910                  two lanes.  */
3911               if (i + 1 < group_size)
3912                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3913                                                   rest, kind, max_tree_size,
3914                                                   limit);
3915               return res;
3916             }
3917         }
3918
3919       /* For loop vectorization split the RHS into arbitrary pieces of
3920          size >= 1.  */
3921       else if (is_a <loop_vec_info> (vinfo)
3922                && (group_size != 1 && i < group_size))
3923         {
3924           /* There are targets that cannot do even/odd interleaving schemes
3925              so they absolutely need to use load/store-lanes.  For now
3926              force single-lane SLP for them - they would be happy with
3927              uniform power-of-two lanes (but depending on element size),
3928              but even if we can use 'i' as indicator we would need to
3929              backtrack when later lanes fail to discover with the same
3930              granularity.  We cannot turn any of strided or scatter store
3931              into store-lanes.  */
3932           /* ???  If this is not in sync with what get_load_store_type
3933              later decides the SLP representation is not good for other
3934              store vectorization methods.  */
3935           bool want_store_lanes
3936             = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
3937                && ! STMT_VINFO_STRIDED_P (stmt_info)
3938                && compare_step_with_zero (vinfo, stmt_info) > 0
3939                && vect_slp_prefer_store_lanes_p (vinfo, stmt_info,
3940                                                  group_size, 1));
3941           if (want_store_lanes || force_single_lane)
3942             i = 1;
3943
3944           /* A fatal discovery fail doesn't always mean single-lane SLP
3945              isn't a possibility, so try.  */
3946           if (i == 0)
3947             i = 1;
3948
3949           if (dump_enabled_p ())
3950             dump_printf_loc (MSG_NOTE, vect_location,
3951                              "Splitting SLP group at stmt %u\n", i);
3952
3953           /* Analyze the stored values and pinch them together with
3954              a permute node so we can preserve the whole store group.  */
3955           auto_vec<slp_tree> rhs_nodes;
3956
3957           /* Calculate the unrolling factor based on the smallest type.  */
3958           poly_uint64 unrolling_factor = 1;
3959
3960           unsigned int start = 0, end = i;
3961           while (start < group_size)
3962             {
3963               gcc_assert (end - start >= 1);
3964               vec<stmt_vec_info> substmts;
3965               substmts.create (end - start);
3966               for (unsigned j = start; j < end; ++j)
3967                 substmts.quick_push (scalar_stmts[j]);
3968               max_nunits = 1;
3969               node = vect_build_slp_tree (vinfo, substmts, end - start,
3970                                           &max_nunits,
3971                                           matches, limit, &tree_size, bst_map);
3972               if (node)
3973                 {
3974                   /* ???  Possibly not safe, but not sure how to check
3975                      and fail SLP build?  */
3976                   unrolling_factor
3977                     = force_common_multiple (unrolling_factor,
3978                                              calculate_unrolling_factor
3979                                                (max_nunits, end - start));
3980                   rhs_nodes.safe_push (node);
3981                   start = end;
3982                   if (want_store_lanes || force_single_lane)
3983                     end = start + 1;
3984                   else
3985                     end = group_size;
3986                 }
3987               else
3988                 {
3989                   substmts.release ();
3990                   if (end - start == 1)
3991                     {
3992                       /* Single-lane discovery failed.  Free ressources.  */
3993                       for (auto node : rhs_nodes)
3994                         vect_free_slp_tree (node);
3995                       scalar_stmts.release ();
3996                       if (dump_enabled_p ())
3997                         dump_printf_loc (MSG_NOTE, vect_location,
3998                                          "SLP discovery failed\n");
3999                       return false;
4000                     }
4001
4002                   /* ???  It really happens that we soft-fail SLP
4003                      build at a mismatch but the matching part hard-fails
4004                      later.  As we know we arrived here with a group
4005                      larger than one try a group of size one!  */
4006                   if (!matches[0])
4007                     end = start + 1;
4008                   else
4009                     for (unsigned j = start; j < end; j++)
4010                       if (!matches[j - start])
4011                         {
4012                           end = j;
4013                           break;
4014                         }
4015                 }
4016             }
4017
4018           /* Now we assume we can build the root SLP node from all stores.  */
4019           if (want_store_lanes)
4020             {
4021               /* For store-lanes feed the store node with all RHS nodes
4022                  in order.  */
4023               node = vect_create_new_slp_node (scalar_stmts,
4024                                                SLP_TREE_CHILDREN
4025                                                  (rhs_nodes[0]).length ());
4026               SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
4027               node->ldst_lanes = true;
4028               SLP_TREE_CHILDREN (node)
4029                 .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
4030                                 + rhs_nodes.length () - 1);
4031               /* First store value and possibly mask.  */
4032               SLP_TREE_CHILDREN (node)
4033                 .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
4034               /* Rest of the store values.  All mask nodes are the same,
4035                  this should be guaranteed by dataref group discovery.  */
4036               for (unsigned j = 1; j < rhs_nodes.length (); ++j)
4037                 SLP_TREE_CHILDREN (node)
4038                   .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
4039               for (slp_tree child : SLP_TREE_CHILDREN (node))
4040                 child->refcnt++;
4041             }
4042           else
4043             node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts);
4044
4045           while (!rhs_nodes.is_empty ())
4046             vect_free_slp_tree (rhs_nodes.pop ());
4047
4048           /* Create a new SLP instance.  */
4049           slp_instance new_instance = XNEW (class _slp_instance);
4050           SLP_INSTANCE_TREE (new_instance) = node;
4051           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
4052           SLP_INSTANCE_LOADS (new_instance) = vNULL;
4053           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4054           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4055           SLP_INSTANCE_KIND (new_instance) = kind;
4056           new_instance->reduc_phis = NULL;
4057           new_instance->cost_vec = vNULL;
4058           new_instance->subgraph_entries = vNULL;
4059
4060           if (dump_enabled_p ())
4061             dump_printf_loc (MSG_NOTE, vect_location,
4062                              "SLP size %u vs. limit %u.\n",
4063                              tree_size, max_tree_size);
4064
4065           vinfo->slp_instances.safe_push (new_instance);
4066
4067           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4068              the number of scalar stmts in the root in a few places.
4069              Verify that assumption holds.  */
4070           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4071                         .length () == group_size);
4072
4073           if (dump_enabled_p ())
4074             {
4075               dump_printf_loc (MSG_NOTE, vect_location,
4076                                "Final SLP tree for instance %p:\n",
4077                                (void *) new_instance);
4078               vect_print_slp_graph (MSG_NOTE, vect_location,
4079                                     SLP_INSTANCE_TREE (new_instance));
4080             }
4081           return true;
4082         }
4083       else
4084         /* Free the allocated memory.  */
4085         scalar_stmts.release ();
4086
4087       /* Even though the first vector did not all match, we might be able to SLP
4088          (some) of the remainder.  FORNOW ignore this possibility.  */
4089     }
4090   else
4091     /* Free the allocated memory.  */
4092     scalar_stmts.release ();
4093
4094   /* Failed to SLP.  */
4095   if (dump_enabled_p ())
4096     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4097   return false;
4098 }
4099
4100
4101 /* Analyze an SLP instance starting from a group of grouped stores.  Call
4102    vect_build_slp_tree to build a tree of packed stmts if possible.
4103    Return FALSE if it's impossible to SLP any stmt in the loop.  */
4104
4105 static bool
4106 vect_analyze_slp_instance (vec_info *vinfo,
4107                            scalar_stmts_to_slp_tree_map_t *bst_map,
4108                            stmt_vec_info stmt_info,
4109                            slp_instance_kind kind,
4110                            unsigned max_tree_size, unsigned *limit,
4111                            bool force_single_lane)
4112 {
4113   vec<stmt_vec_info> scalar_stmts;
4114
4115   if (is_a <bb_vec_info> (vinfo))
4116     vect_location = stmt_info->stmt;
4117
4118   stmt_vec_info next_info = stmt_info;
4119   if (kind == slp_inst_kind_store)
4120     {
4121       /* Collect the stores and store them in scalar_stmts.  */
4122       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
4123       while (next_info)
4124         {
4125           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4126           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
4127         }
4128     }
4129   else if (kind == slp_inst_kind_reduc_chain)
4130     {
4131       /* Collect the reduction stmts and store them in scalar_stmts.  */
4132       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
4133       while (next_info)
4134         {
4135           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4136           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
4137         }
4138       /* Mark the first element of the reduction chain as reduction to properly
4139          transform the node.  In the reduction analysis phase only the last
4140          element of the chain is marked as reduction.  */
4141       STMT_VINFO_DEF_TYPE (stmt_info)
4142         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
4143       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
4144         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
4145     }
4146   else
4147     gcc_unreachable ();
4148
4149   vec<stmt_vec_info> roots = vNULL;
4150   vec<tree> remain = vNULL;
4151   /* Build the tree for the SLP instance.  */
4152   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
4153                                       roots, remain,
4154                                       max_tree_size, limit, bst_map,
4155                                       kind == slp_inst_kind_store
4156                                       ? stmt_info : NULL, force_single_lane);
4157
4158   /* ???  If this is slp_inst_kind_store and the above succeeded here's
4159      where we should do store group splitting.  */
4160
4161   return res;
4162 }
4163
4164 /* qsort comparator ordering SLP load nodes.  */
4165
4166 static int
4167 vllp_cmp (const void *a_, const void *b_)
4168 {
4169   const slp_tree a = *(const slp_tree *)a_;
4170   const slp_tree b = *(const slp_tree *)b_;
4171   stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
4172   stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
4173   if (STMT_VINFO_GROUPED_ACCESS (a0)
4174       && STMT_VINFO_GROUPED_ACCESS (b0)
4175       && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
4176     {
4177       /* Same group, order after lanes used.  */
4178       if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
4179         return 1;
4180       else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
4181         return -1;
4182       else
4183         {
4184           /* Try to order loads using the same lanes together, breaking
4185              the tie with the lane number that first differs.  */
4186           if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
4187               && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
4188             return 0;
4189           else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
4190                    && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
4191             return 1;
4192           else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
4193                    && SLP_TREE_LOAD_PERMUTATION (b).exists ())
4194             return -1;
4195           else
4196             {
4197               for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
4198                 if (SLP_TREE_LOAD_PERMUTATION (a)[i]
4199                     != SLP_TREE_LOAD_PERMUTATION (b)[i])
4200                   {
4201                     /* In-order lane first, that's what the above case for
4202                        no permutation does.  */
4203                     if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
4204                       return -1;
4205                     else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
4206                       return 1;
4207                     else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
4208                              < SLP_TREE_LOAD_PERMUTATION (b)[i])
4209                       return -1;
4210                     else
4211                       return 1;
4212                   }
4213               return 0;
4214             }
4215         }
4216     }
4217   else /* Different groups or non-groups.  */
4218     {
4219       /* Order groups as their first element to keep them together.  */
4220       if (STMT_VINFO_GROUPED_ACCESS (a0))
4221         a0 = DR_GROUP_FIRST_ELEMENT (a0);
4222       if (STMT_VINFO_GROUPED_ACCESS (b0))
4223         b0 = DR_GROUP_FIRST_ELEMENT (b0);
4224       if (a0 == b0)
4225         return 0;
4226       /* Tie using UID.  */
4227       else if (gimple_uid (STMT_VINFO_STMT (a0))
4228                < gimple_uid (STMT_VINFO_STMT (b0)))
4229         return -1;
4230       else
4231         {
4232           gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
4233                       != gimple_uid (STMT_VINFO_STMT (b0)));
4234           return 1;
4235         }
4236     }
4237 }
4238
4239 /* Process the set of LOADS that are all from the same dataref group.  */
4240
4241 static void
4242 vect_lower_load_permutations (loop_vec_info loop_vinfo,
4243                               scalar_stmts_to_slp_tree_map_t *bst_map,
4244                               const array_slice<slp_tree> &loads)
4245 {
4246   /* We at this point want to lower without a fixed VF or vector
4247      size in mind which means we cannot actually compute whether we
4248      need three or more vectors for a load permutation yet.  So always
4249      lower.  */
4250   stmt_vec_info first
4251     = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
4252   unsigned group_lanes = DR_GROUP_SIZE (first);
4253
4254   /* Verify if all load permutations can be implemented with a suitably
4255      large element load-lanes operation.  */
4256   unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
4257   if (STMT_VINFO_STRIDED_P (first)
4258       || compare_step_with_zero (loop_vinfo, first) <= 0
4259       || exact_log2 (ld_lanes_lanes) == -1
4260       /* ???  For now only support the single-lane case as there is
4261          missing support on the store-lane side and code generation
4262          isn't up to the task yet.  */
4263       || ld_lanes_lanes != 1
4264       || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
4265                                     group_lanes / ld_lanes_lanes,
4266                                     false) == IFN_LAST)
4267     ld_lanes_lanes = 0;
4268   else
4269     /* Verify the loads access the same number of lanes aligned to
4270        ld_lanes_lanes.  */
4271     for (slp_tree load : loads)
4272       {
4273         if (SLP_TREE_LANES (load) != ld_lanes_lanes)
4274           {
4275             ld_lanes_lanes = 0;
4276             break;
4277           }
4278         unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
4279         if (first % ld_lanes_lanes != 0)
4280           {
4281             ld_lanes_lanes = 0;
4282             break;
4283           }
4284         for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i)
4285           if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i)
4286             {
4287               ld_lanes_lanes = 0;
4288               break;
4289             }
4290       }
4291
4292   /* Only a power-of-two number of lanes matches interleaving with N levels.
4293      ???  An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
4294      at each step.  */
4295   if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
4296     return;
4297
4298   for (slp_tree load : loads)
4299     {
4300       /* Leave masked or gather loads alone for now.  */
4301       if (!SLP_TREE_CHILDREN (load).is_empty ())
4302         continue;
4303
4304       /* We want to pattern-match special cases here and keep those
4305          alone.  Candidates are splats and load-lane.  */
4306
4307       /* We need to lower only loads of less than half of the groups
4308          lanes, including duplicate lanes.  Note this leaves nodes
4309          with a non-1:1 load permutation around instead of canonicalizing
4310          those into a load and a permute node.  Removing this early
4311          check would do such canonicalization.  */
4312       if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
4313           && ld_lanes_lanes == 0)
4314         continue;
4315
4316       /* Build the permute to get the original load permutation order.  */
4317       bool contiguous = true;
4318       lane_permutation_t final_perm;
4319       final_perm.create (SLP_TREE_LANES (load));
4320       for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
4321         {
4322           final_perm.quick_push
4323             (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
4324           if (i != 0
4325               && (SLP_TREE_LOAD_PERMUTATION (load)[i]
4326                   != SLP_TREE_LOAD_PERMUTATION (load)[i-1] + 1))
4327             contiguous = false;
4328         }
4329
4330       /* When the load permutation accesses a contiguous unpermuted,
4331          power-of-two aligned and sized chunk leave the load alone.
4332          We can likely (re-)load it more efficiently rather than
4333          extracting it from the larger load.
4334          ???  Long-term some of the lowering should move to where
4335          the vector types involved are fixed.  */
4336       if (ld_lanes_lanes == 0
4337           && contiguous
4338           && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
4339           && pow2p_hwi (SLP_TREE_LANES (load))
4340           && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
4341           && group_lanes % SLP_TREE_LANES (load) == 0)
4342         {
4343           final_perm.release ();
4344           continue;
4345         }
4346
4347       /* First build (and possibly re-use) a load node for the
4348          unpermuted group.  Gaps in the middle and on the end are
4349          represented with NULL stmts.  */
4350       vec<stmt_vec_info> stmts;
4351       stmts.create (group_lanes);
4352       for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
4353         {
4354           if (s != first)
4355             for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
4356               stmts.quick_push (NULL);
4357           stmts.quick_push (s);
4358         }
4359       for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
4360         stmts.quick_push (NULL);
4361       poly_uint64 max_nunits = 1;
4362       bool *matches = XALLOCAVEC (bool, group_lanes);
4363       unsigned limit = 1;
4364       unsigned tree_size = 0;
4365       slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
4366                                          group_lanes,
4367                                          &max_nunits, matches, &limit,
4368                                          &tree_size, bst_map);
4369
4370       if (ld_lanes_lanes != 0)
4371         {
4372           /* ???  If this is not in sync with what get_load_store_type
4373              later decides the SLP representation is not good for other
4374              store vectorization methods.  */
4375           l0->ldst_lanes = true;
4376           load->ldst_lanes = true;
4377         }
4378
4379       while (1)
4380         {
4381           unsigned group_lanes = SLP_TREE_LANES (l0);
4382           if (ld_lanes_lanes != 0
4383               || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
4384             break;
4385
4386           /* Try to lower by reducing the group to half its size using an
4387              interleaving scheme.  For this try to compute whether all
4388              elements needed for this load are in even or odd elements of
4389              an even/odd decomposition with N consecutive elements.
4390              Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
4391              with N == 2.  */
4392           /* ???  Only an even number of lanes can be handed this way, but the
4393              fallback below could work for any number.  We have to make sure
4394              to round up in that case.  */
4395           gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
4396           unsigned even = 0, odd = 0;
4397           if ((group_lanes & 1) == 0)
4398             {
4399               even = (1 << ceil_log2 (group_lanes)) - 1;
4400               odd = even;
4401               for (auto l : final_perm)
4402                 {
4403                   even &= ~l.second;
4404                   odd &= l.second;
4405                 }
4406             }
4407
4408           /* Now build an even or odd extraction from the unpermuted load.  */
4409           lane_permutation_t perm;
4410           perm.create ((group_lanes + 1) / 2);
4411           unsigned level;
4412           if (even
4413               && ((level = 1 << ctz_hwi (even)), true)
4414               && group_lanes % (2 * level) == 0)
4415             {
4416               /* { 0, 1, ... 4, 5 ..., } */
4417               unsigned level = 1 << ctz_hwi (even);
4418               for (unsigned i = 0; i < group_lanes / 2 / level; ++i)
4419                 for (unsigned j = 0; j < level; ++j)
4420                   perm.quick_push (std::make_pair (0, 2 * i * level + j));
4421             }
4422           else if (odd)
4423             {
4424               /* { ..., 2, 3, ... 6, 7 } */
4425               unsigned level = 1 << ctz_hwi (odd);
4426               gcc_assert (group_lanes % (2 * level) == 0);
4427               for (unsigned i = 0; i < group_lanes / 2 / level; ++i)
4428                 for (unsigned j = 0; j < level; ++j)
4429                   perm.quick_push (std::make_pair (0, (2 * i + 1) * level + j));
4430             }
4431           else
4432             {
4433               /* As fallback extract all used lanes and fill to half the
4434                  group size by repeating the last element.
4435                  ???  This is quite a bad strathegy for re-use - we could
4436                  brute force our way to find more optimal filling lanes to
4437                  maximize re-use when looking at all loads from the group.  */
4438               auto_bitmap l;
4439               for (auto p : final_perm)
4440                 bitmap_set_bit (l, p.second);
4441               unsigned i = 0;
4442               bitmap_iterator bi;
4443               EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
4444                   perm.quick_push (std::make_pair (0, i));
4445               while (perm.length () < (group_lanes + 1) / 2)
4446                 perm.quick_push (perm.last ());
4447             }
4448
4449           /* Update final_perm with the intermediate permute.  */
4450           for (unsigned i = 0; i < final_perm.length (); ++i)
4451             {
4452               unsigned l = final_perm[i].second;
4453               unsigned j;
4454               for (j = 0; j < perm.length (); ++j)
4455                 if (perm[j].second == l)
4456                   {
4457                     final_perm[i].second = j;
4458                     break;
4459                   }
4460               gcc_assert (j < perm.length ());
4461             }
4462
4463           /* And create scalar stmts.  */
4464           vec<stmt_vec_info> perm_stmts;
4465           perm_stmts.create (perm.length ());
4466           for (unsigned i = 0; i < perm.length (); ++i)
4467             perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
4468
4469           slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
4470           SLP_TREE_CHILDREN (p).quick_push (l0);
4471           SLP_TREE_LANE_PERMUTATION (p) = perm;
4472           SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
4473           SLP_TREE_LANES (p) = perm.length ();
4474           SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
4475           /* ???  As we have scalar stmts for this intermediate permute we
4476              could CSE it via bst_map but we do not want to pick up
4477              another SLP node with a load permutation.  We instead should
4478              have a "local" CSE map here.  */
4479           SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
4480
4481           /* We now have a node for (group_lanes + 1) / 2 lanes.  */
4482           l0 = p;
4483         }
4484
4485       /* And finally from the ordered reduction node create the
4486          permute to shuffle the lanes into the original load-permutation
4487          order.  We replace the original load node with this.  */
4488       SLP_TREE_CODE (load) = VEC_PERM_EXPR;
4489       SLP_TREE_LOAD_PERMUTATION (load).release ();
4490       SLP_TREE_LANE_PERMUTATION (load) = final_perm;
4491       SLP_TREE_CHILDREN (load).create (1);
4492       SLP_TREE_CHILDREN (load).quick_push (l0);
4493     }
4494 }
4495
4496 /* Transform SLP loads in the SLP graph created by SLP discovery to
4497    group loads from the same group and lower load permutations that
4498    are unlikely to be supported into a series of permutes.
4499    In the degenerate case of having only single-lane SLP instances
4500    this should result in a series of permute nodes emulating an
4501    interleaving scheme.  */
4502
4503 static void
4504 vect_lower_load_permutations (loop_vec_info loop_vinfo,
4505                               scalar_stmts_to_slp_tree_map_t *bst_map)
4506 {
4507   /* Gather and sort loads across all instances.  */
4508   hash_set<slp_tree> visited;
4509   auto_vec<slp_tree> loads;
4510   for (auto inst : loop_vinfo->slp_instances)
4511     vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
4512   if (loads.is_empty ())
4513     return;
4514   loads.qsort (vllp_cmp);
4515
4516   /* Now process each dataref group separately.  */
4517   unsigned firsti = 0;
4518   for (unsigned i = 1; i < loads.length (); ++i)
4519     {
4520       slp_tree first = loads[firsti];
4521       slp_tree next = loads[i];
4522       stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
4523       stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
4524       if (STMT_VINFO_GROUPED_ACCESS (a0)
4525           && STMT_VINFO_GROUPED_ACCESS (b0)
4526           && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
4527         continue;
4528       /* Now we have one or multiple SLP loads of the same group from
4529          firsti to i - 1.  */
4530       if (STMT_VINFO_GROUPED_ACCESS (a0))
4531         vect_lower_load_permutations (loop_vinfo, bst_map,
4532                                       make_array_slice (&loads[firsti],
4533                                                         i - firsti));
4534       firsti = i;
4535     }
4536   if (firsti < loads.length ()
4537       && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
4538     vect_lower_load_permutations (loop_vinfo, bst_map,
4539                                   make_array_slice (&loads[firsti],
4540                                                     loads.length () - firsti));
4541 }
4542
4543 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
4544    trees of packed scalar stmts if SLP is possible.  */
4545
4546 opt_result
4547 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
4548 {
4549   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4550   unsigned int i;
4551   stmt_vec_info first_element;
4552   slp_instance instance;
4553
4554   DUMP_VECT_SCOPE ("vect_analyze_slp");
4555
4556   unsigned limit = max_tree_size;
4557
4558   scalar_stmts_to_slp_tree_map_t *bst_map
4559     = new scalar_stmts_to_slp_tree_map_t ();
4560
4561   /* Find SLP sequences starting from groups of grouped stores.  */
4562   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4563     vect_analyze_slp_instance (vinfo, bst_map, first_element,
4564                                slp_inst_kind_store, max_tree_size, &limit);
4565
4566   /* For loops also start SLP discovery from non-grouped stores.  */
4567   if (loop_vinfo)
4568     {
4569       data_reference_p dr;
4570       FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
4571         if (DR_IS_WRITE (dr))
4572           {
4573             stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
4574             /* Grouped stores are already handled above.  */
4575             if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
4576               continue;
4577             vec<stmt_vec_info> stmts;
4578             vec<stmt_vec_info> roots = vNULL;
4579             vec<tree> remain = vNULL;
4580             stmts.create (1);
4581             stmts.quick_push (stmt_info);
4582             vect_build_slp_instance (vinfo, slp_inst_kind_store,
4583                                      stmts, roots, remain, max_tree_size,
4584                                      &limit, bst_map, NULL);
4585           }
4586     }
4587
4588   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4589     {
4590       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
4591         {
4592           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
4593           /* Apply patterns.  */
4594           for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
4595             bb_vinfo->roots[i].stmts[j]
4596               = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
4597           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
4598                                        bb_vinfo->roots[i].stmts,
4599                                        bb_vinfo->roots[i].roots,
4600                                        bb_vinfo->roots[i].remain,
4601                                        max_tree_size, &limit, bst_map, NULL))
4602             {
4603               bb_vinfo->roots[i].stmts = vNULL;
4604               bb_vinfo->roots[i].roots = vNULL;
4605               bb_vinfo->roots[i].remain = vNULL;
4606             }
4607         }
4608     }
4609
4610   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4611     {
4612       /* Find SLP sequences starting from reduction chains.  */
4613       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
4614         if (! STMT_VINFO_RELEVANT_P (first_element)
4615             && ! STMT_VINFO_LIVE_P (first_element))
4616           ;
4617         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
4618                                               slp_inst_kind_reduc_chain,
4619                                               max_tree_size, &limit))
4620           {
4621             /* Dissolve reduction chain group.  */
4622             stmt_vec_info vinfo = first_element;
4623             stmt_vec_info last = NULL;
4624             while (vinfo)
4625               {
4626                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
4627                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
4628                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
4629                 last = vinfo;
4630                 vinfo = next;
4631               }
4632             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
4633             /* It can be still vectorized as part of an SLP reduction.  */
4634             loop_vinfo->reductions.safe_push (last);
4635           }
4636
4637       /* Find SLP sequences starting from groups of reductions.  */
4638       if (loop_vinfo->reductions.length () > 0)
4639         {
4640           /* Collect reduction statements we can combine into
4641              a SLP reduction.  */
4642           vec<stmt_vec_info> scalar_stmts;
4643           scalar_stmts.create (loop_vinfo->reductions.length ());
4644           for (auto next_info : loop_vinfo->reductions)
4645             {
4646               next_info = vect_stmt_to_vectorize (next_info);
4647               if ((STMT_VINFO_RELEVANT_P (next_info)
4648                    || STMT_VINFO_LIVE_P (next_info))
4649                   /* ???  Make sure we didn't skip a conversion around a
4650                      reduction path.  In that case we'd have to reverse
4651                      engineer that conversion stmt following the chain using
4652                      reduc_idx and from the PHI using reduc_def.  */
4653                   && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
4654                       || (STMT_VINFO_DEF_TYPE (next_info)
4655                           == vect_double_reduction_def)))
4656                 {
4657                   /* Do not discover SLP reductions combining lane-reducing
4658                      ops, that will fail later.  */
4659                   if (!lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
4660                     scalar_stmts.quick_push (next_info);
4661                   else
4662                     {
4663                       /* Do SLP discovery for single-lane reductions.  */
4664                       vec<stmt_vec_info> stmts;
4665                       vec<stmt_vec_info> roots = vNULL;
4666                       vec<tree> remain = vNULL;
4667                       stmts.create (1);
4668                       stmts.quick_push (next_info);
4669                       vect_build_slp_instance (vinfo,
4670                                                slp_inst_kind_reduc_group,
4671                                                stmts, roots, remain,
4672                                                max_tree_size, &limit,
4673                                                bst_map, NULL);
4674                     }
4675                 }
4676             }
4677           /* Save for re-processing on failure.  */
4678           vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
4679           vec<stmt_vec_info> roots = vNULL;
4680           vec<tree> remain = vNULL;
4681           if (scalar_stmts.length () <= 1
4682               || !vect_build_slp_instance (loop_vinfo,
4683                                            slp_inst_kind_reduc_group,
4684                                            scalar_stmts, roots, remain,
4685                                            max_tree_size, &limit, bst_map,
4686                                            NULL))
4687             {
4688               if (scalar_stmts.length () <= 1)
4689                 scalar_stmts.release ();
4690               /* Do SLP discovery for single-lane reductions.  */
4691               for (auto stmt_info : saved_stmts)
4692                 {
4693                   vec<stmt_vec_info> stmts;
4694                   vec<stmt_vec_info> roots = vNULL;
4695                   vec<tree> remain = vNULL;
4696                   stmts.create (1);
4697                   stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
4698                   vect_build_slp_instance (vinfo,
4699                                            slp_inst_kind_reduc_group,
4700                                            stmts, roots, remain,
4701                                            max_tree_size, &limit,
4702                                            bst_map, NULL);
4703                 }
4704               saved_stmts.release ();
4705             }
4706         }
4707
4708       /* Make sure to vectorize only-live stmts, usually inductions.  */
4709       for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
4710         for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
4711              gsi_next (&gsi))
4712           {
4713             gphi *lc_phi = *gsi;
4714             tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
4715             stmt_vec_info stmt_info;
4716             if (TREE_CODE (def) == SSA_NAME
4717                 && !virtual_operand_p (def)
4718                 && (stmt_info = loop_vinfo->lookup_def (def))
4719                 && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
4720                 && STMT_VINFO_LIVE_P (stmt_info)
4721                 && (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
4722                     || (STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
4723                         && STMT_VINFO_REDUC_IDX (stmt_info) == -1)))
4724               {
4725                 vec<stmt_vec_info> stmts;
4726                 vec<stmt_vec_info> roots = vNULL;
4727                 vec<tree> remain = vNULL;
4728                 stmts.create (1);
4729                 stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
4730                 vect_build_slp_instance (vinfo,
4731                                          slp_inst_kind_reduc_group,
4732                                          stmts, roots, remain,
4733                                          max_tree_size, &limit,
4734                                          bst_map, NULL);
4735               }
4736           }
4737     }
4738
4739   hash_set<slp_tree> visited_patterns;
4740   slp_tree_to_load_perm_map_t perm_cache;
4741   slp_compat_nodes_map_t compat_cache;
4742
4743   /* See if any patterns can be found in the SLP tree.  */
4744   bool pattern_found = false;
4745   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4746     pattern_found |= vect_match_slp_patterns (instance, vinfo,
4747                                               &visited_patterns, &perm_cache,
4748                                               &compat_cache);
4749
4750   /* If any were found optimize permutations of loads.  */
4751   if (pattern_found)
4752     {
4753       hash_map<slp_tree, slp_tree> load_map;
4754       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4755         {
4756           slp_tree root = SLP_INSTANCE_TREE (instance);
4757           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
4758                                         &load_map, root);
4759         }
4760     }
4761
4762   /* Check whether we should force some SLP instances to use load/store-lanes
4763      and do so by forcing SLP re-discovery with single lanes.  We used
4764      to cancel SLP when this applied to all instances in a loop but now
4765      we decide this per SLP instance.  It's important to do this only
4766      after SLP pattern recognition.  */
4767   if (is_a <loop_vec_info> (vinfo))
4768     FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4769       if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
4770           && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
4771         {
4772           slp_tree slp_root = SLP_INSTANCE_TREE (instance);
4773           int group_size = SLP_TREE_LANES (slp_root);
4774           tree vectype = SLP_TREE_VECTYPE (slp_root);
4775
4776           stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
4777           gimple *rep = STMT_VINFO_STMT (rep_info);
4778           bool masked = (is_gimple_call (rep)
4779                          && gimple_call_internal_p (rep)
4780                          && internal_fn_mask_index
4781                               (gimple_call_internal_fn (rep)) != -1);
4782           if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
4783               || slp_root->ldst_lanes
4784               || (vect_store_lanes_supported (vectype, group_size, masked)
4785                   == IFN_LAST))
4786             continue;
4787
4788           auto_vec<slp_tree> loads;
4789           hash_set<slp_tree> visited;
4790           vect_gather_slp_loads (loads, slp_root, visited);
4791
4792           /* Check whether any load in the SLP instance is possibly
4793              permuted.  */
4794           bool loads_permuted = false;
4795           slp_tree load_node;
4796           unsigned j;
4797           FOR_EACH_VEC_ELT (loads, j, load_node)
4798             {
4799               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
4800                 continue;
4801               unsigned k;
4802               stmt_vec_info load_info;
4803               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
4804                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
4805                   {
4806                     loads_permuted = true;
4807                     break;
4808                   }
4809             }
4810
4811           /* If the loads and stores can use load/store-lanes force re-discovery
4812              with single lanes.  */
4813           if (loads_permuted)
4814             {
4815               bool can_use_lanes = true;
4816               FOR_EACH_VEC_ELT (loads, j, load_node)
4817                 if (STMT_VINFO_GROUPED_ACCESS
4818                       (SLP_TREE_REPRESENTATIVE (load_node)))
4819                   {
4820                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
4821                         (SLP_TREE_REPRESENTATIVE (load_node));
4822                     rep = STMT_VINFO_STMT (stmt_vinfo);
4823                     masked = (is_gimple_call (rep)
4824                               && gimple_call_internal_p (rep)
4825                               && internal_fn_mask_index
4826                                    (gimple_call_internal_fn (rep)));
4827                     /* Use SLP for strided accesses (or if we can't
4828                        load-lanes).  */
4829                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
4830                         || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
4831                         || vect_load_lanes_supported
4832                              (STMT_VINFO_VECTYPE (stmt_vinfo),
4833                               DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
4834                         /* ???  During SLP re-discovery with a single lane
4835                            a masked grouped load will appear permuted and
4836                            discovery will fail.  We have to rework this
4837                            on the discovery side - for now avoid ICEing.  */
4838                         || masked)
4839                       {
4840                         can_use_lanes = false;
4841                         break;
4842                       }
4843                   }
4844
4845               if (can_use_lanes)
4846                 {
4847                   if (dump_enabled_p ())
4848                     dump_printf_loc (MSG_NOTE, vect_location,
4849                                      "SLP instance %p can use load/store-lanes,"
4850                                      " re-discovering with single-lanes\n",
4851                                      (void *) instance);
4852
4853                   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
4854
4855                   vect_free_slp_instance (instance);
4856                   limit = max_tree_size;
4857                   bool res = vect_analyze_slp_instance (vinfo, bst_map,
4858                                                         stmt_info,
4859                                                         slp_inst_kind_store,
4860                                                         max_tree_size, &limit,
4861                                                         true);
4862                   gcc_assert (res);
4863                   auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
4864                   LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
4865                 }
4866             }
4867         }
4868
4869   /* When we end up with load permutations that we cannot possibly handle,
4870      like those requiring three vector inputs, lower them using interleaving
4871      like schemes.  */
4872   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4873     {
4874       vect_lower_load_permutations (loop_vinfo, bst_map);
4875       if (dump_enabled_p ())
4876         {
4877           dump_printf_loc (MSG_NOTE, vect_location,
4878                            "SLP graph after lowering permutations:\n");
4879           hash_set<slp_tree> visited;
4880           FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4881             vect_print_slp_graph (MSG_NOTE, vect_location,
4882                                   SLP_INSTANCE_TREE (instance), visited);
4883         }
4884     }
4885
4886   release_scalar_stmts_to_slp_tree_map (bst_map);
4887
4888   if (pattern_found && dump_enabled_p ())
4889     {
4890       dump_printf_loc (MSG_NOTE, vect_location,
4891                        "Pattern matched SLP tree\n");
4892       hash_set<slp_tree> visited;
4893       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4894         vect_print_slp_graph (MSG_NOTE, vect_location,
4895                               SLP_INSTANCE_TREE (instance), visited);
4896     }
4897
4898   return opt_result::success ();
4899 }
4900
4901 /* Estimates the cost of inserting layout changes into the SLP graph.
4902    It can also say that the insertion is impossible.  */
4903
4904 struct slpg_layout_cost
4905 {
4906   slpg_layout_cost () = default;
4907   slpg_layout_cost (sreal, bool);
4908
4909   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
4910   bool is_possible () const { return depth != sreal::max (); }
4911
4912   bool operator== (const slpg_layout_cost &) const;
4913   bool operator!= (const slpg_layout_cost &) const;
4914
4915   bool is_better_than (const slpg_layout_cost &, bool) const;
4916
4917   void add_parallel_cost (const slpg_layout_cost &);
4918   void add_serial_cost (const slpg_layout_cost &);
4919   void split (unsigned int);
4920
4921   /* The longest sequence of layout changes needed during any traversal
4922      of the partition dag, weighted by execution frequency.
4923
4924      This is the most important metric when optimizing for speed, since
4925      it helps to ensure that we keep the number of operations on
4926      critical paths to a minimum.  */
4927   sreal depth = 0;
4928
4929   /* An estimate of the total number of operations needed.  It is weighted by
4930      execution frequency when optimizing for speed but not when optimizing for
4931      size.  In order to avoid double-counting, a node with a fanout of N will
4932      distribute 1/N of its total cost to each successor.
4933
4934      This is the most important metric when optimizing for size, since
4935      it helps to keep the total number of operations to a minimum,  */
4936   sreal total = 0;
4937 };
4938
4939 /* Construct costs for a node with weight WEIGHT.  A higher weight
4940    indicates more frequent execution.  IS_FOR_SIZE is true if we are
4941    optimizing for size rather than speed.  */
4942
4943 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
4944   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
4945 {
4946 }
4947
4948 bool
4949 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
4950 {
4951   return depth == other.depth && total == other.total;
4952 }
4953
4954 bool
4955 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
4956 {
4957   return !operator== (other);
4958 }
4959
4960 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
4961    true if we are optimizing for size rather than speed.  */
4962
4963 bool
4964 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
4965                                   bool is_for_size) const
4966 {
4967   if (is_for_size)
4968     {
4969       if (total != other.total)
4970         return total < other.total;
4971       return depth < other.depth;
4972     }
4973   else
4974     {
4975       if (depth != other.depth)
4976         return depth < other.depth;
4977       return total < other.total;
4978     }
4979 }
4980
4981 /* Increase the costs to account for something with cost INPUT_COST
4982    happening in parallel with the current costs.  */
4983
4984 void
4985 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
4986 {
4987   depth = std::max (depth, input_cost.depth);
4988   total += input_cost.total;
4989 }
4990
4991 /* Increase the costs to account for something with cost INPUT_COST
4992    happening in series with the current costs.  */
4993
4994 void
4995 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
4996 {
4997   depth += other.depth;
4998   total += other.total;
4999 }
5000
5001 /* Split the total cost among TIMES successors or predecessors.  */
5002
5003 void
5004 slpg_layout_cost::split (unsigned int times)
5005 {
5006   if (times > 1)
5007     total /= times;
5008 }
5009
5010 /* Information about one node in the SLP graph, for use during
5011    vect_optimize_slp_pass.  */
5012
5013 struct slpg_vertex
5014 {
5015   slpg_vertex (slp_tree node_) : node (node_) {}
5016
5017   /* The node itself.  */
5018   slp_tree node;
5019
5020   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
5021      partitions are flexible; they can have whichever layout consumers
5022      want them to have.  */
5023   int partition = -1;
5024
5025   /* The number of nodes that directly use the result of this one
5026      (i.e. the number of nodes that count this one as a child).  */
5027   unsigned int out_degree = 0;
5028
5029   /* The execution frequency of the node.  */
5030   sreal weight = 0;
5031
5032   /* The total execution frequency of all nodes that directly use the
5033      result of this one.  */
5034   sreal out_weight = 0;
5035 };
5036
5037 /* Information about one partition of the SLP graph, for use during
5038    vect_optimize_slp_pass.  */
5039
5040 struct slpg_partition_info
5041 {
5042   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
5043      of m_partitioned_nodes.  */
5044   unsigned int node_begin = 0;
5045   unsigned int node_end = 0;
5046
5047   /* Which layout we've chosen to use for this partition, or -1 if
5048      we haven't picked one yet.  */
5049   int layout = -1;
5050
5051   /* The number of predecessors and successors in the partition dag.
5052      The predecessors always have lower partition numbers and the
5053      successors always have higher partition numbers.
5054
5055      Note that the directions of these edges are not necessarily the
5056      same as in the data flow graph.  For example, if an SCC has separate
5057      partitions for an inner loop and an outer loop, the inner loop's
5058      partition will have at least two incoming edges from the outer loop's
5059      partition: one for a live-in value and one for a live-out value.
5060      In data flow terms, one of these edges would also be from the outer loop
5061      to the inner loop, but the other would be in the opposite direction.  */
5062   unsigned int in_degree = 0;
5063   unsigned int out_degree = 0;
5064 };
5065
5066 /* Information about the costs of using a particular layout for a
5067    particular partition.  It can also say that the combination is
5068    impossible.  */
5069
5070 struct slpg_partition_layout_costs
5071 {
5072   bool is_possible () const { return internal_cost.is_possible (); }
5073   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
5074
5075   /* The costs inherited from predecessor partitions.  */
5076   slpg_layout_cost in_cost;
5077
5078   /* The inherent cost of the layout within the node itself.  For example,
5079      this is nonzero for a load if choosing a particular layout would require
5080      the load to permute the loaded elements.  It is nonzero for a
5081      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
5082      to full-vector moves.  */
5083   slpg_layout_cost internal_cost;
5084
5085   /* The costs inherited from successor partitions.  */
5086   slpg_layout_cost out_cost;
5087 };
5088
5089 /* This class tries to optimize the layout of vectors in order to avoid
5090    unnecessary shuffling.  At the moment, the set of possible layouts are
5091    restricted to bijective permutations.
5092
5093    The goal of the pass depends on whether we're optimizing for size or
5094    for speed.  When optimizing for size, the goal is to reduce the overall
5095    number of layout changes (including layout changes implied by things
5096    like load permutations).  When optimizing for speed, the goal is to
5097    reduce the maximum latency attributable to layout changes on any
5098    non-cyclical path through the data flow graph.
5099
5100    For example, when optimizing a loop nest for speed, we will prefer
5101    to make layout changes outside of a loop rather than inside of a loop,
5102    and will prefer to make layout changes in parallel rather than serially,
5103    even if that increases the overall number of layout changes.
5104
5105    The high-level procedure is:
5106
5107    (1) Build a graph in which edges go from uses (parents) to definitions
5108        (children).
5109
5110    (2) Divide the graph into a dag of strongly-connected components (SCCs).
5111
5112    (3) When optimizing for speed, partition the nodes in each SCC based
5113        on their containing cfg loop.  When optimizing for size, treat
5114        each SCC as a single partition.
5115
5116        This gives us a dag of partitions.  The goal is now to assign a
5117        layout to each partition.
5118
5119    (4) Construct a set of vector layouts that are worth considering.
5120        Record which nodes must keep their current layout.
5121
5122    (5) Perform a forward walk over the partition dag (from loads to stores)
5123        accumulating the "forward" cost of using each layout.  When visiting
5124        each partition, assign a tentative choice of layout to the partition
5125        and use that choice when calculating the cost of using a different
5126        layout in successor partitions.
5127
5128    (6) Perform a backward walk over the partition dag (from stores to loads),
5129        accumulating the "backward" cost of using each layout.  When visiting
5130        each partition, make a final choice of layout for that partition based
5131        on the accumulated forward costs (from (5)) and backward costs
5132        (from (6)).
5133
5134    (7) Apply the chosen layouts to the SLP graph.
5135
5136    For example, consider the SLP statements:
5137
5138    S1:      a_1 = load
5139        loop:
5140    S2:      a_2 = PHI<a_1, a_3>
5141    S3:      b_1 = load
5142    S4:      a_3 = a_2 + b_1
5143        exit:
5144    S5:      a_4 = PHI<a_3>
5145    S6:      store a_4
5146
5147    S2 and S4 form an SCC and are part of the same loop.  Every other
5148    statement is in a singleton SCC.  In this example there is a one-to-one
5149    mapping between SCCs and partitions and the partition dag looks like this;
5150
5151         S1     S3
5152          \     /
5153           S2+S4
5154             |
5155            S5
5156             |
5157            S6
5158
5159    S2, S3 and S4 will have a higher execution frequency than the other
5160    statements, so when optimizing for speed, the goal is to avoid any
5161    layout changes:
5162
5163    - within S3
5164    - within S2+S4
5165    - on the S3->S2+S4 edge
5166
5167    For example, if S3 was originally a reversing load, the goal of the
5168    pass is to make it an unreversed load and change the layout on the
5169    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
5170    on S1->S2+S4 and S5->S6 would also be acceptable.)
5171
5172    The difference between SCCs and partitions becomes important if we
5173    add an outer loop:
5174
5175    S1:      a_1 = ...
5176        loop1:
5177    S2:      a_2 = PHI<a_1, a_6>
5178    S3:      b_1 = load
5179    S4:      a_3 = a_2 + b_1
5180        loop2:
5181    S5:      a_4 = PHI<a_3, a_5>
5182    S6:      c_1 = load
5183    S7:      a_5 = a_4 + c_1
5184        exit2:
5185    S8:      a_6 = PHI<a_5>
5186    S9:      store a_6
5187        exit1:
5188
5189    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
5190    for speed, we usually do not want restrictions in the outer loop to "infect"
5191    the decision for the inner loop.  For example, if an outer-loop node
5192    in the SCC contains a statement with a fixed layout, that should not
5193    prevent the inner loop from using a different layout.  Conversely,
5194    the inner loop should not dictate a layout to the outer loop: if the
5195    outer loop does a lot of computation, then it may not be efficient to
5196    do all of that computation in the inner loop's preferred layout.
5197
5198    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
5199    and S5+S7 (inner).  We also try to arrange partitions so that:
5200
5201    - the partition for an outer loop comes before the partition for
5202      an inner loop
5203
5204    - if a sibling loop A dominates a sibling loop B, A's partition
5205      comes before B's
5206
5207    This gives the following partition dag for the example above:
5208
5209         S1        S3
5210          \        /
5211           S2+S4+S8   S6
5212            |   \\    /
5213            |    S5+S7
5214            |
5215           S9
5216
5217    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
5218    one for a reversal of the edge S7->S8.
5219
5220    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
5221    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
5222    preferred layout against the cost of changing the layout on entry to the
5223    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
5224
5225    Although this works well when optimizing for speed, it has the downside
5226    when optimizing for size that the choice of layout for S5+S7 is completely
5227    independent of S9, which lessens the chance of reducing the overall number
5228    of permutations.  We therefore do not partition SCCs when optimizing
5229    for size.
5230
5231    To give a concrete example of the difference between optimizing
5232    for size and speed, consider:
5233
5234    a[0] = (b[1] << c[3]) - d[1];
5235    a[1] = (b[0] << c[2]) - d[0];
5236    a[2] = (b[3] << c[1]) - d[3];
5237    a[3] = (b[2] << c[0]) - d[2];
5238
5239    There are three different layouts here: one for a, one for b and d,
5240    and one for c.  When optimizing for speed it is better to permute each
5241    of b, c and d into the order required by a, since those permutations
5242    happen in parallel.  But when optimizing for size, it is better to:
5243
5244    - permute c into the same order as b
5245    - do the arithmetic
5246    - permute the result into the order required by a
5247
5248    This gives 2 permutations rather than 3.  */
5249
5250 class vect_optimize_slp_pass
5251 {
5252 public:
5253   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
5254   void run ();
5255
5256 private:
5257   /* Graph building.  */
5258   struct loop *containing_loop (slp_tree);
5259   bool is_cfg_latch_edge (graph_edge *);
5260   void build_vertices (hash_set<slp_tree> &, slp_tree);
5261   void build_vertices ();
5262   void build_graph ();
5263
5264   /* Partitioning.  */
5265   void create_partitions ();
5266   template<typename T> void for_each_partition_edge (unsigned int, T);
5267
5268   /* Layout selection.  */
5269   bool is_compatible_layout (slp_tree, unsigned int);
5270   int change_layout_cost (slp_tree, unsigned int, unsigned int);
5271   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
5272                                                        unsigned int);
5273   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
5274                                int, unsigned int);
5275   int internal_node_cost (slp_tree, int, unsigned int);
5276   void start_choosing_layouts ();
5277
5278   /* Cost propagation.  */
5279   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
5280                                      unsigned int, unsigned int);
5281   slpg_layout_cost total_in_cost (unsigned int);
5282   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
5283   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
5284   void forward_pass ();
5285   void backward_pass ();
5286
5287   /* Rematerialization.  */
5288   slp_tree get_result_with_layout (slp_tree, unsigned int);
5289   void materialize ();
5290
5291   /* Clean-up.  */
5292   void remove_redundant_permutations ();
5293
5294   void dump ();
5295
5296   vec_info *m_vinfo;
5297
5298   /* True if we should optimize the graph for size, false if we should
5299      optimize it for speed.  (It wouldn't be easy to make this decision
5300      more locally.)  */
5301   bool m_optimize_size;
5302
5303   /* A graph of all SLP nodes, with edges leading from uses to definitions.
5304      In other words, a node's predecessors are its slp_tree parents and
5305      a node's successors are its slp_tree children.  */
5306   graph *m_slpg = nullptr;
5307
5308   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
5309   auto_vec<slpg_vertex> m_vertices;
5310
5311   /* The list of all leaves of M_SLPG. such as external definitions, constants,
5312      and loads.  */
5313   auto_vec<int> m_leafs;
5314
5315   /* This array has one entry for every vector layout that we're considering.
5316      Element 0 is null and indicates "no change".  Other entries describe
5317      permutations that are inherent in the current graph and that we would
5318      like to reverse if possible.
5319
5320      For example, a permutation { 1, 2, 3, 0 } means that something has
5321      effectively been permuted in that way, such as a load group
5322      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
5323      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
5324      in order to put things "back" in order.  */
5325   auto_vec<vec<unsigned> > m_perms;
5326
5327   /* A partitioning of the nodes for which a layout must be chosen.
5328      Each partition represents an <SCC, cfg loop> pair; that is,
5329      nodes in different SCCs belong to different partitions, and nodes
5330      within an SCC can be further partitioned according to a containing
5331      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
5332
5333      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
5334        from leaves (such as loads) to roots (such as stores).
5335
5336      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
5337   auto_vec<slpg_partition_info> m_partitions;
5338
5339   /* The list of all nodes for which a layout must be chosen.  Nodes for
5340      partition P come before the nodes for partition P+1.  Nodes within a
5341      partition are in reverse postorder.  */
5342   auto_vec<unsigned int> m_partitioned_nodes;
5343
5344   /* Index P * num-layouts + L contains the cost of using layout L
5345      for partition P.  */
5346   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
5347
5348   /* Index N * num-layouts + L, if nonnull, is a node that provides the
5349      original output of node N adjusted to have layout L.  */
5350   auto_vec<slp_tree> m_node_layouts;
5351 };
5352
5353 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
5354    Also record whether we should optimize anything for speed rather
5355    than size.  */
5356
5357 void
5358 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
5359                                         slp_tree node)
5360 {
5361   unsigned i;
5362   slp_tree child;
5363
5364   if (visited.add (node))
5365     return;
5366
5367   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
5368     {
5369       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
5370       if (optimize_bb_for_speed_p (bb))
5371         m_optimize_size = false;
5372     }
5373
5374   node->vertex = m_vertices.length ();
5375   m_vertices.safe_push (slpg_vertex (node));
5376
5377   bool leaf = true;
5378   bool force_leaf = false;
5379   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5380     if (child)
5381       {
5382         leaf = false;
5383         build_vertices (visited, child);
5384       }
5385     else
5386       force_leaf = true;
5387   /* Since SLP discovery works along use-def edges all cycles have an
5388      entry - but there's the exception of cycles where we do not handle
5389      the entry explicitely (but with a NULL SLP node), like some reductions
5390      and inductions.  Force those SLP PHIs to act as leafs to make them
5391      backwards reachable.  */
5392   if (leaf || force_leaf)
5393     m_leafs.safe_push (node->vertex);
5394 }
5395
5396 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
5397
5398 void
5399 vect_optimize_slp_pass::build_vertices ()
5400 {
5401   hash_set<slp_tree> visited;
5402   unsigned i;
5403   slp_instance instance;
5404   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
5405     build_vertices (visited, SLP_INSTANCE_TREE (instance));
5406 }
5407
5408 /* Apply (reverse) bijectite PERM to VEC.  */
5409
5410 template <class T>
5411 static void
5412 vect_slp_permute (vec<unsigned> perm,
5413                   vec<T> &vec, bool reverse)
5414 {
5415   auto_vec<T, 64> saved;
5416   saved.create (vec.length ());
5417   for (unsigned i = 0; i < vec.length (); ++i)
5418     saved.quick_push (vec[i]);
5419
5420   if (reverse)
5421     {
5422       for (unsigned i = 0; i < vec.length (); ++i)
5423         vec[perm[i]] = saved[i];
5424       for (unsigned i = 0; i < vec.length (); ++i)
5425         gcc_assert (vec[perm[i]] == saved[i]);
5426     }
5427   else
5428     {
5429       for (unsigned i = 0; i < vec.length (); ++i)
5430         vec[i] = saved[perm[i]];
5431       for (unsigned i = 0; i < vec.length (); ++i)
5432         gcc_assert (vec[i] == saved[perm[i]]);
5433     }
5434 }
5435
5436 /* Return the cfg loop that contains NODE.  */
5437
5438 struct loop *
5439 vect_optimize_slp_pass::containing_loop (slp_tree node)
5440 {
5441   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
5442   if (!rep)
5443     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
5444   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
5445 }
5446
5447 /* Return true if UD (an edge from a use to a definition) is associated
5448    with a loop latch edge in the cfg.  */
5449
5450 bool
5451 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
5452 {
5453   slp_tree use = m_vertices[ud->src].node;
5454   slp_tree def = m_vertices[ud->dest].node;
5455   if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
5456        || SLP_TREE_CODE (use) == VEC_PERM_EXPR)
5457       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
5458     return false;
5459
5460   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
5461   return (is_a<gphi *> (use_rep->stmt)
5462           && bb_loop_header_p (gimple_bb (use_rep->stmt))
5463           && containing_loop (def) == containing_loop (use));
5464 }
5465
5466 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
5467    a nonnull data field.  */
5468
5469 void
5470 vect_optimize_slp_pass::build_graph ()
5471 {
5472   m_optimize_size = true;
5473   build_vertices ();
5474
5475   m_slpg = new_graph (m_vertices.length ());
5476   for (slpg_vertex &v : m_vertices)
5477     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
5478       if (child)
5479         {
5480           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
5481           if (is_cfg_latch_edge (ud))
5482             ud->data = this;
5483         }
5484 }
5485
5486 /* Return true if E corresponds to a loop latch edge in the cfg.  */
5487
5488 static bool
5489 skip_cfg_latch_edges (graph_edge *e)
5490 {
5491   return e->data;
5492 }
5493
5494 /* Create the node partitions.  */
5495
5496 void
5497 vect_optimize_slp_pass::create_partitions ()
5498 {
5499   /* Calculate a postorder of the graph, ignoring edges that correspond
5500      to natural latch edges in the cfg.  Reading the vector from the end
5501      to the beginning gives the reverse postorder.  */
5502   auto_vec<int> initial_rpo;
5503   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
5504                false, NULL, skip_cfg_latch_edges);
5505   gcc_assert (initial_rpo.length () == m_vertices.length ());
5506
5507   /* Calculate the strongly connected components of the graph.  */
5508   auto_vec<int> scc_grouping;
5509   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
5510
5511   /* Create a new index order in which all nodes from the same SCC are
5512      consecutive.  Use scc_pos to record the index of the first node in
5513      each SCC.  */
5514   auto_vec<unsigned int> scc_pos (num_sccs);
5515   int last_component = -1;
5516   unsigned int node_count = 0;
5517   for (unsigned int node_i : scc_grouping)
5518     {
5519       if (last_component != m_slpg->vertices[node_i].component)
5520         {
5521           last_component = m_slpg->vertices[node_i].component;
5522           gcc_assert (last_component == int (scc_pos.length ()));
5523           scc_pos.quick_push (node_count);
5524         }
5525       node_count += 1;
5526     }
5527   gcc_assert (node_count == initial_rpo.length ()
5528               && last_component + 1 == int (num_sccs));
5529
5530   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
5531      inside each SCC following the RPO we calculated above.  The fact that
5532      we ignored natural latch edges when calculating the RPO should ensure
5533      that, for natural loop nests:
5534
5535      - the first node that we encounter in a cfg loop is the loop header phi
5536      - the loop header phis are in dominance order
5537
5538      Arranging for this is an optimization (see below) rather than a
5539      correctness issue.  Unnatural loops with a tangled mess of backedges
5540      will still work correctly, but might give poorer results.
5541
5542      Also update scc_pos so that it gives 1 + the index of the last node
5543      in the SCC.  */
5544   m_partitioned_nodes.safe_grow (node_count);
5545   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
5546     {
5547       unsigned int node_i = initial_rpo[old_i];
5548       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
5549       m_partitioned_nodes[new_i] = node_i;
5550     }
5551
5552   /* When optimizing for speed, partition each SCC based on the containing
5553      cfg loop. The order we constructed above should ensure that, for natural
5554      cfg loops, we'll create sub-SCC partitions for outer loops before
5555      the corresponding sub-SCC partitions for inner loops.  Similarly,
5556      when one sibling loop A dominates another sibling loop B, we should
5557      create a sub-SCC partition for A before a sub-SCC partition for B.
5558
5559      As above, nothing depends for correctness on whether this achieves
5560      a natural nesting, but we should get better results when it does.  */
5561   m_partitions.reserve (m_vertices.length ());
5562   unsigned int next_partition_i = 0;
5563   hash_map<struct loop *, int> loop_partitions;
5564   unsigned int rpo_begin = 0;
5565   unsigned int num_partitioned_nodes = 0;
5566   for (unsigned int rpo_end : scc_pos)
5567     {
5568       loop_partitions.empty ();
5569       unsigned int partition_i = next_partition_i;
5570       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
5571         {
5572           /* Handle externals and constants optimistically throughout.
5573              But treat existing vectors as fixed since we do not handle
5574              permuting them.  */
5575           unsigned int node_i = m_partitioned_nodes[rpo_i];
5576           auto &vertex = m_vertices[node_i];
5577           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
5578                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
5579               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
5580             vertex.partition = -1;
5581           else
5582             {
5583               bool existed;
5584               if (m_optimize_size)
5585                 existed = next_partition_i > partition_i;
5586               else
5587                 {
5588                   struct loop *loop = containing_loop (vertex.node);
5589                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
5590                   if (!existed)
5591                     entry = next_partition_i;
5592                   partition_i = entry;
5593                 }
5594               if (!existed)
5595                 {
5596                   m_partitions.quick_push (slpg_partition_info ());
5597                   next_partition_i += 1;
5598                 }
5599               vertex.partition = partition_i;
5600               num_partitioned_nodes += 1;
5601               m_partitions[partition_i].node_end += 1;
5602             }
5603         }
5604       rpo_begin = rpo_end;
5605     }
5606
5607   /* Assign ranges of consecutive node indices to each partition,
5608      in partition order.  Start with node_end being the same as
5609      node_begin so that the next loop can use it as a counter.  */
5610   unsigned int node_begin = 0;
5611   for (auto &partition : m_partitions)
5612     {
5613       partition.node_begin = node_begin;
5614       node_begin += partition.node_end;
5615       partition.node_end = partition.node_begin;
5616     }
5617   gcc_assert (node_begin == num_partitioned_nodes);
5618
5619   /* Finally build the list of nodes in partition order.  */
5620   m_partitioned_nodes.truncate (num_partitioned_nodes);
5621   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
5622     {
5623       int partition_i = m_vertices[node_i].partition;
5624       if (partition_i >= 0)
5625         {
5626           unsigned int order_i = m_partitions[partition_i].node_end++;
5627           m_partitioned_nodes[order_i] = node_i;
5628         }
5629     }
5630 }
5631
5632 /* Look for edges from earlier partitions into node NODE_I and edges from
5633    node NODE_I into later partitions.  Call:
5634
5635       FN (ud, other_node_i)
5636
5637    for each such use-to-def edge ud, where other_node_i is the node at the
5638    other end of the edge.  */
5639
5640 template<typename T>
5641 void
5642 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
5643 {
5644   int partition_i = m_vertices[node_i].partition;
5645   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
5646        pred; pred = pred->pred_next)
5647     {
5648       int src_partition_i = m_vertices[pred->src].partition;
5649       if (src_partition_i >= 0 && src_partition_i != partition_i)
5650         fn (pred, pred->src);
5651     }
5652   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
5653        succ; succ = succ->succ_next)
5654     {
5655       int dest_partition_i = m_vertices[succ->dest].partition;
5656       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
5657         fn (succ, succ->dest);
5658     }
5659 }
5660
5661 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
5662    that NODE would operate on.  This test is independent of NODE's actual
5663    operation.  */
5664
5665 bool
5666 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
5667                                               unsigned int layout_i)
5668 {
5669   if (layout_i == 0)
5670     return true;
5671
5672   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
5673     return false;
5674
5675   return true;
5676 }
5677
5678 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
5679    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
5680    layouts is incompatible with NODE or if the change is not possible for
5681    some other reason.
5682
5683    The properties taken from NODE include the number of lanes and the
5684    vector type.  The actual operation doesn't matter.  */
5685
5686 int
5687 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
5688                                             unsigned int from_layout_i,
5689                                             unsigned int to_layout_i)
5690 {
5691   if (!is_compatible_layout (node, from_layout_i)
5692       || !is_compatible_layout (node, to_layout_i))
5693     return -1;
5694
5695   if (from_layout_i == to_layout_i)
5696     return 0;
5697
5698   auto_vec<slp_tree, 1> children (1);
5699   children.quick_push (node);
5700   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
5701   if (from_layout_i > 0)
5702     for (unsigned int i : m_perms[from_layout_i])
5703       perm.quick_push ({ 0, i });
5704   else
5705     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
5706       perm.quick_push ({ 0, i });
5707   if (to_layout_i > 0)
5708     vect_slp_permute (m_perms[to_layout_i], perm, true);
5709   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
5710                                                children, false);
5711   if (count >= 0)
5712     return MAX (count, 1);
5713
5714   /* ??? In principle we could try changing via layout 0, giving two
5715      layout changes rather than 1.  Doing that would require
5716      corresponding support in get_result_with_layout.  */
5717   return -1;
5718 }
5719
5720 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
5721
5722 inline slpg_partition_layout_costs &
5723 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
5724                                                 unsigned int layout_i)
5725 {
5726   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
5727 }
5728
5729 /* Change PERM in one of two ways:
5730
5731    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
5732      chosen for child I of NODE.
5733
5734    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
5735
5736    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
5737
5738 void
5739 vect_optimize_slp_pass::
5740 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
5741                         int in_layout_i, unsigned int out_layout_i)
5742 {
5743   for (auto &entry : perm)
5744     {
5745       int this_in_layout_i = in_layout_i;
5746       if (this_in_layout_i < 0)
5747         {
5748           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
5749           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
5750           if (in_partition_i == -1u)
5751             continue;
5752           this_in_layout_i = m_partitions[in_partition_i].layout;
5753         }
5754       if (this_in_layout_i > 0)
5755         entry.second = m_perms[this_in_layout_i][entry.second];
5756     }
5757   if (out_layout_i > 0)
5758     vect_slp_permute (m_perms[out_layout_i], perm, true);
5759 }
5760
5761 /* Check whether the target allows NODE to be rearranged so that the node's
5762    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
5763    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
5764
5765    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
5766    NODE can adapt to the layout changes that have (perhaps provisionally)
5767    been chosen for NODE's children, so that no extra permutations are
5768    needed on either the input or the output of NODE.
5769
5770    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
5771    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
5772
5773    IN_LAYOUT_I has no meaning for other types of node.
5774
5775    Keeping the node as-is is always valid.  If the target doesn't appear
5776    to support the node as-is, but might realistically support other layouts,
5777    then layout 0 instead has the cost of a worst-case permutation.  On the
5778    one hand, this ensures that every node has at least one valid layout,
5779    avoiding what would otherwise be an awkward special case.  On the other,
5780    it still encourages the pass to change an invalid pre-existing layout
5781    choice into a valid one.  */
5782
5783 int
5784 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
5785                                             unsigned int out_layout_i)
5786 {
5787   const int fallback_cost = 1;
5788
5789   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5790     {
5791       auto_lane_permutation_t tmp_perm;
5792       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5793
5794       /* Check that the child nodes support the chosen layout.  Checking
5795          the first child is enough, since any second child would have the
5796          same shape.  */
5797       auto first_child = SLP_TREE_CHILDREN (node)[0];
5798       if (in_layout_i > 0
5799           && !is_compatible_layout (first_child, in_layout_i))
5800         return -1;
5801
5802       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
5803       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
5804                                                   node, tmp_perm,
5805                                                   SLP_TREE_CHILDREN (node),
5806                                                   false);
5807       if (count < 0)
5808         {
5809           if (in_layout_i == 0 && out_layout_i == 0)
5810             {
5811               /* Use the fallback cost if the node could in principle support
5812                  some nonzero layout for both the inputs and the outputs.
5813                  Otherwise assume that the node will be rejected later
5814                  and rebuilt from scalars.  */
5815               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
5816                 return fallback_cost;
5817               return 0;
5818             }
5819           return -1;
5820         }
5821
5822       /* We currently have no way of telling whether the new layout is cheaper
5823          or more expensive than the old one.  But at least in principle,
5824          it should be worth making zero permutations (whole-vector shuffles)
5825          cheaper than real permutations, in case the pass is able to remove
5826          the latter.  */
5827       return count == 0 ? 0 : 1;
5828     }
5829
5830   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
5831   if (rep
5832       && STMT_VINFO_DATA_REF (rep)
5833       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
5834       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
5835     {
5836       auto_load_permutation_t tmp_perm;
5837       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
5838       if (out_layout_i > 0)
5839         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
5840
5841       poly_uint64 vf = 1;
5842       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
5843         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5844       unsigned int n_perms;
5845       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
5846                                            nullptr, vf, true, false, &n_perms))
5847         {
5848           auto rep = SLP_TREE_REPRESENTATIVE (node);
5849           if (out_layout_i == 0)
5850             {
5851               /* Use the fallback cost if the load is an N-to-N permutation.
5852                  Otherwise assume that the node will be rejected later
5853                  and rebuilt from scalars.  */
5854               if (STMT_VINFO_GROUPED_ACCESS (rep)
5855                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
5856                       == SLP_TREE_LANES (node)))
5857                 return fallback_cost;
5858               return 0;
5859             }
5860           return -1;
5861         }
5862
5863       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
5864       return n_perms == 0 ? 0 : 1;
5865     }
5866
5867   return 0;
5868 }
5869
5870 /* Decide which element layouts we should consider using.  Calculate the
5871    weights associated with inserting layout changes on partition edges.
5872    Also mark partitions that cannot change layout, by setting their
5873    layout to zero.  */
5874
5875 void
5876 vect_optimize_slp_pass::start_choosing_layouts ()
5877 {
5878   /* Used to assign unique permutation indices.  */
5879   using perm_hash = unbounded_hashmap_traits<
5880     vec_free_hash_base<int_hash_base<unsigned>>,
5881     int_hash<int, -1, -2>
5882   >;
5883   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
5884
5885   /* Layout 0 is "no change".  */
5886   m_perms.safe_push (vNULL);
5887
5888   /* Create layouts from existing permutations.  */
5889   auto_load_permutation_t tmp_perm;
5890   for (unsigned int node_i : m_partitioned_nodes)
5891     {
5892       /* Leafs also double as entries to the reverse graph.  Allow the
5893          layout of those to be changed.  */
5894       auto &vertex = m_vertices[node_i];
5895       auto &partition = m_partitions[vertex.partition];
5896       if (!m_slpg->vertices[node_i].succ)
5897         partition.layout = 0;
5898
5899       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
5900       slp_tree node = vertex.node;
5901       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
5902       slp_tree child;
5903       unsigned HOST_WIDE_INT imin, imax = 0;
5904       bool any_permute = false;
5905       tmp_perm.truncate (0);
5906       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
5907         {
5908           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
5909              unpermuted, record a layout that reverses this permutation.
5910
5911              We would need more work to cope with loads that are internally
5912              permuted and also have inputs (such as masks for
5913              IFN_MASK_LOADs).  */
5914           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
5915           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
5916             {
5917               partition.layout = -1;
5918               continue;
5919             }
5920           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
5921           imin = DR_GROUP_SIZE (dr_stmt) + 1;
5922           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
5923         }
5924       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
5925                && SLP_TREE_CHILDREN (node).length () == 1
5926                && (child = SLP_TREE_CHILDREN (node)[0])
5927                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
5928                    .is_constant (&imin)))
5929         {
5930           /* If the child has the same vector size as this node,
5931              reversing the permutation can make the permutation a no-op.
5932              In other cases it can change a true permutation into a
5933              full-vector extract.  */
5934           tmp_perm.reserve (SLP_TREE_LANES (node));
5935           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
5936             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
5937         }
5938       else
5939         continue;
5940
5941       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
5942         {
5943           unsigned idx = tmp_perm[j];
5944           imin = MIN (imin, idx);
5945           imax = MAX (imax, idx);
5946           if (idx - tmp_perm[0] != j)
5947             any_permute = true;
5948         }
5949       /* If the span doesn't match we'd disrupt VF computation, avoid
5950          that for now.  */
5951       if (imax - imin + 1 != SLP_TREE_LANES (node))
5952         continue;
5953       /* If there's no permute no need to split one out.  In this case
5954          we can consider turning a load into a permuted load, if that
5955          turns out to be cheaper than alternatives.  */
5956       if (!any_permute)
5957         {
5958           partition.layout = -1;
5959           continue;
5960         }
5961
5962       /* For now only handle true permutes, like
5963          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
5964          when permuting constants and invariants keeping the permute
5965          bijective.  */
5966       auto_sbitmap load_index (SLP_TREE_LANES (node));
5967       bitmap_clear (load_index);
5968       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
5969         bitmap_set_bit (load_index, tmp_perm[j] - imin);
5970       unsigned j;
5971       for (j = 0; j < SLP_TREE_LANES (node); ++j)
5972         if (!bitmap_bit_p (load_index, j))
5973           break;
5974       if (j != SLP_TREE_LANES (node))
5975         continue;
5976
5977       vec<unsigned> perm = vNULL;
5978       perm.safe_grow (SLP_TREE_LANES (node), true);
5979       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
5980         perm[j] = tmp_perm[j] - imin;
5981
5982       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
5983         {
5984           /* Continue to use existing layouts, but don't add any more.  */
5985           int *entry = layout_ids.get (perm);
5986           partition.layout = entry ? *entry : 0;
5987           perm.release ();
5988         }
5989       else
5990         {
5991           bool existed;
5992           int &layout_i = layout_ids.get_or_insert (perm, &existed);
5993           if (existed)
5994             perm.release ();
5995           else
5996             {
5997               layout_i = m_perms.length ();
5998               m_perms.safe_push (perm);
5999             }
6000           partition.layout = layout_i;
6001         }
6002     }
6003
6004   /* Initially assume that every layout is possible and has zero cost
6005      in every partition.  */
6006   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
6007                                               * m_perms.length ());
6008
6009   /* We have to mark outgoing permutations facing non-associating-reduction
6010      graph entries that are not represented as to be materialized.
6011      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
6012   for (slp_instance instance : m_vinfo->slp_instances)
6013     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
6014       {
6015         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
6016         m_partitions[m_vertices[node_i].partition].layout = 0;
6017       }
6018     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
6019       {
6020         stmt_vec_info stmt_info
6021           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
6022         stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
6023         if (needs_fold_left_reduction_p (TREE_TYPE
6024                                            (gimple_get_lhs (stmt_info->stmt)),
6025                                          STMT_VINFO_REDUC_CODE (reduc_info)))
6026           {
6027             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
6028             m_partitions[m_vertices[node_i].partition].layout = 0;
6029           }
6030       }
6031
6032   /* Check which layouts each node and partition can handle.  Calculate the
6033      weights associated with inserting layout changes on edges.  */
6034   for (unsigned int node_i : m_partitioned_nodes)
6035     {
6036       auto &vertex = m_vertices[node_i];
6037       auto &partition = m_partitions[vertex.partition];
6038       slp_tree node = vertex.node;
6039
6040       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
6041         {
6042           vertex.weight = vect_slp_node_weight (node);
6043
6044           /* We do not handle stores with a permutation, so all
6045              incoming permutations must have been materialized.
6046
6047              We also don't handle masked grouped loads, which lack a
6048              permutation vector.  In this case the memory locations
6049              form an implicit second input to the loads, on top of the
6050              explicit mask input, and the memory input's layout cannot
6051              be changed.
6052
6053              On the other hand, we do support permuting gather loads and
6054              masked gather loads, where each scalar load is independent
6055              of the others.  This can be useful if the address/index input
6056              benefits from permutation.  */
6057           if (STMT_VINFO_DATA_REF (rep)
6058               && STMT_VINFO_GROUPED_ACCESS (rep)
6059               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
6060             partition.layout = 0;
6061
6062           /* We cannot change the layout of an operation that is
6063              not independent on lanes.  Note this is an explicit
6064              negative list since that's much shorter than the respective
6065              positive one but it's critical to keep maintaining it.  */
6066           if (is_gimple_call (STMT_VINFO_STMT (rep)))
6067             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
6068               {
6069               case CFN_COMPLEX_ADD_ROT90:
6070               case CFN_COMPLEX_ADD_ROT270:
6071               case CFN_COMPLEX_MUL:
6072               case CFN_COMPLEX_MUL_CONJ:
6073               case CFN_VEC_ADDSUB:
6074               case CFN_VEC_FMADDSUB:
6075               case CFN_VEC_FMSUBADD:
6076                 partition.layout = 0;
6077               default:;
6078               }
6079         }
6080
6081       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
6082         {
6083           auto &other_vertex = m_vertices[other_node_i];
6084
6085           /* Count the number of edges from earlier partitions and the number
6086              of edges to later partitions.  */
6087           if (other_vertex.partition < vertex.partition)
6088             partition.in_degree += 1;
6089           else
6090             partition.out_degree += 1;
6091
6092           /* If the current node uses the result of OTHER_NODE_I, accumulate
6093              the effects of that.  */
6094           if (ud->src == int (node_i))
6095             {
6096               other_vertex.out_weight += vertex.weight;
6097               other_vertex.out_degree += 1;
6098             }
6099         };
6100       for_each_partition_edge (node_i, process_edge);
6101     }
6102 }
6103
6104 /* Return the incoming costs for node NODE_I, assuming that each input keeps
6105    its current (provisional) choice of layout.  The inputs do not necessarily
6106    have the same layout as each other.  */
6107
6108 slpg_layout_cost
6109 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
6110 {
6111   auto &vertex = m_vertices[node_i];
6112   slpg_layout_cost cost;
6113   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
6114     {
6115       auto &other_vertex = m_vertices[other_node_i];
6116       if (other_vertex.partition < vertex.partition)
6117         {
6118           auto &other_partition = m_partitions[other_vertex.partition];
6119           auto &other_costs = partition_layout_costs (other_vertex.partition,
6120                                                       other_partition.layout);
6121           slpg_layout_cost this_cost = other_costs.in_cost;
6122           this_cost.add_serial_cost (other_costs.internal_cost);
6123           this_cost.split (other_partition.out_degree);
6124           cost.add_parallel_cost (this_cost);
6125         }
6126     };
6127   for_each_partition_edge (node_i, add_cost);
6128   return cost;
6129 }
6130
6131 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
6132    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
6133    slpg_layout_cost::impossible () if the change isn't possible.  */
6134
6135 slpg_layout_cost
6136 vect_optimize_slp_pass::
6137 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
6138                   unsigned int layout2_i)
6139 {
6140   auto &def_vertex = m_vertices[ud->dest];
6141   auto &use_vertex = m_vertices[ud->src];
6142   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
6143   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
6144   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
6145                                     use_layout_i);
6146   if (factor < 0)
6147     return slpg_layout_cost::impossible ();
6148
6149   /* We have a choice of putting the layout change at the site of the
6150      definition or at the site of the use.  Prefer the former when
6151      optimizing for size or when the execution frequency of the
6152      definition is no greater than the combined execution frequencies of
6153      the uses.  When putting the layout change at the site of the definition,
6154      divvy up the cost among all consumers.  */
6155   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
6156     {
6157       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
6158       cost.split (def_vertex.out_degree);
6159       return cost;
6160     }
6161   return { use_vertex.weight * factor, m_optimize_size };
6162 }
6163
6164 /* UD represents a use-def link between FROM_NODE_I and a node in a later
6165    partition; FROM_NODE_I could be the definition node or the use node.
6166    The node at the other end of the link wants to use layout TO_LAYOUT_I.
6167    Return the cost of any necessary fix-ups on edge UD, or return
6168    slpg_layout_cost::impossible () if the change isn't possible.
6169
6170    At this point, FROM_NODE_I's partition has chosen the cheapest
6171    layout based on the information available so far, but this choice
6172    is only provisional.  */
6173
6174 slpg_layout_cost
6175 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
6176                                       unsigned int to_layout_i)
6177 {
6178   auto &from_vertex = m_vertices[from_node_i];
6179   unsigned int from_partition_i = from_vertex.partition;
6180   slpg_partition_info &from_partition = m_partitions[from_partition_i];
6181   gcc_assert (from_partition.layout >= 0);
6182
6183   /* First calculate the cost on the assumption that FROM_PARTITION sticks
6184      with its current layout preference.  */
6185   slpg_layout_cost cost = slpg_layout_cost::impossible ();
6186   auto edge_cost = edge_layout_cost (ud, from_node_i,
6187                                      from_partition.layout, to_layout_i);
6188   if (edge_cost.is_possible ())
6189     {
6190       auto &from_costs = partition_layout_costs (from_partition_i,
6191                                                  from_partition.layout);
6192       cost = from_costs.in_cost;
6193       cost.add_serial_cost (from_costs.internal_cost);
6194       cost.split (from_partition.out_degree);
6195       cost.add_serial_cost (edge_cost);
6196     }
6197   else if (from_partition.layout == 0)
6198     /* We must allow the source partition to have layout 0 as a fallback,
6199        in case all other options turn out to be impossible.  */
6200     return cost;
6201
6202   /* Take the minimum of that cost and the cost that applies if
6203      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
6204   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
6205                                                       to_layout_i);
6206   if (direct_layout_costs.is_possible ())
6207     {
6208       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
6209       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
6210       direct_cost.split (from_partition.out_degree);
6211       if (!cost.is_possible ()
6212           || direct_cost.is_better_than (cost, m_optimize_size))
6213         cost = direct_cost;
6214     }
6215
6216   return cost;
6217 }
6218
6219 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
6220    partition; TO_NODE_I could be the definition node or the use node.
6221    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
6222    return the cost of any necessary fix-ups on edge UD, or
6223    slpg_layout_cost::impossible () if the choice cannot be made.
6224
6225    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
6226
6227 slpg_layout_cost
6228 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
6229                                        unsigned int from_layout_i)
6230 {
6231   auto &to_vertex = m_vertices[to_node_i];
6232   unsigned int to_partition_i = to_vertex.partition;
6233   slpg_partition_info &to_partition = m_partitions[to_partition_i];
6234   gcc_assert (to_partition.layout >= 0);
6235
6236   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
6237      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
6238      any other inputs keep their current choice of layout.  */
6239   auto &to_costs = partition_layout_costs (to_partition_i,
6240                                            to_partition.layout);
6241   if (ud->src == int (to_node_i)
6242       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
6243     {
6244       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
6245       auto old_layout = from_partition.layout;
6246       from_partition.layout = from_layout_i;
6247       int factor = internal_node_cost (to_vertex.node, -1,
6248                                        to_partition.layout);
6249       from_partition.layout = old_layout;
6250       if (factor >= 0)
6251         {
6252           slpg_layout_cost cost = to_costs.out_cost;
6253           cost.add_serial_cost ({ to_vertex.weight * factor,
6254                                   m_optimize_size });
6255           cost.split (to_partition.in_degree);
6256           return cost;
6257         }
6258     }
6259
6260   /* Compute the cost if we insert any necessary layout change on edge UD.  */
6261   auto edge_cost = edge_layout_cost (ud, to_node_i,
6262                                      to_partition.layout, from_layout_i);
6263   if (edge_cost.is_possible ())
6264     {
6265       slpg_layout_cost cost = to_costs.out_cost;
6266       cost.add_serial_cost (to_costs.internal_cost);
6267       cost.split (to_partition.in_degree);
6268       cost.add_serial_cost (edge_cost);
6269       return cost;
6270     }
6271
6272   return slpg_layout_cost::impossible ();
6273 }
6274
6275 /* Make a forward pass through the partitions, accumulating input costs.
6276    Make a tentative (provisional) choice of layout for each partition,
6277    ensuring that this choice still allows later partitions to keep
6278    their original layout.  */
6279
6280 void
6281 vect_optimize_slp_pass::forward_pass ()
6282 {
6283   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
6284        ++partition_i)
6285     {
6286       auto &partition = m_partitions[partition_i];
6287
6288       /* If the partition consists of a single VEC_PERM_EXPR, precompute
6289          the incoming cost that would apply if every predecessor partition
6290          keeps its current layout.  This is used within the loop below.  */
6291       slpg_layout_cost in_cost;
6292       slp_tree single_node = nullptr;
6293       if (partition.node_end == partition.node_begin + 1)
6294         {
6295           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
6296           single_node = m_vertices[node_i].node;
6297           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
6298             in_cost = total_in_cost (node_i);
6299         }
6300
6301       /* Go through the possible layouts.  Decide which ones are valid
6302          for this partition and record which of the valid layouts has
6303          the lowest cost.  */
6304       unsigned int min_layout_i = 0;
6305       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
6306       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6307         {
6308           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6309           if (!layout_costs.is_possible ())
6310             continue;
6311
6312           /* If the recorded layout is already 0 then the layout cannot
6313              change.  */
6314           if (partition.layout == 0 && layout_i != 0)
6315             {
6316               layout_costs.mark_impossible ();
6317               continue;
6318             }
6319
6320           bool is_possible = true;
6321           for (unsigned int order_i = partition.node_begin;
6322                order_i < partition.node_end; ++order_i)
6323             {
6324               unsigned int node_i = m_partitioned_nodes[order_i];
6325               auto &vertex = m_vertices[node_i];
6326
6327               /* Reject the layout if it is individually incompatible
6328                  with any node in the partition.  */
6329               if (!is_compatible_layout (vertex.node, layout_i))
6330                 {
6331                   is_possible = false;
6332                   break;
6333                 }
6334
6335               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
6336                 {
6337                   auto &other_vertex = m_vertices[other_node_i];
6338                   if (other_vertex.partition < vertex.partition)
6339                     {
6340                       /* Accumulate the incoming costs from earlier
6341                          partitions, plus the cost of any layout changes
6342                          on UD itself.  */
6343                       auto cost = forward_cost (ud, other_node_i, layout_i);
6344                       if (!cost.is_possible ())
6345                         is_possible = false;
6346                       else
6347                         layout_costs.in_cost.add_parallel_cost (cost);
6348                     }
6349                   else
6350                     /* Reject the layout if it would make layout 0 impossible
6351                        for later partitions.  This amounts to testing that the
6352                        target supports reversing the layout change on edges
6353                        to later partitions.
6354
6355                        In principle, it might be possible to push a layout
6356                        change all the way down a graph, so that it never
6357                        needs to be reversed and so that the target doesn't
6358                        need to support the reverse operation.  But it would
6359                        be awkward to bail out if we hit a partition that
6360                        does not support the new layout, especially since
6361                        we are not dealing with a lattice.  */
6362                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
6363                                                      layout_i).is_possible ();
6364                 };
6365               for_each_partition_edge (node_i, add_cost);
6366
6367               /* Accumulate the cost of using LAYOUT_I within NODE,
6368                  both for the inputs and the outputs.  */
6369               int factor = internal_node_cost (vertex.node, layout_i,
6370                                                layout_i);
6371               if (factor < 0)
6372                 {
6373                   is_possible = false;
6374                   break;
6375                 }
6376               else if (factor)
6377                 layout_costs.internal_cost.add_serial_cost
6378                   ({ vertex.weight * factor, m_optimize_size });
6379             }
6380           if (!is_possible)
6381             {
6382               layout_costs.mark_impossible ();
6383               continue;
6384             }
6385
6386           /* Combine the incoming and partition-internal costs.  */
6387           slpg_layout_cost combined_cost = layout_costs.in_cost;
6388           combined_cost.add_serial_cost (layout_costs.internal_cost);
6389
6390           /* If this partition consists of a single VEC_PERM_EXPR, see
6391              if the VEC_PERM_EXPR can be changed to support output layout
6392              LAYOUT_I while keeping all the provisional choices of input
6393              layout.  */
6394           if (single_node
6395               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
6396             {
6397               int factor = internal_node_cost (single_node, -1, layout_i);
6398               if (factor >= 0)
6399                 {
6400                   auto weight = m_vertices[single_node->vertex].weight;
6401                   slpg_layout_cost internal_cost
6402                     = { weight * factor, m_optimize_size };
6403
6404                   slpg_layout_cost alt_cost = in_cost;
6405                   alt_cost.add_serial_cost (internal_cost);
6406                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
6407                     {
6408                       combined_cost = alt_cost;
6409                       layout_costs.in_cost = in_cost;
6410                       layout_costs.internal_cost = internal_cost;
6411                     }
6412                 }
6413             }
6414
6415           /* Record the layout with the lowest cost.  Prefer layout 0 in
6416              the event of a tie between it and another layout.  */
6417           if (!min_layout_cost.is_possible ()
6418               || combined_cost.is_better_than (min_layout_cost,
6419                                                m_optimize_size))
6420             {
6421               min_layout_i = layout_i;
6422               min_layout_cost = combined_cost;
6423             }
6424         }
6425
6426       /* This loop's handling of earlier partitions should ensure that
6427          choosing the original layout for the current partition is no
6428          less valid than it was in the original graph, even with the
6429          provisional layout choices for those earlier partitions.  */
6430       gcc_assert (min_layout_cost.is_possible ());
6431       partition.layout = min_layout_i;
6432     }
6433 }
6434
6435 /* Make a backward pass through the partitions, accumulating output costs.
6436    Make a final choice of layout for each partition.  */
6437
6438 void
6439 vect_optimize_slp_pass::backward_pass ()
6440 {
6441   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
6442     {
6443       auto &partition = m_partitions[partition_i];
6444
6445       unsigned int min_layout_i = 0;
6446       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
6447       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6448         {
6449           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6450           if (!layout_costs.is_possible ())
6451             continue;
6452
6453           /* Accumulate the costs from successor partitions.  */
6454           bool is_possible = true;
6455           for (unsigned int order_i = partition.node_begin;
6456                order_i < partition.node_end; ++order_i)
6457             {
6458               unsigned int node_i = m_partitioned_nodes[order_i];
6459               auto &vertex = m_vertices[node_i];
6460               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
6461                 {
6462                   auto &other_vertex = m_vertices[other_node_i];
6463                   auto &other_partition = m_partitions[other_vertex.partition];
6464                   if (other_vertex.partition > vertex.partition)
6465                     {
6466                       /* Accumulate the incoming costs from later
6467                          partitions, plus the cost of any layout changes
6468                          on UD itself.  */
6469                       auto cost = backward_cost (ud, other_node_i, layout_i);
6470                       if (!cost.is_possible ())
6471                         is_possible = false;
6472                       else
6473                         layout_costs.out_cost.add_parallel_cost (cost);
6474                     }
6475                   else
6476                     /* Make sure that earlier partitions can (if necessary
6477                        or beneficial) keep the layout that they chose in
6478                        the forward pass.  This ensures that there is at
6479                        least one valid choice of layout.  */
6480                     is_possible &= edge_layout_cost (ud, other_node_i,
6481                                                      other_partition.layout,
6482                                                      layout_i).is_possible ();
6483                 };
6484               for_each_partition_edge (node_i, add_cost);
6485             }
6486           if (!is_possible)
6487             {
6488               layout_costs.mark_impossible ();
6489               continue;
6490             }
6491
6492           /* Locally combine the costs from the forward and backward passes.
6493              (This combined cost is not passed on, since that would lead
6494              to double counting.)  */
6495           slpg_layout_cost combined_cost = layout_costs.in_cost;
6496           combined_cost.add_serial_cost (layout_costs.internal_cost);
6497           combined_cost.add_serial_cost (layout_costs.out_cost);
6498
6499           /* Record the layout with the lowest cost.  Prefer layout 0 in
6500              the event of a tie between it and another layout.  */
6501           if (!min_layout_cost.is_possible ()
6502               || combined_cost.is_better_than (min_layout_cost,
6503                                                m_optimize_size))
6504             {
6505               min_layout_i = layout_i;
6506               min_layout_cost = combined_cost;
6507             }
6508         }
6509
6510       gcc_assert (min_layout_cost.is_possible ());
6511       partition.layout = min_layout_i;
6512     }
6513 }
6514
6515 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
6516    NODE already has the layout that was selected for its partition.  */
6517
6518 slp_tree
6519 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
6520                                                 unsigned int to_layout_i)
6521 {
6522   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
6523   slp_tree result = m_node_layouts[result_i];
6524   if (result)
6525     return result;
6526
6527   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
6528       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
6529           /* We can't permute vector defs in place.  */
6530           && SLP_TREE_VEC_DEFS (node).is_empty ()))
6531     {
6532       /* If the vector is uniform or unchanged, there's nothing to do.  */
6533       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
6534         result = node;
6535       else
6536         {
6537           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
6538           result = vect_create_new_slp_node (scalar_ops);
6539           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
6540         }
6541     }
6542   else
6543     {
6544       unsigned int partition_i = m_vertices[node->vertex].partition;
6545       unsigned int from_layout_i = m_partitions[partition_i].layout;
6546       if (from_layout_i == to_layout_i)
6547         return node;
6548
6549       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
6550          permutation instead of a serial one.  Leave the new permutation
6551          in TMP_PERM on success.  */
6552       auto_lane_permutation_t tmp_perm;
6553       unsigned int num_inputs = 1;
6554       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6555         {
6556           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
6557           if (from_layout_i != 0)
6558             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
6559           if (to_layout_i != 0)
6560             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
6561           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
6562                                               tmp_perm,
6563                                               SLP_TREE_CHILDREN (node),
6564                                               false) >= 0)
6565             num_inputs = SLP_TREE_CHILDREN (node).length ();
6566           else
6567             tmp_perm.truncate (0);
6568         }
6569
6570       if (dump_enabled_p ())
6571         {
6572           if (tmp_perm.length () > 0)
6573             dump_printf_loc (MSG_NOTE, vect_location,
6574                              "duplicating permutation node %p with"
6575                              " layout %d\n",
6576                              (void *) node, to_layout_i);
6577           else
6578             dump_printf_loc (MSG_NOTE, vect_location,
6579                              "inserting permutation node in place of %p\n",
6580                              (void *) node);
6581         }
6582
6583       unsigned int num_lanes = SLP_TREE_LANES (node);
6584       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
6585       if (SLP_TREE_SCALAR_STMTS (node).length ())
6586         {
6587           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
6588           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
6589           if (from_layout_i != 0)
6590             vect_slp_permute (m_perms[from_layout_i], stmts, false);
6591           if (to_layout_i != 0)
6592             vect_slp_permute (m_perms[to_layout_i], stmts, true);
6593         }
6594       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
6595       SLP_TREE_LANES (result) = num_lanes;
6596       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
6597       result->vertex = -1;
6598
6599       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
6600       if (tmp_perm.length ())
6601         {
6602           lane_perm.safe_splice (tmp_perm);
6603           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
6604         }
6605       else
6606         {
6607           lane_perm.create (num_lanes);
6608           for (unsigned j = 0; j < num_lanes; ++j)
6609             lane_perm.quick_push ({ 0, j });
6610           if (from_layout_i != 0)
6611             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
6612           if (to_layout_i != 0)
6613             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
6614           SLP_TREE_CHILDREN (result).safe_push (node);
6615         }
6616       for (slp_tree child : SLP_TREE_CHILDREN (result))
6617         child->refcnt++;
6618     }
6619   m_node_layouts[result_i] = result;
6620   return result;
6621 }
6622
6623 /* Apply the chosen vector layouts to the SLP graph.  */
6624
6625 void
6626 vect_optimize_slp_pass::materialize ()
6627 {
6628   /* We no longer need the costs, so avoid having two O(N * P) arrays
6629      live at the same time.  */
6630   m_partition_layout_costs.release ();
6631   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
6632
6633   auto_sbitmap fully_folded (m_vertices.length ());
6634   bitmap_clear (fully_folded);
6635   for (unsigned int node_i : m_partitioned_nodes)
6636     {
6637       auto &vertex = m_vertices[node_i];
6638       slp_tree node = vertex.node;
6639       int layout_i = m_partitions[vertex.partition].layout;
6640       gcc_assert (layout_i >= 0);
6641
6642       /* Rearrange the scalar statements to match the chosen layout.  */
6643       if (layout_i > 0)
6644         vect_slp_permute (m_perms[layout_i],
6645                           SLP_TREE_SCALAR_STMTS (node), true);
6646
6647       /* Update load and lane permutations.  */
6648       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6649         {
6650           /* First try to absorb the input vector layouts.  If that fails,
6651              force the inputs to have layout LAYOUT_I too.  We checked that
6652              that was possible before deciding to use nonzero output layouts.
6653              (Note that at this stage we don't really have any guarantee that
6654              the target supports the original VEC_PERM_EXPR.)  */
6655           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
6656           auto_lane_permutation_t tmp_perm;
6657           tmp_perm.safe_splice (perm);
6658           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
6659           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
6660                                               tmp_perm,
6661                                               SLP_TREE_CHILDREN (node),
6662                                               false) >= 0)
6663             {
6664               if (dump_enabled_p ()
6665                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
6666                                   perm.begin ()))
6667                 dump_printf_loc (MSG_NOTE, vect_location,
6668                                  "absorbing input layouts into %p\n",
6669                                  (void *) node);
6670               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
6671               bitmap_set_bit (fully_folded, node_i);
6672             }
6673           else
6674             {
6675               /* Not MSG_MISSED because it would make no sense to users.  */
6676               if (dump_enabled_p ())
6677                 dump_printf_loc (MSG_NOTE, vect_location,
6678                                  "failed to absorb input layouts into %p\n",
6679                                  (void *) node);
6680               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
6681             }
6682         }
6683       else
6684         {
6685           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
6686           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
6687           if (layout_i > 0)
6688             /* ???  When we handle non-bijective permutes the idea
6689                is that we can force the load-permutation to be
6690                { min, min + 1, min + 2, ... max }.  But then the
6691                scalar defs might no longer match the lane content
6692                which means wrong-code with live lane vectorization.
6693                So we possibly have to have NULL entries for those.  */
6694             vect_slp_permute (m_perms[layout_i], load_perm, true);
6695         }
6696     }
6697
6698   /* Do this before any nodes disappear, since it involves a walk
6699      over the leaves.  */
6700   remove_redundant_permutations ();
6701
6702   /* Replace each child with a correctly laid-out version.  */
6703   for (unsigned int node_i : m_partitioned_nodes)
6704     {
6705       /* Skip nodes that have already been handled above.  */
6706       if (bitmap_bit_p (fully_folded, node_i))
6707         continue;
6708
6709       auto &vertex = m_vertices[node_i];
6710       int in_layout_i = m_partitions[vertex.partition].layout;
6711       gcc_assert (in_layout_i >= 0);
6712
6713       unsigned j;
6714       slp_tree child;
6715       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
6716         {
6717           if (!child)
6718             continue;
6719
6720           slp_tree new_child = get_result_with_layout (child, in_layout_i);
6721           if (new_child != child)
6722             {
6723               vect_free_slp_tree (child);
6724               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
6725               new_child->refcnt += 1;
6726             }
6727         }
6728     }
6729 }
6730
6731 /* Elide load permutations that are not necessary.  Such permutations might
6732    be pre-existing, rather than created by the layout optimizations.  */
6733
6734 void
6735 vect_optimize_slp_pass::remove_redundant_permutations ()
6736 {
6737   for (unsigned int node_i : m_leafs)
6738     {
6739       slp_tree node = m_vertices[node_i].node;
6740       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
6741         continue;
6742
6743       /* In basic block vectorization we allow any subchain of an interleaving
6744          chain.
6745          FORNOW: not in loop SLP because of realignment complications.  */
6746       if (is_a <bb_vec_info> (m_vinfo))
6747         {
6748           bool subchain_p = true;
6749           stmt_vec_info next_load_info = NULL;
6750           stmt_vec_info load_info;
6751           unsigned j;
6752           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
6753             {
6754               if (j != 0
6755                   && (next_load_info != load_info
6756                       || ! load_info
6757                       || DR_GROUP_GAP (load_info) != 1))
6758                 {
6759                   subchain_p = false;
6760                   break;
6761                 }
6762               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
6763             }
6764           if (subchain_p)
6765             {
6766               SLP_TREE_LOAD_PERMUTATION (node).release ();
6767               continue;
6768             }
6769         }
6770       else
6771         {
6772           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
6773           stmt_vec_info load_info;
6774           bool this_load_permuted = false;
6775           unsigned j;
6776           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
6777             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
6778               {
6779                 this_load_permuted = true;
6780                 break;
6781               }
6782           /* When this isn't a grouped access we know it's single element
6783              and contiguous.  */
6784           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
6785             {
6786               if (!this_load_permuted
6787                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
6788                       || SLP_TREE_LANES (node) == 1))
6789                 SLP_TREE_LOAD_PERMUTATION (node).release ();
6790               continue;
6791             }
6792           stmt_vec_info first_stmt_info
6793             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
6794           if (!this_load_permuted
6795               /* The load requires permutation when unrolling exposes
6796                  a gap either because the group is larger than the SLP
6797                  group-size or because there is a gap between the groups.  */
6798               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
6799                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
6800                       && DR_GROUP_GAP (first_stmt_info) == 0)))
6801             {
6802               SLP_TREE_LOAD_PERMUTATION (node).release ();
6803               continue;
6804             }
6805         }
6806     }
6807 }
6808
6809 /* Print the partition graph and layout information to the dump file.  */
6810
6811 void
6812 vect_optimize_slp_pass::dump ()
6813 {
6814   dump_printf_loc (MSG_NOTE, vect_location,
6815                    "SLP optimize permutations:\n");
6816   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
6817     {
6818       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
6819       const char *sep = "";
6820       for (unsigned int idx : m_perms[layout_i])
6821         {
6822           dump_printf (MSG_NOTE, "%s%d", sep, idx);
6823           sep = ", ";
6824         }
6825       dump_printf (MSG_NOTE, " }\n");
6826     }
6827   dump_printf_loc (MSG_NOTE, vect_location,
6828                    "SLP optimize partitions:\n");
6829   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
6830        ++partition_i)
6831     {
6832       auto &partition = m_partitions[partition_i];
6833       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
6834       dump_printf_loc (MSG_NOTE, vect_location,
6835                        "  partition %d (layout %d):\n",
6836                        partition_i, partition.layout);
6837       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
6838       for (unsigned int order_i = partition.node_begin;
6839            order_i < partition.node_end; ++order_i)
6840         {
6841           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
6842           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
6843                            (void *) vertex.node);
6844           dump_printf_loc (MSG_NOTE, vect_location,
6845                            "          weight: %f\n",
6846                            vertex.weight.to_double ());
6847           if (vertex.out_degree)
6848             dump_printf_loc (MSG_NOTE, vect_location,
6849                              "          out weight: %f (degree %d)\n",
6850                              vertex.out_weight.to_double (),
6851                              vertex.out_degree);
6852           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
6853             dump_printf_loc (MSG_NOTE, vect_location,
6854                              "          op: VEC_PERM_EXPR\n");
6855           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
6856             dump_printf_loc (MSG_NOTE, vect_location,
6857                              "          op template: %G", rep->stmt);
6858         }
6859       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
6860       for (unsigned int order_i = partition.node_begin;
6861            order_i < partition.node_end; ++order_i)
6862         {
6863           unsigned int node_i = m_partitioned_nodes[order_i];
6864           auto &vertex = m_vertices[node_i];
6865           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
6866             {
6867               auto &other_vertex = m_vertices[other_node_i];
6868               if (other_vertex.partition < vertex.partition)
6869                 dump_printf_loc (MSG_NOTE, vect_location,
6870                                  "      - %p [%d] --> %p\n",
6871                                  (void *) other_vertex.node,
6872                                  other_vertex.partition,
6873                                  (void *) vertex.node);
6874               else
6875                 dump_printf_loc (MSG_NOTE, vect_location,
6876                                  "      - %p --> [%d] %p\n",
6877                                  (void *) vertex.node,
6878                                  other_vertex.partition,
6879                                  (void *) other_vertex.node);
6880             };
6881           for_each_partition_edge (node_i, print_edge);
6882         }
6883
6884       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6885         {
6886           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6887           if (layout_costs.is_possible ())
6888             {
6889               dump_printf_loc (MSG_NOTE, vect_location,
6890                                "    layout %d:%s\n", layout_i,
6891                                partition.layout == int (layout_i)
6892                                ? " (*)" : "");
6893               slpg_layout_cost combined_cost = layout_costs.in_cost;
6894               combined_cost.add_serial_cost (layout_costs.internal_cost);
6895               combined_cost.add_serial_cost (layout_costs.out_cost);
6896 #define TEMPLATE "{depth: %f, total: %f}"
6897               dump_printf_loc (MSG_NOTE, vect_location,
6898                                "        " TEMPLATE "\n",
6899                                layout_costs.in_cost.depth.to_double (),
6900                                layout_costs.in_cost.total.to_double ());
6901               dump_printf_loc (MSG_NOTE, vect_location,
6902                                "      + " TEMPLATE "\n",
6903                                layout_costs.internal_cost.depth.to_double (),
6904                                layout_costs.internal_cost.total.to_double ());
6905               dump_printf_loc (MSG_NOTE, vect_location,
6906                                "      + " TEMPLATE "\n",
6907                                layout_costs.out_cost.depth.to_double (),
6908                                layout_costs.out_cost.total.to_double ());
6909               dump_printf_loc (MSG_NOTE, vect_location,
6910                                "      = " TEMPLATE "\n",
6911                                combined_cost.depth.to_double (),
6912                                combined_cost.total.to_double ());
6913 #undef TEMPLATE
6914             }
6915           else
6916             dump_printf_loc (MSG_NOTE, vect_location,
6917                              "    layout %d: rejected\n", layout_i);
6918         }
6919     }
6920 }
6921
6922 /* Main entry point for the SLP graph optimization pass.  */
6923
6924 void
6925 vect_optimize_slp_pass::run ()
6926 {
6927   build_graph ();
6928   create_partitions ();
6929   start_choosing_layouts ();
6930   if (m_perms.length () > 1)
6931     {
6932       forward_pass ();
6933       backward_pass ();
6934       if (dump_enabled_p ())
6935         dump ();
6936       materialize ();
6937       while (!m_perms.is_empty ())
6938         m_perms.pop ().release ();
6939     }
6940   else
6941     remove_redundant_permutations ();
6942   free_graph (m_slpg);
6943 }
6944
6945 /* Apply CSE to NODE and its children using BST_MAP.  */
6946
6947 static void
6948 vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
6949 {
6950   bool put_p = false;
6951   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
6952       /* Besides some VEC_PERM_EXPR, two-operator nodes also
6953          lack scalar stmts and thus CSE doesn't work via bst_map.  Ideally
6954          we'd have sth that works for all internal and external nodes.  */
6955       && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
6956     {
6957       slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
6958       if (leader)
6959         {
6960           /* We've visited this node already.  */
6961           if (!*leader || *leader == node)
6962             return;
6963
6964           if (dump_enabled_p ())
6965             dump_printf_loc (MSG_NOTE, vect_location,
6966                              "re-using SLP tree %p for %p\n",
6967                              (void *)*leader, (void *)node);
6968           vect_free_slp_tree (node);
6969           (*leader)->refcnt += 1;
6970           node = *leader;
6971           return;
6972         }
6973
6974       /* Avoid creating a cycle by populating the map only after recursion.  */
6975       bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
6976       node->refcnt += 1;
6977       put_p = true;
6978       /* And recurse.  */
6979     }
6980
6981   for (slp_tree &child : SLP_TREE_CHILDREN (node))
6982     if (child)
6983       vect_cse_slp_nodes (bst_map, child);
6984
6985   /* Now record the node for CSE in other siblings.  */
6986   if (put_p)
6987     bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
6988 }
6989
6990 /* Optimize the SLP graph of VINFO.  */
6991
6992 void
6993 vect_optimize_slp (vec_info *vinfo)
6994 {
6995   if (vinfo->slp_instances.is_empty ())
6996     return;
6997   vect_optimize_slp_pass (vinfo).run ();
6998
6999   /* Apply CSE again to nodes after permute optimization.  */
7000   scalar_stmts_to_slp_tree_map_t *bst_map
7001     = new scalar_stmts_to_slp_tree_map_t ();
7002
7003   for (auto inst : vinfo->slp_instances)
7004     vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
7005
7006   release_scalar_stmts_to_slp_tree_map (bst_map);
7007 }
7008
7009 /* Gather loads reachable from the individual SLP graph entries.  */
7010
7011 void
7012 vect_gather_slp_loads (vec_info *vinfo)
7013 {
7014   unsigned i;
7015   slp_instance instance;
7016   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
7017     {
7018       hash_set<slp_tree> visited;
7019       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
7020                              SLP_INSTANCE_TREE (instance), visited);
7021     }
7022 }
7023
7024
7025 /* For each possible SLP instance decide whether to SLP it and calculate overall
7026    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
7027    least one instance.  */
7028
7029 bool
7030 vect_make_slp_decision (loop_vec_info loop_vinfo)
7031 {
7032   unsigned int i;
7033   poly_uint64 unrolling_factor = 1;
7034   const vec<slp_instance> &slp_instances
7035     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7036   slp_instance instance;
7037   int decided_to_slp = 0;
7038
7039   DUMP_VECT_SCOPE ("vect_make_slp_decision");
7040
7041   FOR_EACH_VEC_ELT (slp_instances, i, instance)
7042     {
7043       /* FORNOW: SLP if you can.  */
7044       /* All unroll factors have the form:
7045
7046            GET_MODE_SIZE (vinfo->vector_mode) * X
7047
7048          for some rational X, so they must have a common multiple.  */
7049       unrolling_factor
7050         = force_common_multiple (unrolling_factor,
7051                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
7052
7053       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
7054          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
7055          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
7056       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7057       decided_to_slp++;
7058     }
7059
7060   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
7061
7062   if (decided_to_slp && dump_enabled_p ())
7063     {
7064       dump_printf_loc (MSG_NOTE, vect_location,
7065                        "Decided to SLP %d instances. Unrolling factor ",
7066                        decided_to_slp);
7067       dump_dec (MSG_NOTE, unrolling_factor);
7068       dump_printf (MSG_NOTE, "\n");
7069     }
7070
7071   return (decided_to_slp > 0);
7072 }
7073
7074 /* Private data for vect_detect_hybrid_slp.  */
7075 struct vdhs_data
7076 {
7077   loop_vec_info loop_vinfo;
7078   vec<stmt_vec_info> *worklist;
7079 };
7080
7081 /* Walker for walk_gimple_op.  */
7082
7083 static tree
7084 vect_detect_hybrid_slp (tree *tp, int *, void *data)
7085 {
7086   walk_stmt_info *wi = (walk_stmt_info *)data;
7087   vdhs_data *dat = (vdhs_data *)wi->info;
7088
7089   if (wi->is_lhs)
7090     return NULL_TREE;
7091
7092   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
7093   if (!def_stmt_info)
7094     return NULL_TREE;
7095   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
7096   if (PURE_SLP_STMT (def_stmt_info))
7097     {
7098       if (dump_enabled_p ())
7099         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
7100                          def_stmt_info->stmt);
7101       STMT_SLP_TYPE (def_stmt_info) = hybrid;
7102       dat->worklist->safe_push (def_stmt_info);
7103     }
7104
7105   return NULL_TREE;
7106 }
7107
7108 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
7109    if so, otherwise pushing it to WORKLIST.  */
7110
7111 static void
7112 maybe_push_to_hybrid_worklist (vec_info *vinfo,
7113                                vec<stmt_vec_info> &worklist,
7114                                stmt_vec_info stmt_info)
7115 {
7116   if (dump_enabled_p ())
7117     dump_printf_loc (MSG_NOTE, vect_location,
7118                      "Processing hybrid candidate : %G", stmt_info->stmt);
7119   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
7120   imm_use_iterator iter2;
7121   ssa_op_iter iter1;
7122   use_operand_p use_p;
7123   def_operand_p def_p;
7124   bool any_def = false;
7125   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
7126     {
7127       any_def = true;
7128       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
7129         {
7130           if (is_gimple_debug (USE_STMT (use_p)))
7131             continue;
7132           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
7133           /* An out-of loop use means this is a loop_vect sink.  */
7134           if (!use_info)
7135             {
7136               if (dump_enabled_p ())
7137                 dump_printf_loc (MSG_NOTE, vect_location,
7138                                  "Found loop_vect sink: %G", stmt_info->stmt);
7139               worklist.safe_push (stmt_info);
7140               return;
7141             }
7142           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
7143             {
7144               if (dump_enabled_p ())
7145                 dump_printf_loc (MSG_NOTE, vect_location,
7146                                  "Found loop_vect use: %G", use_info->stmt);
7147               worklist.safe_push (stmt_info);
7148               return;
7149             }
7150         }
7151     }
7152   /* No def means this is a loo_vect sink.  */
7153   if (!any_def)
7154     {
7155       if (dump_enabled_p ())
7156         dump_printf_loc (MSG_NOTE, vect_location,
7157                          "Found loop_vect sink: %G", stmt_info->stmt);
7158       worklist.safe_push (stmt_info);
7159       return;
7160     }
7161   if (dump_enabled_p ())
7162     dump_printf_loc (MSG_NOTE, vect_location,
7163                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
7164   STMT_SLP_TYPE (stmt_info) = pure_slp;
7165 }
7166
7167 /* Find stmts that must be both vectorized and SLPed.  */
7168
7169 void
7170 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
7171 {
7172   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
7173
7174   /* All stmts participating in SLP are marked pure_slp, all other
7175      stmts are loop_vect.
7176      First collect all loop_vect stmts into a worklist.
7177      SLP patterns cause not all original scalar stmts to appear in
7178      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
7179      Rectify this here and do a backward walk over the IL only considering
7180      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
7181      mark them as pure_slp.  */
7182   auto_vec<stmt_vec_info> worklist;
7183   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
7184     {
7185       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
7186       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
7187            gsi_next (&gsi))
7188         {
7189           gphi *phi = gsi.phi ();
7190           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
7191           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
7192             maybe_push_to_hybrid_worklist (loop_vinfo,
7193                                            worklist, stmt_info);
7194         }
7195       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
7196            gsi_prev (&gsi))
7197         {
7198           gimple *stmt = gsi_stmt (gsi);
7199           if (is_gimple_debug (stmt))
7200             continue;
7201           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
7202           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
7203             {
7204               for (gimple_stmt_iterator gsi2
7205                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
7206                    !gsi_end_p (gsi2); gsi_next (&gsi2))
7207                 {
7208                   stmt_vec_info patt_info
7209                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
7210                   if (!STMT_SLP_TYPE (patt_info)
7211                       && STMT_VINFO_RELEVANT (patt_info))
7212                     maybe_push_to_hybrid_worklist (loop_vinfo,
7213                                                    worklist, patt_info);
7214                 }
7215               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7216             }
7217           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
7218             maybe_push_to_hybrid_worklist (loop_vinfo,
7219                                            worklist, stmt_info);
7220         }
7221     }
7222
7223   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
7224      mark any SLP vectorized stmt as hybrid.
7225      ???  We're visiting def stmts N times (once for each non-SLP and
7226      once for each hybrid-SLP use).  */
7227   walk_stmt_info wi;
7228   vdhs_data dat;
7229   dat.worklist = &worklist;
7230   dat.loop_vinfo = loop_vinfo;
7231   memset (&wi, 0, sizeof (wi));
7232   wi.info = (void *)&dat;
7233   while (!worklist.is_empty ())
7234     {
7235       stmt_vec_info stmt_info = worklist.pop ();
7236       /* Since SSA operands are not set up for pattern stmts we need
7237          to use walk_gimple_op.  */
7238       wi.is_lhs = 0;
7239       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
7240       /* For gather/scatter make sure to walk the offset operand, that
7241          can be a scaling and conversion away.  */
7242       gather_scatter_info gs_info;
7243       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
7244           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
7245         {
7246           int dummy;
7247           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
7248         }
7249     }
7250 }
7251
7252
7253 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
7254
7255 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
7256   : vec_info (vec_info::bb, shared),
7257     roots (vNULL)
7258 {
7259   /* The region we are operating on.  bbs[0] is the entry, excluding
7260      its PHI nodes.  In the future we might want to track an explicit
7261      entry edge to cover bbs[0] PHI nodes and have a region entry
7262      insert location.  */
7263   bbs = _bbs.address ();
7264   nbbs = _bbs.length ();
7265
7266   for (unsigned i = 0; i < nbbs; ++i)
7267     {
7268       if (i != 0)
7269         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
7270              gsi_next (&si))
7271           {
7272             gphi *phi = si.phi ();
7273             gimple_set_uid (phi, 0);
7274             add_stmt (phi);
7275           }
7276       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
7277            !gsi_end_p (gsi); gsi_next (&gsi))
7278         {
7279           gimple *stmt = gsi_stmt (gsi);
7280           gimple_set_uid (stmt, 0);
7281           if (is_gimple_debug (stmt))
7282             continue;
7283           add_stmt (stmt);
7284         }
7285     }
7286 }
7287
7288
7289 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
7290    stmts in the basic block.  */
7291
7292 _bb_vec_info::~_bb_vec_info ()
7293 {
7294   /* Reset region marker.  */
7295   for (unsigned i = 0; i < nbbs; ++i)
7296     {
7297       if (i != 0)
7298         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
7299              gsi_next (&si))
7300           {
7301             gphi *phi = si.phi ();
7302             gimple_set_uid (phi, -1);
7303           }
7304       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
7305            !gsi_end_p (gsi); gsi_next (&gsi))
7306         {
7307           gimple *stmt = gsi_stmt (gsi);
7308           gimple_set_uid (stmt, -1);
7309         }
7310     }
7311
7312   for (unsigned i = 0; i < roots.length (); ++i)
7313     {
7314       roots[i].stmts.release ();
7315       roots[i].roots.release ();
7316       roots[i].remain.release ();
7317     }
7318   roots.release ();
7319 }
7320
7321 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
7322    given then that child nodes have already been processed, and that
7323    their def types currently match their SLP node's def type.  */
7324
7325 static bool
7326 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
7327                                     slp_instance node_instance,
7328                                     stmt_vector_for_cost *cost_vec)
7329 {
7330   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7331
7332   /* Calculate the number of vector statements to be created for the scalar
7333      stmts in this node.  It is the number of scalar elements in one scalar
7334      iteration (DR_GROUP_SIZE) multiplied by VF divided by the number of
7335      elements in a vector.  For single-defuse-cycle, lane-reducing op, and
7336      PHI statement that starts reduction comprised of only lane-reducing ops,
7337      the number is more than effective vector statements actually required.  */
7338   SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vect_get_num_copies (vinfo, node);
7339
7340   /* Handle purely internal nodes.  */
7341   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7342     {
7343       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
7344         return false;
7345
7346       stmt_vec_info slp_stmt_info;
7347       unsigned int i;
7348       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
7349         {
7350           if (slp_stmt_info
7351               && STMT_VINFO_LIVE_P (slp_stmt_info)
7352               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
7353                                                node_instance, i,
7354                                                false, cost_vec))
7355             return false;
7356         }
7357       return true;
7358     }
7359
7360   bool dummy;
7361   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
7362                             node, node_instance, cost_vec);
7363 }
7364
7365 /* Try to build NODE from scalars, returning true on success.
7366    NODE_INSTANCE is the SLP instance that contains NODE.  */
7367
7368 static bool
7369 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
7370                               slp_instance node_instance)
7371 {
7372   stmt_vec_info stmt_info;
7373   unsigned int i;
7374
7375   if (!is_a <bb_vec_info> (vinfo)
7376       || node == SLP_INSTANCE_TREE (node_instance)
7377       || !SLP_TREE_SCALAR_STMTS (node).exists ()
7378       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
7379       /* Force the mask use to be built from scalars instead.  */
7380       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
7381     return false;
7382
7383   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7384     if (!stmt_info)
7385       return false;
7386
7387   if (dump_enabled_p ())
7388     dump_printf_loc (MSG_NOTE, vect_location,
7389                      "Building vector operands of %p from scalars instead\n",
7390                      (void *) node);
7391
7392   /* Don't remove and free the child nodes here, since they could be
7393      referenced by other structures.  The analysis and scheduling phases
7394      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
7395   unsigned int group_size = SLP_TREE_LANES (node);
7396   SLP_TREE_DEF_TYPE (node) = vect_external_def;
7397   /* Invariants get their vector type from the uses.  */
7398   SLP_TREE_VECTYPE (node) = NULL_TREE;
7399   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
7400   SLP_TREE_LOAD_PERMUTATION (node).release ();
7401   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7402     {
7403       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
7404       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
7405     }
7406   return true;
7407 }
7408
7409 /* Return true if all elements of the slice are the same.  */
7410 bool
7411 vect_scalar_ops_slice::all_same_p () const
7412 {
7413   for (unsigned int i = 1; i < length; ++i)
7414     if (!operand_equal_p (op (0), op (i)))
7415       return false;
7416   return true;
7417 }
7418
7419 hashval_t
7420 vect_scalar_ops_slice_hash::hash (const value_type &s)
7421 {
7422   hashval_t hash = 0;
7423   for (unsigned i = 0; i < s.length; ++i)
7424     hash = iterative_hash_expr (s.op (i), hash);
7425   return hash;
7426 }
7427
7428 bool
7429 vect_scalar_ops_slice_hash::equal (const value_type &s1,
7430                                    const compare_type &s2)
7431 {
7432   if (s1.length != s2.length)
7433     return false;
7434   for (unsigned i = 0; i < s1.length; ++i)
7435     if (!operand_equal_p (s1.op (i), s2.op (i)))
7436       return false;
7437   return true;
7438 }
7439
7440 /* Compute the prologue cost for invariant or constant operands represented
7441    by NODE.  */
7442
7443 static void
7444 vect_prologue_cost_for_slp (slp_tree node,
7445                             stmt_vector_for_cost *cost_vec)
7446 {
7447   /* There's a special case of an existing vector, that costs nothing.  */
7448   if (SLP_TREE_SCALAR_OPS (node).length () == 0
7449       && !SLP_TREE_VEC_DEFS (node).is_empty ())
7450     return;
7451   /* Without looking at the actual initializer a vector of
7452      constants can be implemented as load from the constant pool.
7453      When all elements are the same we can use a splat.  */
7454   tree vectype = SLP_TREE_VECTYPE (node);
7455   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
7456   unsigned HOST_WIDE_INT const_nunits;
7457   unsigned nelt_limit;
7458   auto ops = &SLP_TREE_SCALAR_OPS (node);
7459   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7460   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
7461       && ! multiple_p (const_nunits, group_size))
7462     {
7463       nelt_limit = const_nunits;
7464       hash_set<vect_scalar_ops_slice_hash> vector_ops;
7465       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
7466         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
7467           starts.quick_push (i * const_nunits);
7468     }
7469   else
7470     {
7471       /* If either the vector has variable length or the vectors
7472          are composed of repeated whole groups we only need to
7473          cost construction once.  All vectors will be the same.  */
7474       nelt_limit = group_size;
7475       starts.quick_push (0);
7476     }
7477   /* ???  We're just tracking whether vectors in a single node are the same.
7478      Ideally we'd do something more global.  */
7479   bool passed = false;
7480   for (unsigned int start : starts)
7481     {
7482       vect_cost_for_stmt kind;
7483       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
7484         kind = vector_load;
7485       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
7486         kind = scalar_to_vec;
7487       else
7488         kind = vec_construct;
7489       /* The target cost hook has no idea which part of the SLP node
7490          we are costing so avoid passing it down more than once.  Pass
7491          it to the first vec_construct or scalar_to_vec part since for those
7492          the x86 backend tries to account for GPR to XMM register moves.  */
7493       record_stmt_cost (cost_vec, 1, kind,
7494                         (kind != vector_load && !passed) ? node : nullptr,
7495                         vectype, 0, vect_prologue);
7496       if (kind != vector_load)
7497         passed = true;
7498     }
7499 }
7500
7501 /* Analyze statements contained in SLP tree NODE after recursively analyzing
7502    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
7503
7504    Return true if the operations are supported.  */
7505
7506 static bool
7507 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
7508                                   slp_instance node_instance,
7509                                   hash_set<slp_tree> &visited_set,
7510                                   vec<slp_tree> &visited_vec,
7511                                   stmt_vector_for_cost *cost_vec)
7512 {
7513   int i, j;
7514   slp_tree child;
7515
7516   /* Assume we can code-generate all invariants.  */
7517   if (!node
7518       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
7519       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7520     return true;
7521
7522   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
7523     {
7524       if (dump_enabled_p ())
7525         dump_printf_loc (MSG_NOTE, vect_location,
7526                          "Failed cyclic SLP reference in %p\n", (void *) node);
7527       return false;
7528     }
7529   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
7530
7531   /* If we already analyzed the exact same set of scalar stmts we're done.
7532      We share the generated vector stmts for those.  */
7533   if (visited_set.add (node))
7534     return true;
7535   visited_vec.safe_push (node);
7536
7537   bool res = true;
7538   unsigned visited_rec_start = visited_vec.length ();
7539   unsigned cost_vec_rec_start = cost_vec->length ();
7540   bool seen_non_constant_child = false;
7541   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7542     {
7543       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
7544                                               visited_set, visited_vec,
7545                                               cost_vec);
7546       if (!res)
7547         break;
7548       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
7549         seen_non_constant_child = true;
7550     }
7551   /* We're having difficulties scheduling nodes with just constant
7552      operands and no scalar stmts since we then cannot compute a stmt
7553      insertion place.  */
7554   if (res
7555       && !seen_non_constant_child
7556       && SLP_TREE_SCALAR_STMTS (node).is_empty ())
7557     {
7558       if (dump_enabled_p ())
7559         dump_printf_loc (MSG_NOTE, vect_location,
7560                          "Cannot vectorize all-constant op node %p\n",
7561                          (void *) node);
7562       res = false;
7563     }
7564
7565   if (res)
7566     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
7567                                               cost_vec);
7568   /* If analysis failed we have to pop all recursive visited nodes
7569      plus ourselves.  */
7570   if (!res)
7571     {
7572       while (visited_vec.length () >= visited_rec_start)
7573         visited_set.remove (visited_vec.pop ());
7574       cost_vec->truncate (cost_vec_rec_start);
7575     }
7576
7577   /* When the node can be vectorized cost invariant nodes it references.
7578      This is not done in DFS order to allow the refering node
7579      vectorizable_* calls to nail down the invariant nodes vector type
7580      and possibly unshare it if it needs a different vector type than
7581      other referrers.  */
7582   if (res)
7583     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
7584       if (child
7585           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
7586               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
7587           /* Perform usual caching, note code-generation still
7588              code-gens these nodes multiple times but we expect
7589              to CSE them later.  */
7590           && !visited_set.add (child))
7591         {
7592           visited_vec.safe_push (child);
7593           /* ???  After auditing more code paths make a "default"
7594              and push the vector type from NODE to all children
7595              if it is not already set.  */
7596           /* Compute the number of vectors to be generated.  */
7597           tree vector_type = SLP_TREE_VECTYPE (child);
7598           if (!vector_type)
7599             {
7600               /* For shifts with a scalar argument we don't need
7601                  to cost or code-generate anything.
7602                  ???  Represent this more explicitely.  */
7603               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
7604                            == shift_vec_info_type)
7605                           && j == 1);
7606               continue;
7607             }
7608
7609           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
7610                 = vect_get_num_copies (vinfo, child);
7611           /* And cost them.  */
7612           vect_prologue_cost_for_slp (child, cost_vec);
7613         }
7614
7615   /* If this node or any of its children can't be vectorized, try pruning
7616      the tree here rather than felling the whole thing.  */
7617   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
7618     {
7619       /* We'll need to revisit this for invariant costing and number
7620          of vectorized stmt setting.   */
7621       res = true;
7622     }
7623
7624   return res;
7625 }
7626
7627 /* Given a definition DEF, analyze if it will have any live scalar use after
7628    performing SLP vectorization whose information is represented by BB_VINFO,
7629    and record result into hash map SCALAR_USE_MAP as cache for later fast
7630    check.  If recursion DEPTH exceeds a limit, stop analysis and make a
7631    conservative assumption.  Return 0 if no scalar use, 1 if there is, -1
7632    means recursion is limited.  */
7633
7634 static int
7635 vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
7636                         hash_map<tree, int> &scalar_use_map,
7637                         int depth = 0)
7638 {
7639   const int depth_limit = 2;
7640   imm_use_iterator use_iter;
7641   gimple *use_stmt;
7642
7643   if (int *res = scalar_use_map.get (def))
7644     return *res;
7645
7646   int scalar_use = 1;
7647
7648   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
7649     {
7650       if (is_gimple_debug (use_stmt))
7651         continue;
7652
7653       stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
7654
7655       if (!use_stmt_info)
7656         break;
7657
7658       if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
7659         continue;
7660
7661       /* Do not step forward when encounter PHI statement, since it may
7662          involve cyclic reference and cause infinite recursive invocation.  */
7663       if (gimple_code (use_stmt) == GIMPLE_PHI)
7664         break;
7665
7666       /* When pattern recognition is involved, a statement whose definition is
7667          consumed in some pattern, may not be included in the final replacement
7668          pattern statements, so would be skipped when building SLP graph.
7669
7670          * Original
7671           char a_c = *(char *) a;
7672           char b_c = *(char *) b;
7673           unsigned short a_s = (unsigned short) a_c;
7674           int a_i = (int) a_s;
7675           int b_i = (int) b_c;
7676           int r_i = a_i - b_i;
7677
7678          * After pattern replacement
7679           a_s = (unsigned short) a_c;
7680           a_i = (int) a_s;
7681
7682           patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
7683           patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
7684
7685           patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
7686           patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
7687
7688          The definitions of a_i(original statement) and b_i(pattern statement)
7689          are related to, but actually not part of widen_minus pattern.
7690          Vectorizing the pattern does not cause these definition statements to
7691          be marked as PURE_SLP.  For this case, we need to recursively check
7692          whether their uses are all absorbed into vectorized code.  But there
7693          is an exception that some use may participate in an vectorized
7694          operation via an external SLP node containing that use as an element.
7695          The parameter "scalar_use_map" tags such kind of SSA as having scalar
7696          use in advance.  */
7697       tree lhs = gimple_get_lhs (use_stmt);
7698
7699       if (!lhs || TREE_CODE (lhs) != SSA_NAME)
7700         break;
7701
7702       if (depth_limit && depth >= depth_limit)
7703         return -1;
7704
7705       if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
7706                                                 depth + 1)))
7707         break;
7708     }
7709
7710   if (end_imm_use_stmt_p (&use_iter))
7711     scalar_use = 0;
7712
7713   /* If recursion is limited, do not cache result for non-root defs.  */
7714   if (!depth || scalar_use >= 0)
7715     {
7716       bool added = scalar_use_map.put (def, scalar_use);
7717       gcc_assert (!added);
7718     }
7719
7720   return scalar_use;
7721 }
7722
7723 /* Mark lanes of NODE that are live outside of the basic-block vectorized
7724    region and that can be vectorized using vectorizable_live_operation
7725    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
7726    scalar code computing it to be retained.  */
7727
7728 static void
7729 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
7730                              slp_instance instance,
7731                              stmt_vector_for_cost *cost_vec,
7732                              hash_map<tree, int> &scalar_use_map,
7733                              hash_set<stmt_vec_info> &svisited,
7734                              hash_set<slp_tree> &visited)
7735 {
7736   if (visited.add (node))
7737     return;
7738
7739   unsigned i;
7740   stmt_vec_info stmt_info;
7741   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
7742   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7743     {
7744       if (!stmt_info || svisited.contains (stmt_info))
7745         continue;
7746       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
7747       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
7748           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
7749         /* Only the pattern root stmt computes the original scalar value.  */
7750         continue;
7751       bool mark_visited = true;
7752       gimple *orig_stmt = orig_stmt_info->stmt;
7753       ssa_op_iter op_iter;
7754       def_operand_p def_p;
7755       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
7756         {
7757           if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
7758                                       scalar_use_map))
7759             {
7760               STMT_VINFO_LIVE_P (stmt_info) = true;
7761               if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
7762                                                instance, i, false, cost_vec))
7763                 /* ???  So we know we can vectorize the live stmt from one SLP
7764                    node.  If we cannot do so from all or none consistently
7765                    we'd have to record which SLP node (and lane) we want to
7766                    use for the live operation.  So make sure we can
7767                    code-generate from all nodes.  */
7768                 mark_visited = false;
7769               else
7770                 STMT_VINFO_LIVE_P (stmt_info) = false;
7771             }
7772
7773           /* We have to verify whether we can insert the lane extract
7774              before all uses.  The following is a conservative approximation.
7775              We cannot put this into vectorizable_live_operation because
7776              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
7777              doesn't work.
7778              Note that while the fact that we emit code for loads at the
7779              first load should make this a non-problem leafs we construct
7780              from scalars are vectorized after the last scalar def.
7781              ???  If we'd actually compute the insert location during
7782              analysis we could use sth less conservative than the last
7783              scalar stmt in the node for the dominance check.  */
7784           /* ???  What remains is "live" uses in vector CTORs in the same
7785              SLP graph which is where those uses can end up code-generated
7786              right after their definition instead of close to their original
7787              use.  But that would restrict us to code-generate lane-extracts
7788              from the latest stmt in a node.  So we compensate for this
7789              during code-generation, simply not replacing uses for those
7790              hopefully rare cases.  */
7791           imm_use_iterator use_iter;
7792           gimple *use_stmt;
7793           stmt_vec_info use_stmt_info;
7794
7795           if (STMT_VINFO_LIVE_P (stmt_info))
7796             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
7797               if (!is_gimple_debug (use_stmt)
7798                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
7799                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
7800                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
7801                 {
7802                   if (dump_enabled_p ())
7803                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7804                                      "Cannot determine insertion place for "
7805                                      "lane extract\n");
7806                   STMT_VINFO_LIVE_P (stmt_info) = false;
7807                   mark_visited = true;
7808                 }
7809         }
7810       if (mark_visited)
7811         svisited.add (stmt_info);
7812     }
7813
7814   slp_tree child;
7815   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7816     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7817       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
7818                                    scalar_use_map, svisited, visited);
7819 }
7820
7821 /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
7822    are live outside of the basic-block vectorized region and that can be
7823    vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
7824
7825 static void
7826 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
7827 {
7828   if (bb_vinfo->slp_instances.is_empty ())
7829     return;
7830
7831   hash_set<stmt_vec_info> svisited;
7832   hash_set<slp_tree> visited;
7833   hash_map<tree, int> scalar_use_map;
7834   auto_vec<slp_tree> worklist;
7835
7836   for (slp_instance instance : bb_vinfo->slp_instances)
7837     {
7838       if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
7839         for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
7840           if (TREE_CODE (op) == SSA_NAME)
7841             scalar_use_map.put (op, 1);
7842       if (!visited.add (SLP_INSTANCE_TREE (instance)))
7843         worklist.safe_push (SLP_INSTANCE_TREE (instance));
7844     }
7845
7846   do
7847     {
7848       slp_tree node = worklist.pop ();
7849
7850       if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
7851         {
7852           for (tree op : SLP_TREE_SCALAR_OPS (node))
7853             if (TREE_CODE (op) == SSA_NAME)
7854               scalar_use_map.put (op, 1);
7855         }
7856       else
7857         {
7858           for (slp_tree child : SLP_TREE_CHILDREN (node))
7859             if (child && !visited.add (child))
7860               worklist.safe_push (child);
7861         }
7862     }
7863   while (!worklist.is_empty ());
7864
7865   visited.empty ();
7866
7867   for (slp_instance instance : bb_vinfo->slp_instances)
7868     {
7869       vect_location = instance->location ();
7870       vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
7871                                    instance, &instance->cost_vec,
7872                                    scalar_use_map, svisited, visited);
7873     }
7874 }
7875
7876 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
7877
7878 static bool
7879 vectorizable_bb_reduc_epilogue (slp_instance instance,
7880                                 stmt_vector_for_cost *cost_vec)
7881 {
7882   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
7883   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
7884   if (reduc_code == MINUS_EXPR)
7885     reduc_code = PLUS_EXPR;
7886   internal_fn reduc_fn;
7887   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
7888   if (!vectype
7889       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7890       || reduc_fn == IFN_LAST
7891       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
7892       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
7893                                      TREE_TYPE (vectype)))
7894     {
7895       if (dump_enabled_p ())
7896         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7897                          "not vectorized: basic block reduction epilogue "
7898                          "operation unsupported.\n");
7899       return false;
7900     }
7901
7902   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
7903      cost log2 vector operations plus shuffles and one extraction.  */
7904   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
7905   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
7906                     vectype, 0, vect_body);
7907   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
7908                     vectype, 0, vect_body);
7909   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
7910                     vectype, 0, vect_body);
7911
7912   /* Since we replace all stmts of a possibly longer scalar reduction
7913      chain account for the extra scalar stmts for that.  */
7914   record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
7915                     instance->root_stmts[0], 0, vect_body);
7916   return true;
7917 }
7918
7919 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
7920    and recurse to children.  */
7921
7922 static void
7923 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
7924                               hash_set<slp_tree> &visited)
7925 {
7926   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
7927       || visited.add (node))
7928     return;
7929
7930   stmt_vec_info stmt;
7931   unsigned i;
7932   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
7933     if (stmt)
7934       roots.remove (vect_orig_stmt (stmt));
7935
7936   slp_tree child;
7937   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7938     if (child)
7939       vect_slp_prune_covered_roots (child, roots, visited);
7940 }
7941
7942 /* Analyze statements in SLP instances of VINFO.  Return true if the
7943    operations are supported. */
7944
7945 bool
7946 vect_slp_analyze_operations (vec_info *vinfo)
7947 {
7948   slp_instance instance;
7949   int i;
7950
7951   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
7952
7953   hash_set<slp_tree> visited;
7954   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
7955     {
7956       auto_vec<slp_tree> visited_vec;
7957       stmt_vector_for_cost cost_vec;
7958       cost_vec.create (2);
7959       if (is_a <bb_vec_info> (vinfo))
7960         vect_location = instance->location ();
7961       if (!vect_slp_analyze_node_operations (vinfo,
7962                                              SLP_INSTANCE_TREE (instance),
7963                                              instance, visited, visited_vec,
7964                                              &cost_vec)
7965           /* CTOR instances require vectorized defs for the SLP tree root.  */
7966           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
7967               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
7968                   != vect_internal_def
7969                   /* Make sure we vectorized with the expected type.  */
7970                   || !useless_type_conversion_p
7971                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
7972                                               (instance->root_stmts[0]->stmt))),
7973                          TREE_TYPE (SLP_TREE_VECTYPE
7974                                             (SLP_INSTANCE_TREE (instance))))))
7975           /* Check we can vectorize the reduction.  */
7976           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
7977               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
7978         {
7979           cost_vec.release ();
7980           slp_tree node = SLP_INSTANCE_TREE (instance);
7981           stmt_vec_info stmt_info;
7982           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7983             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
7984           else
7985             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7986           if (is_a <loop_vec_info> (vinfo))
7987             {
7988               if (dump_enabled_p ())
7989                 dump_printf_loc (MSG_NOTE, vect_location,
7990                                  "unsupported SLP instance starting from: %G",
7991                                  stmt_info->stmt);
7992               return false;
7993             }
7994           if (dump_enabled_p ())
7995             dump_printf_loc (MSG_NOTE, vect_location,
7996                              "removing SLP instance operations starting from: %G",
7997                              stmt_info->stmt);
7998           vect_free_slp_instance (instance);
7999           vinfo->slp_instances.ordered_remove (i);
8000           while (!visited_vec.is_empty ())
8001             visited.remove (visited_vec.pop ());
8002         }
8003       else
8004         {
8005           i++;
8006           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
8007             {
8008               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
8009               cost_vec.release ();
8010             }
8011           else
8012             /* For BB vectorization remember the SLP graph entry
8013                cost for later.  */
8014             instance->cost_vec = cost_vec;
8015         }
8016     }
8017
8018   /* Now look for SLP instances with a root that are covered by other
8019      instances and remove them.  */
8020   hash_set<stmt_vec_info> roots;
8021   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
8022     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8023       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
8024   if (!roots.is_empty ())
8025     {
8026       visited.empty ();
8027       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
8028         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
8029                                       visited);
8030       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
8031         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
8032             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
8033           {
8034             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
8035             if (dump_enabled_p ())
8036               dump_printf_loc (MSG_NOTE, vect_location,
8037                                "removing SLP instance operations starting "
8038                                "from: %G", root->stmt);
8039             vect_free_slp_instance (instance);
8040             vinfo->slp_instances.ordered_remove (i);
8041           }
8042         else
8043           ++i;
8044     }
8045
8046   /* Compute vectorizable live stmts.  */
8047   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
8048     vect_bb_slp_mark_live_stmts (bb_vinfo);
8049
8050   return !vinfo->slp_instances.is_empty ();
8051 }
8052
8053 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
8054    closing the eventual chain.  */
8055
8056 static slp_instance
8057 get_ultimate_leader (slp_instance instance,
8058                      hash_map<slp_instance, slp_instance> &instance_leader)
8059 {
8060   auto_vec<slp_instance *, 8> chain;
8061   slp_instance *tem;
8062   while (*(tem = instance_leader.get (instance)) != instance)
8063     {
8064       chain.safe_push (tem);
8065       instance = *tem;
8066     }
8067   while (!chain.is_empty ())
8068     *chain.pop () = instance;
8069   return instance;
8070 }
8071
8072 namespace {
8073 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
8074    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
8075    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
8076
8077    INSTANCE_LEADER is as for get_ultimate_leader.  */
8078
8079 template<typename T>
8080 bool
8081 vect_map_to_instance (slp_instance instance, T key,
8082                       hash_map<T, slp_instance> &key_to_instance,
8083                       hash_map<slp_instance, slp_instance> &instance_leader)
8084 {
8085   bool existed_p;
8086   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
8087   if (!existed_p)
8088     ;
8089   else if (key_instance != instance)
8090     {
8091       /* If we're running into a previously marked key make us the
8092          leader of the current ultimate leader.  This keeps the
8093          leader chain acyclic and works even when the current instance
8094          connects two previously independent graph parts.  */
8095       slp_instance key_leader
8096         = get_ultimate_leader (key_instance, instance_leader);
8097       if (key_leader != instance)
8098         instance_leader.put (key_leader, instance);
8099     }
8100   key_instance = instance;
8101   return existed_p;
8102 }
8103 }
8104
8105 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
8106
8107 static void
8108 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
8109                            slp_instance instance, slp_tree node,
8110                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
8111                            hash_map<slp_tree, slp_instance> &node_to_instance,
8112                            hash_map<slp_instance, slp_instance> &instance_leader)
8113 {
8114   stmt_vec_info stmt_info;
8115   unsigned i;
8116
8117   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8118     if (stmt_info)
8119       vect_map_to_instance (instance, stmt_info, stmt_to_instance,
8120                             instance_leader);
8121
8122   if (vect_map_to_instance (instance, node, node_to_instance,
8123                             instance_leader))
8124     return;
8125
8126   slp_tree child;
8127   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8128     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8129       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
8130                                  node_to_instance, instance_leader);
8131 }
8132
8133 /* Partition the SLP graph into pieces that can be costed independently.  */
8134
8135 static void
8136 vect_bb_partition_graph (bb_vec_info bb_vinfo)
8137 {
8138   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
8139
8140   /* First walk the SLP graph assigning each involved scalar stmt a
8141      corresponding SLP graph entry and upon visiting a previously
8142      marked stmt, make the stmts leader the current SLP graph entry.  */
8143   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
8144   hash_map<slp_tree, slp_instance> node_to_instance;
8145   hash_map<slp_instance, slp_instance> instance_leader;
8146   slp_instance instance;
8147   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
8148     {
8149       instance_leader.put (instance, instance);
8150       vect_bb_partition_graph_r (bb_vinfo,
8151                                  instance, SLP_INSTANCE_TREE (instance),
8152                                  stmt_to_instance, node_to_instance,
8153                                  instance_leader);
8154     }
8155
8156   /* Then collect entries to each independent subgraph.  */
8157   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
8158     {
8159       slp_instance leader = get_ultimate_leader (instance, instance_leader);
8160       leader->subgraph_entries.safe_push (instance);
8161       if (dump_enabled_p ()
8162           && leader != instance)
8163         dump_printf_loc (MSG_NOTE, vect_location,
8164                          "instance %p is leader of %p\n",
8165                          (void *) leader, (void *) instance);
8166     }
8167 }
8168
8169 /* Compute the set of scalar stmts participating in internal and external
8170    nodes.  */
8171
8172 static void
8173 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
8174                                          hash_set<slp_tree> &visited,
8175                                          hash_set<stmt_vec_info> &vstmts,
8176                                          hash_set<stmt_vec_info> &estmts)
8177 {
8178   int i;
8179   stmt_vec_info stmt_info;
8180   slp_tree child;
8181
8182   if (visited.add (node))
8183     return;
8184
8185   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
8186     {
8187       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8188         if (stmt_info)
8189           vstmts.add (stmt_info);
8190
8191       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8192         if (child)
8193           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
8194                                                    vstmts, estmts);
8195     }
8196   else
8197     for (tree def : SLP_TREE_SCALAR_OPS (node))
8198       {
8199         stmt_vec_info def_stmt = vinfo->lookup_def (def);
8200         if (def_stmt)
8201           estmts.add (def_stmt);
8202       }
8203 }
8204
8205
8206 /* Compute the scalar cost of the SLP node NODE and its children
8207    and return it.  Do not account defs that are marked in LIFE and
8208    update LIFE according to uses of NODE.  */
8209
8210 static void
8211 vect_bb_slp_scalar_cost (vec_info *vinfo,
8212                          slp_tree node, vec<bool, va_heap> *life,
8213                          stmt_vector_for_cost *cost_vec,
8214                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
8215                          hash_set<slp_tree> &visited)
8216 {
8217   unsigned i;
8218   stmt_vec_info stmt_info;
8219   slp_tree child;
8220
8221   if (visited.add (node))
8222     return;
8223
8224   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8225     {
8226       ssa_op_iter op_iter;
8227       def_operand_p def_p;
8228
8229       if (!stmt_info || (*life)[i])
8230         continue;
8231
8232       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
8233       gimple *orig_stmt = orig_stmt_info->stmt;
8234
8235       /* If there is a non-vectorized use of the defs then the scalar
8236          stmt is kept live in which case we do not account it or any
8237          required defs in the SLP children in the scalar cost.  This
8238          way we make the vectorization more costly when compared to
8239          the scalar cost.  */
8240       if (!STMT_VINFO_LIVE_P (stmt_info))
8241         {
8242           auto_vec<gimple *, 8> worklist;
8243           hash_set<gimple *> *worklist_visited = NULL;
8244           worklist.quick_push (orig_stmt);
8245           do
8246             {
8247               gimple *work_stmt = worklist.pop ();
8248               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
8249                 {
8250                   imm_use_iterator use_iter;
8251                   gimple *use_stmt;
8252                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
8253                                          DEF_FROM_PTR (def_p))
8254                     if (!is_gimple_debug (use_stmt))
8255                       {
8256                         stmt_vec_info use_stmt_info
8257                           = vinfo->lookup_stmt (use_stmt);
8258                         if (!use_stmt_info
8259                             || !vectorized_scalar_stmts.contains (use_stmt_info))
8260                           {
8261                             if (use_stmt_info
8262                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
8263                               {
8264                                 /* For stmts participating in patterns we have
8265                                    to check its uses recursively.  */
8266                                 if (!worklist_visited)
8267                                   worklist_visited = new hash_set<gimple *> ();
8268                                 if (!worklist_visited->add (use_stmt))
8269                                   worklist.safe_push (use_stmt);
8270                                 continue;
8271                               }
8272                             (*life)[i] = true;
8273                             goto next_lane;
8274                           }
8275                       }
8276                 }
8277             }
8278           while (!worklist.is_empty ());
8279 next_lane:
8280           if (worklist_visited)
8281             delete worklist_visited;
8282           if ((*life)[i])
8283             continue;
8284         }
8285
8286       /* Count scalar stmts only once.  */
8287       if (gimple_visited_p (orig_stmt))
8288         continue;
8289       gimple_set_visited (orig_stmt, true);
8290
8291       vect_cost_for_stmt kind;
8292       if (STMT_VINFO_DATA_REF (orig_stmt_info))
8293         {
8294           data_reference_p dr = STMT_VINFO_DATA_REF (orig_stmt_info);
8295           tree base = get_base_address (DR_REF (dr));
8296           /* When the scalar access is to a non-global not address-taken
8297              decl that is not BLKmode assume we can access it with a single
8298              non-load/store instruction.  */
8299           if (DECL_P (base)
8300               && !is_global_var (base)
8301               && !TREE_ADDRESSABLE (base)
8302               && DECL_MODE (base) != BLKmode)
8303             kind = scalar_stmt;
8304           else if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
8305             kind = scalar_load;
8306           else
8307             kind = scalar_store;
8308         }
8309       else if (vect_nop_conversion_p (orig_stmt_info))
8310         continue;
8311       /* For single-argument PHIs assume coalescing which means zero cost
8312          for the scalar and the vector PHIs.  This avoids artificially
8313          favoring the vector path (but may pessimize it in some cases).  */
8314       else if (is_a <gphi *> (orig_stmt_info->stmt)
8315                && gimple_phi_num_args
8316                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
8317         continue;
8318       else
8319         kind = scalar_stmt;
8320       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
8321                         SLP_TREE_VECTYPE (node), 0, vect_body);
8322     }
8323
8324   auto_vec<bool, 20> subtree_life;
8325   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8326     {
8327       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8328         {
8329           /* Do not directly pass LIFE to the recursive call, copy it to
8330              confine changes in the callee to the current child/subtree.  */
8331           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
8332             {
8333               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
8334               for (unsigned j = 0;
8335                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
8336                 {
8337                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
8338                   if (perm.first == i)
8339                     subtree_life[perm.second] = (*life)[j];
8340                 }
8341             }
8342           else
8343             {
8344               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
8345               subtree_life.safe_splice (*life);
8346             }
8347           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
8348                                    vectorized_scalar_stmts, visited);
8349           subtree_life.truncate (0);
8350         }
8351     }
8352 }
8353
8354 /* Comparator for the loop-index sorted cost vectors.  */
8355
8356 static int
8357 li_cost_vec_cmp (const void *a_, const void *b_)
8358 {
8359   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
8360   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
8361   if (a->first < b->first)
8362     return -1;
8363   else if (a->first == b->first)
8364     return 0;
8365   return 1;
8366 }
8367
8368 /* Check if vectorization of the basic block is profitable for the
8369    subgraph denoted by SLP_INSTANCES.  */
8370
8371 static bool
8372 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
8373                                     vec<slp_instance> slp_instances,
8374                                     loop_p orig_loop)
8375 {
8376   slp_instance instance;
8377   int i;
8378   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
8379   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
8380
8381   if (dump_enabled_p ())
8382     {
8383       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
8384       hash_set<slp_tree> visited;
8385       FOR_EACH_VEC_ELT (slp_instances, i, instance)
8386         vect_print_slp_graph (MSG_NOTE, vect_location,
8387                               SLP_INSTANCE_TREE (instance), visited);
8388     }
8389
8390   /* Compute the set of scalar stmts we know will go away 'locally' when
8391      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
8392      not accurate for nodes promoted extern late or for scalar stmts that
8393      are used both in extern defs and in vectorized defs.  */
8394   hash_set<stmt_vec_info> vectorized_scalar_stmts;
8395   hash_set<stmt_vec_info> scalar_stmts_in_externs;
8396   hash_set<slp_tree> visited;
8397   FOR_EACH_VEC_ELT (slp_instances, i, instance)
8398     {
8399       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
8400                                                SLP_INSTANCE_TREE (instance),
8401                                                visited,
8402                                                vectorized_scalar_stmts,
8403                                                scalar_stmts_in_externs);
8404       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
8405         vectorized_scalar_stmts.add (rstmt);
8406     }
8407   /* Scalar stmts used as defs in external nodes need to be preseved, so
8408      remove them from vectorized_scalar_stmts.  */
8409   for (stmt_vec_info stmt : scalar_stmts_in_externs)
8410     vectorized_scalar_stmts.remove (stmt);
8411
8412   /* Calculate scalar cost and sum the cost for the vector stmts
8413      previously collected.  */
8414   stmt_vector_for_cost scalar_costs = vNULL;
8415   stmt_vector_for_cost vector_costs = vNULL;
8416   visited.empty ();
8417   FOR_EACH_VEC_ELT (slp_instances, i, instance)
8418     {
8419       auto_vec<bool, 20> life;
8420       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
8421                               true);
8422       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8423         record_stmt_cost (&scalar_costs,
8424                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
8425                           scalar_stmt,
8426                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
8427       vect_bb_slp_scalar_cost (bb_vinfo,
8428                                SLP_INSTANCE_TREE (instance),
8429                                &life, &scalar_costs, vectorized_scalar_stmts,
8430                                visited);
8431       vector_costs.safe_splice (instance->cost_vec);
8432       instance->cost_vec.release ();
8433     }
8434
8435   if (dump_enabled_p ())
8436     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
8437
8438   /* When costing non-loop vectorization we need to consider each covered
8439      loop independently and make sure vectorization is profitable.  For
8440      now we assume a loop may be not entered or executed an arbitrary
8441      number of iterations (???  static information can provide more
8442      precise info here) which means we can simply cost each containing
8443      loops stmts separately.  */
8444
8445   /* First produce cost vectors sorted by loop index.  */
8446   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
8447     li_scalar_costs (scalar_costs.length ());
8448   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
8449     li_vector_costs (vector_costs.length ());
8450   stmt_info_for_cost *cost;
8451   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
8452     {
8453       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
8454       li_scalar_costs.quick_push (std::make_pair (l, cost));
8455     }
8456   /* Use a random used loop as fallback in case the first vector_costs
8457      entry does not have a stmt_info associated with it.  */
8458   unsigned l = li_scalar_costs[0].first;
8459   FOR_EACH_VEC_ELT (vector_costs, i, cost)
8460     {
8461       /* We inherit from the previous COST, invariants, externals and
8462          extracts immediately follow the cost for the related stmt.  */
8463       if (cost->stmt_info)
8464         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
8465       li_vector_costs.quick_push (std::make_pair (l, cost));
8466     }
8467   li_scalar_costs.qsort (li_cost_vec_cmp);
8468   li_vector_costs.qsort (li_cost_vec_cmp);
8469
8470   /* Now cost the portions individually.  */
8471   unsigned vi = 0;
8472   unsigned si = 0;
8473   bool profitable = true;
8474   while (si < li_scalar_costs.length ()
8475          && vi < li_vector_costs.length ())
8476     {
8477       unsigned sl = li_scalar_costs[si].first;
8478       unsigned vl = li_vector_costs[vi].first;
8479       if (sl != vl)
8480         {
8481           if (dump_enabled_p ())
8482             dump_printf_loc (MSG_NOTE, vect_location,
8483                              "Scalar %d and vector %d loop part do not "
8484                              "match up, skipping scalar part\n", sl, vl);
8485           /* Skip the scalar part, assuming zero cost on the vector side.  */
8486           do
8487             {
8488               si++;
8489             }
8490           while (si < li_scalar_costs.length ()
8491                  && li_scalar_costs[si].first == sl);
8492           continue;
8493         }
8494
8495       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
8496       do
8497         {
8498           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
8499           si++;
8500         }
8501       while (si < li_scalar_costs.length ()
8502              && li_scalar_costs[si].first == sl);
8503       unsigned dummy;
8504       finish_cost (scalar_target_cost_data, nullptr,
8505                    &dummy, &scalar_cost, &dummy);
8506
8507       /* Complete the target-specific vector cost calculation.  */
8508       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
8509       do
8510         {
8511           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
8512           vi++;
8513         }
8514       while (vi < li_vector_costs.length ()
8515              && li_vector_costs[vi].first == vl);
8516       finish_cost (vect_target_cost_data, scalar_target_cost_data,
8517                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
8518       delete scalar_target_cost_data;
8519       delete vect_target_cost_data;
8520
8521       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
8522
8523       if (dump_enabled_p ())
8524         {
8525           dump_printf_loc (MSG_NOTE, vect_location,
8526                            "Cost model analysis for part in loop %d:\n", sl);
8527           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
8528                        vec_inside_cost + vec_outside_cost);
8529           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
8530         }
8531
8532       /* Vectorization is profitable if its cost is more than the cost of scalar
8533          version.  Note that we err on the vector side for equal cost because
8534          the cost estimate is otherwise quite pessimistic (constant uses are
8535          free on the scalar side but cost a load on the vector side for
8536          example).  */
8537       if (vec_outside_cost + vec_inside_cost > scalar_cost)
8538         {
8539           profitable = false;
8540           break;
8541         }
8542     }
8543   if (profitable && vi < li_vector_costs.length ())
8544     {
8545       if (dump_enabled_p ())
8546         dump_printf_loc (MSG_NOTE, vect_location,
8547                          "Excess vector cost for part in loop %d:\n",
8548                          li_vector_costs[vi].first);
8549       profitable = false;
8550     }
8551
8552   /* Unset visited flag.  This is delayed when the subgraph is profitable
8553      and we process the loop for remaining unvectorized if-converted code.  */
8554   if (!orig_loop || !profitable)
8555     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
8556       gimple_set_visited  (cost->stmt_info->stmt, false);
8557
8558   scalar_costs.release ();
8559   vector_costs.release ();
8560
8561   return profitable;
8562 }
8563
8564 /* qsort comparator for lane defs.  */
8565
8566 static int
8567 vld_cmp (const void *a_, const void *b_)
8568 {
8569   auto *a = (const std::pair<unsigned, tree> *)a_;
8570   auto *b = (const std::pair<unsigned, tree> *)b_;
8571   return a->first - b->first;
8572 }
8573
8574 /* Return true if USE_STMT is a vector lane insert into VEC and set
8575    *THIS_LANE to the lane number that is set.  */
8576
8577 static bool
8578 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
8579 {
8580   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
8581   if (!use_ass
8582       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
8583       || (vec
8584           ? gimple_assign_rhs1 (use_ass) != vec
8585           : ((vec = gimple_assign_rhs1 (use_ass)), false))
8586       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
8587                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
8588       || !constant_multiple_p
8589             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
8590              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
8591              this_lane))
8592     return false;
8593   return true;
8594 }
8595
8596 /* Find any vectorizable constructors and add them to the grouped_store
8597    array.  */
8598
8599 static void
8600 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
8601 {
8602   for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
8603     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
8604          !gsi_end_p (gsi); gsi_next (&gsi))
8605     {
8606       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
8607       if (!assign)
8608         continue;
8609
8610       tree rhs = gimple_assign_rhs1 (assign);
8611       enum tree_code code = gimple_assign_rhs_code (assign);
8612       use_operand_p use_p;
8613       gimple *use_stmt;
8614       if (code == CONSTRUCTOR)
8615         {
8616           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
8617               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
8618                            CONSTRUCTOR_NELTS (rhs))
8619               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
8620               || uniform_vector_p (rhs))
8621             continue;
8622
8623           unsigned j;
8624           tree val;
8625           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
8626             if (TREE_CODE (val) != SSA_NAME
8627                 || !bb_vinfo->lookup_def (val))
8628               break;
8629           if (j != CONSTRUCTOR_NELTS (rhs))
8630             continue;
8631
8632           vec<stmt_vec_info> roots = vNULL;
8633           roots.safe_push (bb_vinfo->lookup_stmt (assign));
8634           vec<stmt_vec_info> stmts;
8635           stmts.create (CONSTRUCTOR_NELTS (rhs));
8636           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
8637             stmts.quick_push
8638               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
8639           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
8640                                                stmts, roots));
8641         }
8642       else if (code == BIT_INSERT_EXPR
8643                && VECTOR_TYPE_P (TREE_TYPE (rhs))
8644                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
8645                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
8646                && integer_zerop (gimple_assign_rhs3 (assign))
8647                && useless_type_conversion_p
8648                     (TREE_TYPE (TREE_TYPE (rhs)),
8649                      TREE_TYPE (gimple_assign_rhs2 (assign)))
8650                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
8651         {
8652           /* We start to match on insert to lane zero but since the
8653              inserts need not be ordered we'd have to search both
8654              the def and the use chains.  */
8655           tree vectype = TREE_TYPE (rhs);
8656           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
8657           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
8658           auto_sbitmap lanes (nlanes);
8659           bitmap_clear (lanes);
8660           bitmap_set_bit (lanes, 0);
8661           tree def = gimple_assign_lhs (assign);
8662           lane_defs.quick_push
8663                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
8664           unsigned lanes_found = 1;
8665           /* Start with the use chains, the last stmt will be the root.  */
8666           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
8667           vec<stmt_vec_info> roots = vNULL;
8668           roots.safe_push (last);
8669           do
8670             {
8671               use_operand_p use_p;
8672               gimple *use_stmt;
8673               if (!single_imm_use (def, &use_p, &use_stmt))
8674                 break;
8675               unsigned this_lane;
8676               if (!bb_vinfo->lookup_stmt (use_stmt)
8677                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
8678                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
8679                 break;
8680               if (bitmap_bit_p (lanes, this_lane))
8681                 break;
8682               lanes_found++;
8683               bitmap_set_bit (lanes, this_lane);
8684               gassign *use_ass = as_a <gassign *> (use_stmt);
8685               lane_defs.quick_push (std::make_pair
8686                                      (this_lane, gimple_assign_rhs2 (use_ass)));
8687               last = bb_vinfo->lookup_stmt (use_ass);
8688               roots.safe_push (last);
8689               def = gimple_assign_lhs (use_ass);
8690             }
8691           while (lanes_found < nlanes);
8692           if (roots.length () > 1)
8693             std::swap(roots[0], roots[roots.length () - 1]);
8694           if (lanes_found < nlanes)
8695             {
8696               /* Now search the def chain.  */
8697               def = gimple_assign_rhs1 (assign);
8698               do
8699                 {
8700                   if (TREE_CODE (def) != SSA_NAME
8701                       || !has_single_use (def))
8702                     break;
8703                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
8704                   unsigned this_lane;
8705                   if (!bb_vinfo->lookup_stmt (def_stmt)
8706                       || !vect_slp_is_lane_insert (def_stmt,
8707                                                    NULL_TREE, &this_lane)
8708                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
8709                     break;
8710                   if (bitmap_bit_p (lanes, this_lane))
8711                     break;
8712                   lanes_found++;
8713                   bitmap_set_bit (lanes, this_lane);
8714                   lane_defs.quick_push (std::make_pair
8715                                           (this_lane,
8716                                            gimple_assign_rhs2 (def_stmt)));
8717                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
8718                   def = gimple_assign_rhs1 (def_stmt);
8719                 }
8720               while (lanes_found < nlanes);
8721             }
8722           if (lanes_found == nlanes)
8723             {
8724               /* Sort lane_defs after the lane index and register the root.  */
8725               lane_defs.qsort (vld_cmp);
8726               vec<stmt_vec_info> stmts;
8727               stmts.create (nlanes);
8728               for (unsigned i = 0; i < nlanes; ++i)
8729                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
8730               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
8731                                                    stmts, roots));
8732             }
8733           else
8734             roots.release ();
8735         }
8736       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
8737                && (associative_tree_code (code) || code == MINUS_EXPR)
8738                /* ???  This pessimizes a two-element reduction.  PR54400.
8739                   ???  In-order reduction could be handled if we only
8740                   traverse one operand chain in vect_slp_linearize_chain.  */
8741                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
8742                /* Ops with constants at the tail can be stripped here.  */
8743                && TREE_CODE (rhs) == SSA_NAME
8744                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
8745                /* Should be the chain end.  */
8746                && (!single_imm_use (gimple_assign_lhs (assign),
8747                                     &use_p, &use_stmt)
8748                    || !is_gimple_assign (use_stmt)
8749                    || (gimple_assign_rhs_code (use_stmt) != code
8750                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
8751                            || (gimple_assign_rhs_code (use_stmt)
8752                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
8753         {
8754           /* We start the match at the end of a possible association
8755              chain.  */
8756           auto_vec<chain_op_t> chain;
8757           auto_vec<std::pair<tree_code, gimple *> > worklist;
8758           auto_vec<gimple *> chain_stmts;
8759           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
8760           if (code == MINUS_EXPR)
8761             code = PLUS_EXPR;
8762           internal_fn reduc_fn;
8763           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
8764               || reduc_fn == IFN_LAST)
8765             continue;
8766           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
8767                                     /* ??? */
8768                                     code_stmt, alt_code_stmt, &chain_stmts);
8769           if (chain.length () > 1)
8770             {
8771               /* Sort the chain according to def_type and operation.  */
8772               chain.sort (dt_sort_cmp, bb_vinfo);
8773               /* ???  Now we'd want to strip externals and constants
8774                  but record those to be handled in the epilogue.  */
8775               /* ???  For now do not allow mixing ops or externs/constants.  */
8776               bool invalid = false;
8777               unsigned remain_cnt = 0;
8778               unsigned last_idx = 0;
8779               for (unsigned i = 0; i < chain.length (); ++i)
8780                 {
8781                   if (chain[i].code != code)
8782                     {
8783                       invalid = true;
8784                       break;
8785                     }
8786                   if (chain[i].dt != vect_internal_def
8787                       /* Avoid stmts where the def is not the LHS, like
8788                          ASMs.  */
8789                       || (gimple_get_lhs (bb_vinfo->lookup_def
8790                                                       (chain[i].op)->stmt)
8791                           != chain[i].op))
8792                     remain_cnt++;
8793                   else
8794                     last_idx = i;
8795                 }
8796               /* Make sure to have an even number of lanes as we later do
8797                  all-or-nothing discovery, not trying to split further.  */
8798               if ((chain.length () - remain_cnt) & 1)
8799                 remain_cnt++;
8800               if (!invalid && chain.length () - remain_cnt > 1)
8801                 {
8802                   vec<stmt_vec_info> stmts;
8803                   vec<tree> remain = vNULL;
8804                   stmts.create (chain.length ());
8805                   if (remain_cnt > 0)
8806                     remain.create (remain_cnt);
8807                   for (unsigned i = 0; i < chain.length (); ++i)
8808                     {
8809                       stmt_vec_info stmt_info;
8810                       if (chain[i].dt == vect_internal_def
8811                           && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
8812                               gimple_get_lhs (stmt_info->stmt) == chain[i].op)
8813                           && (i != last_idx
8814                               || (stmts.length () & 1)))
8815                         stmts.quick_push (stmt_info);
8816                       else
8817                         remain.quick_push (chain[i].op);
8818                     }
8819                   vec<stmt_vec_info> roots;
8820                   roots.create (chain_stmts.length ());
8821                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
8822                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
8823                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
8824                                                        stmts, roots, remain));
8825                 }
8826             }
8827         }
8828     }
8829 }
8830
8831 /* Walk the grouped store chains and replace entries with their
8832    pattern variant if any.  */
8833
8834 static void
8835 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
8836 {
8837   stmt_vec_info first_element;
8838   unsigned i;
8839
8840   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
8841     {
8842       /* We also have CTORs in this array.  */
8843       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
8844         continue;
8845       if (STMT_VINFO_IN_PATTERN_P (first_element))
8846         {
8847           stmt_vec_info orig = first_element;
8848           first_element = STMT_VINFO_RELATED_STMT (first_element);
8849           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
8850           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
8851           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
8852           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
8853           vinfo->grouped_stores[i] = first_element;
8854         }
8855       stmt_vec_info prev = first_element;
8856       while (DR_GROUP_NEXT_ELEMENT (prev))
8857         {
8858           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
8859           if (STMT_VINFO_IN_PATTERN_P (elt))
8860             {
8861               stmt_vec_info orig = elt;
8862               elt = STMT_VINFO_RELATED_STMT (elt);
8863               DR_GROUP_NEXT_ELEMENT (prev) = elt;
8864               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
8865               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
8866             }
8867           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
8868           prev = elt;
8869         }
8870     }
8871 }
8872
8873 /* Check if the region described by BB_VINFO can be vectorized, returning
8874    true if so.  When returning false, set FATAL to true if the same failure
8875    would prevent vectorization at other vector sizes, false if it is still
8876    worth trying other sizes.  N_STMTS is the number of statements in the
8877    region.  */
8878
8879 static bool
8880 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
8881                        vec<int> *dataref_groups)
8882 {
8883   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
8884
8885   slp_instance instance;
8886   int i;
8887   poly_uint64 min_vf = 2;
8888
8889   /* The first group of checks is independent of the vector size.  */
8890   fatal = true;
8891
8892   /* Analyze the data references.  */
8893
8894   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
8895     {
8896       if (dump_enabled_p ())
8897         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8898                          "not vectorized: unhandled data-ref in basic "
8899                          "block.\n");
8900       return false;
8901     }
8902
8903   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
8904     {
8905      if (dump_enabled_p ())
8906        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8907                         "not vectorized: unhandled data access in "
8908                         "basic block.\n");
8909       return false;
8910     }
8911
8912   vect_slp_check_for_roots (bb_vinfo);
8913
8914   /* If there are no grouped stores and no constructors in the region
8915      there is no need to continue with pattern recog as vect_analyze_slp
8916      will fail anyway.  */
8917   if (bb_vinfo->grouped_stores.is_empty ()
8918       && bb_vinfo->roots.is_empty ())
8919     {
8920       if (dump_enabled_p ())
8921         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8922                          "not vectorized: no grouped stores in "
8923                          "basic block.\n");
8924       return false;
8925     }
8926
8927   /* While the rest of the analysis below depends on it in some way.  */
8928   fatal = false;
8929
8930   vect_pattern_recog (bb_vinfo);
8931
8932   /* Update store groups from pattern processing.  */
8933   vect_fixup_store_groups_with_patterns (bb_vinfo);
8934
8935   /* Check the SLP opportunities in the basic block, analyze and build SLP
8936      trees.  */
8937   if (!vect_analyze_slp (bb_vinfo, n_stmts))
8938     {
8939       if (dump_enabled_p ())
8940         {
8941           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8942                            "Failed to SLP the basic block.\n");
8943           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8944                            "not vectorized: failed to find SLP opportunities "
8945                            "in basic block.\n");
8946         }
8947       return false;
8948     }
8949
8950   /* Optimize permutations.  */
8951   vect_optimize_slp (bb_vinfo);
8952
8953   /* Gather the loads reachable from the SLP graph entries.  */
8954   vect_gather_slp_loads (bb_vinfo);
8955
8956   vect_record_base_alignments (bb_vinfo);
8957
8958   /* Analyze and verify the alignment of data references and the
8959      dependence in the SLP instances.  */
8960   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
8961     {
8962       vect_location = instance->location ();
8963       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
8964           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
8965         {
8966           slp_tree node = SLP_INSTANCE_TREE (instance);
8967           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8968           if (dump_enabled_p ())
8969             dump_printf_loc (MSG_NOTE, vect_location,
8970                              "removing SLP instance operations starting from: %G",
8971                              stmt_info->stmt);
8972           vect_free_slp_instance (instance);
8973           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
8974           continue;
8975         }
8976
8977       /* Mark all the statements that we want to vectorize as pure SLP and
8978          relevant.  */
8979       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
8980       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
8981       unsigned j;
8982       stmt_vec_info root;
8983       /* Likewise consider instance root stmts as vectorized.  */
8984       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
8985         STMT_SLP_TYPE (root) = pure_slp;
8986
8987       i++;
8988     }
8989   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
8990     return false;
8991
8992   if (!vect_slp_analyze_operations (bb_vinfo))
8993     {
8994       if (dump_enabled_p ())
8995         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8996                          "not vectorized: bad operation in basic block.\n");
8997       return false;
8998     }
8999
9000   vect_bb_partition_graph (bb_vinfo);
9001
9002   return true;
9003 }
9004
9005 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
9006    basic blocks in BBS, returning true on success.
9007    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
9008
9009 static bool
9010 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
9011                  vec<int> *dataref_groups, unsigned int n_stmts,
9012                  loop_p orig_loop)
9013 {
9014   bb_vec_info bb_vinfo;
9015   auto_vector_modes vector_modes;
9016
9017   /* Autodetect first vector size we try.  */
9018   machine_mode next_vector_mode = VOIDmode;
9019   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
9020   unsigned int mode_i = 0;
9021
9022   vec_info_shared shared;
9023
9024   machine_mode autodetected_vector_mode = VOIDmode;
9025   while (1)
9026     {
9027       bool vectorized = false;
9028       bool fatal = false;
9029       bb_vinfo = new _bb_vec_info (bbs, &shared);
9030
9031       bool first_time_p = shared.datarefs.is_empty ();
9032       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
9033       if (first_time_p)
9034         bb_vinfo->shared->save_datarefs ();
9035       else
9036         bb_vinfo->shared->check_datarefs ();
9037       bb_vinfo->vector_mode = next_vector_mode;
9038
9039       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
9040         {
9041           if (dump_enabled_p ())
9042             {
9043               dump_printf_loc (MSG_NOTE, vect_location,
9044                                "***** Analysis succeeded with vector mode"
9045                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
9046               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
9047             }
9048
9049           bb_vinfo->shared->check_datarefs ();
9050
9051           bool force_clear = false;
9052           auto_vec<slp_instance> profitable_subgraphs;
9053           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
9054             {
9055               if (instance->subgraph_entries.is_empty ())
9056                 continue;
9057
9058               dump_user_location_t saved_vect_location = vect_location;
9059               vect_location = instance->location ();
9060               if (!unlimited_cost_model (NULL)
9061                   && !vect_bb_vectorization_profitable_p
9062                         (bb_vinfo, instance->subgraph_entries, orig_loop))
9063                 {
9064                   if (dump_enabled_p ())
9065                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9066                                      "not vectorized: vectorization is not "
9067                                      "profitable.\n");
9068                   vect_location = saved_vect_location;
9069                   continue;
9070                 }
9071
9072               vect_location = saved_vect_location;
9073               if (!dbg_cnt (vect_slp))
9074                 {
9075                   force_clear = true;
9076                   continue;
9077                 }
9078
9079               profitable_subgraphs.safe_push (instance);
9080             }
9081
9082           /* When we're vectorizing an if-converted loop body make sure
9083              we vectorized all if-converted code.  */
9084           if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
9085             {
9086               gcc_assert (bb_vinfo->nbbs == 1);
9087               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
9088                    !gsi_end_p (gsi); gsi_next (&gsi))
9089                 {
9090                   /* The costing above left us with DCEable vectorized scalar
9091                      stmts having the visited flag set on profitable
9092                      subgraphs.  Do the delayed clearing of the flag here.  */
9093                   if (gimple_visited_p (gsi_stmt (gsi)))
9094                     {
9095                       gimple_set_visited (gsi_stmt (gsi), false);
9096                       continue;
9097                     }
9098                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
9099                     continue;
9100
9101                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
9102                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
9103                       {
9104                         if (!profitable_subgraphs.is_empty ()
9105                             && dump_enabled_p ())
9106                           dump_printf_loc (MSG_NOTE, vect_location,
9107                                            "not profitable because of "
9108                                            "unprofitable if-converted scalar "
9109                                            "code\n");
9110                         profitable_subgraphs.truncate (0);
9111                       }
9112                 }
9113             }
9114
9115           /* Finally schedule the profitable subgraphs.  */
9116           for (slp_instance instance : profitable_subgraphs)
9117             {
9118               if (!vectorized && dump_enabled_p ())
9119                 dump_printf_loc (MSG_NOTE, vect_location,
9120                                  "Basic block will be vectorized "
9121                                  "using SLP\n");
9122               vectorized = true;
9123
9124               /* Dump before scheduling as store vectorization will remove
9125                  the original stores and mess with the instance tree
9126                  so querying its location will eventually ICE.  */
9127               if (flag_checking)
9128                 for (slp_instance sub : instance->subgraph_entries)
9129                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
9130               unsigned HOST_WIDE_INT bytes;
9131               if (dump_enabled_p ())
9132                 for (slp_instance sub : instance->subgraph_entries)
9133                   {
9134                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
9135                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
9136                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
9137                                        sub->location (),
9138                                        "basic block part vectorized using %wu "
9139                                        "byte vectors\n", bytes);
9140                     else
9141                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
9142                                        sub->location (),
9143                                        "basic block part vectorized using "
9144                                        "variable length vectors\n");
9145                   }
9146
9147               dump_user_location_t saved_vect_location = vect_location;
9148               vect_location = instance->location ();
9149
9150               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
9151
9152               vect_location = saved_vect_location;
9153             }
9154         }
9155       else
9156         {
9157           if (dump_enabled_p ())
9158             dump_printf_loc (MSG_NOTE, vect_location,
9159                              "***** Analysis failed with vector mode %s\n",
9160                              GET_MODE_NAME (bb_vinfo->vector_mode));
9161         }
9162
9163       if (mode_i == 0)
9164         autodetected_vector_mode = bb_vinfo->vector_mode;
9165
9166       if (!fatal)
9167         while (mode_i < vector_modes.length ()
9168                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
9169           {
9170             if (dump_enabled_p ())
9171               dump_printf_loc (MSG_NOTE, vect_location,
9172                                "***** The result for vector mode %s would"
9173                                " be the same\n",
9174                                GET_MODE_NAME (vector_modes[mode_i]));
9175             mode_i += 1;
9176           }
9177
9178       delete bb_vinfo;
9179
9180       if (mode_i < vector_modes.length ()
9181           && VECTOR_MODE_P (autodetected_vector_mode)
9182           && (related_vector_mode (vector_modes[mode_i],
9183                                    GET_MODE_INNER (autodetected_vector_mode))
9184               == autodetected_vector_mode)
9185           && (related_vector_mode (autodetected_vector_mode,
9186                                    GET_MODE_INNER (vector_modes[mode_i]))
9187               == vector_modes[mode_i]))
9188         {
9189           if (dump_enabled_p ())
9190             dump_printf_loc (MSG_NOTE, vect_location,
9191                              "***** Skipping vector mode %s, which would"
9192                              " repeat the analysis for %s\n",
9193                              GET_MODE_NAME (vector_modes[mode_i]),
9194                              GET_MODE_NAME (autodetected_vector_mode));
9195           mode_i += 1;
9196         }
9197
9198       if (vectorized
9199           || mode_i == vector_modes.length ()
9200           || autodetected_vector_mode == VOIDmode
9201           /* If vect_slp_analyze_bb_1 signaled that analysis for all
9202              vector sizes will fail do not bother iterating.  */
9203           || fatal)
9204         return vectorized;
9205
9206       /* Try the next biggest vector size.  */
9207       next_vector_mode = vector_modes[mode_i++];
9208       if (dump_enabled_p ())
9209         dump_printf_loc (MSG_NOTE, vect_location,
9210                          "***** Re-trying analysis with vector mode %s\n",
9211                          GET_MODE_NAME (next_vector_mode));
9212     }
9213 }
9214
9215
9216 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
9217    true if anything in the basic-block was vectorized.  */
9218
9219 static bool
9220 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
9221 {
9222   vec<data_reference_p> datarefs = vNULL;
9223   auto_vec<int> dataref_groups;
9224   int insns = 0;
9225   int current_group = 0;
9226
9227   for (unsigned i = 0; i < bbs.length (); i++)
9228     {
9229       basic_block bb = bbs[i];
9230       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
9231            gsi_next (&gsi))
9232         {
9233           gimple *stmt = gsi_stmt (gsi);
9234           if (is_gimple_debug (stmt))
9235             continue;
9236
9237           insns++;
9238
9239           if (gimple_location (stmt) != UNKNOWN_LOCATION)
9240             vect_location = stmt;
9241
9242           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
9243                                               &dataref_groups, current_group))
9244             ++current_group;
9245         }
9246       /* New BBs always start a new DR group.  */
9247       ++current_group;
9248     }
9249
9250   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
9251 }
9252
9253 /* Special entry for the BB vectorizer.  Analyze and transform a single
9254    if-converted BB with ORIG_LOOPs body being the not if-converted
9255    representation.  Returns true if anything in the basic-block was
9256    vectorized.  */
9257
9258 bool
9259 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
9260 {
9261   auto_vec<basic_block> bbs;
9262   bbs.safe_push (bb);
9263   return vect_slp_bbs (bbs, orig_loop);
9264 }
9265
9266 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
9267    true if anything in the basic-block was vectorized.  */
9268
9269 bool
9270 vect_slp_function (function *fun)
9271 {
9272   bool r = false;
9273   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
9274   auto_bitmap exit_bbs;
9275   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
9276   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
9277   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
9278                                                       true, rpo, NULL);
9279
9280   /* For the moment split the function into pieces to avoid making
9281      the iteration on the vector mode moot.  Split at points we know
9282      to not handle well which is CFG merges (SLP discovery doesn't
9283      handle non-loop-header PHIs) and loop exits.  Since pattern
9284      recog requires reverse iteration to visit uses before defs
9285      simply chop RPO into pieces.  */
9286   auto_vec<basic_block> bbs;
9287   for (unsigned i = 0; i < n; i++)
9288     {
9289       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
9290       bool split = false;
9291
9292       /* Split when a BB is not dominated by the first block.  */
9293       if (!bbs.is_empty ()
9294           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
9295         {
9296           if (dump_enabled_p ())
9297             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9298                              "splitting region at dominance boundary bb%d\n",
9299                              bb->index);
9300           split = true;
9301         }
9302       /* Split when the loop determined by the first block
9303          is exited.  This is because we eventually insert
9304          invariants at region begin.  */
9305       else if (!bbs.is_empty ()
9306                && bbs[0]->loop_father != bb->loop_father
9307                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
9308         {
9309           if (dump_enabled_p ())
9310             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9311                              "splitting region at loop %d exit at bb%d\n",
9312                              bbs[0]->loop_father->num, bb->index);
9313           split = true;
9314         }
9315       else if (!bbs.is_empty ()
9316                && bb->loop_father->header == bb
9317                && bb->loop_father->dont_vectorize)
9318         {
9319           if (dump_enabled_p ())
9320             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9321                              "splitting region at dont-vectorize loop %d "
9322                              "entry at bb%d\n",
9323                              bb->loop_father->num, bb->index);
9324           split = true;
9325         }
9326
9327       if (split && !bbs.is_empty ())
9328         {
9329           r |= vect_slp_bbs (bbs, NULL);
9330           bbs.truncate (0);
9331         }
9332
9333       if (bbs.is_empty ())
9334         {
9335           /* We need to be able to insert at the head of the region which
9336              we cannot for region starting with a returns-twice call.  */
9337           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
9338             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
9339               {
9340                 if (dump_enabled_p ())
9341                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9342                                    "skipping bb%d as start of region as it "
9343                                    "starts with returns-twice call\n",
9344                                    bb->index);
9345                 continue;
9346               }
9347           /* If the loop this BB belongs to is marked as not to be vectorized
9348              honor that also for BB vectorization.  */
9349           if (bb->loop_father->dont_vectorize)
9350             continue;
9351         }
9352
9353       bbs.safe_push (bb);
9354
9355       /* When we have a stmt ending this block and defining a
9356          value we have to insert on edges when inserting after it for
9357          a vector containing its definition.  Avoid this for now.  */
9358       if (gimple *last = *gsi_last_bb (bb))
9359         if (gimple_get_lhs (last)
9360             && is_ctrl_altering_stmt (last))
9361           {
9362             if (dump_enabled_p ())
9363               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9364                                "splitting region at control altering "
9365                                "definition %G", last);
9366             r |= vect_slp_bbs (bbs, NULL);
9367             bbs.truncate (0);
9368           }
9369     }
9370
9371   if (!bbs.is_empty ())
9372     r |= vect_slp_bbs (bbs, NULL);
9373
9374   free (rpo);
9375
9376   return r;
9377 }
9378
9379 /* Build a variable-length vector in which the elements in ELTS are repeated
9380    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
9381    RESULTS and add any new instructions to SEQ.
9382
9383    The approach we use is:
9384
9385    (1) Find a vector mode VM with integer elements of mode IM.
9386
9387    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
9388        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
9389        from small vectors to IM.
9390
9391    (3) Duplicate each ELTS'[I] into a vector of mode VM.
9392
9393    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
9394        correct byte contents.
9395
9396    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
9397
9398    We try to find the largest IM for which this sequence works, in order
9399    to cut down on the number of interleaves.  */
9400
9401 void
9402 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
9403                           const vec<tree> &elts, unsigned int nresults,
9404                           vec<tree> &results)
9405 {
9406   unsigned int nelts = elts.length ();
9407   tree element_type = TREE_TYPE (vector_type);
9408
9409   /* (1) Find a vector mode VM with integer elements of mode IM.  */
9410   unsigned int nvectors = 1;
9411   tree new_vector_type;
9412   tree permutes[2];
9413   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
9414                                        &nvectors, &new_vector_type,
9415                                        permutes))
9416     gcc_unreachable ();
9417
9418   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
9419   unsigned int partial_nelts = nelts / nvectors;
9420   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
9421
9422   tree_vector_builder partial_elts;
9423   auto_vec<tree, 32> pieces (nvectors * 2);
9424   pieces.quick_grow_cleared (nvectors * 2);
9425   for (unsigned int i = 0; i < nvectors; ++i)
9426     {
9427       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
9428              ELTS' has mode IM.  */
9429       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
9430       for (unsigned int j = 0; j < partial_nelts; ++j)
9431         partial_elts.quick_push (elts[i * partial_nelts + j]);
9432       tree t = gimple_build_vector (seq, &partial_elts);
9433       t = gimple_build (seq, VIEW_CONVERT_EXPR,
9434                         TREE_TYPE (new_vector_type), t);
9435
9436       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
9437       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
9438     }
9439
9440   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
9441          correct byte contents.
9442
9443      Conceptually, we need to repeat the following operation log2(nvectors)
9444      times, where hi_start = nvectors / 2:
9445
9446         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
9447         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
9448
9449      However, if each input repeats every N elements and the VF is
9450      a multiple of N * 2, the HI result is the same as the LO result.
9451      This will be true for the first N1 iterations of the outer loop,
9452      followed by N2 iterations for which both the LO and HI results
9453      are needed.  I.e.:
9454
9455         N1 + N2 = log2(nvectors)
9456
9457      Each "N1 iteration" doubles the number of redundant vectors and the
9458      effect of the process as a whole is to have a sequence of nvectors/2**N1
9459      vectors that repeats 2**N1 times.  Rather than generate these redundant
9460      vectors, we halve the number of vectors for each N1 iteration.  */
9461   unsigned int in_start = 0;
9462   unsigned int out_start = nvectors;
9463   unsigned int new_nvectors = nvectors;
9464   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
9465     {
9466       unsigned int hi_start = new_nvectors / 2;
9467       unsigned int out_i = 0;
9468       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
9469         {
9470           if ((in_i & 1) != 0
9471               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
9472                              2 * in_repeat))
9473             continue;
9474
9475           tree output = make_ssa_name (new_vector_type);
9476           tree input1 = pieces[in_start + (in_i / 2)];
9477           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
9478           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
9479                                                input1, input2,
9480                                                permutes[in_i & 1]);
9481           gimple_seq_add_stmt (seq, stmt);
9482           pieces[out_start + out_i] = output;
9483           out_i += 1;
9484         }
9485       std::swap (in_start, out_start);
9486       new_nvectors = out_i;
9487     }
9488
9489   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
9490   results.reserve (nresults);
9491   for (unsigned int i = 0; i < nresults; ++i)
9492     if (i < new_nvectors)
9493       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
9494                                         pieces[in_start + i]));
9495     else
9496       results.quick_push (results[i - new_nvectors]);
9497 }
9498
9499
9500 /* For constant and loop invariant defs in OP_NODE this function creates
9501    vector defs that will be used in the vectorized stmts and stores them
9502    to SLP_TREE_VEC_DEFS of OP_NODE.  */
9503
9504 static void
9505 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
9506 {
9507   unsigned HOST_WIDE_INT nunits;
9508   tree vec_cst;
9509   unsigned j, number_of_places_left_in_vector;
9510   tree vector_type;
9511   tree vop;
9512   int group_size = op_node->ops.length ();
9513   unsigned int vec_num, i;
9514   unsigned number_of_copies = 1;
9515   bool constant_p;
9516   gimple_seq ctor_seq = NULL;
9517   auto_vec<tree, 16> permute_results;
9518
9519   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
9520   vector_type = SLP_TREE_VECTYPE (op_node);
9521
9522   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
9523   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
9524   auto_vec<tree> voprnds (number_of_vectors);
9525
9526   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
9527      created vectors. It is greater than 1 if unrolling is performed.
9528
9529      For example, we have two scalar operands, s1 and s2 (e.g., group of
9530      strided accesses of size two), while NUNITS is four (i.e., four scalars
9531      of this type can be packed in a vector).  The output vector will contain
9532      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
9533      will be 2).
9534
9535      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
9536      containing the operands.
9537
9538      For example, NUNITS is four as before, and the group size is 8
9539      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
9540      {s5, s6, s7, s8}.  */
9541
9542   /* When using duplicate_and_interleave, we just need one element for
9543      each scalar statement.  */
9544   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
9545     nunits = group_size;
9546
9547   number_of_copies = nunits * number_of_vectors / group_size;
9548
9549   number_of_places_left_in_vector = nunits;
9550   constant_p = true;
9551   tree uniform_elt = NULL_TREE;
9552   tree_vector_builder elts (vector_type, nunits, 1);
9553   elts.quick_grow (nunits);
9554   stmt_vec_info insert_after = NULL;
9555   for (j = 0; j < number_of_copies; j++)
9556     {
9557       tree op;
9558       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
9559         {
9560           /* Create 'vect_ = {op0,op1,...,opn}'.  */
9561           tree orig_op = op;
9562           if (number_of_places_left_in_vector == nunits)
9563             uniform_elt = op;
9564           else if (uniform_elt && operand_equal_p (uniform_elt, op))
9565             op = elts[number_of_places_left_in_vector];
9566           else
9567             uniform_elt = NULL_TREE;
9568           number_of_places_left_in_vector--;
9569           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
9570             {
9571               if (CONSTANT_CLASS_P (op))
9572                 {
9573                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
9574                     {
9575                       /* Can't use VIEW_CONVERT_EXPR for booleans because
9576                          of possibly different sizes of scalar value and
9577                          vector element.  */
9578                       if (integer_zerop (op))
9579                         op = build_int_cst (TREE_TYPE (vector_type), 0);
9580                       else if (integer_onep (op))
9581                         op = build_all_ones_cst (TREE_TYPE (vector_type));
9582                       else
9583                         gcc_unreachable ();
9584                     }
9585                   else
9586                     op = fold_unary (VIEW_CONVERT_EXPR,
9587                                      TREE_TYPE (vector_type), op);
9588                   gcc_assert (op && CONSTANT_CLASS_P (op));
9589                 }
9590               else
9591                 {
9592                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
9593                   gimple *init_stmt;
9594                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
9595                     {
9596                       tree true_val
9597                         = build_all_ones_cst (TREE_TYPE (vector_type));
9598                       tree false_val
9599                         = build_zero_cst (TREE_TYPE (vector_type));
9600                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
9601                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
9602                                                        op, true_val,
9603                                                        false_val);
9604                     }
9605                   else
9606                     {
9607                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
9608                                    op);
9609                       init_stmt
9610                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
9611                                                op);
9612                     }
9613                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
9614                   op = new_temp;
9615                 }
9616             }
9617           elts[number_of_places_left_in_vector] = op;
9618           if (!CONSTANT_CLASS_P (op))
9619             constant_p = false;
9620           /* For BB vectorization we have to compute an insert location
9621              when a def is inside the analyzed region since we cannot
9622              simply insert at the BB start in this case.  */
9623           stmt_vec_info opdef;
9624           if (TREE_CODE (orig_op) == SSA_NAME
9625               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
9626               && is_a <bb_vec_info> (vinfo)
9627               && (opdef = vinfo->lookup_def (orig_op)))
9628             {
9629               if (!insert_after)
9630                 insert_after = opdef;
9631               else
9632                 insert_after = get_later_stmt (insert_after, opdef);
9633             }
9634
9635           if (number_of_places_left_in_vector == 0)
9636             {
9637               auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
9638               if (uniform_elt)
9639                 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
9640                                                         elts[0]);
9641               else if (constant_p
9642                        ? multiple_p (type_nunits, nunits)
9643                        : known_eq (type_nunits, nunits))
9644                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
9645               else
9646                 {
9647                   if (permute_results.is_empty ())
9648                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
9649                                               elts, number_of_vectors,
9650                                               permute_results);
9651                   vec_cst = permute_results[number_of_vectors - j - 1];
9652                 }
9653               if (!gimple_seq_empty_p (ctor_seq))
9654                 {
9655                   if (insert_after)
9656                     {
9657                       gimple_stmt_iterator gsi;
9658                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
9659                         {
9660                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
9661                           gsi_insert_seq_before (&gsi, ctor_seq,
9662                                                  GSI_CONTINUE_LINKING);
9663                         }
9664                       else if (!stmt_ends_bb_p (insert_after->stmt))
9665                         {
9666                           gsi = gsi_for_stmt (insert_after->stmt);
9667                           gsi_insert_seq_after (&gsi, ctor_seq,
9668                                                 GSI_CONTINUE_LINKING);
9669                         }
9670                       else
9671                         {
9672                           /* When we want to insert after a def where the
9673                              defining stmt throws then insert on the fallthru
9674                              edge.  */
9675                           edge e = find_fallthru_edge
9676                                      (gimple_bb (insert_after->stmt)->succs);
9677                           basic_block new_bb
9678                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
9679                           gcc_assert (!new_bb);
9680                         }
9681                     }
9682                   else
9683                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
9684                   ctor_seq = NULL;
9685                 }
9686               voprnds.quick_push (vec_cst);
9687               insert_after = NULL;
9688               number_of_places_left_in_vector = nunits;
9689               constant_p = true;
9690               elts.new_vector (vector_type, nunits, 1);
9691               elts.quick_grow (nunits);
9692             }
9693         }
9694     }
9695
9696   /* Since the vectors are created in the reverse order, we should invert
9697      them.  */
9698   vec_num = voprnds.length ();
9699   for (j = vec_num; j != 0; j--)
9700     {
9701       vop = voprnds[j - 1];
9702       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
9703     }
9704
9705   /* In case that VF is greater than the unrolling factor needed for the SLP
9706      group of stmts, NUMBER_OF_VECTORS to be created is greater than
9707      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
9708      to replicate the vectors.  */
9709   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
9710     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
9711          i++)
9712       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
9713 }
9714
9715 /* Get the Ith vectorized definition from SLP_NODE.  */
9716
9717 tree
9718 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
9719 {
9720   return SLP_TREE_VEC_DEFS (slp_node)[i];
9721 }
9722
9723 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
9724
9725 void
9726 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
9727 {
9728   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
9729   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
9730 }
9731
9732 /* Get N vectorized definitions for SLP_NODE.  */
9733
9734 void
9735 vect_get_slp_defs (vec_info *,
9736                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
9737 {
9738   if (n == -1U)
9739     n = SLP_TREE_CHILDREN (slp_node).length ();
9740
9741   for (unsigned i = 0; i < n; ++i)
9742     {
9743       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9744       vec<tree> vec_defs = vNULL;
9745       vect_get_slp_defs (child, &vec_defs);
9746       vec_oprnds->quick_push (vec_defs);
9747     }
9748 }
9749
9750 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
9751    - PERM gives the permutation that the caller wants to use for NODE,
9752      which might be different from SLP_LOAD_PERMUTATION.
9753    - DUMP_P controls whether the function dumps information.  */
9754
9755 static bool
9756 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
9757                                 load_permutation_t &perm,
9758                                 const vec<tree> &dr_chain,
9759                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
9760                                 bool analyze_only, bool dump_p,
9761                                 unsigned *n_perms, unsigned int *n_loads,
9762                                 bool dce_chain)
9763 {
9764   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
9765   int vec_index = 0;
9766   tree vectype = SLP_TREE_VECTYPE (node);
9767   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
9768   unsigned int mask_element;
9769   unsigned dr_group_size;
9770   machine_mode mode;
9771
9772   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
9773     dr_group_size = 1;
9774   else
9775     {
9776       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
9777       dr_group_size = DR_GROUP_SIZE (stmt_info);
9778     }
9779
9780   mode = TYPE_MODE (vectype);
9781   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9782   unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9783
9784   /* Initialize the vect stmts of NODE to properly insert the generated
9785      stmts later.  */
9786   if (! analyze_only)
9787     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
9788       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
9789
9790   /* Generate permutation masks for every NODE. Number of masks for each NODE
9791      is equal to GROUP_SIZE.
9792      E.g., we have a group of three nodes with three loads from the same
9793      location in each node, and the vector size is 4. I.e., we have a
9794      a0b0c0a1b1c1... sequence and we need to create the following vectors:
9795      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
9796      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
9797      ...
9798
9799      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
9800      The last mask is illegal since we assume two operands for permute
9801      operation, and the mask element values can't be outside that range.
9802      Hence, the last mask must be converted into {2,5,5,5}.
9803      For the first two permutations we need the first and the second input
9804      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
9805      we need the second and the third vectors: {b1,c1,a2,b2} and
9806      {c2,a3,b3,c3}.  */
9807
9808   int vect_stmts_counter = 0;
9809   unsigned int index = 0;
9810   int first_vec_index = -1;
9811   int second_vec_index = -1;
9812   bool noop_p = true;
9813   *n_perms = 0;
9814
9815   vec_perm_builder mask;
9816   unsigned int nelts_to_build;
9817   unsigned int nvectors_per_build;
9818   unsigned int in_nlanes;
9819   bool repeating_p = (group_size == dr_group_size
9820                       && multiple_p (nunits, group_size));
9821   if (repeating_p)
9822     {
9823       /* A single vector contains a whole number of copies of the node, so:
9824          (a) all permutes can use the same mask; and
9825          (b) the permutes only need a single vector input.  */
9826       mask.new_vector (nunits, group_size, 3);
9827       nelts_to_build = mask.encoded_nelts ();
9828       /* It's possible to obtain zero nstmts during analyze_only, so make
9829          it at least one to ensure the later computation for n_perms
9830          proceed.  */
9831       nvectors_per_build = nstmts > 0 ? nstmts : 1;
9832       in_nlanes = dr_group_size * 3;
9833     }
9834   else
9835     {
9836       /* We need to construct a separate mask for each vector statement.  */
9837       unsigned HOST_WIDE_INT const_nunits, const_vf;
9838       if (!nunits.is_constant (&const_nunits)
9839           || !vf.is_constant (&const_vf))
9840         return false;
9841       mask.new_vector (const_nunits, const_nunits, 1);
9842       nelts_to_build = const_vf * group_size;
9843       nvectors_per_build = 1;
9844       in_nlanes = const_vf * dr_group_size;
9845     }
9846   auto_sbitmap used_in_lanes (in_nlanes);
9847   bitmap_clear (used_in_lanes);
9848   auto_bitmap used_defs;
9849
9850   unsigned int count = mask.encoded_nelts ();
9851   mask.quick_grow (count);
9852   vec_perm_indices indices;
9853
9854   for (unsigned int j = 0; j < nelts_to_build; j++)
9855     {
9856       unsigned int iter_num = j / group_size;
9857       unsigned int stmt_num = j % group_size;
9858       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
9859       bitmap_set_bit (used_in_lanes, i);
9860       if (repeating_p)
9861         {
9862           first_vec_index = 0;
9863           mask_element = i;
9864         }
9865       else
9866         {
9867           /* Enforced before the loop when !repeating_p.  */
9868           unsigned int const_nunits = nunits.to_constant ();
9869           vec_index = i / const_nunits;
9870           mask_element = i % const_nunits;
9871           if (vec_index == first_vec_index
9872               || first_vec_index == -1)
9873             {
9874               first_vec_index = vec_index;
9875             }
9876           else if (vec_index == second_vec_index
9877                    || second_vec_index == -1)
9878             {
9879               second_vec_index = vec_index;
9880               mask_element += const_nunits;
9881             }
9882           else
9883             {
9884               if (dump_p)
9885                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9886                                  "permutation requires at "
9887                                  "least three vectors %G",
9888                                  stmt_info->stmt);
9889               gcc_assert (analyze_only);
9890               return false;
9891             }
9892
9893           gcc_assert (mask_element < 2 * const_nunits);
9894         }
9895
9896       if (mask_element != index)
9897         noop_p = false;
9898       mask[index++] = mask_element;
9899
9900       if (index == count)
9901         {
9902           if (!noop_p)
9903             {
9904               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
9905               if (!can_vec_perm_const_p (mode, mode, indices))
9906                 {
9907                   if (dump_p)
9908                     {
9909                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9910                                        "unsupported vect permute { ");
9911                       for (i = 0; i < count; ++i)
9912                         {
9913                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
9914                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
9915                         }
9916                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
9917                     }
9918                   gcc_assert (analyze_only);
9919                   return false;
9920                 }
9921
9922               tree mask_vec = NULL_TREE;
9923               if (!analyze_only)
9924                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
9925
9926               if (second_vec_index == -1)
9927                 second_vec_index = first_vec_index;
9928
9929               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
9930                 {
9931                   ++*n_perms;
9932                   if (analyze_only)
9933                     continue;
9934                   /* Generate the permute statement if necessary.  */
9935                   tree first_vec = dr_chain[first_vec_index + ri];
9936                   tree second_vec = dr_chain[second_vec_index + ri];
9937                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
9938                   tree perm_dest
9939                     = vect_create_destination_var (gimple_assign_lhs (stmt),
9940                                                    vectype);
9941                   perm_dest = make_ssa_name (perm_dest);
9942                   gimple *perm_stmt
9943                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
9944                                            second_vec, mask_vec);
9945                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
9946                                                gsi);
9947                   if (dce_chain)
9948                     {
9949                       bitmap_set_bit (used_defs, first_vec_index + ri);
9950                       bitmap_set_bit (used_defs, second_vec_index + ri);
9951                     }
9952
9953                   /* Store the vector statement in NODE.  */
9954                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
9955                 }
9956             }
9957           else if (!analyze_only)
9958             {
9959               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
9960                 {
9961                   tree first_vec = dr_chain[first_vec_index + ri];
9962                   /* If mask was NULL_TREE generate the requested
9963                      identity transform.  */
9964                   if (dce_chain)
9965                     bitmap_set_bit (used_defs, first_vec_index + ri);
9966
9967                   /* Store the vector statement in NODE.  */
9968                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
9969                 }
9970             }
9971
9972           index = 0;
9973           first_vec_index = -1;
9974           second_vec_index = -1;
9975           noop_p = true;
9976         }
9977     }
9978
9979   if (n_loads)
9980     {
9981       if (repeating_p)
9982         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9983       else
9984         {
9985           /* Enforced above when !repeating_p.  */
9986           unsigned int const_nunits = nunits.to_constant ();
9987           *n_loads = 0;
9988           bool load_seen = false;
9989           for (unsigned i = 0; i < in_nlanes; ++i)
9990             {
9991               if (i % const_nunits == 0)
9992                 {
9993                   if (load_seen)
9994                     *n_loads += 1;
9995                   load_seen = false;
9996                 }
9997               if (bitmap_bit_p (used_in_lanes, i))
9998                 load_seen = true;
9999             }
10000           if (load_seen)
10001             *n_loads += 1;
10002         }
10003     }
10004
10005   if (dce_chain)
10006     for (unsigned i = 0; i < dr_chain.length (); ++i)
10007       if (!bitmap_bit_p (used_defs, i))
10008         {
10009           tree def = dr_chain[i];
10010           do
10011             {
10012               gimple *stmt = SSA_NAME_DEF_STMT (def);
10013               if (is_gimple_assign (stmt)
10014                   && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
10015                       || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
10016                 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
10017               else
10018                 def = NULL;
10019               gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
10020               gsi_remove (&rgsi, true);
10021               release_defs (stmt);
10022             }
10023           while (def);
10024         }
10025
10026   return true;
10027 }
10028
10029 /* Generate vector permute statements from a list of loads in DR_CHAIN.
10030    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
10031    permute statements for the SLP node NODE.  Store the number of vector
10032    permute instructions in *N_PERMS and the number of vector load
10033    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
10034    that were not needed.  */
10035
10036 bool
10037 vect_transform_slp_perm_load (vec_info *vinfo,
10038                               slp_tree node, const vec<tree> &dr_chain,
10039                               gimple_stmt_iterator *gsi, poly_uint64 vf,
10040                               bool analyze_only, unsigned *n_perms,
10041                               unsigned int *n_loads, bool dce_chain)
10042 {
10043   return vect_transform_slp_perm_load_1 (vinfo, node,
10044                                          SLP_TREE_LOAD_PERMUTATION (node),
10045                                          dr_chain, gsi, vf, analyze_only,
10046                                          dump_enabled_p (), n_perms, n_loads,
10047                                          dce_chain);
10048 }
10049
10050 /* Produce the next vector result for SLP permutation NODE by adding a vector
10051    statement at GSI.  If MASK_VEC is nonnull, add:
10052
10053       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
10054
10055    otherwise add:
10056
10057       <new SSA name> = FIRST_DEF.  */
10058
10059 static void
10060 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
10061                           slp_tree node, tree first_def, tree second_def,
10062                           tree mask_vec, poly_uint64 identity_offset)
10063 {
10064   tree vectype = SLP_TREE_VECTYPE (node);
10065
10066   /* ???  We SLP match existing vector element extracts but
10067      allow punning which we need to re-instantiate at uses
10068      but have no good way of explicitly representing.  */
10069   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
10070       && !types_compatible_p (TREE_TYPE (first_def), vectype))
10071     {
10072       gassign *conv_stmt
10073         = gimple_build_assign (make_ssa_name (vectype),
10074                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
10075       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
10076       first_def = gimple_assign_lhs (conv_stmt);
10077     }
10078   gassign *perm_stmt;
10079   tree perm_dest = make_ssa_name (vectype);
10080   if (mask_vec)
10081     {
10082       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
10083                            TYPE_SIZE (vectype))
10084           && !types_compatible_p (TREE_TYPE (second_def), vectype))
10085         {
10086           gassign *conv_stmt
10087             = gimple_build_assign (make_ssa_name (vectype),
10088                                    build1 (VIEW_CONVERT_EXPR,
10089                                            vectype, second_def));
10090           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
10091           second_def = gimple_assign_lhs (conv_stmt);
10092         }
10093       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
10094                                        first_def, second_def,
10095                                        mask_vec);
10096     }
10097   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
10098     {
10099       /* For identity permutes we still need to handle the case
10100          of offsetted extracts or concats.  */
10101       unsigned HOST_WIDE_INT c;
10102       auto first_def_nunits
10103         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
10104       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
10105         {
10106           unsigned HOST_WIDE_INT elsz
10107             = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
10108           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
10109                                  TYPE_SIZE (vectype),
10110                                  bitsize_int (identity_offset * elsz));
10111           perm_stmt = gimple_build_assign (perm_dest, lowpart);
10112         }
10113       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
10114                                     first_def_nunits, &c) && c == 2)
10115         {
10116           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
10117                                             NULL_TREE, second_def);
10118           perm_stmt = gimple_build_assign (perm_dest, ctor);
10119         }
10120       else
10121         gcc_unreachable ();
10122     }
10123   else
10124     {
10125       /* We need a copy here in case the def was external.  */
10126       perm_stmt = gimple_build_assign (perm_dest, first_def);
10127     }
10128   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
10129   /* Store the vector statement in NODE.  */
10130   node->push_vec_def (perm_stmt);
10131 }
10132
10133 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
10134    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
10135    If GSI is nonnull, emit the permutation there.
10136
10137    When GSI is null, the only purpose of NODE is to give properties
10138    of the result, such as the vector type and number of SLP lanes.
10139    The node does not need to be a VEC_PERM_EXPR.
10140
10141    If the target supports the operation, return the number of individual
10142    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
10143    dump file if DUMP_P is true.  */
10144
10145 static int
10146 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
10147                                 slp_tree node, lane_permutation_t &perm,
10148                                 vec<slp_tree> &children, bool dump_p)
10149 {
10150   tree vectype = SLP_TREE_VECTYPE (node);
10151
10152   /* ???  We currently only support all same vector input types
10153      while the SLP IL should really do a concat + select and thus accept
10154      arbitrary mismatches.  */
10155   slp_tree child;
10156   unsigned i;
10157   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10158   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
10159   tree op_vectype = NULL_TREE;
10160   FOR_EACH_VEC_ELT (children, i, child)
10161     if (SLP_TREE_VECTYPE (child))
10162       {
10163         op_vectype = SLP_TREE_VECTYPE (child);
10164         break;
10165       }
10166   if (!op_vectype)
10167     op_vectype = vectype;
10168   FOR_EACH_VEC_ELT (children, i, child)
10169     {
10170       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
10171            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
10172           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
10173           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
10174         {
10175           if (dump_p)
10176             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10177                              "Unsupported vector types in lane permutation\n");
10178           return -1;
10179         }
10180       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
10181         repeating_p = false;
10182     }
10183
10184   gcc_assert (perm.length () == SLP_TREE_LANES (node));
10185
10186   /* Load-lanes permute.  This permute only acts as a forwarder to
10187      select the correct vector def of the load-lanes load which
10188      has the permuted vectors in its vector defs like
10189      { v0, w0, r0, v1, w1, r1 ... } for a ld3.  */
10190   if (node->ldst_lanes)
10191     {
10192       gcc_assert (children.length () == 1);
10193       if (!gsi)
10194         /* This is a trivial op always supported.  */
10195         return 1;
10196       slp_tree child = children[0];
10197       unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
10198                           / SLP_TREE_LANES (node));
10199       unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
10200       for (unsigned i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
10201         {
10202           tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num  + vec_idx];
10203           node->push_vec_def (def);
10204         }
10205       return 1;
10206     }
10207
10208   /* REPEATING_P is true if every output vector is guaranteed to use the
10209      same permute vector.  We can handle that case for both variable-length
10210      and constant-length vectors, but we only handle other cases for
10211      constant-length vectors.
10212
10213      Set:
10214
10215      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
10216        mask vector that we want to build.
10217
10218      - NCOPIES to the number of copies of PERM that we need in order
10219        to build the necessary permute mask vectors.
10220
10221      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
10222        for each permute mask vector.  This is only relevant when GSI is
10223        nonnull.  */
10224   uint64_t npatterns;
10225   unsigned nelts_per_pattern;
10226   uint64_t ncopies;
10227   unsigned noutputs_per_mask;
10228   if (repeating_p)
10229     {
10230       /* We need a single permute mask vector that has the form:
10231
10232            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
10233
10234          In other words, the original n-element permute in PERM is
10235          "unrolled" to fill a full vector.  The stepped vector encoding
10236          that we use for permutes requires 3n elements.  */
10237       npatterns = SLP_TREE_LANES (node);
10238       nelts_per_pattern = ncopies = 3;
10239       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
10240     }
10241   else
10242     {
10243       /* Calculate every element of every permute mask vector explicitly,
10244          instead of relying on the pattern described above.  */
10245       if (!nunits.is_constant (&npatterns)
10246           || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
10247         return -1;
10248       nelts_per_pattern = ncopies = 1;
10249       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
10250         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
10251           return -1;
10252       noutputs_per_mask = 1;
10253     }
10254   unsigned olanes = ncopies * SLP_TREE_LANES (node);
10255   gcc_assert (repeating_p || multiple_p (olanes, nunits));
10256
10257   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
10258      from the { SLP operand, scalar lane } permutation as recorded in the
10259      SLP node as intermediate step.  This part should already work
10260      with SLP children with arbitrary number of lanes.  */
10261   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
10262   auto_vec<unsigned> active_lane;
10263   vperm.create (olanes);
10264   active_lane.safe_grow_cleared (children.length (), true);
10265   for (unsigned i = 0; i < ncopies; ++i)
10266     {
10267       for (unsigned pi = 0; pi < perm.length (); ++pi)
10268         {
10269           std::pair<unsigned, unsigned> p = perm[pi];
10270           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
10271           if (repeating_p)
10272             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
10273           else
10274             {
10275               /* We checked above that the vectors are constant-length.  */
10276               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
10277               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
10278               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
10279               vperm.quick_push ({{p.first, vi}, vl});
10280             }
10281         }
10282       /* Advance to the next group.  */
10283       for (unsigned j = 0; j < children.length (); ++j)
10284         active_lane[j] += SLP_TREE_LANES (children[j]);
10285     }
10286
10287   if (dump_p)
10288     {
10289       dump_printf_loc (MSG_NOTE, vect_location,
10290                        "vectorizing permutation %p", (void *)node);
10291       for (unsigned i = 0; i < perm.length (); ++i)
10292         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
10293       if (repeating_p)
10294         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
10295       dump_printf (MSG_NOTE, "\n");
10296       dump_printf_loc (MSG_NOTE, vect_location, "as");
10297       for (unsigned i = 0; i < vperm.length (); ++i)
10298         {
10299           if (i != 0
10300               && (repeating_p
10301                   ? multiple_p (i, npatterns)
10302                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
10303             dump_printf (MSG_NOTE, ",");
10304           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
10305                        vperm[i].first.first, vperm[i].first.second,
10306                        vperm[i].second);
10307         }
10308       dump_printf (MSG_NOTE, "\n");
10309     }
10310
10311   /* We can only handle two-vector permutes, everything else should
10312      be lowered on the SLP level.  The following is closely inspired
10313      by vect_transform_slp_perm_load and is supposed to eventually
10314      replace it.
10315      ???   As intermediate step do code-gen in the SLP tree representation
10316      somehow?  */
10317   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
10318   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
10319   unsigned int index = 0;
10320   poly_uint64 mask_element;
10321   vec_perm_builder mask;
10322   mask.new_vector (nunits, npatterns, nelts_per_pattern);
10323   unsigned int count = mask.encoded_nelts ();
10324   mask.quick_grow (count);
10325   vec_perm_indices indices;
10326   unsigned nperms = 0;
10327   for (unsigned i = 0; i < vperm.length (); ++i)
10328     {
10329       mask_element = vperm[i].second;
10330       if (first_vec.first == -1U
10331           || first_vec == vperm[i].first)
10332         first_vec = vperm[i].first;
10333       else if (second_vec.first == -1U
10334                || second_vec == vperm[i].first)
10335         {
10336           second_vec = vperm[i].first;
10337           mask_element += nunits;
10338         }
10339       else
10340         {
10341           if (dump_p)
10342             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10343                              "permutation requires at "
10344                              "least three vectors\n");
10345           gcc_assert (!gsi);
10346           return -1;
10347         }
10348
10349       mask[index++] = mask_element;
10350
10351       if (index == count)
10352         {
10353           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
10354                               TYPE_VECTOR_SUBPARTS (op_vectype));
10355           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
10356                              && constant_multiple_p (mask[0], nunits));
10357           machine_mode vmode = TYPE_MODE (vectype);
10358           machine_mode op_vmode = TYPE_MODE (op_vectype);
10359           unsigned HOST_WIDE_INT c;
10360           if ((!identity_p
10361                && !can_vec_perm_const_p (vmode, op_vmode, indices))
10362               || (identity_p
10363                   && !known_le (nunits,
10364                                 TYPE_VECTOR_SUBPARTS (op_vectype))
10365                   && (!constant_multiple_p (nunits,
10366                                             TYPE_VECTOR_SUBPARTS (op_vectype),
10367                                             &c) || c != 2)))
10368             {
10369               if (dump_p)
10370                 {
10371                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
10372                                    vect_location,
10373                                    "unsupported vect permute { ");
10374                   for (i = 0; i < count; ++i)
10375                     {
10376                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
10377                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
10378                     }
10379                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
10380                 }
10381               gcc_assert (!gsi);
10382               return -1;
10383             }
10384
10385           if (!identity_p)
10386             nperms++;
10387           if (gsi)
10388             {
10389               if (second_vec.first == -1U)
10390                 second_vec = first_vec;
10391
10392               slp_tree
10393                 first_node = children[first_vec.first],
10394                 second_node = children[second_vec.first];
10395
10396               tree mask_vec = NULL_TREE;
10397               if (!identity_p)
10398                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
10399
10400               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
10401                 {
10402                   tree first_def
10403                     = vect_get_slp_vect_def (first_node,
10404                                              first_vec.second + vi);
10405                   tree second_def
10406                     = vect_get_slp_vect_def (second_node,
10407                                              second_vec.second + vi);
10408                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
10409                                             second_def, mask_vec, mask[0]);
10410                 }
10411             }
10412
10413           index = 0;
10414           first_vec = std::make_pair (-1U, -1U);
10415           second_vec = std::make_pair (-1U, -1U);
10416         }
10417     }
10418
10419   return nperms;
10420 }
10421
10422 /* Vectorize the SLP permutations in NODE as specified
10423    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
10424    child number and lane number.
10425    Interleaving of two two-lane two-child SLP subtrees (not supported):
10426      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
10427    A blend of two four-lane two-child SLP subtrees:
10428      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
10429    Highpart of a four-lane one-child SLP subtree (not supported):
10430      [ { 0, 2 }, { 0, 3 } ]
10431    Where currently only a subset is supported by code generating below.  */
10432
10433 static bool
10434 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
10435                               slp_tree node, stmt_vector_for_cost *cost_vec)
10436 {
10437   tree vectype = SLP_TREE_VECTYPE (node);
10438   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
10439   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
10440                                                SLP_TREE_CHILDREN (node),
10441                                                dump_enabled_p ());
10442   if (nperms < 0)
10443     return false;
10444
10445   if (!gsi)
10446     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
10447
10448   return true;
10449 }
10450
10451 /* Vectorize SLP NODE.  */
10452
10453 static void
10454 vect_schedule_slp_node (vec_info *vinfo,
10455                         slp_tree node, slp_instance instance)
10456 {
10457   gimple_stmt_iterator si;
10458   int i;
10459   slp_tree child;
10460
10461   /* Vectorize externals and constants.  */
10462   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
10463       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
10464     {
10465       /* ???  vectorizable_shift can end up using a scalar operand which is
10466          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
10467          node in this case.  */
10468       if (!SLP_TREE_VECTYPE (node))
10469         return;
10470
10471       /* There are two reasons vector defs might already exist.  The first
10472          is that we are vectorizing an existing vector def.  The second is
10473          when performing BB vectorization shared constant/external nodes
10474          are not split apart during partitioning so during the code-gen
10475          DFS walk we can end up visiting them twice.  */
10476       if (! SLP_TREE_VEC_DEFS (node).exists ())
10477         vect_create_constant_vectors (vinfo, node);
10478       return;
10479     }
10480
10481   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
10482
10483   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
10484
10485   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
10486   SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
10487
10488   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
10489       && STMT_VINFO_DATA_REF (stmt_info))
10490     {
10491       /* Vectorized loads go before the first scalar load to make it
10492          ready early, vectorized stores go before the last scalar
10493          stmt which is where all uses are ready.  */
10494       stmt_vec_info last_stmt_info = NULL;
10495       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
10496         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
10497       else /* DR_IS_WRITE */
10498         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
10499       si = gsi_for_stmt (last_stmt_info->stmt);
10500     }
10501   else if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
10502            && (STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
10503                || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
10504                || STMT_VINFO_TYPE (stmt_info) == phi_info_type))
10505     {
10506       /* For PHI node vectorization we do not use the insertion iterator.  */
10507       si = gsi_none ();
10508     }
10509   else
10510     {
10511       /* Emit other stmts after the children vectorized defs which is
10512          earliest possible.  */
10513       gimple *last_stmt = NULL;
10514       bool seen_vector_def = false;
10515       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
10516         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
10517           {
10518             /* For fold-left reductions we are retaining the scalar
10519                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
10520                set so the representation isn't perfect.  Resort to the
10521                last scalar def here.  */
10522             if (SLP_TREE_VEC_DEFS (child).is_empty ())
10523               {
10524                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
10525                             == cycle_phi_info_type);
10526                 gphi *phi = as_a <gphi *>
10527                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
10528                 if (!last_stmt
10529                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
10530                   last_stmt = phi;
10531               }
10532             /* We are emitting all vectorized stmts in the same place and
10533                the last one is the last.
10534                ???  Unless we have a load permutation applied and that
10535                figures to re-use an earlier generated load.  */
10536             unsigned j;
10537             tree vdef;
10538             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
10539               {
10540                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
10541                 if (!last_stmt
10542                     || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
10543                   last_stmt = vstmt;
10544               }
10545           }
10546         else if (!SLP_TREE_VECTYPE (child))
10547           {
10548             /* For externals we use unvectorized at all scalar defs.  */
10549             unsigned j;
10550             tree def;
10551             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
10552               if (TREE_CODE (def) == SSA_NAME
10553                   && !SSA_NAME_IS_DEFAULT_DEF (def))
10554                 {
10555                   gimple *stmt = SSA_NAME_DEF_STMT (def);
10556                   if (!last_stmt
10557                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
10558                     last_stmt = stmt;
10559                 }
10560           }
10561         else
10562           {
10563             /* For externals we have to look at all defs since their
10564                insertion place is decided per vector.  But beware
10565                of pre-existing vectors where we need to make sure
10566                we do not insert before the region boundary.  */
10567             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
10568                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
10569               seen_vector_def = true;
10570             else
10571               {
10572                 unsigned j;
10573                 tree vdef;
10574                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
10575                   if (TREE_CODE (vdef) == SSA_NAME
10576                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
10577                     {
10578                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
10579                       if (!last_stmt
10580                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
10581                         last_stmt = vstmt;
10582                     }
10583               }
10584           }
10585       /* This can happen when all children are pre-existing vectors or
10586          constants.  */
10587       if (!last_stmt)
10588         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
10589       if (!last_stmt)
10590         {
10591           gcc_assert (seen_vector_def);
10592           si = gsi_after_labels (vinfo->bbs[0]);
10593         }
10594       else if (is_ctrl_altering_stmt (last_stmt))
10595         {
10596           /* We split regions to vectorize at control altering stmts
10597              with a definition so this must be an external which
10598              we can insert at the start of the region.  */
10599           si = gsi_after_labels (vinfo->bbs[0]);
10600         }
10601       else if (is_a <bb_vec_info> (vinfo)
10602                && SLP_TREE_CODE (node) != VEC_PERM_EXPR
10603                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
10604                && gimple_could_trap_p (stmt_info->stmt))
10605         {
10606           /* We've constrained possibly trapping operations to all come
10607              from the same basic-block, if vectorized defs would allow earlier
10608              scheduling still force vectorized stmts to the original block.
10609              This is only necessary for BB vectorization since for loop vect
10610              all operations are in a single BB and scalar stmt based
10611              placement doesn't play well with epilogue vectorization.  */
10612           gcc_assert (dominated_by_p (CDI_DOMINATORS,
10613                                       gimple_bb (stmt_info->stmt),
10614                                       gimple_bb (last_stmt)));
10615           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
10616         }
10617       else if (is_a <gphi *> (last_stmt))
10618         si = gsi_after_labels (gimple_bb (last_stmt));
10619       else
10620         {
10621           si = gsi_for_stmt (last_stmt);
10622           gsi_next (&si);
10623
10624           /* Avoid scheduling internal defs outside of the loop when
10625              we might have only implicitly tracked loop mask/len defs.  */
10626           if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
10627             if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10628                 || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10629               {
10630                 gimple_stmt_iterator si2
10631                   = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
10632                 if ((gsi_end_p (si2)
10633                      && (LOOP_VINFO_LOOP (loop_vinfo)->header
10634                          != gimple_bb (last_stmt))
10635                      && dominated_by_p (CDI_DOMINATORS,
10636                                         LOOP_VINFO_LOOP (loop_vinfo)->header,
10637                                         gimple_bb (last_stmt)))
10638                     || (!gsi_end_p (si2)
10639                         && last_stmt != *si2
10640                         && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
10641                   si = si2;
10642               }
10643         }
10644     }
10645
10646   /* Handle purely internal nodes.  */
10647   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
10648     {
10649       if (dump_enabled_p ())
10650         dump_printf_loc (MSG_NOTE, vect_location,
10651                          "------>vectorizing SLP permutation node\n");
10652       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
10653          be shared with different SLP nodes (but usually it's the same
10654          operation apart from the case the stmt is only there for denoting
10655          the actual scalar lane defs ...).  So do not call vect_transform_stmt
10656          but open-code it here (partly).  */
10657       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
10658       gcc_assert (done);
10659       stmt_vec_info slp_stmt_info;
10660       unsigned int i;
10661       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
10662         if (slp_stmt_info && STMT_VINFO_LIVE_P (slp_stmt_info))
10663           {
10664             done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
10665                                                 instance, i, true, NULL);
10666             gcc_assert (done);
10667           }
10668     }
10669   else
10670     {
10671       if (dump_enabled_p ())
10672         dump_printf_loc (MSG_NOTE, vect_location,
10673                          "------>vectorizing SLP node starting from: %G",
10674                          stmt_info->stmt);
10675       vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
10676     }
10677 }
10678
10679 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
10680    For loop vectorization this is done in vectorizable_call, but for SLP
10681    it needs to be deferred until end of vect_schedule_slp, because multiple
10682    SLP instances may refer to the same scalar stmt.  */
10683
10684 static void
10685 vect_remove_slp_scalar_calls (vec_info *vinfo,
10686                               slp_tree node, hash_set<slp_tree> &visited)
10687 {
10688   gimple *new_stmt;
10689   gimple_stmt_iterator gsi;
10690   int i;
10691   slp_tree child;
10692   tree lhs;
10693   stmt_vec_info stmt_info;
10694
10695   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
10696     return;
10697
10698   if (visited.add (node))
10699     return;
10700
10701   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
10702     vect_remove_slp_scalar_calls (vinfo, child, visited);
10703
10704   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
10705     {
10706       if (!stmt_info)
10707         continue;
10708       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
10709       if (!stmt || gimple_bb (stmt) == NULL)
10710         continue;
10711       if (is_pattern_stmt_p (stmt_info)
10712           || !PURE_SLP_STMT (stmt_info))
10713         continue;
10714       lhs = gimple_call_lhs (stmt);
10715       if (lhs)
10716         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
10717       else
10718         {
10719           new_stmt = gimple_build_nop ();
10720           unlink_stmt_vdef (stmt_info->stmt);
10721         }
10722       gsi = gsi_for_stmt (stmt);
10723       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
10724       if (lhs)
10725         SSA_NAME_DEF_STMT (lhs) = new_stmt;
10726     }
10727 }
10728
10729 static void
10730 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
10731 {
10732   hash_set<slp_tree> visited;
10733   vect_remove_slp_scalar_calls (vinfo, node, visited);
10734 }
10735
10736 /* Vectorize the instance root.  */
10737
10738 void
10739 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
10740 {
10741   gassign *rstmt = NULL;
10742
10743   if (instance->kind == slp_inst_kind_ctor)
10744     {
10745       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
10746         {
10747           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
10748           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
10749           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
10750                                           TREE_TYPE (vect_lhs)))
10751             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
10752                                vect_lhs);
10753           rstmt = gimple_build_assign (root_lhs, vect_lhs);
10754         }
10755       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
10756         {
10757           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
10758           tree child_def;
10759           int j;
10760           vec<constructor_elt, va_gc> *v;
10761           vec_alloc (v, nelts);
10762
10763           /* A CTOR can handle V16HI composition from VNx8HI so we
10764              do not need to convert vector elements if the types
10765              do not match.  */
10766           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
10767             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
10768           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
10769           tree rtype
10770             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
10771           tree r_constructor = build_constructor (rtype, v);
10772           rstmt = gimple_build_assign (lhs, r_constructor);
10773         }
10774     }
10775   else if (instance->kind == slp_inst_kind_bb_reduc)
10776     {
10777       /* Largely inspired by reduction chain epilogue handling in
10778          vect_create_epilog_for_reduction.  */
10779       vec<tree> vec_defs = vNULL;
10780       vect_get_slp_defs (node, &vec_defs);
10781       enum tree_code reduc_code
10782         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
10783       /* ???  We actually have to reflect signs somewhere.  */
10784       if (reduc_code == MINUS_EXPR)
10785         reduc_code = PLUS_EXPR;
10786       gimple_seq epilogue = NULL;
10787       /* We may end up with more than one vector result, reduce them
10788          to one vector.  */
10789       tree vec_def = vec_defs[0];
10790       tree vectype = TREE_TYPE (vec_def);
10791       tree compute_vectype = vectype;
10792       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
10793                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
10794                                  && operation_can_overflow (reduc_code));
10795       if (pun_for_overflow_p)
10796         {
10797           compute_vectype = unsigned_type_for (vectype);
10798           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
10799                                   compute_vectype, vec_def);
10800         }
10801       for (unsigned i = 1; i < vec_defs.length (); ++i)
10802         {
10803           tree def = vec_defs[i];
10804           if (pun_for_overflow_p)
10805             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
10806                                 compute_vectype, def);
10807           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
10808                                   vec_def, def);
10809         }
10810       vec_defs.release ();
10811       /* ???  Support other schemes than direct internal fn.  */
10812       internal_fn reduc_fn;
10813       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
10814           || reduc_fn == IFN_LAST)
10815         gcc_unreachable ();
10816       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
10817                                       TREE_TYPE (compute_vectype), vec_def);
10818       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
10819         {
10820           tree rem_def = NULL_TREE;
10821           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
10822             {
10823               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
10824               if (!rem_def)
10825                 rem_def = def;
10826               else
10827                 rem_def = gimple_build (&epilogue, reduc_code,
10828                                         TREE_TYPE (scalar_def),
10829                                         rem_def, def);
10830             }
10831           scalar_def = gimple_build (&epilogue, reduc_code,
10832                                      TREE_TYPE (scalar_def),
10833                                      scalar_def, rem_def);
10834         }
10835       scalar_def = gimple_convert (&epilogue,
10836                                    TREE_TYPE (vectype), scalar_def);
10837       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
10838       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
10839       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
10840       update_stmt (gsi_stmt (rgsi));
10841       return;
10842     }
10843   else
10844     gcc_unreachable ();
10845
10846   gcc_assert (rstmt);
10847
10848   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
10849   gsi_replace (&rgsi, rstmt, true);
10850 }
10851
10852 struct slp_scc_info
10853 {
10854   bool on_stack;
10855   int dfs;
10856   int lowlink;
10857 };
10858
10859 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
10860
10861 static void
10862 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
10863                    hash_map<slp_tree, slp_scc_info> &scc_info,
10864                    int &maxdfs, vec<slp_tree> &stack)
10865 {
10866   bool existed_p;
10867   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
10868   gcc_assert (!existed_p);
10869   info->dfs = maxdfs;
10870   info->lowlink = maxdfs;
10871   maxdfs++;
10872
10873   /* Leaf.  */
10874   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
10875     {
10876       info->on_stack = false;
10877       vect_schedule_slp_node (vinfo, node, instance);
10878       return;
10879     }
10880
10881   info->on_stack = true;
10882   stack.safe_push (node);
10883
10884   unsigned i;
10885   slp_tree child;
10886   /* DFS recurse.  */
10887   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
10888     {
10889       if (!child)
10890         continue;
10891       slp_scc_info *child_info = scc_info.get (child);
10892       if (!child_info)
10893         {
10894           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
10895           /* Recursion might have re-allocated the node.  */
10896           info = scc_info.get (node);
10897           child_info = scc_info.get (child);
10898           info->lowlink = MIN (info->lowlink, child_info->lowlink);
10899         }
10900       else if (child_info->on_stack)
10901         info->lowlink = MIN (info->lowlink, child_info->dfs);
10902     }
10903   if (info->lowlink != info->dfs)
10904     return;
10905
10906   auto_vec<slp_tree, 4> phis_to_fixup;
10907
10908   /* Singleton.  */
10909   if (stack.last () == node)
10910     {
10911       stack.pop ();
10912       info->on_stack = false;
10913       vect_schedule_slp_node (vinfo, node, instance);
10914       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
10915           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
10916         phis_to_fixup.quick_push (node);
10917     }
10918   else
10919     {
10920       /* SCC.  */
10921       int last_idx = stack.length () - 1;
10922       while (stack[last_idx] != node)
10923         last_idx--;
10924       /* We can break the cycle at PHIs who have at least one child
10925          code generated.  Then we could re-start the DFS walk until
10926          all nodes in the SCC are covered (we might have new entries
10927          for only back-reachable nodes).  But it's simpler to just
10928          iterate and schedule those that are ready.  */
10929       unsigned todo = stack.length () - last_idx;
10930       do
10931         {
10932           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
10933             {
10934               slp_tree entry = stack[idx];
10935               if (!entry)
10936                 continue;
10937               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
10938                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
10939               bool ready = !phi;
10940               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
10941                   if (!child)
10942                     {
10943                       gcc_assert (phi);
10944                       ready = true;
10945                       break;
10946                     }
10947                   else if (scc_info.get (child)->on_stack)
10948                     {
10949                       if (!phi)
10950                         {
10951                           ready = false;
10952                           break;
10953                         }
10954                     }
10955                   else
10956                     {
10957                       if (phi)
10958                         {
10959                           ready = true;
10960                           break;
10961                         }
10962                     }
10963               if (ready)
10964                 {
10965                   vect_schedule_slp_node (vinfo, entry, instance);
10966                   scc_info.get (entry)->on_stack = false;
10967                   stack[idx] = NULL;
10968                   todo--;
10969                   if (phi)
10970                     phis_to_fixup.safe_push (entry);
10971                 }
10972             }
10973         }
10974       while (todo != 0);
10975
10976       /* Pop the SCC.  */
10977       stack.truncate (last_idx);
10978     }
10979
10980   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
10981   slp_tree phi_node;
10982   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
10983     {
10984       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
10985       edge_iterator ei;
10986       edge e;
10987       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
10988         {
10989           unsigned dest_idx = e->dest_idx;
10990           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
10991           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
10992             continue;
10993           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
10994           /* Simply fill all args.  */
10995           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
10996               != vect_first_order_recurrence)
10997             for (unsigned i = 0; i < n; ++i)
10998               {
10999                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
11000                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11001                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
11002                              e, gimple_phi_arg_location (phi, dest_idx));
11003               }
11004           else
11005             {
11006               /* Unless it is a first order recurrence which needs
11007                  args filled in for both the PHI node and the permutes.  */
11008               gimple *perm
11009                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
11010               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
11011               add_phi_arg (as_a <gphi *> (rphi),
11012                            vect_get_slp_vect_def (child, n - 1),
11013                            e, gimple_phi_arg_location (phi, dest_idx));
11014               for (unsigned i = 0; i < n; ++i)
11015                 {
11016                   gimple *perm
11017                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
11018                   if (i > 0)
11019                     gimple_assign_set_rhs1 (perm,
11020                                             vect_get_slp_vect_def (child, i - 1));
11021                   gimple_assign_set_rhs2 (perm,
11022                                           vect_get_slp_vect_def (child, i));
11023                   update_stmt (perm);
11024                 }
11025             }
11026         }
11027     }
11028 }
11029
11030 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
11031
11032 void
11033 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
11034 {
11035   slp_instance instance;
11036   unsigned int i;
11037
11038   hash_map<slp_tree, slp_scc_info> scc_info;
11039   int maxdfs = 0;
11040   FOR_EACH_VEC_ELT (slp_instances, i, instance)
11041     {
11042       slp_tree node = SLP_INSTANCE_TREE (instance);
11043       if (dump_enabled_p ())
11044         {
11045           dump_printf_loc (MSG_NOTE, vect_location,
11046                            "Vectorizing SLP tree:\n");
11047           /* ???  Dump all?  */
11048           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
11049             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
11050                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
11051           vect_print_slp_graph (MSG_NOTE, vect_location,
11052                                 SLP_INSTANCE_TREE (instance));
11053         }
11054       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
11055          have a PHI be the node breaking the cycle.  */
11056       auto_vec<slp_tree> stack;
11057       if (!scc_info.get (node))
11058         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
11059
11060       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
11061         vectorize_slp_instance_root_stmt (node, instance);
11062
11063       if (dump_enabled_p ())
11064         dump_printf_loc (MSG_NOTE, vect_location,
11065                          "vectorizing stmts using SLP.\n");
11066     }
11067
11068   FOR_EACH_VEC_ELT (slp_instances, i, instance)
11069     {
11070       slp_tree root = SLP_INSTANCE_TREE (instance);
11071       stmt_vec_info store_info;
11072       unsigned int j;
11073
11074       /* Remove scalar call stmts.  Do not do this for basic-block
11075          vectorization as not all uses may be vectorized.
11076          ???  Why should this be necessary?  DCE should be able to
11077          remove the stmts itself.
11078          ???  For BB vectorization we can as well remove scalar
11079          stmts starting from the SLP tree root if they have no
11080          uses.  */
11081       if (is_a <loop_vec_info> (vinfo))
11082         vect_remove_slp_scalar_calls (vinfo, root);
11083
11084       /* Remove vectorized stores original scalar stmts.  */
11085       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
11086         {
11087           if (!STMT_VINFO_DATA_REF (store_info)
11088               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
11089             break;
11090
11091           store_info = vect_orig_stmt (store_info);
11092           /* Free the attached stmt_vec_info and remove the stmt.  */
11093           vinfo->remove_stmt (store_info);
11094
11095           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
11096              to not crash in vect_free_slp_tree later.  */
11097           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
11098             SLP_TREE_REPRESENTATIVE (root) = NULL;
11099         }
11100     }
11101 }