gcc/tree-vect-slp.cc

   1 /* SLP - Basic Block Vectorization
   2    Copyright (C) 2007-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #define INCLUDE_ALGORITHM
  24 #define INCLUDE_MEMORY
  25 #include "system.h"
  26 #include "coretypes.h"
  27 #include "backend.h"
  28 #include "target.h"
  29 #include "rtl.h"
  30 #include "tree.h"
  31 #include "gimple.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "insn-config.h"
  36 #include "recog.h"              /* FIXME: for insn_data */
  37 #include "fold-const.h"
  38 #include "stor-layout.h"
  39 #include "gimple-iterator.h"
  40 #include "cfgloop.h"
  41 #include "tree-vectorizer.h"
  42 #include "langhooks.h"
  43 #include "gimple-walk.h"
  44 #include "dbgcnt.h"
  45 #include "tree-vector-builder.h"
  46 #include "vec-perm-indices.h"
  47 #include "gimple-fold.h"
  48 #include "internal-fn.h"
  49 #include "dump-context.h"
  50 #include "cfganal.h"
  51 #include "tree-eh.h"
  52 #include "tree-cfg.h"
  53 #include "alloc-pool.h"
  54 #include "sreal.h"
  55 #include "predict.h"
  56
  57 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
  58                                             load_permutation_t &,
  59                                             const vec<tree> &,
  60                                             gimple_stmt_iterator *,
  61                                             poly_uint64, bool, bool,
  62                                             unsigned *,
  63                                             unsigned * = nullptr,
  64                                             bool = false);
  65 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
  66                                            slp_tree, lane_permutation_t &,
  67                                            vec<slp_tree> &, bool);
  68 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
  69                                           slp_tree, stmt_vector_for_cost *);
  70 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
  71
  72 static object_allocator<_slp_tree> *slp_tree_pool;
  73 static slp_tree slp_first_node;
  74
  75 void
  76 vect_slp_init (void)
  77 {
  78   slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
  79 }
  80
  81 void
  82 vect_slp_fini (void)
  83 {
  84   while (slp_first_node)
  85     delete slp_first_node;
  86   delete slp_tree_pool;
  87   slp_tree_pool = NULL;
  88 }
  89
  90 void *
  91 _slp_tree::operator new (size_t n)
  92 {
  93   gcc_assert (n == sizeof (_slp_tree));
  94   return slp_tree_pool->allocate_raw ();
  95 }
  96
  97 void
  98 _slp_tree::operator delete (void *node, size_t n)
  99 {
 100   gcc_assert (n == sizeof (_slp_tree));
 101   slp_tree_pool->remove_raw (node);
 102 }
 103
 104
 105 /* Initialize a SLP node.  */
 106
 107 _slp_tree::_slp_tree ()
 108 {
 109   this->prev_node = NULL;
 110   if (slp_first_node)
 111     slp_first_node->prev_node = this;
 112   this->next_node = slp_first_node;
 113   slp_first_node = this;
 114   SLP_TREE_SCALAR_STMTS (this) = vNULL;
 115   SLP_TREE_SCALAR_OPS (this) = vNULL;
 116   SLP_TREE_VEC_DEFS (this) = vNULL;
 117   SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
 118   SLP_TREE_CHILDREN (this) = vNULL;
 119   SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
 120   SLP_TREE_LANE_PERMUTATION (this) = vNULL;
 121   SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
 122   SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
 123   SLP_TREE_CODE (this) = ERROR_MARK;
 124   this->ldst_lanes = false;
 125   SLP_TREE_VECTYPE (this) = NULL_TREE;
 126   SLP_TREE_REPRESENTATIVE (this) = NULL;
 127   SLP_TREE_REF_COUNT (this) = 1;
 128   this->failed = NULL;
 129   this->max_nunits = 1;
 130   this->lanes = 0;
 131 }
 132
 133 /* Tear down a SLP node.  */
 134
 135 _slp_tree::~_slp_tree ()
 136 {
 137   if (this->prev_node)
 138     this->prev_node->next_node = this->next_node;
 139   else
 140     slp_first_node = this->next_node;
 141   if (this->next_node)
 142     this->next_node->prev_node = this->prev_node;
 143   SLP_TREE_CHILDREN (this).release ();
 144   SLP_TREE_SCALAR_STMTS (this).release ();
 145   SLP_TREE_SCALAR_OPS (this).release ();
 146   SLP_TREE_VEC_DEFS (this).release ();
 147   SLP_TREE_LOAD_PERMUTATION (this).release ();
 148   SLP_TREE_LANE_PERMUTATION (this).release ();
 149   SLP_TREE_SIMD_CLONE_INFO (this).release ();
 150   if (this->failed)
 151     free (failed);
 152 }
 153
 154 /* Push the single SSA definition in DEF to the vector of vector defs.  */
 155
 156 void
 157 _slp_tree::push_vec_def (gimple *def)
 158 {
 159   if (gphi *phi = dyn_cast <gphi *> (def))
 160     vec_defs.quick_push (gimple_phi_result (phi));
 161   else
 162     {
 163       def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
 164       vec_defs.quick_push (get_def_from_ptr (defop));
 165     }
 166 }
 167
 168 /* Recursively free the memory allocated for the SLP tree rooted at NODE.  */
 169
 170 void
 171 vect_free_slp_tree (slp_tree node)
 172 {
 173   int i;
 174   slp_tree child;
 175
 176   if (--SLP_TREE_REF_COUNT (node) != 0)
 177     return;
 178
 179   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
 180     if (child)
 181       vect_free_slp_tree (child);
 182
 183   /* If the node defines any SLP only patterns then those patterns are no
 184      longer valid and should be removed.  */
 185   stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
 186   if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
 187     {
 188       stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
 189       STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
 190       STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
 191     }
 192
 193   delete node;
 194 }
 195
 196 /* Return a location suitable for dumpings related to the SLP instance.  */
 197
 198 dump_user_location_t
 199 _slp_instance::location () const
 200 {
 201   if (!root_stmts.is_empty ())
 202     return root_stmts[0]->stmt;
 203   else
 204     return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
 205 }
 206
 207
 208 /* Free the memory allocated for the SLP instance.  */
 209
 210 void
 211 vect_free_slp_instance (slp_instance instance)
 212 {
 213   vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
 214   SLP_INSTANCE_LOADS (instance).release ();
 215   SLP_INSTANCE_ROOT_STMTS (instance).release ();
 216   SLP_INSTANCE_REMAIN_DEFS (instance).release ();
 217   instance->subgraph_entries.release ();
 218   instance->cost_vec.release ();
 219   free (instance);
 220 }
 221
 222
 223 /* Create an SLP node for SCALAR_STMTS.  */
 224
 225 slp_tree
 226 vect_create_new_slp_node (unsigned nops, tree_code code)
 227 {
 228   slp_tree node = new _slp_tree;
 229   SLP_TREE_SCALAR_STMTS (node) = vNULL;
 230   SLP_TREE_CHILDREN (node).create (nops);
 231   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 232   SLP_TREE_CODE (node) = code;
 233   return node;
 234 }
 235 /* Create an SLP node for SCALAR_STMTS.  */
 236
 237 static slp_tree
 238 vect_create_new_slp_node (slp_tree node,
 239                           vec<stmt_vec_info> scalar_stmts, unsigned nops)
 240 {
 241   SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
 242   SLP_TREE_CHILDREN (node).create (nops);
 243   SLP_TREE_DEF_TYPE (node) = vect_internal_def;
 244   SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
 245   SLP_TREE_LANES (node) = scalar_stmts.length ();
 246   return node;
 247 }
 248
 249 /* Create an SLP node for SCALAR_STMTS.  */
 250
 251 static slp_tree
 252 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
 253 {
 254   return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
 255 }
 256
 257 /* Create an SLP node for OPS.  */
 258
 259 static slp_tree
 260 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
 261 {
 262   SLP_TREE_SCALAR_OPS (node) = ops;
 263   SLP_TREE_DEF_TYPE (node) = vect_external_def;
 264   SLP_TREE_LANES (node) = ops.length ();
 265   return node;
 266 }
 267
 268 /* Create an SLP node for OPS.  */
 269
 270 static slp_tree
 271 vect_create_new_slp_node (vec<tree> ops)
 272 {
 273   return vect_create_new_slp_node (new _slp_tree, ops);
 274 }
 275
 276
 277 /* This structure is used in creation of an SLP tree.  Each instance
 278    corresponds to the same operand in a group of scalar stmts in an SLP
 279    node.  */
 280 typedef struct _slp_oprnd_info
 281 {
 282   /* Def-stmts for the operands.  */
 283   vec<stmt_vec_info> def_stmts;
 284   /* Operands.  */
 285   vec<tree> ops;
 286   /* Information about the first statement, its vector def-type, type, the
 287      operand itself in case it's constant, and an indication if it's a pattern
 288      stmt and gather/scatter info.  */
 289   tree first_op_type;
 290   enum vect_def_type first_dt;
 291   bool any_pattern;
 292   bool first_gs_p;
 293   gather_scatter_info first_gs_info;
 294 } *slp_oprnd_info;
 295
 296
 297 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
 298    operand.  */
 299 static vec<slp_oprnd_info>
 300 vect_create_oprnd_info (int nops, int group_size)
 301 {
 302   int i;
 303   slp_oprnd_info oprnd_info;
 304   vec<slp_oprnd_info> oprnds_info;
 305
 306   oprnds_info.create (nops);
 307   for (i = 0; i < nops; i++)
 308     {
 309       oprnd_info = XNEW (struct _slp_oprnd_info);
 310       oprnd_info->def_stmts.create (group_size);
 311       oprnd_info->ops.create (group_size);
 312       oprnd_info->first_dt = vect_uninitialized_def;
 313       oprnd_info->first_op_type = NULL_TREE;
 314       oprnd_info->any_pattern = false;
 315       oprnd_info->first_gs_p = false;
 316       oprnds_info.quick_push (oprnd_info);
 317     }
 318
 319   return oprnds_info;
 320 }
 321
 322
 323 /* Free operands info.  */
 324
 325 static void
 326 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
 327 {
 328   int i;
 329   slp_oprnd_info oprnd_info;
 330
 331   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
 332     {
 333       oprnd_info->def_stmts.release ();
 334       oprnd_info->ops.release ();
 335       XDELETE (oprnd_info);
 336     }
 337
 338   oprnds_info.release ();
 339 }
 340
 341 /* Return the execution frequency of NODE (so that a higher value indicates
 342    a "more important" node when optimizing for speed).  */
 343
 344 static sreal
 345 vect_slp_node_weight (slp_tree node)
 346 {
 347   stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
 348   basic_block bb = gimple_bb (stmt_info->stmt);
 349   return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
 350 }
 351
 352 /* Return true if STMTS contains a pattern statement.  */
 353
 354 static bool
 355 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
 356 {
 357   stmt_vec_info stmt_info;
 358   unsigned int i;
 359   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
 360     if (stmt_info && is_pattern_stmt_p (stmt_info))
 361       return true;
 362   return false;
 363 }
 364
 365 /* Return true when all lanes in the external or constant NODE have
 366    the same value.  */
 367
 368 static bool
 369 vect_slp_tree_uniform_p (slp_tree node)
 370 {
 371   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
 372               || SLP_TREE_DEF_TYPE (node) == vect_external_def);
 373
 374   /* Pre-exsting vectors.  */
 375   if (SLP_TREE_SCALAR_OPS (node).is_empty ())
 376     return false;
 377
 378   unsigned i;
 379   tree op, first = NULL_TREE;
 380   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
 381     if (!first)
 382       first = op;
 383     else if (!operand_equal_p (first, op, 0))
 384       return false;
 385
 386   return true;
 387 }
 388
 389 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
 390    that starts from FIRST_STMT_INFO.  Return -1 if the data-ref is not a part
 391    of the chain.  */
 392
 393 int
 394 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
 395                                       stmt_vec_info first_stmt_info)
 396 {
 397   stmt_vec_info next_stmt_info = first_stmt_info;
 398   int result = 0;
 399
 400   if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
 401     return -1;
 402
 403   do
 404     {
 405       if (next_stmt_info == stmt_info)
 406         return result;
 407       next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
 408       if (next_stmt_info)
 409         result += DR_GROUP_GAP (next_stmt_info);
 410     }
 411   while (next_stmt_info);
 412
 413   return -1;
 414 }
 415
 416 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
 417    using the method implemented by duplicate_and_interleave.  Return true
 418    if so, returning the number of intermediate vectors in *NVECTORS_OUT
 419    (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
 420    (if nonnull).  */
 421
 422 bool
 423 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
 424                                 tree elt_type, unsigned int *nvectors_out,
 425                                 tree *vector_type_out,
 426                                 tree *permutes)
 427 {
 428   tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
 429   if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
 430     return false;
 431
 432   machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
 433   poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
 434   unsigned int nvectors = 1;
 435   for (;;)
 436     {
 437       scalar_int_mode int_mode;
 438       poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
 439       if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
 440         {
 441           /* Get the natural vector type for this SLP group size.  */
 442           tree int_type = build_nonstandard_integer_type
 443             (GET_MODE_BITSIZE (int_mode), 1);
 444           tree vector_type
 445             = get_vectype_for_scalar_type (vinfo, int_type, count);
 446           poly_int64 half_nelts;
 447           if (vector_type
 448               && VECTOR_MODE_P (TYPE_MODE (vector_type))
 449               && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
 450                            GET_MODE_SIZE (base_vector_mode))
 451               && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
 452                              2, &half_nelts))
 453             {
 454               /* Try fusing consecutive sequences of COUNT / NVECTORS elements
 455                  together into elements of type INT_TYPE and using the result
 456                  to build NVECTORS vectors.  */
 457               poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
 458               vec_perm_builder sel1 (nelts, 2, 3);
 459               vec_perm_builder sel2 (nelts, 2, 3);
 460
 461               for (unsigned int i = 0; i < 3; ++i)
 462                 {
 463                   sel1.quick_push (i);
 464                   sel1.quick_push (i + nelts);
 465                   sel2.quick_push (half_nelts + i);
 466                   sel2.quick_push (half_nelts + i + nelts);
 467                 }
 468               vec_perm_indices indices1 (sel1, 2, nelts);
 469               vec_perm_indices indices2 (sel2, 2, nelts);
 470               machine_mode vmode = TYPE_MODE (vector_type);
 471               if (can_vec_perm_const_p (vmode, vmode, indices1)
 472                   && can_vec_perm_const_p (vmode, vmode, indices2))
 473                 {
 474                   if (nvectors_out)
 475                     *nvectors_out = nvectors;
 476                   if (vector_type_out)
 477                     *vector_type_out = vector_type;
 478                   if (permutes)
 479                     {
 480                       permutes[0] = vect_gen_perm_mask_checked (vector_type,
 481                                                                 indices1);
 482                       permutes[1] = vect_gen_perm_mask_checked (vector_type,
 483                                                                 indices2);
 484                     }
 485                   return true;
 486                 }
 487             }
 488         }
 489       if (!multiple_p (elt_bytes, 2, &elt_bytes))
 490         return false;
 491       nvectors *= 2;
 492     }
 493 }
 494
 495 /* Return true if DTA and DTB match.  */
 496
 497 static bool
 498 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
 499 {
 500   return (dta == dtb
 501           || ((dta == vect_external_def || dta == vect_constant_def)
 502               && (dtb == vect_external_def || dtb == vect_constant_def)));
 503 }
 504
 505 static const int cond_expr_maps[3][5] = {
 506   { 4, -1, -2, 1, 2 },
 507   { 4, -2, -1, 1, 2 },
 508   { 4, -1, -2, 2, 1 }
 509 };
 510 static const int arg0_map[] = { 1, 0 };
 511 static const int arg1_map[] = { 1, 1 };
 512 static const int arg2_map[] = { 1, 2 };
 513 static const int arg1_arg4_map[] = { 2, 1, 4 };
 514 static const int arg3_arg2_map[] = { 2, 3, 2 };
 515 static const int op1_op0_map[] = { 2, 1, 0 };
 516 static const int off_map[] = { 1, -3 };
 517 static const int off_op0_map[] = { 2, -3, 0 };
 518 static const int off_arg2_map[] = { 2, -3, 2 };
 519 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
 520 static const int mask_call_maps[6][7] = {
 521   { 1, 1, },
 522   { 2, 1, 2, },
 523   { 3, 1, 2, 3, },
 524   { 4, 1, 2, 3, 4, },
 525   { 5, 1, 2, 3, 4, 5, },
 526   { 6, 1, 2, 3, 4, 5, 6 },
 527 };
 528
 529 /* For most SLP statements, there is a one-to-one mapping between
 530    gimple arguments and child nodes.  If that is not true for STMT,
 531    return an array that contains:
 532
 533    - the number of child nodes, followed by
 534    - for each child node, the index of the argument associated with that node.
 535      The special index -1 is the first operand of an embedded comparison and
 536      the special index -2 is the second operand of an embedded comparison.
 537      The special indes -3 is the offset of a gather as analyzed by
 538      vect_check_gather_scatter.
 539
 540    SWAP is as for vect_get_and_check_slp_defs.  */
 541
 542 static const int *
 543 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
 544                       unsigned char swap = 0)
 545 {
 546   if (auto assign = dyn_cast<const gassign *> (stmt))
 547     {
 548       if (gimple_assign_rhs_code (assign) == COND_EXPR
 549           && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
 550         return cond_expr_maps[swap];
 551       if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
 552           && swap)
 553         return op1_op0_map;
 554       if (gather_scatter_p)
 555         return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
 556                 ? off_op0_map : off_map);
 557     }
 558   gcc_assert (!swap);
 559   if (auto call = dyn_cast<const gcall *> (stmt))
 560     {
 561       if (gimple_call_internal_p (call))
 562         switch (gimple_call_internal_fn (call))
 563           {
 564           case IFN_MASK_LOAD:
 565             return gather_scatter_p ? off_arg2_map : arg2_map;
 566
 567           case IFN_GATHER_LOAD:
 568             return arg1_map;
 569
 570           case IFN_MASK_GATHER_LOAD:
 571           case IFN_MASK_LEN_GATHER_LOAD:
 572             return arg1_arg4_map;
 573
 574           case IFN_MASK_STORE:
 575             return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
 576
 577           case IFN_MASK_CALL:
 578             {
 579               unsigned nargs = gimple_call_num_args (call);
 580               if (nargs >= 2 && nargs <= 7)
 581                 return mask_call_maps[nargs-2];
 582               else
 583                 return nullptr;
 584             }
 585
 586           case IFN_CLZ:
 587           case IFN_CTZ:
 588             return arg0_map;
 589
 590           default:
 591             break;
 592           }
 593     }
 594   return nullptr;
 595 }
 596
 597 /* Return the SLP node child index for operand OP of STMT.  */
 598
 599 int
 600 vect_slp_child_index_for_operand (const gimple *stmt, int op,
 601                                   bool gather_scatter_p)
 602 {
 603   const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
 604   if (!opmap)
 605     return op;
 606   for (int i = 1; i < 1 + opmap[0]; ++i)
 607     if (opmap[i] == op)
 608       return i - 1;
 609   gcc_unreachable ();
 610 }
 611
 612 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
 613    they are of a valid type and that they match the defs of the first stmt of
 614    the SLP group (stored in OPRNDS_INFO).  This function tries to match stmts
 615    by swapping operands of STMTS[STMT_NUM] when possible.  Non-zero SWAP
 616    indicates swap is required for cond_expr stmts.  Specifically, SWAP
 617    is 1 if STMT is cond and operands of comparison need to be swapped;
 618    SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
 619
 620    If there was a fatal error return -1; if the error could be corrected by
 621    swapping operands of father node of this one, return 1; if everything is
 622    ok return 0.  */
 623 static int
 624 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
 625                              bool *skip_args,
 626                              vec<stmt_vec_info> stmts, unsigned stmt_num,
 627                              vec<slp_oprnd_info> *oprnds_info)
 628 {
 629   stmt_vec_info stmt_info = stmts[stmt_num];
 630   tree oprnd;
 631   unsigned int i, number_of_oprnds;
 632   enum vect_def_type dt = vect_uninitialized_def;
 633   slp_oprnd_info oprnd_info;
 634   gather_scatter_info gs_info;
 635   unsigned int gs_op = -1u;
 636   unsigned int commutative_op = -1U;
 637   bool first = stmt_num == 0;
 638
 639   if (!is_a<gcall *> (stmt_info->stmt)
 640       && !is_a<gassign *> (stmt_info->stmt)
 641       && !is_a<gphi *> (stmt_info->stmt))
 642     return -1;
 643
 644   number_of_oprnds = gimple_num_args (stmt_info->stmt);
 645   const int *map
 646     = vect_get_operand_map (stmt_info->stmt,
 647                             STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
 648   if (map)
 649     number_of_oprnds = *map++;
 650   if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
 651     {
 652       if (gimple_call_internal_p (stmt))
 653         {
 654           internal_fn ifn = gimple_call_internal_fn (stmt);
 655           commutative_op = first_commutative_argument (ifn);
 656         }
 657     }
 658   else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
 659     {
 660       if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
 661         commutative_op = 0;
 662     }
 663
 664   bool swapped = (swap != 0);
 665   bool backedge = false;
 666   enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
 667   for (i = 0; i < number_of_oprnds; i++)
 668     {
 669       oprnd_info = (*oprnds_info)[i];
 670       int opno = map ? map[i] : int (i);
 671       if (opno == -3)
 672         {
 673           gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
 674           if (!is_a <loop_vec_info> (vinfo)
 675               || !vect_check_gather_scatter (stmt_info,
 676                                              as_a <loop_vec_info> (vinfo),
 677                                              first ? &oprnd_info->first_gs_info
 678                                              : &gs_info))
 679             return -1;
 680
 681           if (first)
 682             {
 683               oprnd_info->first_gs_p = true;
 684               oprnd = oprnd_info->first_gs_info.offset;
 685             }
 686           else
 687             {
 688               gs_op = i;
 689               oprnd = gs_info.offset;
 690             }
 691         }
 692       else if (opno < 0)
 693         oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
 694       else
 695         {
 696           oprnd = gimple_arg (stmt_info->stmt, opno);
 697           if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
 698             {
 699               edge e = gimple_phi_arg_edge (stmt, opno);
 700               backedge = (is_a <bb_vec_info> (vinfo)
 701                           ? e->flags & EDGE_DFS_BACK
 702                           : dominated_by_p (CDI_DOMINATORS, e->src,
 703                                             gimple_bb (stmt_info->stmt)));
 704             }
 705         }
 706       if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
 707         oprnd = TREE_OPERAND (oprnd, 0);
 708
 709       stmt_vec_info def_stmt_info;
 710       if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
 711         {
 712           if (dump_enabled_p ())
 713             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 714                              "Build SLP failed: can't analyze def for %T\n",
 715                              oprnd);
 716
 717           return -1;
 718         }
 719
 720       if (skip_args[i])
 721         {
 722           oprnd_info->def_stmts.quick_push (NULL);
 723           oprnd_info->ops.quick_push (NULL_TREE);
 724           oprnd_info->first_dt = vect_uninitialized_def;
 725           continue;
 726         }
 727
 728       oprnd_info->def_stmts.quick_push (def_stmt_info);
 729       oprnd_info->ops.quick_push (oprnd);
 730
 731       if (def_stmt_info
 732           && is_pattern_stmt_p (def_stmt_info))
 733         {
 734           if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
 735               != def_stmt_info)
 736             oprnd_info->any_pattern = true;
 737           else
 738             /* If we promote this to external use the original stmt def.  */
 739             oprnd_info->ops.last ()
 740               = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
 741         }
 742
 743       /* If there's a extern def on a backedge make sure we can
 744          code-generate at the region start.
 745          ???  This is another case that could be fixed by adjusting
 746          how we split the function but at the moment we'd have conflicting
 747          goals there.  */
 748       if (backedge
 749           && dts[i] == vect_external_def
 750           && is_a <bb_vec_info> (vinfo)
 751           && TREE_CODE (oprnd) == SSA_NAME
 752           && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
 753           && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
 754                               gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
 755         {
 756           if (dump_enabled_p ())
 757             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 758                              "Build SLP failed: extern def %T only defined "
 759                              "on backedge\n", oprnd);
 760           return -1;
 761         }
 762
 763       if (first)
 764         {
 765           tree type = TREE_TYPE (oprnd);
 766           dt = dts[i];
 767
 768           /* For the swapping logic below force vect_reduction_def
 769              for the reduction op in a SLP reduction group.  */
 770           if (!STMT_VINFO_DATA_REF (stmt_info)
 771               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 772               && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
 773               && def_stmt_info)
 774             dts[i] = dt = vect_reduction_def;
 775
 776           /* Check the types of the definition.  */
 777           switch (dt)
 778             {
 779             case vect_external_def:
 780             case vect_constant_def:
 781             case vect_internal_def:
 782             case vect_reduction_def:
 783             case vect_double_reduction_def:
 784             case vect_induction_def:
 785             case vect_nested_cycle:
 786             case vect_first_order_recurrence:
 787               break;
 788
 789             default:
 790               /* FORNOW: Not supported.  */
 791               if (dump_enabled_p ())
 792                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 793                                  "Build SLP failed: illegal type of def %T\n",
 794                                  oprnd);
 795               return -1;
 796             }
 797
 798           oprnd_info->first_dt = dt;
 799           oprnd_info->first_op_type = type;
 800         }
 801     }
 802   if (first)
 803     return 0;
 804
 805   /* Now match the operand definition types to that of the first stmt.  */
 806   for (i = 0; i < number_of_oprnds;)
 807     {
 808       if (skip_args[i])
 809         {
 810           ++i;
 811           continue;
 812         }
 813
 814       oprnd_info = (*oprnds_info)[i];
 815       dt = dts[i];
 816       stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
 817       oprnd = oprnd_info->ops[stmt_num];
 818       tree type = TREE_TYPE (oprnd);
 819
 820       if (!types_compatible_p (oprnd_info->first_op_type, type))
 821         {
 822           if (dump_enabled_p ())
 823             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 824                              "Build SLP failed: different operand types\n");
 825           return 1;
 826         }
 827
 828       if ((gs_op == i) != oprnd_info->first_gs_p)
 829         {
 830           if (dump_enabled_p ())
 831             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 832                              "Build SLP failed: mixed gather and non-gather\n");
 833           return 1;
 834         }
 835       else if (gs_op == i)
 836         {
 837           if (!operand_equal_p (oprnd_info->first_gs_info.base,
 838                                 gs_info.base))
 839             {
 840               if (dump_enabled_p ())
 841                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 842                                  "Build SLP failed: different gather base\n");
 843               return 1;
 844             }
 845           if (oprnd_info->first_gs_info.scale != gs_info.scale)
 846             {
 847               if (dump_enabled_p ())
 848                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 849                                  "Build SLP failed: different gather scale\n");
 850               return 1;
 851             }
 852         }
 853
 854       /* Not first stmt of the group, check that the def-stmt/s match
 855          the def-stmt/s of the first stmt.  Allow different definition
 856          types for reduction chains: the first stmt must be a
 857          vect_reduction_def (a phi node), and the rest
 858          end in the reduction chain.  */
 859       if ((!vect_def_types_match (oprnd_info->first_dt, dt)
 860            && !(oprnd_info->first_dt == vect_reduction_def
 861                 && !STMT_VINFO_DATA_REF (stmt_info)
 862                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 863                 && def_stmt_info
 864                 && !STMT_VINFO_DATA_REF (def_stmt_info)
 865                 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 866                     == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
 867           || (!STMT_VINFO_DATA_REF (stmt_info)
 868               && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 869               && ((!def_stmt_info
 870                    || STMT_VINFO_DATA_REF (def_stmt_info)
 871                    || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 872                        != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 873                   != (oprnd_info->first_dt != vect_reduction_def))))
 874         {
 875           /* Try swapping operands if we got a mismatch.  For BB
 876              vectorization only in case it will clearly improve things.  */
 877           if (i == commutative_op && !swapped
 878               && (!is_a <bb_vec_info> (vinfo)
 879                   || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
 880                                              dts[i+1])
 881                       && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
 882                           || vect_def_types_match
 883                                ((*oprnds_info)[i+1]->first_dt, dts[i])))))
 884             {
 885               if (dump_enabled_p ())
 886                 dump_printf_loc (MSG_NOTE, vect_location,
 887                                  "trying swapped operands\n");
 888               std::swap (dts[i], dts[i+1]);
 889               std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
 890                          (*oprnds_info)[i+1]->def_stmts[stmt_num]);
 891               std::swap ((*oprnds_info)[i]->ops[stmt_num],
 892                          (*oprnds_info)[i+1]->ops[stmt_num]);
 893               /* After swapping some operands we lost track whether an
 894                  operand has any pattern defs so be conservative here.  */
 895               if ((*oprnds_info)[i]->any_pattern
 896                   || (*oprnds_info)[i+1]->any_pattern)
 897                 (*oprnds_info)[i]->any_pattern
 898                   = (*oprnds_info)[i+1]->any_pattern = true;
 899               swapped = true;
 900               continue;
 901             }
 902
 903           if (is_a <bb_vec_info> (vinfo)
 904               && !oprnd_info->any_pattern)
 905             {
 906               /* Now for commutative ops we should see whether we can
 907                  make the other operand matching.  */
 908               if (dump_enabled_p ())
 909                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 910                                  "treating operand as external\n");
 911               oprnd_info->first_dt = dt = vect_external_def;
 912             }
 913           else
 914             {
 915               if (dump_enabled_p ())
 916                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 917                                  "Build SLP failed: different types\n");
 918               return 1;
 919             }
 920         }
 921
 922       /* Make sure to demote the overall operand to external.  */
 923       if (dt == vect_external_def)
 924         oprnd_info->first_dt = vect_external_def;
 925       /* For a SLP reduction chain we want to duplicate the reduction to
 926          each of the chain members.  That gets us a sane SLP graph (still
 927          the stmts are not 100% correct wrt the initial values).  */
 928       else if ((dt == vect_internal_def
 929                 || dt == vect_reduction_def)
 930                && oprnd_info->first_dt == vect_reduction_def
 931                && !STMT_VINFO_DATA_REF (stmt_info)
 932                && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
 933                && !STMT_VINFO_DATA_REF (def_stmt_info)
 934                && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
 935                    == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
 936         {
 937           oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
 938           oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
 939         }
 940
 941       ++i;
 942     }
 943
 944   /* Swap operands.  */
 945   if (swapped)
 946     {
 947       if (dump_enabled_p ())
 948         dump_printf_loc (MSG_NOTE, vect_location,
 949                          "swapped operands to match def types in %G",
 950                          stmt_info->stmt);
 951     }
 952
 953   return 0;
 954 }
 955
 956 /* Return true if call statements CALL1 and CALL2 are similar enough
 957    to be combined into the same SLP group.  */
 958
 959 bool
 960 compatible_calls_p (gcall *call1, gcall *call2)
 961 {
 962   unsigned int nargs = gimple_call_num_args (call1);
 963   if (nargs != gimple_call_num_args (call2))
 964     return false;
 965
 966   if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
 967     return false;
 968
 969   if (gimple_call_internal_p (call1))
 970     {
 971       if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
 972                                TREE_TYPE (gimple_call_lhs (call2))))
 973         return false;
 974       for (unsigned int i = 0; i < nargs; ++i)
 975         if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
 976                                  TREE_TYPE (gimple_call_arg (call2, i))))
 977           return false;
 978     }
 979   else
 980     {
 981       if (!operand_equal_p (gimple_call_fn (call1),
 982                             gimple_call_fn (call2), 0))
 983         return false;
 984
 985       if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
 986         return false;
 987     }
 988
 989   /* Check that any unvectorized arguments are equal.  */
 990   if (const int *map = vect_get_operand_map (call1))
 991     {
 992       unsigned int nkept = *map++;
 993       unsigned int mapi = 0;
 994       for (unsigned int i = 0; i < nargs; ++i)
 995         if (mapi < nkept && map[mapi] == int (i))
 996           mapi += 1;
 997         else if (!operand_equal_p (gimple_call_arg (call1, i),
 998                                    gimple_call_arg (call2, i)))
 999           return false;
1000     }
1001
1002   return true;
1003 }
1004
1005 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1006    caller's attempt to find the vector type in STMT_INFO with the narrowest
1007    element type.  Return true if VECTYPE is nonnull and if it is valid
1008    for STMT_INFO.  When returning true, update MAX_NUNITS to reflect the
1009    number of units in VECTYPE.  GROUP_SIZE and MAX_NUNITS are as for
1010    vect_build_slp_tree.  */
1011
1012 static bool
1013 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1014                         unsigned int group_size,
1015                         tree vectype, poly_uint64 *max_nunits)
1016 {
1017   if (!vectype)
1018     {
1019       if (dump_enabled_p ())
1020         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1021                          "Build SLP failed: unsupported data-type in %G\n",
1022                          stmt_info->stmt);
1023       /* Fatal mismatch.  */
1024       return false;
1025     }
1026
1027   /* If populating the vector type requires unrolling then fail
1028      before adjusting *max_nunits for basic-block vectorization.  */
1029   if (is_a <bb_vec_info> (vinfo)
1030       && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1031     {
1032       if (dump_enabled_p ())
1033         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1034                          "Build SLP failed: unrolling required "
1035                          "in basic block SLP\n");
1036       /* Fatal mismatch.  */
1037       return false;
1038     }
1039
1040   /* In case of multiple types we need to detect the smallest type.  */
1041   vect_update_max_nunits (max_nunits, vectype);
1042   return true;
1043 }
1044
1045 /* Verify if the scalar stmts STMTS are isomorphic, require data
1046    permutation or are of unsupported types of operation.  Return
1047    true if they are, otherwise return false and indicate in *MATCHES
1048    which stmts are not isomorphic to the first one.  If MATCHES[0]
1049    is false then this indicates the comparison could not be
1050    carried out or the stmts will never be vectorized by SLP.
1051
1052    Note COND_EXPR is possibly isomorphic to another one after swapping its
1053    operands.  Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1054    the first stmt by swapping the two operands of comparison; set SWAP[i]
1055    to 2 if stmt I is isormorphic to the first stmt by inverting the code
1056    of comparison.  Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1057    to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1.  */
1058
1059 static bool
1060 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1061                        vec<stmt_vec_info> stmts, unsigned int group_size,
1062                        poly_uint64 *max_nunits, bool *matches,
1063                        bool *two_operators, tree *node_vectype)
1064 {
1065   unsigned int i;
1066   stmt_vec_info first_stmt_info = stmts[0];
1067   code_helper first_stmt_code = ERROR_MARK;
1068   code_helper alt_stmt_code = ERROR_MARK;
1069   code_helper rhs_code = ERROR_MARK;
1070   code_helper first_cond_code = ERROR_MARK;
1071   tree lhs;
1072   bool need_same_oprnds = false;
1073   tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1074   stmt_vec_info first_load = NULL, prev_first_load = NULL;
1075   bool first_stmt_ldst_p = false, ldst_p = false;
1076   bool first_stmt_phi_p = false, phi_p = false;
1077   int first_reduc_idx = -1;
1078   bool maybe_soft_fail = false;
1079   tree soft_fail_nunits_vectype = NULL_TREE;
1080
1081   /* For every stmt in NODE find its def stmt/s.  */
1082   stmt_vec_info stmt_info;
1083   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1084     {
1085       swap[i] = 0;
1086       matches[i] = false;
1087       if (!stmt_info)
1088         {
1089           matches[i] = true;
1090           continue;
1091         }
1092
1093       gimple *stmt = stmt_info->stmt;
1094       if (dump_enabled_p ())
1095         dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1096
1097       /* Fail to vectorize statements marked as unvectorizable, throw
1098          or are volatile.  */
1099       if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1100           || stmt_can_throw_internal (cfun, stmt)
1101           || gimple_has_volatile_ops (stmt))
1102         {
1103           if (dump_enabled_p ())
1104             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1105                              "Build SLP failed: unvectorizable statement %G",
1106                              stmt);
1107           /* ???  For BB vectorization we want to commutate operands in a way
1108              to shuffle all unvectorizable defs into one operand and have
1109              the other still vectorized.  The following doesn't reliably
1110              work for this though but it's the easiest we can do here.  */
1111           if (is_a <bb_vec_info> (vinfo) && i != 0)
1112             continue;
1113           /* Fatal mismatch.  */
1114           matches[0] = false;
1115           return false;
1116         }
1117
1118       gcall *call_stmt = dyn_cast <gcall *> (stmt);
1119       lhs = gimple_get_lhs (stmt);
1120       if (lhs == NULL_TREE
1121           && (!call_stmt
1122               || !gimple_call_internal_p (stmt)
1123               || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1124         {
1125           if (dump_enabled_p ())
1126             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1127                              "Build SLP failed: not GIMPLE_ASSIGN nor "
1128                              "GIMPLE_CALL %G", stmt);
1129           if (is_a <bb_vec_info> (vinfo) && i != 0)
1130             continue;
1131           /* Fatal mismatch.  */
1132           matches[0] = false;
1133           return false;
1134         }
1135
1136       tree nunits_vectype;
1137       if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1138                                            &nunits_vectype, group_size))
1139         {
1140           if (is_a <bb_vec_info> (vinfo) && i != 0)
1141             continue;
1142           /* Fatal mismatch.  */
1143           matches[0] = false;
1144           return false;
1145         }
1146       /* Record nunits required but continue analysis, producing matches[]
1147          as if nunits was not an issue.  This allows splitting of groups
1148          to happen.  */
1149       if (nunits_vectype
1150           && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1151                                       nunits_vectype, max_nunits))
1152         {
1153           gcc_assert (is_a <bb_vec_info> (vinfo));
1154           maybe_soft_fail = true;
1155           soft_fail_nunits_vectype = nunits_vectype;
1156         }
1157
1158       gcc_assert (vectype);
1159
1160       if (call_stmt)
1161         {
1162           combined_fn cfn = gimple_call_combined_fn (call_stmt);
1163           if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1164             rhs_code = cfn;
1165           else
1166             rhs_code = CALL_EXPR;
1167
1168           if (cfn == CFN_MASK_LOAD
1169               || cfn == CFN_GATHER_LOAD
1170               || cfn == CFN_MASK_GATHER_LOAD
1171               || cfn == CFN_MASK_LEN_GATHER_LOAD)
1172             ldst_p = true;
1173           else if (cfn == CFN_MASK_STORE)
1174             {
1175               ldst_p = true;
1176               rhs_code = CFN_MASK_STORE;
1177             }
1178           else if ((cfn != CFN_LAST
1179                     && cfn != CFN_MASK_CALL
1180                     && internal_fn_p (cfn)
1181                     && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1182                    || gimple_call_tail_p (call_stmt)
1183                    || gimple_call_noreturn_p (call_stmt)
1184                    || gimple_call_chain (call_stmt))
1185             {
1186               if (dump_enabled_p ())
1187                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1188                                  "Build SLP failed: unsupported call type %G",
1189                                  (gimple *) call_stmt);
1190               if (is_a <bb_vec_info> (vinfo) && i != 0)
1191                 continue;
1192               /* Fatal mismatch.  */
1193               matches[0] = false;
1194               return false;
1195             }
1196         }
1197       else if (gimple_code (stmt) == GIMPLE_PHI)
1198         {
1199           rhs_code = ERROR_MARK;
1200           phi_p = true;
1201         }
1202       else
1203         {
1204           rhs_code = gimple_assign_rhs_code (stmt);
1205           ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1206         }
1207
1208       /* Check the operation.  */
1209       if (i == 0)
1210         {
1211           *node_vectype = vectype;
1212           first_stmt_code = rhs_code;
1213           first_stmt_ldst_p = ldst_p;
1214           first_stmt_phi_p = phi_p;
1215           first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
1216
1217           /* Shift arguments should be equal in all the packed stmts for a
1218              vector shift with scalar shift operand.  */
1219           if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1220               || rhs_code == LROTATE_EXPR
1221               || rhs_code == RROTATE_EXPR)
1222             {
1223               /* First see if we have a vector/vector shift.  */
1224               if (!directly_supported_p (rhs_code, vectype, optab_vector))
1225                 {
1226                   /* No vector/vector shift, try for a vector/scalar shift.  */
1227                   if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1228                     {
1229                       if (dump_enabled_p ())
1230                         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1231                                          "Build SLP failed: "
1232                                          "op not supported by target.\n");
1233                       if (is_a <bb_vec_info> (vinfo) && i != 0)
1234                         continue;
1235                       /* Fatal mismatch.  */
1236                       matches[0] = false;
1237                       return false;
1238                     }
1239                   need_same_oprnds = true;
1240                   first_op1 = gimple_assign_rhs2 (stmt);
1241                 }
1242             }
1243           else if (rhs_code == WIDEN_LSHIFT_EXPR)
1244             {
1245               need_same_oprnds = true;
1246               first_op1 = gimple_assign_rhs2 (stmt);
1247             }
1248           else if (!ldst_p
1249                    && rhs_code == BIT_FIELD_REF)
1250             {
1251               tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1252               if (!is_a <bb_vec_info> (vinfo)
1253                   || TREE_CODE (vec) != SSA_NAME
1254                   /* When the element types are not compatible we pun the
1255                      source to the target vectype which requires equal size.  */
1256                   || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1257                        || !types_compatible_p (TREE_TYPE (vectype),
1258                                                TREE_TYPE (TREE_TYPE (vec))))
1259                       && !operand_equal_p (TYPE_SIZE (vectype),
1260                                            TYPE_SIZE (TREE_TYPE (vec)))))
1261                 {
1262                   if (dump_enabled_p ())
1263                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1264                                      "Build SLP failed: "
1265                                      "BIT_FIELD_REF not supported\n");
1266                   /* Fatal mismatch.  */
1267                   matches[0] = false;
1268                   return false;
1269                 }
1270             }
1271           else if (rhs_code == CFN_DIV_POW2)
1272             {
1273               need_same_oprnds = true;
1274               first_op1 = gimple_call_arg (call_stmt, 1);
1275             }
1276         }
1277       else
1278         {
1279           if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1280               /* For SLP reduction groups the index isn't necessarily
1281                  uniform but only that of the first stmt matters.  */
1282               && !(first_reduc_idx != -1
1283                    && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1284                    && REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
1285             {
1286               if (dump_enabled_p ())
1287                 {
1288                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1289                                    "Build SLP failed: different reduc_idx "
1290                                    "%d instead of %d in %G",
1291                                    STMT_VINFO_REDUC_IDX (stmt_info),
1292                                    first_reduc_idx, stmt);
1293                 }
1294               /* Mismatch.  */
1295               continue;
1296             }
1297           if (first_stmt_code != rhs_code
1298               && alt_stmt_code == ERROR_MARK)
1299             alt_stmt_code = rhs_code;
1300           if ((first_stmt_code != rhs_code
1301                && (first_stmt_code != IMAGPART_EXPR
1302                    || rhs_code != REALPART_EXPR)
1303                && (first_stmt_code != REALPART_EXPR
1304                    || rhs_code != IMAGPART_EXPR)
1305                /* Handle mismatches in plus/minus by computing both
1306                   and merging the results.  */
1307                && !((first_stmt_code == PLUS_EXPR
1308                      || first_stmt_code == MINUS_EXPR)
1309                     && (alt_stmt_code == PLUS_EXPR
1310                         || alt_stmt_code == MINUS_EXPR)
1311                     && rhs_code == alt_stmt_code)
1312                && !(first_stmt_code.is_tree_code ()
1313                     && rhs_code.is_tree_code ()
1314                     && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1315                         == tcc_comparison)
1316                     && (swap_tree_comparison (tree_code (first_stmt_code))
1317                         == tree_code (rhs_code)))
1318                && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1319                     && (first_stmt_code == ARRAY_REF
1320                         || first_stmt_code == BIT_FIELD_REF
1321                         || first_stmt_code == COMPONENT_REF
1322                         || first_stmt_code == REALPART_EXPR
1323                         || first_stmt_code == IMAGPART_EXPR
1324                         || first_stmt_code == MEM_REF)
1325                     && (rhs_code == ARRAY_REF
1326                         || rhs_code == BIT_FIELD_REF
1327                         || rhs_code == COMPONENT_REF
1328                         || rhs_code == REALPART_EXPR
1329                         || rhs_code == IMAGPART_EXPR
1330                         || rhs_code == MEM_REF)))
1331               || (ldst_p
1332                   && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1333                       != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1334               || (ldst_p
1335                   && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1336                       != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1337               || first_stmt_ldst_p != ldst_p
1338               || first_stmt_phi_p != phi_p)
1339             {
1340               if (dump_enabled_p ())
1341                 {
1342                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1343                                    "Build SLP failed: different operation "
1344                                    "in stmt %G", stmt);
1345                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1346                                    "original stmt %G", first_stmt_info->stmt);
1347                 }
1348               /* Mismatch.  */
1349               continue;
1350             }
1351
1352           if (!ldst_p
1353               && first_stmt_code == BIT_FIELD_REF
1354               && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1355                   != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1356             {
1357               if (dump_enabled_p ())
1358                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1359                                  "Build SLP failed: different BIT_FIELD_REF "
1360                                  "arguments in %G", stmt);
1361               /* Mismatch.  */
1362               continue;
1363             }
1364
1365           if (call_stmt
1366               && first_stmt_code != CFN_MASK_LOAD
1367               && first_stmt_code != CFN_MASK_STORE)
1368             {
1369               if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1370                                        call_stmt))
1371                 {
1372                   if (dump_enabled_p ())
1373                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1374                                      "Build SLP failed: different calls in %G",
1375                                      stmt);
1376                   /* Mismatch.  */
1377                   continue;
1378                 }
1379             }
1380
1381           if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1382               && (gimple_bb (first_stmt_info->stmt)
1383                   != gimple_bb (stmt_info->stmt)))
1384             {
1385               if (dump_enabled_p ())
1386                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1387                                  "Build SLP failed: different BB for PHI "
1388                                  "or possibly trapping operation in %G", stmt);
1389               /* Mismatch.  */
1390               continue;
1391             }
1392
1393           if (need_same_oprnds)
1394             {
1395               tree other_op1 = gimple_arg (stmt, 1);
1396               if (!operand_equal_p (first_op1, other_op1, 0))
1397                 {
1398                   if (dump_enabled_p ())
1399                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1400                                      "Build SLP failed: different shift "
1401                                      "arguments in %G", stmt);
1402                   /* Mismatch.  */
1403                   continue;
1404                 }
1405             }
1406
1407           if (!types_compatible_p (vectype, *node_vectype))
1408             {
1409               if (dump_enabled_p ())
1410                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1411                                  "Build SLP failed: different vector type "
1412                                  "in %G", stmt);
1413               /* Mismatch.  */
1414               continue;
1415             }
1416         }
1417
1418       /* Grouped store or load.  */
1419       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1420         {
1421           gcc_assert (ldst_p);
1422           if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1423             {
1424               /* Store.  */
1425               gcc_assert (rhs_code == CFN_MASK_STORE
1426                           || REFERENCE_CLASS_P (lhs)
1427                           || DECL_P (lhs));
1428             }
1429           else
1430             {
1431               /* Load.  */
1432               first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1433               if (prev_first_load)
1434                 {
1435                   /* Check that there are no loads from different interleaving
1436                      chains in the same node.  */
1437                   if (prev_first_load != first_load)
1438                     {
1439                       if (dump_enabled_p ())
1440                         dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1441                                          vect_location,
1442                                          "Build SLP failed: different "
1443                                          "interleaving chains in one node %G",
1444                                          stmt);
1445                       /* Mismatch.  */
1446                       continue;
1447                     }
1448                 }
1449               else
1450                 prev_first_load = first_load;
1451            }
1452         }
1453       /* Non-grouped store or load.  */
1454       else if (ldst_p)
1455         {
1456           if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1457               && rhs_code != CFN_GATHER_LOAD
1458               && rhs_code != CFN_MASK_GATHER_LOAD
1459               && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1460               && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1461               /* Not grouped loads are handled as externals for BB
1462                  vectorization.  For loop vectorization we can handle
1463                  splats the same we handle single element interleaving.  */
1464               && (is_a <bb_vec_info> (vinfo)
1465                   || stmt_info != first_stmt_info))
1466             {
1467               /* Not grouped load.  */
1468               if (dump_enabled_p ())
1469                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1470                                  "Build SLP failed: not grouped load %G", stmt);
1471
1472               if (i != 0)
1473                 continue;
1474               /* Fatal mismatch.  */
1475               matches[0] = false;
1476               return false;
1477             }
1478         }
1479       /* Not memory operation.  */
1480       else
1481         {
1482           if (!phi_p
1483               && rhs_code.is_tree_code ()
1484               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1485               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1486               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1487               && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1488               && rhs_code != VIEW_CONVERT_EXPR
1489               && rhs_code != CALL_EXPR
1490               && rhs_code != BIT_FIELD_REF)
1491             {
1492               if (dump_enabled_p ())
1493                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1494                                  "Build SLP failed: operation unsupported %G",
1495                                  stmt);
1496               if (is_a <bb_vec_info> (vinfo) && i != 0)
1497                 continue;
1498               /* Fatal mismatch.  */
1499               matches[0] = false;
1500               return false;
1501             }
1502
1503           if (rhs_code == COND_EXPR)
1504             {
1505               tree cond_expr = gimple_assign_rhs1 (stmt);
1506               enum tree_code cond_code = TREE_CODE (cond_expr);
1507               enum tree_code swap_code = ERROR_MARK;
1508               enum tree_code invert_code = ERROR_MARK;
1509
1510               if (i == 0)
1511                 first_cond_code = TREE_CODE (cond_expr);
1512               else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1513                 {
1514                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1515                   swap_code = swap_tree_comparison (cond_code);
1516                   invert_code = invert_tree_comparison (cond_code, honor_nans);
1517                 }
1518
1519               if (first_cond_code == cond_code)
1520                 ;
1521               /* Isomorphic can be achieved by swapping.  */
1522               else if (first_cond_code == swap_code)
1523                 swap[i] = 1;
1524               /* Isomorphic can be achieved by inverting.  */
1525               else if (first_cond_code == invert_code)
1526                 swap[i] = 2;
1527               else
1528                 {
1529                   if (dump_enabled_p ())
1530                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1531                                      "Build SLP failed: different"
1532                                      " operation %G", stmt);
1533                   /* Mismatch.  */
1534                   continue;
1535                 }
1536             }
1537
1538           if (rhs_code.is_tree_code ()
1539               && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1540               && (swap_tree_comparison ((tree_code)first_stmt_code)
1541                   == (tree_code)rhs_code))
1542             swap[i] = 1;
1543         }
1544
1545       matches[i] = true;
1546     }
1547
1548   for (i = 0; i < group_size; ++i)
1549     if (!matches[i])
1550       return false;
1551
1552   /* If we allowed a two-operation SLP node verify the target can cope
1553      with the permute we are going to use.  */
1554   if (alt_stmt_code != ERROR_MARK
1555       && (!alt_stmt_code.is_tree_code ()
1556           || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1557               && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1558     {
1559       *two_operators = true;
1560     }
1561
1562   if (maybe_soft_fail)
1563     {
1564       unsigned HOST_WIDE_INT const_nunits;
1565       if (!TYPE_VECTOR_SUBPARTS
1566             (soft_fail_nunits_vectype).is_constant (&const_nunits)
1567           || const_nunits > group_size)
1568         matches[0] = false;
1569       else
1570         {
1571           /* With constant vector elements simulate a mismatch at the
1572              point we need to split.  */
1573           unsigned tail = group_size & (const_nunits - 1);
1574           memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1575         }
1576       return false;
1577     }
1578
1579   return true;
1580 }
1581
1582 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1583    Note we never remove apart from at destruction time so we do not
1584    need a special value for deleted that differs from empty.  */
1585 struct bst_traits
1586 {
1587   typedef vec <stmt_vec_info> value_type;
1588   typedef vec <stmt_vec_info> compare_type;
1589   static inline hashval_t hash (value_type);
1590   static inline bool equal (value_type existing, value_type candidate);
1591   static inline bool is_empty (value_type x) { return !x.exists (); }
1592   static inline bool is_deleted (value_type x) { return !x.exists (); }
1593   static const bool empty_zero_p = true;
1594   static inline void mark_empty (value_type &x) { x.release (); }
1595   static inline void mark_deleted (value_type &x) { x.release (); }
1596   static inline void remove (value_type &x) { x.release (); }
1597 };
1598 inline hashval_t
1599 bst_traits::hash (value_type x)
1600 {
1601   inchash::hash h;
1602   for (unsigned i = 0; i < x.length (); ++i)
1603     h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
1604   return h.end ();
1605 }
1606 inline bool
1607 bst_traits::equal (value_type existing, value_type candidate)
1608 {
1609   if (existing.length () != candidate.length ())
1610     return false;
1611   for (unsigned i = 0; i < existing.length (); ++i)
1612     if (existing[i] != candidate[i])
1613       return false;
1614   return true;
1615 }
1616
1617 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1618                   simple_hashmap_traits <bst_traits, slp_tree> >
1619   scalar_stmts_to_slp_tree_map_t;
1620
1621 /* Release BST_MAP.  */
1622
1623 static void
1624 release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
1625 {
1626   /* The map keeps a reference on SLP nodes built, release that.  */
1627   for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1628        it != bst_map->end (); ++it)
1629     if ((*it).second)
1630       vect_free_slp_tree ((*it).second);
1631   delete bst_map;
1632 }
1633
1634 /* ???  This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1635    but then vec::insert does memmove and that's not compatible with
1636    std::pair.  */
1637 struct chain_op_t
1638 {
1639   chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1640       : code (code_), dt (dt_), op (op_) {}
1641   tree_code code;
1642   vect_def_type dt;
1643   tree op;
1644 };
1645
1646 /* Comparator for sorting associatable chains.  */
1647
1648 static int
1649 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1650 {
1651   auto *op1 = (const chain_op_t *) op1_;
1652   auto *op2 = (const chain_op_t *) op2_;
1653   if (op1->dt != op2->dt)
1654     return (int)op1->dt - (int)op2->dt;
1655   return (int)op1->code - (int)op2->code;
1656 }
1657
1658 /* Linearize the associatable expression chain at START with the
1659    associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1660    filling CHAIN with the result and using WORKLIST as intermediate storage.
1661    CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1662    or MINUS_EXPR.  *CHAIN_STMTS if not NULL is filled with all computation
1663    stmts, starting with START.  */
1664
1665 static void
1666 vect_slp_linearize_chain (vec_info *vinfo,
1667                           vec<std::pair<tree_code, gimple *> > &worklist,
1668                           vec<chain_op_t> &chain,
1669                           enum tree_code code, gimple *start,
1670                           gimple *&code_stmt, gimple *&alt_code_stmt,
1671                           vec<gimple *> *chain_stmts)
1672 {
1673   /* For each lane linearize the addition/subtraction (or other
1674      uniform associatable operation) expression tree.  */
1675   worklist.safe_push (std::make_pair (code, start));
1676   while (!worklist.is_empty ())
1677     {
1678       auto entry = worklist.pop ();
1679       gassign *stmt = as_a <gassign *> (entry.second);
1680       enum tree_code in_code = entry.first;
1681       enum tree_code this_code = gimple_assign_rhs_code (stmt);
1682       /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE.  */
1683       if (!code_stmt
1684           && gimple_assign_rhs_code (stmt) == code)
1685         code_stmt = stmt;
1686       else if (!alt_code_stmt
1687                && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1688         alt_code_stmt = stmt;
1689       if (chain_stmts)
1690         chain_stmts->safe_push (stmt);
1691       for (unsigned opnum = 1; opnum <= 2; ++opnum)
1692         {
1693           tree op = gimple_op (stmt, opnum);
1694           vect_def_type dt;
1695           stmt_vec_info def_stmt_info;
1696           bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1697           gcc_assert (res);
1698           if (dt == vect_internal_def
1699               && is_pattern_stmt_p (def_stmt_info))
1700             op = gimple_get_lhs (def_stmt_info->stmt);
1701           gimple *use_stmt;
1702           use_operand_p use_p;
1703           if (dt == vect_internal_def
1704               && single_imm_use (op, &use_p, &use_stmt)
1705               && is_gimple_assign (def_stmt_info->stmt)
1706               && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1707                   || (code == PLUS_EXPR
1708                       && (gimple_assign_rhs_code (def_stmt_info->stmt)
1709                           == MINUS_EXPR))))
1710             {
1711               tree_code op_def_code = this_code;
1712               if (op_def_code == MINUS_EXPR && opnum == 1)
1713                 op_def_code = PLUS_EXPR;
1714               if (in_code == MINUS_EXPR)
1715                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1716               worklist.safe_push (std::make_pair (op_def_code,
1717                                                   def_stmt_info->stmt));
1718             }
1719           else
1720             {
1721               tree_code op_def_code = this_code;
1722               if (op_def_code == MINUS_EXPR && opnum == 1)
1723                 op_def_code = PLUS_EXPR;
1724               if (in_code == MINUS_EXPR)
1725                 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1726               chain.safe_push (chain_op_t (op_def_code, dt, op));
1727             }
1728         }
1729     }
1730 }
1731
1732 static slp_tree
1733 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1734                        vec<stmt_vec_info> stmts, unsigned int group_size,
1735                        poly_uint64 *max_nunits,
1736                        bool *matches, unsigned *limit, unsigned *tree_size,
1737                        scalar_stmts_to_slp_tree_map_t *bst_map);
1738
1739 static slp_tree
1740 vect_build_slp_tree (vec_info *vinfo,
1741                      vec<stmt_vec_info> stmts, unsigned int group_size,
1742                      poly_uint64 *max_nunits,
1743                      bool *matches, unsigned *limit, unsigned *tree_size,
1744                      scalar_stmts_to_slp_tree_map_t *bst_map)
1745 {
1746   if (slp_tree *leader = bst_map->get (stmts))
1747     {
1748       if (dump_enabled_p ())
1749         dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1750                          !(*leader)->failed ? "" : "failed ",
1751                          (void *) *leader);
1752       if (!(*leader)->failed)
1753         {
1754           SLP_TREE_REF_COUNT (*leader)++;
1755           vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1756           stmts.release ();
1757           return *leader;
1758         }
1759       memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1760       return NULL;
1761     }
1762
1763   /* Single-lane SLP doesn't have the chance of run-away, do not account
1764      it to the limit.  */
1765   if (stmts.length () > 1)
1766     {
1767       if (*limit == 0)
1768         {
1769           if (dump_enabled_p ())
1770             dump_printf_loc (MSG_NOTE, vect_location,
1771                              "SLP discovery limit exceeded\n");
1772           memset (matches, 0, sizeof (bool) * group_size);
1773           return NULL;
1774         }
1775       --*limit;
1776     }
1777
1778   /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1779      so we can pick up backedge destinations during discovery.  */
1780   slp_tree res = new _slp_tree;
1781   SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1782   SLP_TREE_SCALAR_STMTS (res) = stmts;
1783   bst_map->put (stmts.copy (), res);
1784
1785   if (dump_enabled_p ())
1786     dump_printf_loc (MSG_NOTE, vect_location,
1787                      "starting SLP discovery for node %p\n", (void *) res);
1788
1789   poly_uint64 this_max_nunits = 1;
1790   slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1791                                         &this_max_nunits,
1792                                         matches, limit, tree_size, bst_map);
1793   if (!res_)
1794     {
1795       if (dump_enabled_p ())
1796         dump_printf_loc (MSG_NOTE, vect_location,
1797                          "SLP discovery for node %p failed\n", (void *) res);
1798       /* Mark the node invalid so we can detect those when still in use
1799          as backedge destinations.  */
1800       SLP_TREE_SCALAR_STMTS (res) = vNULL;
1801       SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1802       res->failed = XNEWVEC (bool, group_size);
1803       if (flag_checking)
1804         {
1805           unsigned i;
1806           for (i = 0; i < group_size; ++i)
1807             if (!matches[i])
1808               break;
1809           gcc_assert (i < group_size);
1810         }
1811       memcpy (res->failed, matches, sizeof (bool) * group_size);
1812     }
1813   else
1814     {
1815       if (dump_enabled_p ())
1816         dump_printf_loc (MSG_NOTE, vect_location,
1817                          "SLP discovery for node %p succeeded\n",
1818                          (void *) res);
1819       gcc_assert (res_ == res);
1820       res->max_nunits = this_max_nunits;
1821       vect_update_max_nunits (max_nunits, this_max_nunits);
1822       /* Keep a reference for the bst_map use.  */
1823       SLP_TREE_REF_COUNT (res)++;
1824     }
1825   return res_;
1826 }
1827
1828 /* Helper for building an associated SLP node chain.  */
1829
1830 static void
1831 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1832                                    slp_tree op0, slp_tree op1,
1833                                    stmt_vec_info oper1, stmt_vec_info oper2,
1834                                    vec<std::pair<unsigned, unsigned> > lperm)
1835 {
1836   unsigned group_size = SLP_TREE_LANES (op1);
1837
1838   slp_tree child1 = new _slp_tree;
1839   SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1840   SLP_TREE_VECTYPE (child1) = vectype;
1841   SLP_TREE_LANES (child1) = group_size;
1842   SLP_TREE_CHILDREN (child1).create (2);
1843   SLP_TREE_CHILDREN (child1).quick_push (op0);
1844   SLP_TREE_CHILDREN (child1).quick_push (op1);
1845   SLP_TREE_REPRESENTATIVE (child1) = oper1;
1846
1847   slp_tree child2 = new _slp_tree;
1848   SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1849   SLP_TREE_VECTYPE (child2) = vectype;
1850   SLP_TREE_LANES (child2) = group_size;
1851   SLP_TREE_CHILDREN (child2).create (2);
1852   SLP_TREE_CHILDREN (child2).quick_push (op0);
1853   SLP_TREE_REF_COUNT (op0)++;
1854   SLP_TREE_CHILDREN (child2).quick_push (op1);
1855   SLP_TREE_REF_COUNT (op1)++;
1856   SLP_TREE_REPRESENTATIVE (child2) = oper2;
1857
1858   SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1859   SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1860   SLP_TREE_VECTYPE (perm) = vectype;
1861   SLP_TREE_LANES (perm) = group_size;
1862   /* ???  We should set this NULL but that's not expected.  */
1863   SLP_TREE_REPRESENTATIVE (perm) = oper1;
1864   SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1865   SLP_TREE_CHILDREN (perm).quick_push (child1);
1866   SLP_TREE_CHILDREN (perm).quick_push (child2);
1867 }
1868
1869 /* Recursively build an SLP tree starting from NODE.
1870    Fail (and return a value not equal to zero) if def-stmts are not
1871    isomorphic, require data permutation or are of unsupported types of
1872    operation.  Otherwise, return 0.
1873    The value returned is the depth in the SLP tree where a mismatch
1874    was found.  */
1875
1876 static slp_tree
1877 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1878                        vec<stmt_vec_info> stmts, unsigned int group_size,
1879                        poly_uint64 *max_nunits,
1880                        bool *matches, unsigned *limit, unsigned *tree_size,
1881                        scalar_stmts_to_slp_tree_map_t *bst_map)
1882 {
1883   unsigned nops, i, this_tree_size = 0;
1884   poly_uint64 this_max_nunits = *max_nunits;
1885
1886   matches[0] = false;
1887
1888   stmt_vec_info stmt_info = stmts[0];
1889   if (!is_a<gcall *> (stmt_info->stmt)
1890       && !is_a<gassign *> (stmt_info->stmt)
1891       && !is_a<gphi *> (stmt_info->stmt))
1892     return NULL;
1893
1894   nops = gimple_num_args (stmt_info->stmt);
1895   if (const int *map = vect_get_operand_map (stmt_info->stmt,
1896                                              STMT_VINFO_GATHER_SCATTER_P
1897                                                (stmt_info)))
1898     nops = map[0];
1899
1900   /* If the SLP node is a PHI (induction or reduction), terminate
1901      the recursion.  */
1902   bool *skip_args = XALLOCAVEC (bool, nops);
1903   memset (skip_args, 0, sizeof (bool) * nops);
1904   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1905     if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1906       {
1907         tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1908         tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1909                                                     group_size);
1910         if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1911                                      max_nunits))
1912           return NULL;
1913
1914         vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1915         if (def_type == vect_induction_def)
1916           {
1917             /* Induction PHIs are not cycles but walk the initial
1918                value.  Only for inner loops through, for outer loops
1919                we need to pick up the value from the actual PHIs
1920                to more easily support peeling and epilogue vectorization.  */
1921             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1922             if (!nested_in_vect_loop_p (loop, stmt_info))
1923               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1924             else
1925               loop = loop->inner;
1926             skip_args[loop_latch_edge (loop)->dest_idx] = true;
1927           }
1928         else if (def_type == vect_reduction_def
1929                  || def_type == vect_double_reduction_def
1930                  || def_type == vect_nested_cycle
1931                  || def_type == vect_first_order_recurrence)
1932           {
1933             /* Else def types have to match.  */
1934             stmt_vec_info other_info;
1935             bool all_same = true;
1936             FOR_EACH_VEC_ELT (stmts, i, other_info)
1937               {
1938                 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1939                   return NULL;
1940                 if (other_info != stmt_info)
1941                   all_same = false;
1942               }
1943             class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1944             /* Reduction initial values are not explicitely represented.  */
1945             if (def_type != vect_first_order_recurrence
1946                 && gimple_bb (stmt_info->stmt) == loop->header)
1947               skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1948             /* Reduction chain backedge defs are filled manually.
1949                ???  Need a better way to identify a SLP reduction chain PHI.
1950                Or a better overall way to SLP match those.  */
1951             if (stmts.length () > 1
1952                 && all_same && def_type == vect_reduction_def)
1953               skip_args[loop_latch_edge (loop)->dest_idx] = true;
1954           }
1955         else if (def_type != vect_internal_def)
1956           return NULL;
1957       }
1958
1959
1960   bool two_operators = false;
1961   unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1962   tree vectype = NULL_TREE;
1963   if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1964                               &this_max_nunits, matches, &two_operators,
1965                               &vectype))
1966     return NULL;
1967
1968   /* If the SLP node is a load, terminate the recursion unless masked.  */
1969   if (STMT_VINFO_DATA_REF (stmt_info)
1970       && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1971     {
1972       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1973         gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1974       else
1975         {
1976           *max_nunits = this_max_nunits;
1977           (*tree_size)++;
1978           node = vect_create_new_slp_node (node, stmts, 0);
1979           SLP_TREE_VECTYPE (node) = vectype;
1980           /* And compute the load permutation.  Whether it is actually
1981              a permutation depends on the unrolling factor which is
1982              decided later.  */
1983           vec<unsigned> load_permutation;
1984           int j;
1985           stmt_vec_info load_info;
1986           load_permutation.create (group_size);
1987           stmt_vec_info first_stmt_info
1988             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1989           bool any_permute = false;
1990           bool any_null = false;
1991           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1992             {
1993               int load_place;
1994               if (! load_info)
1995                 {
1996                   load_place = j;
1997                   any_null = true;
1998                 }
1999               else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2000                 load_place = vect_get_place_in_interleaving_chain
2001                     (load_info, first_stmt_info);
2002               else
2003                 load_place = 0;
2004               gcc_assert (load_place != -1);
2005               any_permute |= load_place != j;
2006               load_permutation.quick_push (load_place);
2007             }
2008           if (any_null)
2009             {
2010               gcc_assert (!any_permute);
2011               load_permutation.release ();
2012             }
2013
2014           if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
2015             {
2016               gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
2017                           || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
2018                           || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
2019                           || gimple_call_internal_p (stmt,
2020                                                      IFN_MASK_LEN_GATHER_LOAD));
2021               load_permutation.release ();
2022               /* We cannot handle permuted masked loads, see PR114375.  */
2023               if (any_permute
2024                   || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2025                       && DR_GROUP_SIZE (first_stmt_info) != group_size)
2026                   || STMT_VINFO_STRIDED_P (stmt_info))
2027                 {
2028                   matches[0] = false;
2029                   return NULL;
2030                 }
2031             }
2032           else
2033             {
2034               SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
2035               return node;
2036             }
2037         }
2038     }
2039   else if (gimple_assign_single_p (stmt_info->stmt)
2040            && !gimple_vuse (stmt_info->stmt)
2041            && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
2042     {
2043       /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
2044          the same SSA name vector of a compatible type to vectype.  */
2045       vec<std::pair<unsigned, unsigned> > lperm = vNULL;
2046       tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
2047       stmt_vec_info estmt_info;
2048       FOR_EACH_VEC_ELT (stmts, i, estmt_info)
2049         {
2050           gassign *estmt = as_a <gassign *> (estmt_info->stmt);
2051           tree bfref = gimple_assign_rhs1 (estmt);
2052           HOST_WIDE_INT lane;
2053           if (!known_eq (bit_field_size (bfref),
2054                          tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2055               || !constant_multiple_p (bit_field_offset (bfref),
2056                                        bit_field_size (bfref), &lane))
2057             {
2058               lperm.release ();
2059               matches[0] = false;
2060               return NULL;
2061             }
2062           lperm.safe_push (std::make_pair (0, (unsigned)lane));
2063         }
2064       slp_tree vnode = vect_create_new_slp_node (vNULL);
2065       if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2066         /* ???  We record vectype here but we hide eventually necessary
2067            punning and instead rely on code generation to materialize
2068            VIEW_CONVERT_EXPRs as necessary.  We instead should make
2069            this explicit somehow.  */
2070         SLP_TREE_VECTYPE (vnode) = vectype;
2071       else
2072         {
2073           /* For different size but compatible elements we can still
2074              use VEC_PERM_EXPR without punning.  */
2075           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2076                       && types_compatible_p (TREE_TYPE (vectype),
2077                                              TREE_TYPE (TREE_TYPE (vec))));
2078           SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2079         }
2080       auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2081       unsigned HOST_WIDE_INT const_nunits;
2082       if (nunits.is_constant (&const_nunits))
2083         SLP_TREE_LANES (vnode) = const_nunits;
2084       SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2085       /* We are always building a permutation node even if it is an identity
2086          permute to shield the rest of the vectorizer from the odd node
2087          representing an actual vector without any scalar ops.
2088          ???  We could hide it completely with making the permute node
2089          external?  */
2090       node = vect_create_new_slp_node (node, stmts, 1);
2091       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2092       SLP_TREE_LANE_PERMUTATION (node) = lperm;
2093       SLP_TREE_VECTYPE (node) = vectype;
2094       SLP_TREE_CHILDREN (node).quick_push (vnode);
2095       return node;
2096     }
2097   /* When discovery reaches an associatable operation see whether we can
2098      improve that to match up lanes in a way superior to the operand
2099      swapping code which at most looks at two defs.
2100      ???  For BB vectorization we cannot do the brute-force search
2101      for matching as we can succeed by means of builds from scalars
2102      and have no good way to "cost" one build against another.  */
2103   else if (is_a <loop_vec_info> (vinfo)
2104            /* Do not bother for single-lane SLP.  */
2105            && group_size > 1
2106            /* ???  We don't handle !vect_internal_def defs below.  */
2107            && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2108            /* ???  Do not associate a reduction, this will wreck REDUC_IDX
2109               mapping as long as that exists on the stmt_info level.  */
2110            && STMT_VINFO_REDUC_IDX (stmt_info) == -1
2111            && is_gimple_assign (stmt_info->stmt)
2112            && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2113                || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2114            && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2115                || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2116                    && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2117     {
2118       /* See if we have a chain of (mixed) adds or subtracts or other
2119          associatable ops.  */
2120       enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2121       if (code == MINUS_EXPR)
2122         code = PLUS_EXPR;
2123       stmt_vec_info other_op_stmt_info = NULL;
2124       stmt_vec_info op_stmt_info = NULL;
2125       unsigned chain_len = 0;
2126       auto_vec<chain_op_t> chain;
2127       auto_vec<std::pair<tree_code, gimple *> > worklist;
2128       auto_vec<vec<chain_op_t> > chains (group_size);
2129       auto_vec<slp_tree, 4> children;
2130       bool hard_fail = true;
2131       for (unsigned lane = 0; lane < group_size; ++lane)
2132         {
2133           /* For each lane linearize the addition/subtraction (or other
2134              uniform associatable operation) expression tree.  */
2135           gimple *op_stmt = NULL, *other_op_stmt = NULL;
2136           vect_slp_linearize_chain (vinfo, worklist, chain, code,
2137                                     stmts[lane]->stmt, op_stmt, other_op_stmt,
2138                                     NULL);
2139           if (!op_stmt_info && op_stmt)
2140             op_stmt_info = vinfo->lookup_stmt (op_stmt);
2141           if (!other_op_stmt_info && other_op_stmt)
2142             other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2143           if (chain.length () == 2)
2144             {
2145               /* In a chain of just two elements resort to the regular
2146                  operand swapping scheme.  If we run into a length
2147                  mismatch still hard-FAIL.  */
2148               if (chain_len == 0)
2149                 hard_fail = false;
2150               else
2151                 {
2152                   matches[lane] = false;
2153                   /* ???  We might want to process the other lanes, but
2154                      make sure to not give false matching hints to the
2155                      caller for lanes we did not process.  */
2156                   if (lane != group_size - 1)
2157                     matches[0] = false;
2158                 }
2159               break;
2160             }
2161           else if (chain_len == 0)
2162             chain_len = chain.length ();
2163           else if (chain.length () != chain_len)
2164             {
2165               /* ???  Here we could slip in magic to compensate with
2166                  neutral operands.  */
2167               matches[lane] = false;
2168               if (lane != group_size - 1)
2169                 matches[0] = false;
2170               break;
2171             }
2172           chains.quick_push (chain.copy ());
2173           chain.truncate (0);
2174         }
2175       if (chains.length () == group_size)
2176         {
2177           /* We cannot yet use SLP_TREE_CODE to communicate the operation.  */
2178           if (!op_stmt_info)
2179             {
2180               hard_fail = false;
2181               goto out;
2182             }
2183           /* Now we have a set of chains with the same length.  */
2184           /* 1. pre-sort according to def_type and operation.  */
2185           for (unsigned lane = 0; lane < group_size; ++lane)
2186             chains[lane].stablesort (dt_sort_cmp, vinfo);
2187           if (dump_enabled_p ())
2188             {
2189               dump_printf_loc (MSG_NOTE, vect_location,
2190                                "pre-sorted chains of %s\n",
2191                                get_tree_code_name (code));
2192               for (unsigned lane = 0; lane < group_size; ++lane)
2193                 {
2194                   for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2195                     dump_printf (MSG_NOTE, "%s %T ",
2196                                  get_tree_code_name (chains[lane][opnum].code),
2197                                  chains[lane][opnum].op);
2198                   dump_printf (MSG_NOTE, "\n");
2199                 }
2200             }
2201           /* 2. try to build children nodes, associating as necessary.  */
2202           for (unsigned n = 0; n < chain_len; ++n)
2203             {
2204               vect_def_type dt = chains[0][n].dt;
2205               unsigned lane;
2206               for (lane = 0; lane < group_size; ++lane)
2207                 if (chains[lane][n].dt != dt)
2208                   {
2209                     if (dt == vect_constant_def
2210                         && chains[lane][n].dt == vect_external_def)
2211                       dt = vect_external_def;
2212                     else if (dt == vect_external_def
2213                              && chains[lane][n].dt == vect_constant_def)
2214                       ;
2215                     else
2216                       break;
2217                   }
2218               if (lane != group_size)
2219                 {
2220                   if (dump_enabled_p ())
2221                     dump_printf_loc (MSG_NOTE, vect_location,
2222                                      "giving up on chain due to mismatched "
2223                                      "def types\n");
2224                   matches[lane] = false;
2225                   if (lane != group_size - 1)
2226                     matches[0] = false;
2227                   goto out;
2228                 }
2229               if (dt == vect_constant_def
2230                   || dt == vect_external_def)
2231                 {
2232                   /* Check whether we can build the invariant.  If we can't
2233                      we never will be able to.  */
2234                   tree type = TREE_TYPE (chains[0][n].op);
2235                   if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2236                       && (TREE_CODE (type) == BOOLEAN_TYPE
2237                           || !can_duplicate_and_interleave_p (vinfo, group_size,
2238                                                               type)))
2239                     {
2240                       matches[0] = false;
2241                       goto out;
2242                     }
2243                   vec<tree> ops;
2244                   ops.create (group_size);
2245                   for (lane = 0; lane < group_size; ++lane)
2246                     ops.quick_push (chains[lane][n].op);
2247                   slp_tree child = vect_create_new_slp_node (ops);
2248                   SLP_TREE_DEF_TYPE (child) = dt;
2249                   children.safe_push (child);
2250                 }
2251               else if (dt != vect_internal_def)
2252                 {
2253                   /* Not sure, we might need sth special.
2254                      gcc.dg/vect/pr96854.c,
2255                      gfortran.dg/vect/fast-math-pr37021.f90
2256                      and gfortran.dg/vect/pr61171.f trigger.  */
2257                   /* Soft-fail for now.  */
2258                   hard_fail = false;
2259                   goto out;
2260                 }
2261               else
2262                 {
2263                   vec<stmt_vec_info> op_stmts;
2264                   op_stmts.create (group_size);
2265                   slp_tree child = NULL;
2266                   /* Brute-force our way.  We have to consider a lane
2267                      failing after fixing an earlier fail up in the
2268                      SLP discovery recursion.  So track the current
2269                      permute per lane.  */
2270                   unsigned *perms = XALLOCAVEC (unsigned, group_size);
2271                   memset (perms, 0, sizeof (unsigned) * group_size);
2272                   do
2273                     {
2274                       op_stmts.truncate (0);
2275                       for (lane = 0; lane < group_size; ++lane)
2276                         op_stmts.quick_push
2277                           (vinfo->lookup_def (chains[lane][n].op));
2278                       child = vect_build_slp_tree (vinfo, op_stmts,
2279                                                    group_size, &this_max_nunits,
2280                                                    matches, limit,
2281                                                    &this_tree_size, bst_map);
2282                       /* ???  We're likely getting too many fatal mismatches
2283                          here so maybe we want to ignore them (but then we
2284                          have no idea which lanes fatally mismatched).  */
2285                       if (child || !matches[0])
2286                         break;
2287                       /* Swap another lane we have not yet matched up into
2288                          lanes that did not match.  If we run out of
2289                          permute possibilities for a lane terminate the
2290                          search.  */
2291                       bool term = false;
2292                       for (lane = 1; lane < group_size; ++lane)
2293                         if (!matches[lane])
2294                           {
2295                             if (n + perms[lane] + 1 == chain_len)
2296                               {
2297                                 term = true;
2298                                 break;
2299                               }
2300                             std::swap (chains[lane][n],
2301                                        chains[lane][n + perms[lane] + 1]);
2302                             perms[lane]++;
2303                           }
2304                       if (term)
2305                         break;
2306                     }
2307                   while (1);
2308                   if (!child)
2309                     {
2310                       if (dump_enabled_p ())
2311                         dump_printf_loc (MSG_NOTE, vect_location,
2312                                          "failed to match up op %d\n", n);
2313                       op_stmts.release ();
2314                       if (lane != group_size - 1)
2315                         matches[0] = false;
2316                       else
2317                         matches[lane] = false;
2318                       goto out;
2319                     }
2320                   if (dump_enabled_p ())
2321                     {
2322                       dump_printf_loc (MSG_NOTE, vect_location,
2323                                        "matched up op %d to\n", n);
2324                       vect_print_slp_tree (MSG_NOTE, vect_location, child);
2325                     }
2326                   children.safe_push (child);
2327                 }
2328             }
2329           /* 3. build SLP nodes to combine the chain.  */
2330           for (unsigned lane = 0; lane < group_size; ++lane)
2331             if (chains[lane][0].code != code)
2332               {
2333                 /* See if there's any alternate all-PLUS entry.  */
2334                 unsigned n;
2335                 for (n = 1; n < chain_len; ++n)
2336                   {
2337                     for (lane = 0; lane < group_size; ++lane)
2338                       if (chains[lane][n].code != code)
2339                         break;
2340                     if (lane == group_size)
2341                       break;
2342                   }
2343                 if (n != chain_len)
2344                   {
2345                     /* Swap that in at first position.  */
2346                     std::swap (children[0], children[n]);
2347                     for (lane = 0; lane < group_size; ++lane)
2348                       std::swap (chains[lane][0], chains[lane][n]);
2349                   }
2350                 else
2351                   {
2352                     /* ???  When this triggers and we end up with two
2353                        vect_constant/external_def up-front things break (ICE)
2354                        spectacularly finding an insertion place for the
2355                        all-constant op.  We should have a fully
2356                        vect_internal_def operand though(?) so we can swap
2357                        that into first place and then prepend the all-zero
2358                        constant.  */
2359                     if (dump_enabled_p ())
2360                       dump_printf_loc (MSG_NOTE, vect_location,
2361                                        "inserting constant zero to compensate "
2362                                        "for (partially) negated first "
2363                                        "operand\n");
2364                     chain_len++;
2365                     for (lane = 0; lane < group_size; ++lane)
2366                       chains[lane].safe_insert
2367                         (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2368                     vec<tree> zero_ops;
2369                     zero_ops.create (group_size);
2370                     zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2371                     for (lane = 1; lane < group_size; ++lane)
2372                       zero_ops.quick_push (zero_ops[0]);
2373                     slp_tree zero = vect_create_new_slp_node (zero_ops);
2374                     SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2375                     children.safe_insert (0, zero);
2376                   }
2377                 break;
2378               }
2379           for (unsigned i = 1; i < children.length (); ++i)
2380             {
2381               slp_tree op0 = children[i - 1];
2382               slp_tree op1 = children[i];
2383               bool this_two_op = false;
2384               for (unsigned lane = 0; lane < group_size; ++lane)
2385                 if (chains[lane][i].code != chains[0][i].code)
2386                   {
2387                     this_two_op = true;
2388                     break;
2389                   }
2390               slp_tree child;
2391               if (i == children.length () - 1)
2392                 child = vect_create_new_slp_node (node, stmts, 2);
2393               else
2394                 child = vect_create_new_slp_node (2, ERROR_MARK);
2395               if (this_two_op)
2396                 {
2397                   vec<std::pair<unsigned, unsigned> > lperm;
2398                   lperm.create (group_size);
2399                   for (unsigned lane = 0; lane < group_size; ++lane)
2400                     lperm.quick_push (std::make_pair
2401                       (chains[lane][i].code != chains[0][i].code, lane));
2402                   vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2403                                                      (chains[0][i].code == code
2404                                                       ? op_stmt_info
2405                                                       : other_op_stmt_info),
2406                                                      (chains[0][i].code == code
2407                                                       ? other_op_stmt_info
2408                                                       : op_stmt_info),
2409                                                      lperm);
2410                 }
2411               else
2412                 {
2413                   SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2414                   SLP_TREE_VECTYPE (child) = vectype;
2415                   SLP_TREE_LANES (child) = group_size;
2416                   SLP_TREE_CHILDREN (child).quick_push (op0);
2417                   SLP_TREE_CHILDREN (child).quick_push (op1);
2418                   SLP_TREE_REPRESENTATIVE (child)
2419                     = (chains[0][i].code == code
2420                        ? op_stmt_info : other_op_stmt_info);
2421                 }
2422               children[i] = child;
2423             }
2424           *tree_size += this_tree_size + 1;
2425           *max_nunits = this_max_nunits;
2426           while (!chains.is_empty ())
2427             chains.pop ().release ();
2428           return node;
2429         }
2430 out:
2431       while (!children.is_empty ())
2432         vect_free_slp_tree (children.pop ());
2433       while (!chains.is_empty ())
2434         chains.pop ().release ();
2435       /* Hard-fail, otherwise we might run into quadratic processing of the
2436          chains starting one stmt into the chain again.  */
2437       if (hard_fail)
2438         return NULL;
2439       /* Fall thru to normal processing.  */
2440     }
2441
2442   /* Get at the operands, verifying they are compatible.  */
2443   vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2444   slp_oprnd_info oprnd_info;
2445   FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2446     {
2447       int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2448                                              stmts, i, &oprnds_info);
2449       if (res != 0)
2450         matches[(res == -1) ? 0 : i] = false;
2451       if (!matches[0])
2452         break;
2453     }
2454   for (i = 0; i < group_size; ++i)
2455     if (!matches[i])
2456       {
2457         vect_free_oprnd_info (oprnds_info);
2458         return NULL;
2459       }
2460   swap = NULL;
2461
2462   bool has_two_operators_perm = false;
2463   auto_vec<unsigned> two_op_perm_indices[2];
2464   vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
2465
2466   if (two_operators && oprnds_info.length () == 2 && group_size > 2)
2467     {
2468       unsigned idx = 0;
2469       hash_map<gimple *, unsigned> seen;
2470       vec<slp_oprnd_info> new_oprnds_info
2471         = vect_create_oprnd_info (1, group_size);
2472       bool success = true;
2473
2474       enum tree_code code = ERROR_MARK;
2475       if (oprnds_info[0]->def_stmts[0]
2476           && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
2477         code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
2478
2479       for (unsigned j = 0; j < group_size; ++j)
2480         {
2481           FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2482             {
2483               stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
2484               if (!stmt_info || !stmt_info->stmt
2485                   || !is_a<gassign *> (stmt_info->stmt)
2486                   || gimple_assign_rhs_code (stmt_info->stmt) != code
2487                   || skip_args[i])
2488                 {
2489                   success = false;
2490                   break;
2491                 }
2492
2493               bool exists;
2494               unsigned &stmt_idx
2495                 = seen.get_or_insert (stmt_info->stmt, &exists);
2496
2497               if (!exists)
2498                 {
2499                   new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
2500                   new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
2501                   stmt_idx = idx;
2502                   idx++;
2503                 }
2504
2505               two_op_perm_indices[i].safe_push (stmt_idx);
2506             }
2507
2508           if (!success)
2509             break;
2510         }
2511
2512       if (success && idx == group_size)
2513         {
2514           if (dump_enabled_p ())
2515             {
2516               dump_printf_loc (MSG_NOTE, vect_location,
2517                                "Replace two_operators operands:\n");
2518
2519               FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2520                 {
2521                   dump_printf_loc (MSG_NOTE, vect_location,
2522                                    "Operand %u:\n", i);
2523                   for (unsigned j = 0; j < group_size; j++)
2524                     dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2525                                      j, oprnd_info->def_stmts[j]->stmt);
2526                 }
2527
2528               dump_printf_loc (MSG_NOTE, vect_location,
2529                                "With a single operand:\n");
2530               for (unsigned j = 0; j < group_size; j++)
2531                 dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2532                                  j, new_oprnds_info[0]->def_stmts[j]->stmt);
2533             }
2534
2535           two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
2536           two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
2537
2538           new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
2539           new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
2540           new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
2541           new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
2542           new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
2543
2544           vect_free_oprnd_info (oprnds_info);
2545           oprnds_info = new_oprnds_info;
2546           nops = 1;
2547           has_two_operators_perm = true;
2548         }
2549     }
2550
2551   auto_vec<slp_tree, 4> children;
2552
2553   stmt_info = stmts[0];
2554
2555   /* Create SLP_TREE nodes for the definition node/s.  */
2556   FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2557     {
2558       slp_tree child = nullptr;
2559       unsigned int j;
2560
2561       /* We're skipping certain operands from processing, for example
2562          outer loop reduction initial defs.  */
2563       if (skip_args[i])
2564         {
2565           children.safe_push (NULL);
2566           continue;
2567         }
2568
2569       if (oprnd_info->first_dt == vect_uninitialized_def)
2570         {
2571           /* COND_EXPR have one too many eventually if the condition
2572              is a SSA name.  */
2573           gcc_assert (i == 3 && nops == 4);
2574           continue;
2575         }
2576
2577       if (is_a <bb_vec_info> (vinfo)
2578           && oprnd_info->first_dt == vect_internal_def
2579           && !oprnd_info->any_pattern)
2580         {
2581           /* For BB vectorization, if all defs are the same do not
2582              bother to continue the build along the single-lane
2583              graph but use a splat of the scalar value.  */
2584           stmt_vec_info first_def = oprnd_info->def_stmts[0];
2585           for (j = 1; j < group_size; ++j)
2586             if (oprnd_info->def_stmts[j] != first_def)
2587               break;
2588           if (j == group_size
2589               /* But avoid doing this for loads where we may be
2590                  able to CSE things, unless the stmt is not
2591                  vectorizable.  */
2592               && (!STMT_VINFO_VECTORIZABLE (first_def)
2593                   || !gimple_vuse (first_def->stmt)))
2594             {
2595               if (dump_enabled_p ())
2596                 dump_printf_loc (MSG_NOTE, vect_location,
2597                                  "Using a splat of the uniform operand %G",
2598                                  first_def->stmt);
2599               oprnd_info->first_dt = vect_external_def;
2600             }
2601         }
2602
2603       if (oprnd_info->first_dt == vect_external_def
2604           || oprnd_info->first_dt == vect_constant_def)
2605         {
2606           if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2607             {
2608               tree op0;
2609               tree uniform_val = op0 = oprnd_info->ops[0];
2610               for (j = 1; j < oprnd_info->ops.length (); ++j)
2611                 if (!operand_equal_p (uniform_val, oprnd_info->ops[j]))
2612                   {
2613                     uniform_val = NULL_TREE;
2614                     break;
2615                   }
2616               if (!uniform_val
2617                   && !can_duplicate_and_interleave_p (vinfo,
2618                                                       oprnd_info->ops.length (),
2619                                                       TREE_TYPE (op0)))
2620                 {
2621                   matches[j] = false;
2622                   if (dump_enabled_p ())
2623                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2624                                      "Build SLP failed: invalid type of def "
2625                                      "for variable-length SLP %T\n", op0);
2626                   goto fail;
2627                 }
2628             }
2629           slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2630           SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2631           oprnd_info->ops = vNULL;
2632           children.safe_push (invnode);
2633           continue;
2634         }
2635
2636       if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2637                                         group_size, &this_max_nunits,
2638                                         matches, limit,
2639                                         &this_tree_size, bst_map)) != NULL)
2640         {
2641           oprnd_info->def_stmts = vNULL;
2642           children.safe_push (child);
2643           continue;
2644         }
2645
2646       /* If the SLP build for operand zero failed and operand zero
2647          and one can be commutated try that for the scalar stmts
2648          that failed the match.  */
2649       if (i == 0
2650           /* A first scalar stmt mismatch signals a fatal mismatch.  */
2651           && matches[0]
2652           /* ???  For COND_EXPRs we can swap the comparison operands
2653              as well as the arms under some constraints.  */
2654           && nops == 2
2655           && oprnds_info[1]->first_dt == vect_internal_def
2656           && is_gimple_assign (stmt_info->stmt)
2657           /* Swapping operands for reductions breaks assumptions later on.  */
2658           && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
2659         {
2660           /* See whether we can swap the matching or the non-matching
2661              stmt operands.  */
2662           bool swap_not_matching = true;
2663           do
2664             {
2665               for (j = 0; j < group_size; ++j)
2666                 {
2667                   if (matches[j] != !swap_not_matching)
2668                     continue;
2669                   stmt_vec_info stmt_info = stmts[j];
2670                   /* Verify if we can swap operands of this stmt.  */
2671                   gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2672                   if (!stmt
2673                       || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2674                     {
2675                       if (!swap_not_matching)
2676                         goto fail;
2677                       swap_not_matching = false;
2678                       break;
2679                     }
2680                 }
2681             }
2682           while (j != group_size);
2683
2684           /* Swap mismatched definition stmts.  */
2685           if (dump_enabled_p ())
2686             dump_printf_loc (MSG_NOTE, vect_location,
2687                              "Re-trying with swapped operands of stmts ");
2688           for (j = 0; j < group_size; ++j)
2689             if (matches[j] == !swap_not_matching)
2690               {
2691                 std::swap (oprnds_info[0]->def_stmts[j],
2692                            oprnds_info[1]->def_stmts[j]);
2693                 std::swap (oprnds_info[0]->ops[j],
2694                            oprnds_info[1]->ops[j]);
2695                 if (dump_enabled_p ())
2696                   dump_printf (MSG_NOTE, "%d ", j);
2697               }
2698           if (dump_enabled_p ())
2699             dump_printf (MSG_NOTE, "\n");
2700           /* After swapping some operands we lost track whether an
2701              operand has any pattern defs so be conservative here.  */
2702           if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2703             oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2704           /* And try again with scratch 'matches' ... */
2705           bool *tem = XALLOCAVEC (bool, group_size);
2706           if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2707                                             group_size, &this_max_nunits,
2708                                             tem, limit,
2709                                             &this_tree_size, bst_map)) != NULL)
2710             {
2711               oprnd_info->def_stmts = vNULL;
2712               children.safe_push (child);
2713               continue;
2714             }
2715         }
2716 fail:
2717
2718       /* If the SLP build failed and we analyze a basic-block
2719          simply treat nodes we fail to build as externally defined
2720          (and thus build vectors from the scalar defs).
2721          The cost model will reject outright expensive cases.
2722          ???  This doesn't treat cases where permutation ultimatively
2723          fails (or we don't try permutation below).  Ideally we'd
2724          even compute a permutation that will end up with the maximum
2725          SLP tree size...  */
2726       if (is_a <bb_vec_info> (vinfo)
2727           /* ???  Rejecting patterns this way doesn't work.  We'd have to
2728              do extra work to cancel the pattern so the uses see the
2729              scalar version.  */
2730           && !is_pattern_stmt_p (stmt_info)
2731           && !oprnd_info->any_pattern)
2732         {
2733           /* But if there's a leading vector sized set of matching stmts
2734              fail here so we can split the group.  This matches the condition
2735              vect_analyze_slp_instance uses.  */
2736           /* ???  We might want to split here and combine the results to support
2737              multiple vector sizes better.  */
2738           for (j = 0; j < group_size; ++j)
2739             if (!matches[j])
2740               break;
2741           if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2742             {
2743               if (dump_enabled_p ())
2744                 dump_printf_loc (MSG_NOTE, vect_location,
2745                                  "Building vector operands from scalars\n");
2746               this_tree_size++;
2747               child = vect_create_new_slp_node (oprnd_info->ops);
2748               children.safe_push (child);
2749               oprnd_info->ops = vNULL;
2750               continue;
2751             }
2752         }
2753
2754       gcc_assert (child == NULL);
2755       FOR_EACH_VEC_ELT (children, j, child)
2756         if (child)
2757           vect_free_slp_tree (child);
2758       vect_free_oprnd_info (oprnds_info);
2759       return NULL;
2760     }
2761
2762   vect_free_oprnd_info (oprnds_info);
2763
2764   /* If we have all children of a child built up from uniform scalars
2765      or does more than one possibly expensive vector construction then
2766      just throw that away, causing it built up from scalars.
2767      The exception is the SLP node for the vector store.  */
2768   if (is_a <bb_vec_info> (vinfo)
2769       && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2770       /* ???  Rejecting patterns this way doesn't work.  We'd have to
2771          do extra work to cancel the pattern so the uses see the
2772          scalar version.  */
2773       && !is_pattern_stmt_p (stmt_info))
2774     {
2775       slp_tree child;
2776       unsigned j;
2777       bool all_uniform_p = true;
2778       unsigned n_vector_builds = 0;
2779       FOR_EACH_VEC_ELT (children, j, child)
2780         {
2781           if (!child)
2782             ;
2783           else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2784             all_uniform_p = false;
2785           else if (!vect_slp_tree_uniform_p (child))
2786             {
2787               all_uniform_p = false;
2788               if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2789                 n_vector_builds++;
2790             }
2791         }
2792       if (all_uniform_p
2793           || n_vector_builds > 1
2794           || (n_vector_builds == children.length ()
2795               && is_a <gphi *> (stmt_info->stmt)))
2796         {
2797           /* Roll back.  */
2798           matches[0] = false;
2799           FOR_EACH_VEC_ELT (children, j, child)
2800             if (child)
2801               vect_free_slp_tree (child);
2802
2803           if (dump_enabled_p ())
2804             dump_printf_loc (MSG_NOTE, vect_location,
2805                              "Building parent vector operands from "
2806                              "scalars instead\n");
2807           return NULL;
2808         }
2809     }
2810
2811   *tree_size += this_tree_size + 1;
2812   *max_nunits = this_max_nunits;
2813
2814   if (two_operators)
2815     {
2816       /* ???  We'd likely want to either cache in bst_map sth like
2817          { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2818          the true { a+b, a+b, a+b, a+b } ... but there we don't have
2819          explicit stmts to put in so the keying on 'stmts' doesn't
2820          work (but we have the same issue with nodes that use 'ops').  */
2821
2822       if (has_two_operators_perm)
2823         {
2824           slp_tree child = children[0];
2825           children.truncate (0);
2826           for (i = 0; i < 2; i++)
2827             {
2828               slp_tree pnode
2829                 = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
2830               SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
2831               SLP_TREE_VECTYPE (pnode) = vectype;
2832               SLP_TREE_CHILDREN (pnode).quick_push (child);
2833               SLP_TREE_CHILDREN (pnode).quick_push (child);
2834               lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
2835               children.safe_push (pnode);
2836
2837               for (unsigned j = 0; j < stmts.length (); j++)
2838                 perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
2839             }
2840
2841           SLP_TREE_REF_COUNT (child) += 4;
2842         }
2843
2844       slp_tree one = new _slp_tree;
2845       slp_tree two = new _slp_tree;
2846       SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2847       SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2848       SLP_TREE_VECTYPE (one) = vectype;
2849       SLP_TREE_VECTYPE (two) = vectype;
2850       SLP_TREE_CHILDREN (one).safe_splice (children);
2851       SLP_TREE_CHILDREN (two).safe_splice (children);
2852       slp_tree child;
2853       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2854         SLP_TREE_REF_COUNT (child)++;
2855
2856       /* Here we record the original defs since this
2857          node represents the final lane configuration.  */
2858       node = vect_create_new_slp_node (node, stmts, 2);
2859       SLP_TREE_VECTYPE (node) = vectype;
2860       SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2861       SLP_TREE_CHILDREN (node).quick_push (one);
2862       SLP_TREE_CHILDREN (node).quick_push (two);
2863       gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2864       enum tree_code code0 = gimple_assign_rhs_code (stmt);
2865       enum tree_code ocode = ERROR_MARK;
2866       stmt_vec_info ostmt_info;
2867       unsigned j = 0;
2868       FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2869         {
2870           gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2871           if (gimple_assign_rhs_code (ostmt) != code0)
2872             {
2873               SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2874               ocode = gimple_assign_rhs_code (ostmt);
2875               j = i;
2876             }
2877           else
2878             SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2879         }
2880
2881       SLP_TREE_CODE (one) = code0;
2882       SLP_TREE_CODE (two) = ocode;
2883       SLP_TREE_LANES (one) = stmts.length ();
2884       SLP_TREE_LANES (two) = stmts.length ();
2885       SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2886       SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2887
2888       return node;
2889     }
2890
2891   node = vect_create_new_slp_node (node, stmts, nops);
2892   SLP_TREE_VECTYPE (node) = vectype;
2893   SLP_TREE_CHILDREN (node).splice (children);
2894   return node;
2895 }
2896
2897 /* Dump a single SLP tree NODE.  */
2898
2899 static void
2900 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2901                      slp_tree node)
2902 {
2903   unsigned i, j;
2904   slp_tree child;
2905   stmt_vec_info stmt_info;
2906   tree op;
2907
2908   dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2909   dump_user_location_t user_loc = loc.get_user_location ();
2910   dump_printf_loc (metadata, user_loc,
2911                    "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2912                    ", refcnt=%u)",
2913                    SLP_TREE_DEF_TYPE (node) == vect_external_def
2914                    ? " (external)"
2915                    : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2916                       ? " (constant)"
2917                       : ""), (void *) node,
2918                    estimated_poly_value (node->max_nunits),
2919                                          SLP_TREE_REF_COUNT (node));
2920   if (SLP_TREE_VECTYPE (node))
2921     dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2922   dump_printf (metadata, "\n");
2923   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2924     {
2925       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2926         dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2927       else
2928         dump_printf_loc (metadata, user_loc, "op template: %G",
2929                          SLP_TREE_REPRESENTATIVE (node)->stmt);
2930     }
2931   if (SLP_TREE_SCALAR_STMTS (node).exists ())
2932     FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2933       if (stmt_info)
2934         dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
2935                          STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
2936                          i, stmt_info->stmt);
2937       else
2938         dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
2939   else
2940     {
2941       dump_printf_loc (metadata, user_loc, "\t{ ");
2942       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2943         dump_printf (metadata, "%T%s ", op,
2944                      i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2945       dump_printf (metadata, "}\n");
2946     }
2947   if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2948     {
2949       dump_printf_loc (metadata, user_loc, "\tload permutation {");
2950       FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2951         dump_printf (dump_kind, " %u", j);
2952       dump_printf (dump_kind, " }\n");
2953     }
2954   if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2955     {
2956       dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2957       for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2958         dump_printf (dump_kind, " %u[%u]",
2959                      SLP_TREE_LANE_PERMUTATION (node)[i].first,
2960                      SLP_TREE_LANE_PERMUTATION (node)[i].second);
2961       dump_printf (dump_kind, " }\n");
2962     }
2963   if (SLP_TREE_CHILDREN (node).is_empty ())
2964     return;
2965   dump_printf_loc (metadata, user_loc, "\tchildren");
2966   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2967     dump_printf (dump_kind, " %p", (void *)child);
2968   dump_printf (dump_kind, "\n");
2969 }
2970
2971 DEBUG_FUNCTION void
2972 debug (slp_tree node)
2973 {
2974   debug_dump_context ctx;
2975   vect_print_slp_tree (MSG_NOTE,
2976                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2977                        node);
2978 }
2979
2980 /* Recursive helper for the dot producer below.  */
2981
2982 static void
2983 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2984 {
2985   if (visited.add (node))
2986     return;
2987
2988   fprintf (f, "\"%p\" [label=\"", (void *)node);
2989   vect_print_slp_tree (MSG_NOTE,
2990                        dump_location_t::from_location_t (UNKNOWN_LOCATION),
2991                        node);
2992   fprintf (f, "\"];\n");
2993
2994
2995   for (slp_tree child : SLP_TREE_CHILDREN (node))
2996     fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2997
2998   for (slp_tree child : SLP_TREE_CHILDREN (node))
2999     if (child)
3000       dot_slp_tree (f, child, visited);
3001 }
3002
3003 DEBUG_FUNCTION void
3004 dot_slp_tree (const char *fname, slp_tree node)
3005 {
3006   FILE *f = fopen (fname, "w");
3007   fprintf (f, "digraph {\n");
3008   fflush (f);
3009     {
3010       debug_dump_context ctx (f);
3011       hash_set<slp_tree> visited;
3012       dot_slp_tree (f, node, visited);
3013     }
3014   fflush (f);
3015   fprintf (f, "}\n");
3016   fclose (f);
3017 }
3018
3019 DEBUG_FUNCTION void
3020 dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
3021 {
3022   FILE *f = fopen (fname, "w");
3023   fprintf (f, "digraph {\n");
3024   fflush (f);
3025     {
3026       debug_dump_context ctx (f);
3027       hash_set<slp_tree> visited;
3028       for (auto inst : slp_instances)
3029         dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
3030     }
3031   fflush (f);
3032   fprintf (f, "}\n");
3033   fclose (f);
3034 }
3035
3036 /* Dump a slp tree NODE using flags specified in DUMP_KIND.  */
3037
3038 static void
3039 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3040                       slp_tree node, hash_set<slp_tree> &visited)
3041 {
3042   unsigned i;
3043   slp_tree child;
3044
3045   if (visited.add (node))
3046     return;
3047
3048   vect_print_slp_tree (dump_kind, loc, node);
3049
3050   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3051     if (child)
3052       vect_print_slp_graph (dump_kind, loc, child, visited);
3053 }
3054
3055 static void
3056 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3057                       slp_tree entry)
3058 {
3059   hash_set<slp_tree> visited;
3060   vect_print_slp_graph (dump_kind, loc, entry, visited);
3061 }
3062
3063 DEBUG_FUNCTION void
3064 debug (slp_instance instance)
3065 {
3066   debug_dump_context ctx;
3067   vect_print_slp_graph (MSG_NOTE,
3068                         dump_location_t::from_location_t (UNKNOWN_LOCATION),
3069                         SLP_INSTANCE_TREE (instance));
3070 }
3071
3072 /* Mark the tree rooted at NODE with PURE_SLP.  */
3073
3074 static void
3075 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
3076 {
3077   int i;
3078   stmt_vec_info stmt_info;
3079   slp_tree child;
3080
3081   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3082     return;
3083
3084   if (visited.add (node))
3085     return;
3086
3087   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3088     if (stmt_info)
3089       STMT_SLP_TYPE (stmt_info) = pure_slp;
3090
3091   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3092     if (child)
3093       vect_mark_slp_stmts (child, visited);
3094 }
3095
3096 static void
3097 vect_mark_slp_stmts (slp_tree node)
3098 {
3099   hash_set<slp_tree> visited;
3100   vect_mark_slp_stmts (node, visited);
3101 }
3102
3103 /* Mark the statements of the tree rooted at NODE as relevant (vect_used).  */
3104
3105 static void
3106 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
3107 {
3108   int i;
3109   stmt_vec_info stmt_info;
3110   slp_tree child;
3111
3112   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3113     return;
3114
3115   if (visited.add (node))
3116     return;
3117
3118   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3119     if (stmt_info)
3120       {
3121         gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
3122                     || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
3123         STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
3124       }
3125
3126   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3127     if (child)
3128       vect_mark_slp_stmts_relevant (child, visited);
3129 }
3130
3131 static void
3132 vect_mark_slp_stmts_relevant (slp_tree node)
3133 {
3134   hash_set<slp_tree> visited;
3135   vect_mark_slp_stmts_relevant (node, visited);
3136 }
3137
3138
3139 /* Gather loads in the SLP graph NODE and populate the INST loads array.  */
3140
3141 static void
3142 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
3143                        hash_set<slp_tree> &visited)
3144 {
3145   if (!node || visited.add (node))
3146     return;
3147
3148   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3149     return;
3150
3151   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3152     {
3153       stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3154       if (STMT_VINFO_DATA_REF (stmt_info)
3155           && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3156         loads.safe_push (node);
3157     }
3158
3159   unsigned i;
3160   slp_tree child;
3161   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3162     vect_gather_slp_loads (loads, child, visited);
3163 }
3164
3165
3166 /* Find the last store in SLP INSTANCE.  */
3167
3168 stmt_vec_info
3169 vect_find_last_scalar_stmt_in_slp (slp_tree node)
3170 {
3171   stmt_vec_info last = NULL;
3172   stmt_vec_info stmt_vinfo;
3173
3174   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3175     if (stmt_vinfo)
3176       {
3177         stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3178         last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
3179       }
3180
3181   return last;
3182 }
3183
3184 /* Find the first stmt in NODE.  */
3185
3186 stmt_vec_info
3187 vect_find_first_scalar_stmt_in_slp (slp_tree node)
3188 {
3189   stmt_vec_info first = NULL;
3190   stmt_vec_info stmt_vinfo;
3191
3192   for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3193     if (stmt_vinfo)
3194       {
3195         stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3196         if (!first
3197             || get_later_stmt (stmt_vinfo, first) == first)
3198           first = stmt_vinfo;
3199       }
3200
3201   return first;
3202 }
3203
3204 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3205    two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3206    (also containing the first GROUP1_SIZE stmts, since stores are
3207    consecutive), the second containing the remainder.
3208    Return the first stmt in the second group.  */
3209
3210 static stmt_vec_info
3211 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3212 {
3213   gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3214   gcc_assert (group1_size > 0);
3215   int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3216   gcc_assert (group2_size > 0);
3217   DR_GROUP_SIZE (first_vinfo) = group1_size;
3218
3219   stmt_vec_info stmt_info = first_vinfo;
3220   for (unsigned i = group1_size; i > 1; i--)
3221     {
3222       stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3223       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3224     }
3225   /* STMT is now the last element of the first group.  */
3226   stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3227   DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3228
3229   DR_GROUP_SIZE (group2) = group2_size;
3230   for (stmt_info = group2; stmt_info;
3231        stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3232     {
3233       DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3234       gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3235     }
3236
3237   /* For the second group, the DR_GROUP_GAP is that before the original group,
3238      plus skipping over the first vector.  */
3239   DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3240
3241   /* DR_GROUP_GAP of the first group now has to skip over the second group too.  */
3242   DR_GROUP_GAP (first_vinfo) += group2_size;
3243
3244   if (dump_enabled_p ())
3245     dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3246                      group1_size, group2_size);
3247
3248   return group2;
3249 }
3250
3251 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3252    statements and a vector of NUNITS elements.  */
3253
3254 static poly_uint64
3255 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3256 {
3257   return exact_div (common_multiple (nunits, group_size), group_size);
3258 }
3259
3260 /* Helper that checks to see if a node is a load node.  */
3261
3262 static inline bool
3263 vect_is_slp_load_node  (slp_tree root)
3264 {
3265   return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3266          && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3267          && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3268 }
3269
3270
3271 /* Helper function of optimize_load_redistribution that performs the operation
3272    recursively.  */
3273
3274 static slp_tree
3275 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3276                                 vec_info *vinfo, unsigned int group_size,
3277                                 hash_map<slp_tree, slp_tree> *load_map,
3278                                 slp_tree root)
3279 {
3280   if (slp_tree *leader = load_map->get (root))
3281     return *leader;
3282
3283   slp_tree node;
3284   unsigned i;
3285
3286   /* For now, we don't know anything about externals so do not do anything.  */
3287   if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3288     return NULL;
3289   else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3290     {
3291       /* First convert this node into a load node and add it to the leaves
3292          list and flatten the permute from a lane to a load one.  If it's
3293          unneeded it will be elided later.  */
3294       vec<stmt_vec_info> stmts;
3295       stmts.create (SLP_TREE_LANES (root));
3296       lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3297       for (unsigned j = 0; j < lane_perm.length (); j++)
3298         {
3299           std::pair<unsigned, unsigned> perm = lane_perm[j];
3300           node = SLP_TREE_CHILDREN (root)[perm.first];
3301
3302           if (!vect_is_slp_load_node (node)
3303               || SLP_TREE_CHILDREN (node).exists ())
3304             {
3305               stmts.release ();
3306               goto next;
3307             }
3308
3309           stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3310         }
3311
3312       if (dump_enabled_p ())
3313         dump_printf_loc (MSG_NOTE, vect_location,
3314                          "converting stmts on permute node %p\n",
3315                          (void *) root);
3316
3317       bool *matches = XALLOCAVEC (bool, group_size);
3318       poly_uint64 max_nunits = 1;
3319       unsigned tree_size = 0, limit = 1;
3320       node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3321                                   matches, &limit, &tree_size, bst_map);
3322       if (!node)
3323         stmts.release ();
3324
3325       load_map->put (root, node);
3326       return node;
3327     }
3328
3329 next:
3330   load_map->put (root, NULL);
3331
3332   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3333     {
3334       slp_tree value
3335         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3336                                           node);
3337       if (value)
3338         {
3339           SLP_TREE_REF_COUNT (value)++;
3340           SLP_TREE_CHILDREN (root)[i] = value;
3341           /* ???  We know the original leafs of the replaced nodes will
3342              be referenced by bst_map, only the permutes created by
3343              pattern matching are not.  */
3344           if (SLP_TREE_REF_COUNT (node) == 1)
3345             load_map->remove (node);
3346           vect_free_slp_tree (node);
3347         }
3348     }
3349
3350   return NULL;
3351 }
3352
3353 /* Temporary workaround for loads not being CSEd during SLP build.  This
3354    function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3355    VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3356    same DR such that the final operation is equal to a permuted load.  Such
3357    NODES are then directly converted into LOADS themselves.  The nodes are
3358    CSEd using BST_MAP.  */
3359
3360 static void
3361 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3362                               vec_info *vinfo, unsigned int group_size,
3363                               hash_map<slp_tree, slp_tree> *load_map,
3364                               slp_tree root)
3365 {
3366   slp_tree node;
3367   unsigned i;
3368
3369   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3370     {
3371       slp_tree value
3372         = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3373                                           node);
3374       if (value)
3375         {
3376           SLP_TREE_REF_COUNT (value)++;
3377           SLP_TREE_CHILDREN (root)[i] = value;
3378           /* ???  We know the original leafs of the replaced nodes will
3379              be referenced by bst_map, only the permutes created by
3380              pattern matching are not.  */
3381           if (SLP_TREE_REF_COUNT (node) == 1)
3382             load_map->remove (node);
3383           vect_free_slp_tree (node);
3384         }
3385     }
3386 }
3387
3388 /* Helper function of vect_match_slp_patterns.
3389
3390    Attempts to match patterns against the slp tree rooted in REF_NODE using
3391    VINFO.  Patterns are matched in post-order traversal.
3392
3393    If matching is successful the value in REF_NODE is updated and returned, if
3394    not then it is returned unchanged.  */
3395
3396 static bool
3397 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3398                            slp_tree_to_load_perm_map_t *perm_cache,
3399                            slp_compat_nodes_map_t *compat_cache,
3400                            hash_set<slp_tree> *visited)
3401 {
3402   unsigned i;
3403   slp_tree node = *ref_node;
3404   bool found_p = false;
3405   if (!node || visited->add (node))
3406     return false;
3407
3408   slp_tree child;
3409   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3410     found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3411                                           vinfo, perm_cache, compat_cache,
3412                                           visited);
3413
3414   for (unsigned x = 0; x < num__slp_patterns; x++)
3415     {
3416       vect_pattern *pattern
3417         = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3418       if (pattern)
3419         {
3420           pattern->build (vinfo);
3421           delete pattern;
3422           found_p = true;
3423         }
3424     }
3425
3426   return found_p;
3427 }
3428
3429 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3430    vec_info VINFO.
3431
3432    The modified tree is returned.  Patterns are tried in order and multiple
3433    patterns may match.  */
3434
3435 static bool
3436 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3437                          hash_set<slp_tree> *visited,
3438                          slp_tree_to_load_perm_map_t *perm_cache,
3439                          slp_compat_nodes_map_t *compat_cache)
3440 {
3441   DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3442   slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3443
3444   if (dump_enabled_p ())
3445     dump_printf_loc (MSG_NOTE, vect_location,
3446                      "Analyzing SLP tree %p for patterns\n",
3447                      (void *) SLP_INSTANCE_TREE (instance));
3448
3449   return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3450                                     visited);
3451 }
3452
3453 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3454    splitting into two, with the first split group having size NEW_GROUP_SIZE.
3455    Return true if we could use IFN_STORE_LANES instead and if that appears
3456    to be the better approach.  */
3457
3458 static bool
3459 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3460                                unsigned int group_size,
3461                                unsigned int new_group_size)
3462 {
3463   tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3464   tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3465   if (!vectype)
3466     return false;
3467   /* Allow the split if one of the two new groups would operate on full
3468      vectors *within* rather than across one scalar loop iteration.
3469      This is purely a heuristic, but it should work well for group
3470      sizes of 3 and 4, where the possible splits are:
3471
3472        3->2+1:  OK if the vector has exactly two elements
3473        4->2+2:  Likewise
3474        4->3+1:  Less clear-cut.  */
3475   if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3476       || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3477     return false;
3478   return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3479 }
3480
3481 /* Analyze an SLP instance starting from a group of grouped stores.  Call
3482    vect_build_slp_tree to build a tree of packed stmts if possible.
3483    Return FALSE if it's impossible to SLP any stmt in the loop.  */
3484
3485 static bool
3486 vect_analyze_slp_instance (vec_info *vinfo,
3487                            scalar_stmts_to_slp_tree_map_t *bst_map,
3488                            stmt_vec_info stmt_info, slp_instance_kind kind,
3489                            unsigned max_tree_size, unsigned *limit,
3490                            bool force_single_lane = false);
3491
3492 /* Build an interleaving scheme for the store sources RHS_NODES from
3493    SCALAR_STMTS.  */
3494
3495 static slp_tree
3496 vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
3497                                    vec<stmt_vec_info> &scalar_stmts)
3498 {
3499   unsigned int group_size = scalar_stmts.length ();
3500   slp_tree node = vect_create_new_slp_node (scalar_stmts,
3501                                             SLP_TREE_CHILDREN
3502                                               (rhs_nodes[0]).length ());
3503   SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
3504   for (unsigned l = 0;
3505        l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
3506     {
3507       /* And a permute merging all RHS SLP trees.  */
3508       slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
3509                                                 VEC_PERM_EXPR);
3510       SLP_TREE_CHILDREN (node).quick_push (perm);
3511       SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
3512       SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
3513       SLP_TREE_LANES (perm) = group_size;
3514       /* ???  We should set this NULL but that's not expected.  */
3515       SLP_TREE_REPRESENTATIVE (perm)
3516         = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
3517       for (unsigned j = 0; j < rhs_nodes.length (); ++j)
3518         {
3519           SLP_TREE_CHILDREN (perm)
3520             .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
3521           SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
3522           for (unsigned k = 0;
3523                k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
3524             {
3525               /* ???  We should populate SLP_TREE_SCALAR_STMTS
3526                  or SLP_TREE_SCALAR_OPS but then we might have
3527                  a mix of both in our children.  */
3528               SLP_TREE_LANE_PERMUTATION (perm)
3529                 .quick_push (std::make_pair (j, k));
3530             }
3531         }
3532
3533       /* Now we have a single permute node but we cannot code-generate
3534          the case with more than two inputs.
3535          Perform pairwise reduction, reducing the two inputs
3536          with the least number of lanes to one and then repeat until
3537          we end up with two inputs.  That scheme makes sure we end
3538          up with permutes satisfying the restriction of requiring at
3539          most two vector inputs to produce a single vector output
3540          when the number of lanes is even.  */
3541       while (SLP_TREE_CHILDREN (perm).length () > 2)
3542         {
3543           /* When we have three equal sized groups left the pairwise
3544              reduction does not result in a scheme that avoids using
3545              three vectors.  Instead merge the first two groups
3546              to the final size with do-not-care elements (chosen
3547              from the first group) and then merge with the third.
3548                   { A0, B0,  x, A1, B1,  x, ... }
3549                -> { A0, B0, C0, A1, B1, C1, ... }
3550              This handles group size of three (and at least
3551              power-of-two multiples of that).  */
3552           if (SLP_TREE_CHILDREN (perm).length () == 3
3553               && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3554                   == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
3555               && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3556                   == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
3557             {
3558               int ai = 0;
3559               int bi = 1;
3560               slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3561               slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3562               unsigned n = SLP_TREE_LANES (perm);
3563
3564               slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3565               SLP_TREE_LANES (permab) = n;
3566               SLP_TREE_LANE_PERMUTATION (permab).create (n);
3567               SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3568               /* ???  Should be NULL but that's not expected.  */
3569               SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3570               SLP_TREE_CHILDREN (permab).quick_push (a);
3571               for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3572                 SLP_TREE_LANE_PERMUTATION (permab)
3573                   .quick_push (std::make_pair (0, k));
3574               SLP_TREE_CHILDREN (permab).quick_push (b);
3575               for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3576                 SLP_TREE_LANE_PERMUTATION (permab)
3577                   .quick_push (std::make_pair (1, k));
3578               /* Push the do-not-care lanes.  */
3579               for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3580                 SLP_TREE_LANE_PERMUTATION (permab)
3581                   .quick_push (std::make_pair (0, k));
3582
3583               /* Put the merged node into 'perm', in place of a.  */
3584               SLP_TREE_CHILDREN (perm)[ai] = permab;
3585               /* Adjust the references to b in the permutation
3586                  of perm and to the later children which we'll
3587                  remove.  */
3588               for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3589                 {
3590                   std::pair<unsigned, unsigned> &p
3591                     = SLP_TREE_LANE_PERMUTATION (perm)[k];
3592                   if (p.first == (unsigned) bi)
3593                     {
3594                       p.first = ai;
3595                       p.second += SLP_TREE_LANES (a);
3596                     }
3597                   else if (p.first > (unsigned) bi)
3598                     p.first--;
3599                 }
3600               SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3601               break;
3602             }
3603
3604           /* Pick the two nodes with the least number of lanes,
3605              prefer the earliest candidate and maintain ai < bi.  */
3606           int ai = -1;
3607           int bi = -1;
3608           for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
3609             {
3610               if (ai == -1)
3611                 ai = ci;
3612               else if (bi == -1)
3613                 bi = ci;
3614               else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
3615                         < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
3616                        || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
3617                            < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
3618                 {
3619                   if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
3620                       <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
3621                     bi = ci;
3622                   else
3623                     {
3624                       ai = bi;
3625                       bi = ci;
3626                     }
3627                 }
3628             }
3629
3630           /* Produce a merge of nodes ai and bi.  */
3631           slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3632           slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3633           unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
3634           slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3635           SLP_TREE_LANES (permab) = n;
3636           SLP_TREE_LANE_PERMUTATION (permab).create (n);
3637           SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3638           /* ???  Should be NULL but that's not expected.  */
3639           SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3640           SLP_TREE_CHILDREN (permab).quick_push (a);
3641           for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3642             SLP_TREE_LANE_PERMUTATION (permab)
3643               .quick_push (std::make_pair (0, k));
3644           SLP_TREE_CHILDREN (permab).quick_push (b);
3645           for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3646             SLP_TREE_LANE_PERMUTATION (permab)
3647               .quick_push (std::make_pair (1, k));
3648
3649           /* Put the merged node into 'perm', in place of a.  */
3650           SLP_TREE_CHILDREN (perm)[ai] = permab;
3651           /* Adjust the references to b in the permutation
3652              of perm and to the later children which we'll
3653              remove.  */
3654           for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3655             {
3656               std::pair<unsigned, unsigned> &p
3657                 = SLP_TREE_LANE_PERMUTATION (perm)[k];
3658               if (p.first == (unsigned) bi)
3659                 {
3660                   p.first = ai;
3661                   p.second += SLP_TREE_LANES (a);
3662                 }
3663               else if (p.first > (unsigned) bi)
3664                 p.first--;
3665             }
3666           SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3667         }
3668     }
3669
3670   return node;
3671 }
3672
3673 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3674    of KIND.  Return true if successful.  */
3675
3676 static bool
3677 vect_build_slp_instance (vec_info *vinfo,
3678                          slp_instance_kind kind,
3679                          vec<stmt_vec_info> &scalar_stmts,
3680                          vec<stmt_vec_info> &root_stmt_infos,
3681                          vec<tree> &remain,
3682                          unsigned max_tree_size, unsigned *limit,
3683                          scalar_stmts_to_slp_tree_map_t *bst_map,
3684                          /* ???  We need stmt_info for group splitting.  */
3685                          stmt_vec_info stmt_info_,
3686                          bool force_single_lane = false)
3687 {
3688   /* If there's no budget left bail out early.  */
3689   if (*limit == 0)
3690     return false;
3691
3692   if (kind == slp_inst_kind_ctor)
3693     {
3694       if (dump_enabled_p ())
3695         dump_printf_loc (MSG_NOTE, vect_location,
3696                          "Analyzing vectorizable constructor: %G\n",
3697                          root_stmt_infos[0]->stmt);
3698     }
3699
3700   if (dump_enabled_p ())
3701     {
3702       dump_printf_loc (MSG_NOTE, vect_location,
3703                        "Starting SLP discovery for\n");
3704       for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3705         dump_printf_loc (MSG_NOTE, vect_location,
3706                          "  %G", scalar_stmts[i]->stmt);
3707     }
3708
3709   /* Build the tree for the SLP instance.  */
3710   unsigned int group_size = scalar_stmts.length ();
3711   bool *matches = XALLOCAVEC (bool, group_size);
3712   poly_uint64 max_nunits = 1;
3713   unsigned tree_size = 0;
3714   unsigned i;
3715
3716   slp_tree node = NULL;
3717   if (force_single_lane)
3718     {
3719       matches[0] = true;
3720       matches[1] = false;
3721     }
3722   else
3723     node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3724                                 &max_nunits, matches, limit,
3725                                 &tree_size, bst_map);
3726   if (node != NULL)
3727     {
3728       /* Calculate the unrolling factor based on the smallest type.  */
3729       poly_uint64 unrolling_factor
3730         = calculate_unrolling_factor (max_nunits, group_size);
3731
3732       if (maybe_ne (unrolling_factor, 1U)
3733           && is_a <bb_vec_info> (vinfo))
3734         {
3735           unsigned HOST_WIDE_INT const_max_nunits;
3736           if (!max_nunits.is_constant (&const_max_nunits)
3737               || const_max_nunits > group_size)
3738             {
3739               if (dump_enabled_p ())
3740                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3741                                  "Build SLP failed: store group "
3742                                  "size not a multiple of the vector size "
3743                                  "in basic block SLP\n");
3744               vect_free_slp_tree (node);
3745               return false;
3746             }
3747           /* Fatal mismatch.  */
3748           if (dump_enabled_p ())
3749             dump_printf_loc (MSG_NOTE, vect_location,
3750                              "SLP discovery succeeded but node needs "
3751                              "splitting\n");
3752           memset (matches, true, group_size);
3753           matches[group_size / const_max_nunits * const_max_nunits] = false;
3754           vect_free_slp_tree (node);
3755         }
3756       else
3757         {
3758           /* Create a new SLP instance.  */
3759           slp_instance new_instance = XNEW (class _slp_instance);
3760           SLP_INSTANCE_TREE (new_instance) = node;
3761           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3762           SLP_INSTANCE_LOADS (new_instance) = vNULL;
3763           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3764           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3765           SLP_INSTANCE_KIND (new_instance) = kind;
3766           new_instance->reduc_phis = NULL;
3767           new_instance->cost_vec = vNULL;
3768           new_instance->subgraph_entries = vNULL;
3769
3770           if (dump_enabled_p ())
3771             dump_printf_loc (MSG_NOTE, vect_location,
3772                              "SLP size %u vs. limit %u.\n",
3773                              tree_size, max_tree_size);
3774
3775           /* Fixup SLP reduction chains.  */
3776           if (kind == slp_inst_kind_reduc_chain)
3777             {
3778               /* If this is a reduction chain with a conversion in front
3779                  amend the SLP tree with a node for that.  */
3780               gimple *scalar_def
3781                 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3782               if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3783                 {
3784                   /* Get at the conversion stmt - we know it's the single use
3785                      of the last stmt of the reduction chain.  */
3786                   use_operand_p use_p;
3787                   bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3788                                            &use_p, &scalar_def);
3789                   gcc_assert (r);
3790                   stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3791                   next_info = vect_stmt_to_vectorize (next_info);
3792                   scalar_stmts = vNULL;
3793                   scalar_stmts.create (group_size);
3794                   for (unsigned i = 0; i < group_size; ++i)
3795                     scalar_stmts.quick_push (next_info);
3796                   slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3797                   SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3798                   SLP_TREE_CHILDREN (conv).quick_push (node);
3799                   SLP_INSTANCE_TREE (new_instance) = conv;
3800                   /* We also have to fake this conversion stmt as SLP reduction
3801                      group so we don't have to mess with too much code
3802                      elsewhere.  */
3803                   REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3804                   REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3805                 }
3806               /* Fill the backedge child of the PHI SLP node.  The
3807                  general matching code cannot find it because the
3808                  scalar code does not reflect how we vectorize the
3809                  reduction.  */
3810               use_operand_p use_p;
3811               imm_use_iterator imm_iter;
3812               class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3813               FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3814                                      gimple_get_lhs (scalar_def))
3815                 /* There are exactly two non-debug uses, the reduction
3816                    PHI and the loop-closed PHI node.  */
3817                 if (!is_gimple_debug (USE_STMT (use_p))
3818                     && gimple_bb (USE_STMT (use_p)) == loop->header)
3819                   {
3820                     auto_vec<stmt_vec_info, 64> phis (group_size);
3821                     stmt_vec_info phi_info
3822                       = vinfo->lookup_stmt (USE_STMT (use_p));
3823                     for (unsigned i = 0; i < group_size; ++i)
3824                       phis.quick_push (phi_info);
3825                     slp_tree *phi_node = bst_map->get (phis);
3826                     unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3827                     SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3828                       = SLP_INSTANCE_TREE (new_instance);
3829                     SLP_INSTANCE_TREE (new_instance)->refcnt++;
3830                   }
3831             }
3832
3833           vinfo->slp_instances.safe_push (new_instance);
3834
3835           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3836              the number of scalar stmts in the root in a few places.
3837              Verify that assumption holds.  */
3838           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3839                         .length () == group_size);
3840
3841           if (dump_enabled_p ())
3842             {
3843               dump_printf_loc (MSG_NOTE, vect_location,
3844                                "Final SLP tree for instance %p:\n",
3845                                (void *) new_instance);
3846               vect_print_slp_graph (MSG_NOTE, vect_location,
3847                                     SLP_INSTANCE_TREE (new_instance));
3848             }
3849
3850           return true;
3851         }
3852     }
3853   /* Failed to SLP.  */
3854
3855   stmt_vec_info stmt_info = stmt_info_;
3856   /* Try to break the group up into pieces.  */
3857   if (*limit > 0 && kind == slp_inst_kind_store)
3858     {
3859       /* ???  We could delay all the actual splitting of store-groups
3860          until after SLP discovery of the original group completed.
3861          Then we can recurse to vect_build_slp_instance directly.  */
3862       for (i = 0; i < group_size; i++)
3863         if (!matches[i])
3864           break;
3865
3866       /* For basic block SLP, try to break the group up into multiples of
3867          a vector size.  */
3868       if (is_a <bb_vec_info> (vinfo)
3869           && (i > 1 && i < group_size))
3870         {
3871           /* Free the allocated memory.  */
3872           scalar_stmts.release ();
3873
3874           tree scalar_type
3875             = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3876           tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3877                                                       1 << floor_log2 (i));
3878           unsigned HOST_WIDE_INT const_nunits;
3879           if (vectype
3880               && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3881             {
3882               /* Split into two groups at the first vector boundary.  */
3883               gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3884               unsigned group1_size = i & ~(const_nunits - 1);
3885
3886               if (dump_enabled_p ())
3887                 dump_printf_loc (MSG_NOTE, vect_location,
3888                                  "Splitting SLP group at stmt %u\n", i);
3889               stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3890                                                                group1_size);
3891               bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3892                                                     kind, max_tree_size,
3893                                                     limit);
3894               /* Split the rest at the failure point and possibly
3895                  re-analyze the remaining matching part if it has
3896                  at least two lanes.  */
3897               if (group1_size < i
3898                   && (i + 1 < group_size
3899                       || i - group1_size > 1))
3900                 {
3901                   stmt_vec_info rest2 = rest;
3902                   rest = vect_split_slp_store_group (rest, i - group1_size);
3903                   if (i - group1_size > 1)
3904                     res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3905                                                       kind, max_tree_size,
3906                                                       limit);
3907                 }
3908               /* Re-analyze the non-matching tail if it has at least
3909                  two lanes.  */
3910               if (i + 1 < group_size)
3911                 res |= vect_analyze_slp_instance (vinfo, bst_map,
3912                                                   rest, kind, max_tree_size,
3913                                                   limit);
3914               return res;
3915             }
3916         }
3917
3918       /* For loop vectorization split the RHS into arbitrary pieces of
3919          size >= 1.  */
3920       else if (is_a <loop_vec_info> (vinfo)
3921                && (group_size != 1 && i < group_size))
3922         {
3923           /* There are targets that cannot do even/odd interleaving schemes
3924              so they absolutely need to use load/store-lanes.  For now
3925              force single-lane SLP for them - they would be happy with
3926              uniform power-of-two lanes (but depending on element size),
3927              but even if we can use 'i' as indicator we would need to
3928              backtrack when later lanes fail to discover with the same
3929              granularity.  We cannot turn any of strided or scatter store
3930              into store-lanes.  */
3931           /* ???  If this is not in sync with what get_load_store_type
3932              later decides the SLP representation is not good for other
3933              store vectorization methods.  */
3934           bool want_store_lanes
3935             = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
3936                && ! STMT_VINFO_STRIDED_P (stmt_info)
3937                && compare_step_with_zero (vinfo, stmt_info) > 0
3938                && vect_slp_prefer_store_lanes_p (vinfo, stmt_info,
3939                                                  group_size, 1));
3940           if (want_store_lanes || force_single_lane)
3941             i = 1;
3942
3943           /* A fatal discovery fail doesn't always mean single-lane SLP
3944              isn't a possibility, so try.  */
3945           if (i == 0)
3946             i = 1;
3947
3948           if (dump_enabled_p ())
3949             dump_printf_loc (MSG_NOTE, vect_location,
3950                              "Splitting SLP group at stmt %u\n", i);
3951
3952           /* Analyze the stored values and pinch them together with
3953              a permute node so we can preserve the whole store group.  */
3954           auto_vec<slp_tree> rhs_nodes;
3955
3956           /* Calculate the unrolling factor based on the smallest type.  */
3957           poly_uint64 unrolling_factor = 1;
3958
3959           unsigned int start = 0, end = i;
3960           while (start < group_size)
3961             {
3962               gcc_assert (end - start >= 1);
3963               vec<stmt_vec_info> substmts;
3964               substmts.create (end - start);
3965               for (unsigned j = start; j < end; ++j)
3966                 substmts.quick_push (scalar_stmts[j]);
3967               max_nunits = 1;
3968               node = vect_build_slp_tree (vinfo, substmts, end - start,
3969                                           &max_nunits,
3970                                           matches, limit, &tree_size, bst_map);
3971               if (node)
3972                 {
3973                   /* ???  Possibly not safe, but not sure how to check
3974                      and fail SLP build?  */
3975                   unrolling_factor
3976                     = force_common_multiple (unrolling_factor,
3977                                              calculate_unrolling_factor
3978                                                (max_nunits, end - start));
3979                   rhs_nodes.safe_push (node);
3980                   start = end;
3981                   if (want_store_lanes || force_single_lane)
3982                     end = start + 1;
3983                   else
3984                     end = group_size;
3985                 }
3986               else
3987                 {
3988                   substmts.release ();
3989                   if (end - start == 1)
3990                     {
3991                       /* Single-lane discovery failed.  Free ressources.  */
3992                       for (auto node : rhs_nodes)
3993                         vect_free_slp_tree (node);
3994                       scalar_stmts.release ();
3995                       if (dump_enabled_p ())
3996                         dump_printf_loc (MSG_NOTE, vect_location,
3997                                          "SLP discovery failed\n");
3998                       return false;
3999                     }
4000
4001                   /* ???  It really happens that we soft-fail SLP
4002                      build at a mismatch but the matching part hard-fails
4003                      later.  As we know we arrived here with a group
4004                      larger than one try a group of size one!  */
4005                   if (!matches[0])
4006                     end = start + 1;
4007                   else
4008                     for (unsigned j = start; j < end; j++)
4009                       if (!matches[j - start])
4010                         {
4011                           end = j;
4012                           break;
4013                         }
4014                 }
4015             }
4016
4017           /* Now we assume we can build the root SLP node from all stores.  */
4018           if (want_store_lanes)
4019             {
4020               /* For store-lanes feed the store node with all RHS nodes
4021                  in order.  */
4022               node = vect_create_new_slp_node (scalar_stmts,
4023                                                SLP_TREE_CHILDREN
4024                                                  (rhs_nodes[0]).length ());
4025               SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
4026               node->ldst_lanes = true;
4027               SLP_TREE_CHILDREN (node)
4028                 .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
4029                                 + rhs_nodes.length () - 1);
4030               /* First store value and possibly mask.  */
4031               SLP_TREE_CHILDREN (node)
4032                 .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
4033               /* Rest of the store values.  All mask nodes are the same,
4034                  this should be guaranteed by dataref group discovery.  */
4035               for (unsigned j = 1; j < rhs_nodes.length (); ++j)
4036                 SLP_TREE_CHILDREN (node)
4037                   .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
4038               for (slp_tree child : SLP_TREE_CHILDREN (node))
4039                 child->refcnt++;
4040             }
4041           else
4042             node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts);
4043
4044           while (!rhs_nodes.is_empty ())
4045             vect_free_slp_tree (rhs_nodes.pop ());
4046
4047           /* Create a new SLP instance.  */
4048           slp_instance new_instance = XNEW (class _slp_instance);
4049           SLP_INSTANCE_TREE (new_instance) = node;
4050           SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
4051           SLP_INSTANCE_LOADS (new_instance) = vNULL;
4052           SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4053           SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4054           SLP_INSTANCE_KIND (new_instance) = kind;
4055           new_instance->reduc_phis = NULL;
4056           new_instance->cost_vec = vNULL;
4057           new_instance->subgraph_entries = vNULL;
4058
4059           if (dump_enabled_p ())
4060             dump_printf_loc (MSG_NOTE, vect_location,
4061                              "SLP size %u vs. limit %u.\n",
4062                              tree_size, max_tree_size);
4063
4064           vinfo->slp_instances.safe_push (new_instance);
4065
4066           /* ???  We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4067              the number of scalar stmts in the root in a few places.
4068              Verify that assumption holds.  */
4069           gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4070                         .length () == group_size);
4071
4072           if (dump_enabled_p ())
4073             {
4074               dump_printf_loc (MSG_NOTE, vect_location,
4075                                "Final SLP tree for instance %p:\n",
4076                                (void *) new_instance);
4077               vect_print_slp_graph (MSG_NOTE, vect_location,
4078                                     SLP_INSTANCE_TREE (new_instance));
4079             }
4080           return true;
4081         }
4082       else
4083         /* Free the allocated memory.  */
4084         scalar_stmts.release ();
4085
4086       /* Even though the first vector did not all match, we might be able to SLP
4087          (some) of the remainder.  FORNOW ignore this possibility.  */
4088     }
4089   else
4090     /* Free the allocated memory.  */
4091     scalar_stmts.release ();
4092
4093   /* Failed to SLP.  */
4094   if (dump_enabled_p ())
4095     dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4096   return false;
4097 }
4098
4099
4100 /* Analyze an SLP instance starting from a group of grouped stores.  Call
4101    vect_build_slp_tree to build a tree of packed stmts if possible.
4102    Return FALSE if it's impossible to SLP any stmt in the loop.  */
4103
4104 static bool
4105 vect_analyze_slp_instance (vec_info *vinfo,
4106                            scalar_stmts_to_slp_tree_map_t *bst_map,
4107                            stmt_vec_info stmt_info,
4108                            slp_instance_kind kind,
4109                            unsigned max_tree_size, unsigned *limit,
4110                            bool force_single_lane)
4111 {
4112   vec<stmt_vec_info> scalar_stmts;
4113
4114   if (is_a <bb_vec_info> (vinfo))
4115     vect_location = stmt_info->stmt;
4116
4117   stmt_vec_info next_info = stmt_info;
4118   if (kind == slp_inst_kind_store)
4119     {
4120       /* Collect the stores and store them in scalar_stmts.  */
4121       scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
4122       while (next_info)
4123         {
4124           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4125           next_info = DR_GROUP_NEXT_ELEMENT (next_info);
4126         }
4127     }
4128   else if (kind == slp_inst_kind_reduc_chain)
4129     {
4130       /* Collect the reduction stmts and store them in scalar_stmts.  */
4131       scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
4132       while (next_info)
4133         {
4134           scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4135           next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
4136         }
4137       /* Mark the first element of the reduction chain as reduction to properly
4138          transform the node.  In the reduction analysis phase only the last
4139          element of the chain is marked as reduction.  */
4140       STMT_VINFO_DEF_TYPE (stmt_info)
4141         = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
4142       STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
4143         = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
4144     }
4145   else
4146     gcc_unreachable ();
4147
4148   vec<stmt_vec_info> roots = vNULL;
4149   vec<tree> remain = vNULL;
4150   /* Build the tree for the SLP instance.  */
4151   bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
4152                                       roots, remain,
4153                                       max_tree_size, limit, bst_map,
4154                                       kind == slp_inst_kind_store
4155                                       ? stmt_info : NULL, force_single_lane);
4156
4157   /* ???  If this is slp_inst_kind_store and the above succeeded here's
4158      where we should do store group splitting.  */
4159
4160   return res;
4161 }
4162
4163 /* qsort comparator ordering SLP load nodes.  */
4164
4165 static int
4166 vllp_cmp (const void *a_, const void *b_)
4167 {
4168   const slp_tree a = *(const slp_tree *)a_;
4169   const slp_tree b = *(const slp_tree *)b_;
4170   stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
4171   stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
4172   if (STMT_VINFO_GROUPED_ACCESS (a0)
4173       && STMT_VINFO_GROUPED_ACCESS (b0)
4174       && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
4175     {
4176       /* Same group, order after lanes used.  */
4177       if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
4178         return 1;
4179       else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
4180         return -1;
4181       else
4182         {
4183           /* Try to order loads using the same lanes together, breaking
4184              the tie with the lane number that first differs.  */
4185           if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
4186               && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
4187             return 0;
4188           else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
4189                    && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
4190             return 1;
4191           else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
4192                    && SLP_TREE_LOAD_PERMUTATION (b).exists ())
4193             return -1;
4194           else
4195             {
4196               for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
4197                 if (SLP_TREE_LOAD_PERMUTATION (a)[i]
4198                     != SLP_TREE_LOAD_PERMUTATION (b)[i])
4199                   {
4200                     /* In-order lane first, that's what the above case for
4201                        no permutation does.  */
4202                     if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
4203                       return -1;
4204                     else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
4205                       return 1;
4206                     else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
4207                              < SLP_TREE_LOAD_PERMUTATION (b)[i])
4208                       return -1;
4209                     else
4210                       return 1;
4211                   }
4212               return 0;
4213             }
4214         }
4215     }
4216   else /* Different groups or non-groups.  */
4217     {
4218       /* Order groups as their first element to keep them together.  */
4219       if (STMT_VINFO_GROUPED_ACCESS (a0))
4220         a0 = DR_GROUP_FIRST_ELEMENT (a0);
4221       if (STMT_VINFO_GROUPED_ACCESS (b0))
4222         b0 = DR_GROUP_FIRST_ELEMENT (b0);
4223       if (a0 == b0)
4224         return 0;
4225       /* Tie using UID.  */
4226       else if (gimple_uid (STMT_VINFO_STMT (a0))
4227                < gimple_uid (STMT_VINFO_STMT (b0)))
4228         return -1;
4229       else
4230         {
4231           gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
4232                       != gimple_uid (STMT_VINFO_STMT (b0)));
4233           return 1;
4234         }
4235     }
4236 }
4237
4238 /* Process the set of LOADS that are all from the same dataref group.  */
4239
4240 static void
4241 vect_lower_load_permutations (loop_vec_info loop_vinfo,
4242                               scalar_stmts_to_slp_tree_map_t *bst_map,
4243                               const array_slice<slp_tree> &loads)
4244 {
4245   /* We at this point want to lower without a fixed VF or vector
4246      size in mind which means we cannot actually compute whether we
4247      need three or more vectors for a load permutation yet.  So always
4248      lower.  */
4249   stmt_vec_info first
4250     = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
4251   unsigned group_lanes = DR_GROUP_SIZE (first);
4252
4253   /* Verify if all load permutations can be implemented with a suitably
4254      large element load-lanes operation.  */
4255   unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
4256   if (STMT_VINFO_STRIDED_P (first)
4257       || compare_step_with_zero (loop_vinfo, first) <= 0
4258       || exact_log2 (ld_lanes_lanes) == -1
4259       /* ???  For now only support the single-lane case as there is
4260          missing support on the store-lane side and code generation
4261          isn't up to the task yet.  */
4262       || ld_lanes_lanes != 1
4263       || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
4264                                     group_lanes / ld_lanes_lanes,
4265                                     false) == IFN_LAST)
4266     ld_lanes_lanes = 0;
4267   else
4268     /* Verify the loads access the same number of lanes aligned to
4269        ld_lanes_lanes.  */
4270     for (slp_tree load : loads)
4271       {
4272         if (SLP_TREE_LANES (load) != ld_lanes_lanes)
4273           {
4274             ld_lanes_lanes = 0;
4275             break;
4276           }
4277         unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
4278         if (first % ld_lanes_lanes != 0)
4279           {
4280             ld_lanes_lanes = 0;
4281             break;
4282           }
4283         for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i)
4284           if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i)
4285             {
4286               ld_lanes_lanes = 0;
4287               break;
4288             }
4289       }
4290
4291   /* Only a power-of-two number of lanes matches interleaving with N levels.
4292      ???  An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
4293      at each step.  */
4294   if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
4295     return;
4296
4297   for (slp_tree load : loads)
4298     {
4299       /* Leave masked or gather loads alone for now.  */
4300       if (!SLP_TREE_CHILDREN (load).is_empty ())
4301         continue;
4302
4303       /* We want to pattern-match special cases here and keep those
4304          alone.  Candidates are splats and load-lane.  */
4305
4306       /* We need to lower only loads of less than half of the groups
4307          lanes, including duplicate lanes.  Note this leaves nodes
4308          with a non-1:1 load permutation around instead of canonicalizing
4309          those into a load and a permute node.  Removing this early
4310          check would do such canonicalization.  */
4311       if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
4312           && ld_lanes_lanes == 0)
4313         continue;
4314
4315       /* First build (and possibly re-use) a load node for the
4316          unpermuted group.  Gaps in the middle and on the end are
4317          represented with NULL stmts.  */
4318       vec<stmt_vec_info> stmts;
4319       stmts.create (group_lanes);
4320       for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
4321         {
4322           if (s != first)
4323             for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
4324               stmts.quick_push (NULL);
4325           stmts.quick_push (s);
4326         }
4327       for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
4328         stmts.quick_push (NULL);
4329       poly_uint64 max_nunits = 1;
4330       bool *matches = XALLOCAVEC (bool, group_lanes);
4331       unsigned limit = 1;
4332       unsigned tree_size = 0;
4333       slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
4334                                          group_lanes,
4335                                          &max_nunits, matches, &limit,
4336                                          &tree_size, bst_map);
4337
4338       /* Build the permute to get the original load permutation order.  */
4339       lane_permutation_t final_perm;
4340       final_perm.create (SLP_TREE_LANES (load));
4341       for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
4342         final_perm.quick_push
4343           (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
4344
4345       if (ld_lanes_lanes != 0)
4346         {
4347           /* ???  If this is not in sync with what get_load_store_type
4348              later decides the SLP representation is not good for other
4349              store vectorization methods.  */
4350           l0->ldst_lanes = true;
4351           load->ldst_lanes = true;
4352         }
4353
4354       while (1)
4355         {
4356           unsigned group_lanes = SLP_TREE_LANES (l0);
4357           if (ld_lanes_lanes != 0
4358               || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
4359             break;
4360
4361           /* Try to lower by reducing the group to half its size using an
4362              interleaving scheme.  For this try to compute whether all
4363              elements needed for this load are in even or odd elements of
4364              an even/odd decomposition with N consecutive elements.
4365              Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
4366              with N == 2.  */
4367           /* ???  Only an even number of lanes can be handed this way, but the
4368              fallback below could work for any number.  We have to make sure
4369              to round up in that case.  */
4370           gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
4371           unsigned even = 0, odd = 0;
4372           if ((group_lanes & 1) == 0)
4373             {
4374               even = (1 << ceil_log2 (group_lanes)) - 1;
4375               odd = even;
4376               for (auto l : final_perm)
4377                 {
4378                   even &= ~l.second;
4379                   odd &= l.second;
4380                 }
4381             }
4382
4383           /* Now build an even or odd extraction from the unpermuted load.  */
4384           lane_permutation_t perm;
4385           perm.create ((group_lanes + 1) / 2);
4386           unsigned level;
4387           if (even
4388               && ((level = 1 << ctz_hwi (even)), true)
4389               && group_lanes % (2 * level) == 0)
4390             {
4391               /* { 0, 1, ... 4, 5 ..., } */
4392               unsigned level = 1 << ctz_hwi (even);
4393               for (unsigned i = 0; i < group_lanes / 2 / level; ++i)
4394                 for (unsigned j = 0; j < level; ++j)
4395                   perm.quick_push (std::make_pair (0, 2 * i * level + j));
4396             }
4397           else if (odd)
4398             {
4399               /* { ..., 2, 3, ... 6, 7 } */
4400               unsigned level = 1 << ctz_hwi (odd);
4401               gcc_assert (group_lanes % (2 * level) == 0);
4402               for (unsigned i = 0; i < group_lanes / 2 / level; ++i)
4403                 for (unsigned j = 0; j < level; ++j)
4404                   perm.quick_push (std::make_pair (0, (2 * i + 1) * level + j));
4405             }
4406           else
4407             {
4408               /* As fallback extract all used lanes and fill to half the
4409                  group size by repeating the last element.
4410                  ???  This is quite a bad strathegy for re-use - we could
4411                  brute force our way to find more optimal filling lanes to
4412                  maximize re-use when looking at all loads from the group.  */
4413               auto_bitmap l;
4414               for (auto p : final_perm)
4415                 bitmap_set_bit (l, p.second);
4416               unsigned i = 0;
4417               bitmap_iterator bi;
4418               EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
4419                   perm.quick_push (std::make_pair (0, i));
4420               while (perm.length () < (group_lanes + 1) / 2)
4421                 perm.quick_push (perm.last ());
4422             }
4423
4424           /* Update final_perm with the intermediate permute.  */
4425           for (unsigned i = 0; i < final_perm.length (); ++i)
4426             {
4427               unsigned l = final_perm[i].second;
4428               unsigned j;
4429               for (j = 0; j < perm.length (); ++j)
4430                 if (perm[j].second == l)
4431                   {
4432                     final_perm[i].second = j;
4433                     break;
4434                   }
4435               gcc_assert (j < perm.length ());
4436             }
4437
4438           /* And create scalar stmts.  */
4439           vec<stmt_vec_info> perm_stmts;
4440           perm_stmts.create (perm.length ());
4441           for (unsigned i = 0; i < perm.length (); ++i)
4442             perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
4443
4444           slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
4445           SLP_TREE_CHILDREN (p).quick_push (l0);
4446           SLP_TREE_LANE_PERMUTATION (p) = perm;
4447           SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
4448           SLP_TREE_LANES (p) = perm.length ();
4449           SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
4450           /* ???  As we have scalar stmts for this intermediate permute we
4451              could CSE it via bst_map but we do not want to pick up
4452              another SLP node with a load permutation.  We instead should
4453              have a "local" CSE map here.  */
4454           SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
4455
4456           /* We now have a node for (group_lanes + 1) / 2 lanes.  */
4457           l0 = p;
4458         }
4459
4460       /* And finally from the ordered reduction node create the
4461          permute to shuffle the lanes into the original load-permutation
4462          order.  We replace the original load node with this.  */
4463       SLP_TREE_CODE (load) = VEC_PERM_EXPR;
4464       SLP_TREE_LOAD_PERMUTATION (load).release ();
4465       SLP_TREE_LANE_PERMUTATION (load) = final_perm;
4466       SLP_TREE_CHILDREN (load).create (1);
4467       SLP_TREE_CHILDREN (load).quick_push (l0);
4468     }
4469 }
4470
4471 /* Transform SLP loads in the SLP graph created by SLP discovery to
4472    group loads from the same group and lower load permutations that
4473    are unlikely to be supported into a series of permutes.
4474    In the degenerate case of having only single-lane SLP instances
4475    this should result in a series of permute nodes emulating an
4476    interleaving scheme.  */
4477
4478 static void
4479 vect_lower_load_permutations (loop_vec_info loop_vinfo,
4480                               scalar_stmts_to_slp_tree_map_t *bst_map)
4481 {
4482   /* Gather and sort loads across all instances.  */
4483   hash_set<slp_tree> visited;
4484   auto_vec<slp_tree> loads;
4485   for (auto inst : loop_vinfo->slp_instances)
4486     vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
4487   if (loads.is_empty ())
4488     return;
4489   loads.qsort (vllp_cmp);
4490
4491   /* Now process each dataref group separately.  */
4492   unsigned firsti = 0;
4493   for (unsigned i = 1; i < loads.length (); ++i)
4494     {
4495       slp_tree first = loads[firsti];
4496       slp_tree next = loads[i];
4497       stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
4498       stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
4499       if (STMT_VINFO_GROUPED_ACCESS (a0)
4500           && STMT_VINFO_GROUPED_ACCESS (b0)
4501           && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
4502         continue;
4503       /* Just one SLP load of a possible group, leave those alone.  */
4504       if (i == firsti + 1)
4505         {
4506           firsti = i;
4507           continue;
4508         }
4509       /* Now we have multiple SLP loads of the same group from
4510          firsti to i - 1.  */
4511       vect_lower_load_permutations (loop_vinfo, bst_map,
4512                                     make_array_slice (&loads[firsti],
4513                                                       i - firsti));
4514       firsti = i;
4515     }
4516   if (firsti < loads.length () - 1)
4517     vect_lower_load_permutations (loop_vinfo, bst_map,
4518                                   make_array_slice (&loads[firsti],
4519                                                     loads.length () - firsti));
4520 }
4521
4522 /* Check if there are stmts in the loop can be vectorized using SLP.  Build SLP
4523    trees of packed scalar stmts if SLP is possible.  */
4524
4525 opt_result
4526 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
4527 {
4528   unsigned int i;
4529   stmt_vec_info first_element;
4530   slp_instance instance;
4531
4532   DUMP_VECT_SCOPE ("vect_analyze_slp");
4533
4534   unsigned limit = max_tree_size;
4535
4536   scalar_stmts_to_slp_tree_map_t *bst_map
4537     = new scalar_stmts_to_slp_tree_map_t ();
4538
4539   /* Find SLP sequences starting from groups of grouped stores.  */
4540   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4541     vect_analyze_slp_instance (vinfo, bst_map, first_element,
4542                                slp_inst_kind_store, max_tree_size, &limit);
4543
4544   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4545     {
4546       for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
4547         {
4548           vect_location = bb_vinfo->roots[i].roots[0]->stmt;
4549           /* Apply patterns.  */
4550           for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
4551             bb_vinfo->roots[i].stmts[j]
4552               = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
4553           if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
4554                                        bb_vinfo->roots[i].stmts,
4555                                        bb_vinfo->roots[i].roots,
4556                                        bb_vinfo->roots[i].remain,
4557                                        max_tree_size, &limit, bst_map, NULL))
4558             {
4559               bb_vinfo->roots[i].stmts = vNULL;
4560               bb_vinfo->roots[i].roots = vNULL;
4561               bb_vinfo->roots[i].remain = vNULL;
4562             }
4563         }
4564     }
4565
4566   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4567     {
4568       /* Find SLP sequences starting from reduction chains.  */
4569       FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
4570         if (! STMT_VINFO_RELEVANT_P (first_element)
4571             && ! STMT_VINFO_LIVE_P (first_element))
4572           ;
4573         else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
4574                                               slp_inst_kind_reduc_chain,
4575                                               max_tree_size, &limit))
4576           {
4577             /* Dissolve reduction chain group.  */
4578             stmt_vec_info vinfo = first_element;
4579             stmt_vec_info last = NULL;
4580             while (vinfo)
4581               {
4582                 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
4583                 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
4584                 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
4585                 last = vinfo;
4586                 vinfo = next;
4587               }
4588             STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
4589             /* It can be still vectorized as part of an SLP reduction.  */
4590             loop_vinfo->reductions.safe_push (last);
4591           }
4592
4593       /* Find SLP sequences starting from groups of reductions.  */
4594       if (loop_vinfo->reductions.length () > 0)
4595         {
4596           /* Collect reduction statements we can combine into
4597              a SLP reduction.  */
4598           vec<stmt_vec_info> scalar_stmts;
4599           scalar_stmts.create (loop_vinfo->reductions.length ());
4600           for (auto next_info : loop_vinfo->reductions)
4601             {
4602               next_info = vect_stmt_to_vectorize (next_info);
4603               if ((STMT_VINFO_RELEVANT_P (next_info)
4604                    || STMT_VINFO_LIVE_P (next_info))
4605                   /* ???  Make sure we didn't skip a conversion around a
4606                      reduction path.  In that case we'd have to reverse
4607                      engineer that conversion stmt following the chain using
4608                      reduc_idx and from the PHI using reduc_def.  */
4609                   && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
4610                 {
4611                   /* Do not discover SLP reductions combining lane-reducing
4612                      ops, that will fail later.  */
4613                   if (!lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
4614                     scalar_stmts.quick_push (next_info);
4615                   else
4616                     {
4617                       /* Do SLP discovery for single-lane reductions.  */
4618                       vec<stmt_vec_info> stmts;
4619                       vec<stmt_vec_info> roots = vNULL;
4620                       vec<tree> remain = vNULL;
4621                       stmts.create (1);
4622                       stmts.quick_push (next_info);
4623                       vect_build_slp_instance (vinfo,
4624                                                slp_inst_kind_reduc_group,
4625                                                stmts, roots, remain,
4626                                                max_tree_size, &limit,
4627                                                bst_map, NULL);
4628                     }
4629                 }
4630             }
4631           /* Save for re-processing on failure.  */
4632           vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
4633           vec<stmt_vec_info> roots = vNULL;
4634           vec<tree> remain = vNULL;
4635           if (scalar_stmts.length () <= 1
4636               || !vect_build_slp_instance (loop_vinfo,
4637                                            slp_inst_kind_reduc_group,
4638                                            scalar_stmts, roots, remain,
4639                                            max_tree_size, &limit, bst_map,
4640                                            NULL))
4641             {
4642               if (scalar_stmts.length () <= 1)
4643                 scalar_stmts.release ();
4644               /* Do SLP discovery for single-lane reductions.  */
4645               for (auto stmt_info : saved_stmts)
4646                 {
4647                   vec<stmt_vec_info> stmts;
4648                   vec<stmt_vec_info> roots = vNULL;
4649                   vec<tree> remain = vNULL;
4650                   stmts.create (1);
4651                   stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
4652                   vect_build_slp_instance (vinfo,
4653                                            slp_inst_kind_reduc_group,
4654                                            stmts, roots, remain,
4655                                            max_tree_size, &limit,
4656                                            bst_map, NULL);
4657                 }
4658               saved_stmts.release ();
4659             }
4660         }
4661     }
4662
4663   hash_set<slp_tree> visited_patterns;
4664   slp_tree_to_load_perm_map_t perm_cache;
4665   slp_compat_nodes_map_t compat_cache;
4666
4667   /* See if any patterns can be found in the SLP tree.  */
4668   bool pattern_found = false;
4669   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4670     pattern_found |= vect_match_slp_patterns (instance, vinfo,
4671                                               &visited_patterns, &perm_cache,
4672                                               &compat_cache);
4673
4674   /* If any were found optimize permutations of loads.  */
4675   if (pattern_found)
4676     {
4677       hash_map<slp_tree, slp_tree> load_map;
4678       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4679         {
4680           slp_tree root = SLP_INSTANCE_TREE (instance);
4681           optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
4682                                         &load_map, root);
4683         }
4684     }
4685
4686   /* Check whether we should force some SLP instances to use load/store-lanes
4687      and do so by forcing SLP re-discovery with single lanes.  We used
4688      to cancel SLP when this applied to all instances in a loop but now
4689      we decide this per SLP instance.  It's important to do this only
4690      after SLP pattern recognition.  */
4691   if (is_a <loop_vec_info> (vinfo))
4692     FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4693       if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
4694           && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
4695         {
4696           slp_tree slp_root = SLP_INSTANCE_TREE (instance);
4697           int group_size = SLP_TREE_LANES (slp_root);
4698           tree vectype = SLP_TREE_VECTYPE (slp_root);
4699
4700           auto_vec<slp_tree> loads;
4701           hash_set<slp_tree> visited;
4702           vect_gather_slp_loads (loads, slp_root, visited);
4703
4704           /* Check whether any load in the SLP instance is possibly
4705              permuted.  */
4706           bool loads_permuted = false;
4707           slp_tree load_node;
4708           unsigned j;
4709           FOR_EACH_VEC_ELT (loads, j, load_node)
4710             {
4711               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
4712                 continue;
4713               unsigned k;
4714               stmt_vec_info load_info;
4715               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
4716                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
4717                   {
4718                     loads_permuted = true;
4719                     break;
4720                   }
4721             }
4722
4723           gimple *rep = STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE (slp_root));
4724           bool masked = (is_gimple_call (rep)
4725                          && gimple_call_internal_p (rep)
4726                          && internal_fn_mask_index
4727                               (gimple_call_internal_fn (rep)) != -1);
4728           /* If the loads and stores can use load/store-lanes force re-discovery
4729              with single lanes.  */
4730           if (loads_permuted
4731               && !slp_root->ldst_lanes
4732               && vect_store_lanes_supported (vectype, group_size, masked)
4733               != IFN_LAST)
4734             {
4735               bool can_use_lanes = true;
4736               FOR_EACH_VEC_ELT (loads, j, load_node)
4737                 if (STMT_VINFO_GROUPED_ACCESS
4738                       (SLP_TREE_REPRESENTATIVE (load_node)))
4739                   {
4740                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
4741                         (SLP_TREE_REPRESENTATIVE (load_node));
4742                     rep = STMT_VINFO_STMT (stmt_vinfo);
4743                     masked = (is_gimple_call (rep)
4744                               && gimple_call_internal_p (rep)
4745                               && internal_fn_mask_index
4746                                    (gimple_call_internal_fn (rep)));
4747                     /* Use SLP for strided accesses (or if we can't
4748                        load-lanes).  */
4749                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
4750                         || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
4751                         || vect_load_lanes_supported
4752                              (STMT_VINFO_VECTYPE (stmt_vinfo),
4753                               DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
4754                         /* ???  During SLP re-discovery with a single lane
4755                            a masked grouped load will appear permuted and
4756                            discovery will fail.  We have to rework this
4757                            on the discovery side - for now avoid ICEing.  */
4758                         || masked)
4759                       {
4760                         can_use_lanes = false;
4761                         break;
4762                       }
4763                   }
4764
4765               if (can_use_lanes)
4766                 {
4767                   if (dump_enabled_p ())
4768                     dump_printf_loc (MSG_NOTE, vect_location,
4769                                      "SLP instance %p can use load/store-lanes,"
4770                                      " re-discovering with single-lanes\n",
4771                                      (void *) instance);
4772
4773                   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
4774
4775                   vect_free_slp_instance (instance);
4776                   limit = max_tree_size;
4777                   bool res = vect_analyze_slp_instance (vinfo, bst_map,
4778                                                         stmt_info,
4779                                                         slp_inst_kind_store,
4780                                                         max_tree_size, &limit,
4781                                                         true);
4782                   gcc_assert (res);
4783                   auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
4784                   LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
4785                 }
4786             }
4787         }
4788
4789   /* When we end up with load permutations that we cannot possibly handle,
4790      like those requiring three vector inputs, lower them using interleaving
4791      like schemes.  */
4792   if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4793     {
4794       vect_lower_load_permutations (loop_vinfo, bst_map);
4795       if (dump_enabled_p ())
4796         {
4797           dump_printf_loc (MSG_NOTE, vect_location,
4798                            "SLP graph after lowering permutations:\n");
4799           hash_set<slp_tree> visited;
4800           FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4801             vect_print_slp_graph (MSG_NOTE, vect_location,
4802                                   SLP_INSTANCE_TREE (instance), visited);
4803         }
4804     }
4805
4806   release_scalar_stmts_to_slp_tree_map (bst_map);
4807
4808   if (pattern_found && dump_enabled_p ())
4809     {
4810       dump_printf_loc (MSG_NOTE, vect_location,
4811                        "Pattern matched SLP tree\n");
4812       hash_set<slp_tree> visited;
4813       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4814         vect_print_slp_graph (MSG_NOTE, vect_location,
4815                               SLP_INSTANCE_TREE (instance), visited);
4816     }
4817
4818   return opt_result::success ();
4819 }
4820
4821 /* Estimates the cost of inserting layout changes into the SLP graph.
4822    It can also say that the insertion is impossible.  */
4823
4824 struct slpg_layout_cost
4825 {
4826   slpg_layout_cost () = default;
4827   slpg_layout_cost (sreal, bool);
4828
4829   static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
4830   bool is_possible () const { return depth != sreal::max (); }
4831
4832   bool operator== (const slpg_layout_cost &) const;
4833   bool operator!= (const slpg_layout_cost &) const;
4834
4835   bool is_better_than (const slpg_layout_cost &, bool) const;
4836
4837   void add_parallel_cost (const slpg_layout_cost &);
4838   void add_serial_cost (const slpg_layout_cost &);
4839   void split (unsigned int);
4840
4841   /* The longest sequence of layout changes needed during any traversal
4842      of the partition dag, weighted by execution frequency.
4843
4844      This is the most important metric when optimizing for speed, since
4845      it helps to ensure that we keep the number of operations on
4846      critical paths to a minimum.  */
4847   sreal depth = 0;
4848
4849   /* An estimate of the total number of operations needed.  It is weighted by
4850      execution frequency when optimizing for speed but not when optimizing for
4851      size.  In order to avoid double-counting, a node with a fanout of N will
4852      distribute 1/N of its total cost to each successor.
4853
4854      This is the most important metric when optimizing for size, since
4855      it helps to keep the total number of operations to a minimum,  */
4856   sreal total = 0;
4857 };
4858
4859 /* Construct costs for a node with weight WEIGHT.  A higher weight
4860    indicates more frequent execution.  IS_FOR_SIZE is true if we are
4861    optimizing for size rather than speed.  */
4862
4863 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
4864   : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
4865 {
4866 }
4867
4868 bool
4869 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
4870 {
4871   return depth == other.depth && total == other.total;
4872 }
4873
4874 bool
4875 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
4876 {
4877   return !operator== (other);
4878 }
4879
4880 /* Return true if these costs are better than OTHER.  IS_FOR_SIZE is
4881    true if we are optimizing for size rather than speed.  */
4882
4883 bool
4884 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
4885                                   bool is_for_size) const
4886 {
4887   if (is_for_size)
4888     {
4889       if (total != other.total)
4890         return total < other.total;
4891       return depth < other.depth;
4892     }
4893   else
4894     {
4895       if (depth != other.depth)
4896         return depth < other.depth;
4897       return total < other.total;
4898     }
4899 }
4900
4901 /* Increase the costs to account for something with cost INPUT_COST
4902    happening in parallel with the current costs.  */
4903
4904 void
4905 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
4906 {
4907   depth = std::max (depth, input_cost.depth);
4908   total += input_cost.total;
4909 }
4910
4911 /* Increase the costs to account for something with cost INPUT_COST
4912    happening in series with the current costs.  */
4913
4914 void
4915 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
4916 {
4917   depth += other.depth;
4918   total += other.total;
4919 }
4920
4921 /* Split the total cost among TIMES successors or predecessors.  */
4922
4923 void
4924 slpg_layout_cost::split (unsigned int times)
4925 {
4926   if (times > 1)
4927     total /= times;
4928 }
4929
4930 /* Information about one node in the SLP graph, for use during
4931    vect_optimize_slp_pass.  */
4932
4933 struct slpg_vertex
4934 {
4935   slpg_vertex (slp_tree node_) : node (node_) {}
4936
4937   /* The node itself.  */
4938   slp_tree node;
4939
4940   /* Which partition the node belongs to, or -1 if none.  Nodes outside of
4941      partitions are flexible; they can have whichever layout consumers
4942      want them to have.  */
4943   int partition = -1;
4944
4945   /* The number of nodes that directly use the result of this one
4946      (i.e. the number of nodes that count this one as a child).  */
4947   unsigned int out_degree = 0;
4948
4949   /* The execution frequency of the node.  */
4950   sreal weight = 0;
4951
4952   /* The total execution frequency of all nodes that directly use the
4953      result of this one.  */
4954   sreal out_weight = 0;
4955 };
4956
4957 /* Information about one partition of the SLP graph, for use during
4958    vect_optimize_slp_pass.  */
4959
4960 struct slpg_partition_info
4961 {
4962   /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
4963      of m_partitioned_nodes.  */
4964   unsigned int node_begin = 0;
4965   unsigned int node_end = 0;
4966
4967   /* Which layout we've chosen to use for this partition, or -1 if
4968      we haven't picked one yet.  */
4969   int layout = -1;
4970
4971   /* The number of predecessors and successors in the partition dag.
4972      The predecessors always have lower partition numbers and the
4973      successors always have higher partition numbers.
4974
4975      Note that the directions of these edges are not necessarily the
4976      same as in the data flow graph.  For example, if an SCC has separate
4977      partitions for an inner loop and an outer loop, the inner loop's
4978      partition will have at least two incoming edges from the outer loop's
4979      partition: one for a live-in value and one for a live-out value.
4980      In data flow terms, one of these edges would also be from the outer loop
4981      to the inner loop, but the other would be in the opposite direction.  */
4982   unsigned int in_degree = 0;
4983   unsigned int out_degree = 0;
4984 };
4985
4986 /* Information about the costs of using a particular layout for a
4987    particular partition.  It can also say that the combination is
4988    impossible.  */
4989
4990 struct slpg_partition_layout_costs
4991 {
4992   bool is_possible () const { return internal_cost.is_possible (); }
4993   void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
4994
4995   /* The costs inherited from predecessor partitions.  */
4996   slpg_layout_cost in_cost;
4997
4998   /* The inherent cost of the layout within the node itself.  For example,
4999      this is nonzero for a load if choosing a particular layout would require
5000      the load to permute the loaded elements.  It is nonzero for a
5001      VEC_PERM_EXPR if the permutation cannot be eliminated or converted
5002      to full-vector moves.  */
5003   slpg_layout_cost internal_cost;
5004
5005   /* The costs inherited from successor partitions.  */
5006   slpg_layout_cost out_cost;
5007 };
5008
5009 /* This class tries to optimize the layout of vectors in order to avoid
5010    unnecessary shuffling.  At the moment, the set of possible layouts are
5011    restricted to bijective permutations.
5012
5013    The goal of the pass depends on whether we're optimizing for size or
5014    for speed.  When optimizing for size, the goal is to reduce the overall
5015    number of layout changes (including layout changes implied by things
5016    like load permutations).  When optimizing for speed, the goal is to
5017    reduce the maximum latency attributable to layout changes on any
5018    non-cyclical path through the data flow graph.
5019
5020    For example, when optimizing a loop nest for speed, we will prefer
5021    to make layout changes outside of a loop rather than inside of a loop,
5022    and will prefer to make layout changes in parallel rather than serially,
5023    even if that increases the overall number of layout changes.
5024
5025    The high-level procedure is:
5026
5027    (1) Build a graph in which edges go from uses (parents) to definitions
5028        (children).
5029
5030    (2) Divide the graph into a dag of strongly-connected components (SCCs).
5031
5032    (3) When optimizing for speed, partition the nodes in each SCC based
5033        on their containing cfg loop.  When optimizing for size, treat
5034        each SCC as a single partition.
5035
5036        This gives us a dag of partitions.  The goal is now to assign a
5037        layout to each partition.
5038
5039    (4) Construct a set of vector layouts that are worth considering.
5040        Record which nodes must keep their current layout.
5041
5042    (5) Perform a forward walk over the partition dag (from loads to stores)
5043        accumulating the "forward" cost of using each layout.  When visiting
5044        each partition, assign a tentative choice of layout to the partition
5045        and use that choice when calculating the cost of using a different
5046        layout in successor partitions.
5047
5048    (6) Perform a backward walk over the partition dag (from stores to loads),
5049        accumulating the "backward" cost of using each layout.  When visiting
5050        each partition, make a final choice of layout for that partition based
5051        on the accumulated forward costs (from (5)) and backward costs
5052        (from (6)).
5053
5054    (7) Apply the chosen layouts to the SLP graph.
5055
5056    For example, consider the SLP statements:
5057
5058    S1:      a_1 = load
5059        loop:
5060    S2:      a_2 = PHI<a_1, a_3>
5061    S3:      b_1 = load
5062    S4:      a_3 = a_2 + b_1
5063        exit:
5064    S5:      a_4 = PHI<a_3>
5065    S6:      store a_4
5066
5067    S2 and S4 form an SCC and are part of the same loop.  Every other
5068    statement is in a singleton SCC.  In this example there is a one-to-one
5069    mapping between SCCs and partitions and the partition dag looks like this;
5070
5071         S1     S3
5072          \     /
5073           S2+S4
5074             |
5075            S5
5076             |
5077            S6
5078
5079    S2, S3 and S4 will have a higher execution frequency than the other
5080    statements, so when optimizing for speed, the goal is to avoid any
5081    layout changes:
5082
5083    - within S3
5084    - within S2+S4
5085    - on the S3->S2+S4 edge
5086
5087    For example, if S3 was originally a reversing load, the goal of the
5088    pass is to make it an unreversed load and change the layout on the
5089    S1->S2+S4 and S2+S4->S5 edges to compensate.  (Changing the layout
5090    on S1->S2+S4 and S5->S6 would also be acceptable.)
5091
5092    The difference between SCCs and partitions becomes important if we
5093    add an outer loop:
5094
5095    S1:      a_1 = ...
5096        loop1:
5097    S2:      a_2 = PHI<a_1, a_6>
5098    S3:      b_1 = load
5099    S4:      a_3 = a_2 + b_1
5100        loop2:
5101    S5:      a_4 = PHI<a_3, a_5>
5102    S6:      c_1 = load
5103    S7:      a_5 = a_4 + c_1
5104        exit2:
5105    S8:      a_6 = PHI<a_5>
5106    S9:      store a_6
5107        exit1:
5108
5109    Here, S2, S4, S5, S7 and S8 form a single SCC.  However, when optimizing
5110    for speed, we usually do not want restrictions in the outer loop to "infect"
5111    the decision for the inner loop.  For example, if an outer-loop node
5112    in the SCC contains a statement with a fixed layout, that should not
5113    prevent the inner loop from using a different layout.  Conversely,
5114    the inner loop should not dictate a layout to the outer loop: if the
5115    outer loop does a lot of computation, then it may not be efficient to
5116    do all of that computation in the inner loop's preferred layout.
5117
5118    So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
5119    and S5+S7 (inner).  We also try to arrange partitions so that:
5120
5121    - the partition for an outer loop comes before the partition for
5122      an inner loop
5123
5124    - if a sibling loop A dominates a sibling loop B, A's partition
5125      comes before B's
5126
5127    This gives the following partition dag for the example above:
5128
5129         S1        S3
5130          \        /
5131           S2+S4+S8   S6
5132            |   \\    /
5133            |    S5+S7
5134            |
5135           S9
5136
5137    There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
5138    one for a reversal of the edge S7->S8.
5139
5140    The backward walk picks a layout for S5+S7 before S2+S4+S8.  The choice
5141    for S2+S4+S8 therefore has to balance the cost of using the outer loop's
5142    preferred layout against the cost of changing the layout on entry to the
5143    inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
5144
5145    Although this works well when optimizing for speed, it has the downside
5146    when optimizing for size that the choice of layout for S5+S7 is completely
5147    independent of S9, which lessens the chance of reducing the overall number
5148    of permutations.  We therefore do not partition SCCs when optimizing
5149    for size.
5150
5151    To give a concrete example of the difference between optimizing
5152    for size and speed, consider:
5153
5154    a[0] = (b[1] << c[3]) - d[1];
5155    a[1] = (b[0] << c[2]) - d[0];
5156    a[2] = (b[3] << c[1]) - d[3];
5157    a[3] = (b[2] << c[0]) - d[2];
5158
5159    There are three different layouts here: one for a, one for b and d,
5160    and one for c.  When optimizing for speed it is better to permute each
5161    of b, c and d into the order required by a, since those permutations
5162    happen in parallel.  But when optimizing for size, it is better to:
5163
5164    - permute c into the same order as b
5165    - do the arithmetic
5166    - permute the result into the order required by a
5167
5168    This gives 2 permutations rather than 3.  */
5169
5170 class vect_optimize_slp_pass
5171 {
5172 public:
5173   vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
5174   void run ();
5175
5176 private:
5177   /* Graph building.  */
5178   struct loop *containing_loop (slp_tree);
5179   bool is_cfg_latch_edge (graph_edge *);
5180   void build_vertices (hash_set<slp_tree> &, slp_tree);
5181   void build_vertices ();
5182   void build_graph ();
5183
5184   /* Partitioning.  */
5185   void create_partitions ();
5186   template<typename T> void for_each_partition_edge (unsigned int, T);
5187
5188   /* Layout selection.  */
5189   bool is_compatible_layout (slp_tree, unsigned int);
5190   int change_layout_cost (slp_tree, unsigned int, unsigned int);
5191   slpg_partition_layout_costs &partition_layout_costs (unsigned int,
5192                                                        unsigned int);
5193   void change_vec_perm_layout (slp_tree, lane_permutation_t &,
5194                                int, unsigned int);
5195   int internal_node_cost (slp_tree, int, unsigned int);
5196   void start_choosing_layouts ();
5197
5198   /* Cost propagation.  */
5199   slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
5200                                      unsigned int, unsigned int);
5201   slpg_layout_cost total_in_cost (unsigned int);
5202   slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
5203   slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
5204   void forward_pass ();
5205   void backward_pass ();
5206
5207   /* Rematerialization.  */
5208   slp_tree get_result_with_layout (slp_tree, unsigned int);
5209   void materialize ();
5210
5211   /* Clean-up.  */
5212   void remove_redundant_permutations ();
5213
5214   void dump ();
5215
5216   vec_info *m_vinfo;
5217
5218   /* True if we should optimize the graph for size, false if we should
5219      optimize it for speed.  (It wouldn't be easy to make this decision
5220      more locally.)  */
5221   bool m_optimize_size;
5222
5223   /* A graph of all SLP nodes, with edges leading from uses to definitions.
5224      In other words, a node's predecessors are its slp_tree parents and
5225      a node's successors are its slp_tree children.  */
5226   graph *m_slpg = nullptr;
5227
5228   /* The vertices of M_SLPG, indexed by slp_tree::vertex.  */
5229   auto_vec<slpg_vertex> m_vertices;
5230
5231   /* The list of all leaves of M_SLPG. such as external definitions, constants,
5232      and loads.  */
5233   auto_vec<int> m_leafs;
5234
5235   /* This array has one entry for every vector layout that we're considering.
5236      Element 0 is null and indicates "no change".  Other entries describe
5237      permutations that are inherent in the current graph and that we would
5238      like to reverse if possible.
5239
5240      For example, a permutation { 1, 2, 3, 0 } means that something has
5241      effectively been permuted in that way, such as a load group
5242      { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
5243      We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
5244      in order to put things "back" in order.  */
5245   auto_vec<vec<unsigned> > m_perms;
5246
5247   /* A partitioning of the nodes for which a layout must be chosen.
5248      Each partition represents an <SCC, cfg loop> pair; that is,
5249      nodes in different SCCs belong to different partitions, and nodes
5250      within an SCC can be further partitioned according to a containing
5251      cfg loop.  Partition <SCC1, L1> comes before <SCC2, L2> if:
5252
5253      - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
5254        from leaves (such as loads) to roots (such as stores).
5255
5256      - SCC1 == SCC2 and L1's header strictly dominates L2's header.  */
5257   auto_vec<slpg_partition_info> m_partitions;
5258
5259   /* The list of all nodes for which a layout must be chosen.  Nodes for
5260      partition P come before the nodes for partition P+1.  Nodes within a
5261      partition are in reverse postorder.  */
5262   auto_vec<unsigned int> m_partitioned_nodes;
5263
5264   /* Index P * num-layouts + L contains the cost of using layout L
5265      for partition P.  */
5266   auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
5267
5268   /* Index N * num-layouts + L, if nonnull, is a node that provides the
5269      original output of node N adjusted to have layout L.  */
5270   auto_vec<slp_tree> m_node_layouts;
5271 };
5272
5273 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
5274    Also record whether we should optimize anything for speed rather
5275    than size.  */
5276
5277 void
5278 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
5279                                         slp_tree node)
5280 {
5281   unsigned i;
5282   slp_tree child;
5283
5284   if (visited.add (node))
5285     return;
5286
5287   if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
5288     {
5289       basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
5290       if (optimize_bb_for_speed_p (bb))
5291         m_optimize_size = false;
5292     }
5293
5294   node->vertex = m_vertices.length ();
5295   m_vertices.safe_push (slpg_vertex (node));
5296
5297   bool leaf = true;
5298   bool force_leaf = false;
5299   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5300     if (child)
5301       {
5302         leaf = false;
5303         build_vertices (visited, child);
5304       }
5305     else
5306       force_leaf = true;
5307   /* Since SLP discovery works along use-def edges all cycles have an
5308      entry - but there's the exception of cycles where we do not handle
5309      the entry explicitely (but with a NULL SLP node), like some reductions
5310      and inductions.  Force those SLP PHIs to act as leafs to make them
5311      backwards reachable.  */
5312   if (leaf || force_leaf)
5313     m_leafs.safe_push (node->vertex);
5314 }
5315
5316 /* Fill the vertices and leafs vector with all nodes in the SLP graph.  */
5317
5318 void
5319 vect_optimize_slp_pass::build_vertices ()
5320 {
5321   hash_set<slp_tree> visited;
5322   unsigned i;
5323   slp_instance instance;
5324   FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
5325     build_vertices (visited, SLP_INSTANCE_TREE (instance));
5326 }
5327
5328 /* Apply (reverse) bijectite PERM to VEC.  */
5329
5330 template <class T>
5331 static void
5332 vect_slp_permute (vec<unsigned> perm,
5333                   vec<T> &vec, bool reverse)
5334 {
5335   auto_vec<T, 64> saved;
5336   saved.create (vec.length ());
5337   for (unsigned i = 0; i < vec.length (); ++i)
5338     saved.quick_push (vec[i]);
5339
5340   if (reverse)
5341     {
5342       for (unsigned i = 0; i < vec.length (); ++i)
5343         vec[perm[i]] = saved[i];
5344       for (unsigned i = 0; i < vec.length (); ++i)
5345         gcc_assert (vec[perm[i]] == saved[i]);
5346     }
5347   else
5348     {
5349       for (unsigned i = 0; i < vec.length (); ++i)
5350         vec[i] = saved[perm[i]];
5351       for (unsigned i = 0; i < vec.length (); ++i)
5352         gcc_assert (vec[i] == saved[perm[i]]);
5353     }
5354 }
5355
5356 /* Return the cfg loop that contains NODE.  */
5357
5358 struct loop *
5359 vect_optimize_slp_pass::containing_loop (slp_tree node)
5360 {
5361   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
5362   if (!rep)
5363     return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
5364   return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
5365 }
5366
5367 /* Return true if UD (an edge from a use to a definition) is associated
5368    with a loop latch edge in the cfg.  */
5369
5370 bool
5371 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
5372 {
5373   slp_tree use = m_vertices[ud->src].node;
5374   slp_tree def = m_vertices[ud->dest].node;
5375   if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
5376        || SLP_TREE_CODE (use) == VEC_PERM_EXPR)
5377       || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
5378     return false;
5379
5380   stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
5381   return (is_a<gphi *> (use_rep->stmt)
5382           && bb_loop_header_p (gimple_bb (use_rep->stmt))
5383           && containing_loop (def) == containing_loop (use));
5384 }
5385
5386 /* Build the graph.  Mark edges that correspond to cfg loop latch edges with
5387    a nonnull data field.  */
5388
5389 void
5390 vect_optimize_slp_pass::build_graph ()
5391 {
5392   m_optimize_size = true;
5393   build_vertices ();
5394
5395   m_slpg = new_graph (m_vertices.length ());
5396   for (slpg_vertex &v : m_vertices)
5397     for (slp_tree child : SLP_TREE_CHILDREN (v.node))
5398       if (child)
5399         {
5400           graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
5401           if (is_cfg_latch_edge (ud))
5402             ud->data = this;
5403         }
5404 }
5405
5406 /* Return true if E corresponds to a loop latch edge in the cfg.  */
5407
5408 static bool
5409 skip_cfg_latch_edges (graph_edge *e)
5410 {
5411   return e->data;
5412 }
5413
5414 /* Create the node partitions.  */
5415
5416 void
5417 vect_optimize_slp_pass::create_partitions ()
5418 {
5419   /* Calculate a postorder of the graph, ignoring edges that correspond
5420      to natural latch edges in the cfg.  Reading the vector from the end
5421      to the beginning gives the reverse postorder.  */
5422   auto_vec<int> initial_rpo;
5423   graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
5424                false, NULL, skip_cfg_latch_edges);
5425   gcc_assert (initial_rpo.length () == m_vertices.length ());
5426
5427   /* Calculate the strongly connected components of the graph.  */
5428   auto_vec<int> scc_grouping;
5429   unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
5430
5431   /* Create a new index order in which all nodes from the same SCC are
5432      consecutive.  Use scc_pos to record the index of the first node in
5433      each SCC.  */
5434   auto_vec<unsigned int> scc_pos (num_sccs);
5435   int last_component = -1;
5436   unsigned int node_count = 0;
5437   for (unsigned int node_i : scc_grouping)
5438     {
5439       if (last_component != m_slpg->vertices[node_i].component)
5440         {
5441           last_component = m_slpg->vertices[node_i].component;
5442           gcc_assert (last_component == int (scc_pos.length ()));
5443           scc_pos.quick_push (node_count);
5444         }
5445       node_count += 1;
5446     }
5447   gcc_assert (node_count == initial_rpo.length ()
5448               && last_component + 1 == int (num_sccs));
5449
5450   /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
5451      inside each SCC following the RPO we calculated above.  The fact that
5452      we ignored natural latch edges when calculating the RPO should ensure
5453      that, for natural loop nests:
5454
5455      - the first node that we encounter in a cfg loop is the loop header phi
5456      - the loop header phis are in dominance order
5457
5458      Arranging for this is an optimization (see below) rather than a
5459      correctness issue.  Unnatural loops with a tangled mess of backedges
5460      will still work correctly, but might give poorer results.
5461
5462      Also update scc_pos so that it gives 1 + the index of the last node
5463      in the SCC.  */
5464   m_partitioned_nodes.safe_grow (node_count);
5465   for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
5466     {
5467       unsigned int node_i = initial_rpo[old_i];
5468       unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
5469       m_partitioned_nodes[new_i] = node_i;
5470     }
5471
5472   /* When optimizing for speed, partition each SCC based on the containing
5473      cfg loop. The order we constructed above should ensure that, for natural
5474      cfg loops, we'll create sub-SCC partitions for outer loops before
5475      the corresponding sub-SCC partitions for inner loops.  Similarly,
5476      when one sibling loop A dominates another sibling loop B, we should
5477      create a sub-SCC partition for A before a sub-SCC partition for B.
5478
5479      As above, nothing depends for correctness on whether this achieves
5480      a natural nesting, but we should get better results when it does.  */
5481   m_partitions.reserve (m_vertices.length ());
5482   unsigned int next_partition_i = 0;
5483   hash_map<struct loop *, int> loop_partitions;
5484   unsigned int rpo_begin = 0;
5485   unsigned int num_partitioned_nodes = 0;
5486   for (unsigned int rpo_end : scc_pos)
5487     {
5488       loop_partitions.empty ();
5489       unsigned int partition_i = next_partition_i;
5490       for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
5491         {
5492           /* Handle externals and constants optimistically throughout.
5493              But treat existing vectors as fixed since we do not handle
5494              permuting them.  */
5495           unsigned int node_i = m_partitioned_nodes[rpo_i];
5496           auto &vertex = m_vertices[node_i];
5497           if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
5498                && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
5499               || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
5500             vertex.partition = -1;
5501           else
5502             {
5503               bool existed;
5504               if (m_optimize_size)
5505                 existed = next_partition_i > partition_i;
5506               else
5507                 {
5508                   struct loop *loop = containing_loop (vertex.node);
5509                   auto &entry = loop_partitions.get_or_insert (loop, &existed);
5510                   if (!existed)
5511                     entry = next_partition_i;
5512                   partition_i = entry;
5513                 }
5514               if (!existed)
5515                 {
5516                   m_partitions.quick_push (slpg_partition_info ());
5517                   next_partition_i += 1;
5518                 }
5519               vertex.partition = partition_i;
5520               num_partitioned_nodes += 1;
5521               m_partitions[partition_i].node_end += 1;
5522             }
5523         }
5524       rpo_begin = rpo_end;
5525     }
5526
5527   /* Assign ranges of consecutive node indices to each partition,
5528      in partition order.  Start with node_end being the same as
5529      node_begin so that the next loop can use it as a counter.  */
5530   unsigned int node_begin = 0;
5531   for (auto &partition : m_partitions)
5532     {
5533       partition.node_begin = node_begin;
5534       node_begin += partition.node_end;
5535       partition.node_end = partition.node_begin;
5536     }
5537   gcc_assert (node_begin == num_partitioned_nodes);
5538
5539   /* Finally build the list of nodes in partition order.  */
5540   m_partitioned_nodes.truncate (num_partitioned_nodes);
5541   for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
5542     {
5543       int partition_i = m_vertices[node_i].partition;
5544       if (partition_i >= 0)
5545         {
5546           unsigned int order_i = m_partitions[partition_i].node_end++;
5547           m_partitioned_nodes[order_i] = node_i;
5548         }
5549     }
5550 }
5551
5552 /* Look for edges from earlier partitions into node NODE_I and edges from
5553    node NODE_I into later partitions.  Call:
5554
5555       FN (ud, other_node_i)
5556
5557    for each such use-to-def edge ud, where other_node_i is the node at the
5558    other end of the edge.  */
5559
5560 template<typename T>
5561 void
5562 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
5563 {
5564   int partition_i = m_vertices[node_i].partition;
5565   for (graph_edge *pred = m_slpg->vertices[node_i].pred;
5566        pred; pred = pred->pred_next)
5567     {
5568       int src_partition_i = m_vertices[pred->src].partition;
5569       if (src_partition_i >= 0 && src_partition_i != partition_i)
5570         fn (pred, pred->src);
5571     }
5572   for (graph_edge *succ = m_slpg->vertices[node_i].succ;
5573        succ; succ = succ->succ_next)
5574     {
5575       int dest_partition_i = m_vertices[succ->dest].partition;
5576       if (dest_partition_i >= 0 && dest_partition_i != partition_i)
5577         fn (succ, succ->dest);
5578     }
5579 }
5580
5581 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
5582    that NODE would operate on.  This test is independent of NODE's actual
5583    operation.  */
5584
5585 bool
5586 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
5587                                               unsigned int layout_i)
5588 {
5589   if (layout_i == 0)
5590     return true;
5591
5592   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
5593     return false;
5594
5595   return true;
5596 }
5597
5598 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
5599    to layout TO_LAYOUT_I for a node like NODE.  Return -1 if either of the
5600    layouts is incompatible with NODE or if the change is not possible for
5601    some other reason.
5602
5603    The properties taken from NODE include the number of lanes and the
5604    vector type.  The actual operation doesn't matter.  */
5605
5606 int
5607 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
5608                                             unsigned int from_layout_i,
5609                                             unsigned int to_layout_i)
5610 {
5611   if (!is_compatible_layout (node, from_layout_i)
5612       || !is_compatible_layout (node, to_layout_i))
5613     return -1;
5614
5615   if (from_layout_i == to_layout_i)
5616     return 0;
5617
5618   auto_vec<slp_tree, 1> children (1);
5619   children.quick_push (node);
5620   auto_lane_permutation_t perm (SLP_TREE_LANES (node));
5621   if (from_layout_i > 0)
5622     for (unsigned int i : m_perms[from_layout_i])
5623       perm.quick_push ({ 0, i });
5624   else
5625     for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
5626       perm.quick_push ({ 0, i });
5627   if (to_layout_i > 0)
5628     vect_slp_permute (m_perms[to_layout_i], perm, true);
5629   auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
5630                                                children, false);
5631   if (count >= 0)
5632     return MAX (count, 1);
5633
5634   /* ??? In principle we could try changing via layout 0, giving two
5635      layout changes rather than 1.  Doing that would require
5636      corresponding support in get_result_with_layout.  */
5637   return -1;
5638 }
5639
5640 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I.  */
5641
5642 inline slpg_partition_layout_costs &
5643 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
5644                                                 unsigned int layout_i)
5645 {
5646   return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
5647 }
5648
5649 /* Change PERM in one of two ways:
5650
5651    - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
5652      chosen for child I of NODE.
5653
5654    - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
5655
5656    In both cases, arrange for the output to have layout OUT_LAYOUT_I  */
5657
5658 void
5659 vect_optimize_slp_pass::
5660 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
5661                         int in_layout_i, unsigned int out_layout_i)
5662 {
5663   for (auto &entry : perm)
5664     {
5665       int this_in_layout_i = in_layout_i;
5666       if (this_in_layout_i < 0)
5667         {
5668           slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
5669           unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
5670           if (in_partition_i == -1u)
5671             continue;
5672           this_in_layout_i = m_partitions[in_partition_i].layout;
5673         }
5674       if (this_in_layout_i > 0)
5675         entry.second = m_perms[this_in_layout_i][entry.second];
5676     }
5677   if (out_layout_i > 0)
5678     vect_slp_permute (m_perms[out_layout_i], perm, true);
5679 }
5680
5681 /* Check whether the target allows NODE to be rearranged so that the node's
5682    output has layout OUT_LAYOUT_I.  Return the cost of the change if so,
5683    in the same arbitrary units as for change_layout_cost.  Return -1 otherwise.
5684
5685    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
5686    NODE can adapt to the layout changes that have (perhaps provisionally)
5687    been chosen for NODE's children, so that no extra permutations are
5688    needed on either the input or the output of NODE.
5689
5690    If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
5691    that all inputs will be forced into layout IN_LAYOUT_I beforehand.
5692
5693    IN_LAYOUT_I has no meaning for other types of node.
5694
5695    Keeping the node as-is is always valid.  If the target doesn't appear
5696    to support the node as-is, but might realistically support other layouts,
5697    then layout 0 instead has the cost of a worst-case permutation.  On the
5698    one hand, this ensures that every node has at least one valid layout,
5699    avoiding what would otherwise be an awkward special case.  On the other,
5700    it still encourages the pass to change an invalid pre-existing layout
5701    choice into a valid one.  */
5702
5703 int
5704 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
5705                                             unsigned int out_layout_i)
5706 {
5707   const int fallback_cost = 1;
5708
5709   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5710     {
5711       auto_lane_permutation_t tmp_perm;
5712       tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5713
5714       /* Check that the child nodes support the chosen layout.  Checking
5715          the first child is enough, since any second child would have the
5716          same shape.  */
5717       auto first_child = SLP_TREE_CHILDREN (node)[0];
5718       if (in_layout_i > 0
5719           && !is_compatible_layout (first_child, in_layout_i))
5720         return -1;
5721
5722       change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
5723       int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
5724                                                   node, tmp_perm,
5725                                                   SLP_TREE_CHILDREN (node),
5726                                                   false);
5727       if (count < 0)
5728         {
5729           if (in_layout_i == 0 && out_layout_i == 0)
5730             {
5731               /* Use the fallback cost if the node could in principle support
5732                  some nonzero layout for both the inputs and the outputs.
5733                  Otherwise assume that the node will be rejected later
5734                  and rebuilt from scalars.  */
5735               if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
5736                 return fallback_cost;
5737               return 0;
5738             }
5739           return -1;
5740         }
5741
5742       /* We currently have no way of telling whether the new layout is cheaper
5743          or more expensive than the old one.  But at least in principle,
5744          it should be worth making zero permutations (whole-vector shuffles)
5745          cheaper than real permutations, in case the pass is able to remove
5746          the latter.  */
5747       return count == 0 ? 0 : 1;
5748     }
5749
5750   stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
5751   if (rep
5752       && STMT_VINFO_DATA_REF (rep)
5753       && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
5754       && SLP_TREE_LOAD_PERMUTATION (node).exists ())
5755     {
5756       auto_load_permutation_t tmp_perm;
5757       tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
5758       if (out_layout_i > 0)
5759         vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
5760
5761       poly_uint64 vf = 1;
5762       if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
5763         vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5764       unsigned int n_perms;
5765       if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
5766                                            nullptr, vf, true, false, &n_perms))
5767         {
5768           auto rep = SLP_TREE_REPRESENTATIVE (node);
5769           if (out_layout_i == 0)
5770             {
5771               /* Use the fallback cost if the load is an N-to-N permutation.
5772                  Otherwise assume that the node will be rejected later
5773                  and rebuilt from scalars.  */
5774               if (STMT_VINFO_GROUPED_ACCESS (rep)
5775                   && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
5776                       == SLP_TREE_LANES (node)))
5777                 return fallback_cost;
5778               return 0;
5779             }
5780           return -1;
5781         }
5782
5783       /* See the comment above the corresponding VEC_PERM_EXPR handling.  */
5784       return n_perms == 0 ? 0 : 1;
5785     }
5786
5787   return 0;
5788 }
5789
5790 /* Decide which element layouts we should consider using.  Calculate the
5791    weights associated with inserting layout changes on partition edges.
5792    Also mark partitions that cannot change layout, by setting their
5793    layout to zero.  */
5794
5795 void
5796 vect_optimize_slp_pass::start_choosing_layouts ()
5797 {
5798   /* Used to assign unique permutation indices.  */
5799   using perm_hash = unbounded_hashmap_traits<
5800     vec_free_hash_base<int_hash_base<unsigned>>,
5801     int_hash<int, -1, -2>
5802   >;
5803   hash_map<vec<unsigned>, int, perm_hash> layout_ids;
5804
5805   /* Layout 0 is "no change".  */
5806   m_perms.safe_push (vNULL);
5807
5808   /* Create layouts from existing permutations.  */
5809   auto_load_permutation_t tmp_perm;
5810   for (unsigned int node_i : m_partitioned_nodes)
5811     {
5812       /* Leafs also double as entries to the reverse graph.  Allow the
5813          layout of those to be changed.  */
5814       auto &vertex = m_vertices[node_i];
5815       auto &partition = m_partitions[vertex.partition];
5816       if (!m_slpg->vertices[node_i].succ)
5817         partition.layout = 0;
5818
5819       /* Loads and VEC_PERM_EXPRs are the only things generating permutes.  */
5820       slp_tree node = vertex.node;
5821       stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
5822       slp_tree child;
5823       unsigned HOST_WIDE_INT imin, imax = 0;
5824       bool any_permute = false;
5825       tmp_perm.truncate (0);
5826       if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
5827         {
5828           /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
5829              unpermuted, record a layout that reverses this permutation.
5830
5831              We would need more work to cope with loads that are internally
5832              permuted and also have inputs (such as masks for
5833              IFN_MASK_LOADs).  */
5834           gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
5835           if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
5836             {
5837               partition.layout = -1;
5838               continue;
5839             }
5840           dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
5841           imin = DR_GROUP_SIZE (dr_stmt) + 1;
5842           tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
5843         }
5844       else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
5845                && SLP_TREE_CHILDREN (node).length () == 1
5846                && (child = SLP_TREE_CHILDREN (node)[0])
5847                && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
5848                    .is_constant (&imin)))
5849         {
5850           /* If the child has the same vector size as this node,
5851              reversing the permutation can make the permutation a no-op.
5852              In other cases it can change a true permutation into a
5853              full-vector extract.  */
5854           tmp_perm.reserve (SLP_TREE_LANES (node));
5855           for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
5856             tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
5857         }
5858       else
5859         continue;
5860
5861       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
5862         {
5863           unsigned idx = tmp_perm[j];
5864           imin = MIN (imin, idx);
5865           imax = MAX (imax, idx);
5866           if (idx - tmp_perm[0] != j)
5867             any_permute = true;
5868         }
5869       /* If the span doesn't match we'd disrupt VF computation, avoid
5870          that for now.  */
5871       if (imax - imin + 1 != SLP_TREE_LANES (node))
5872         continue;
5873       /* If there's no permute no need to split one out.  In this case
5874          we can consider turning a load into a permuted load, if that
5875          turns out to be cheaper than alternatives.  */
5876       if (!any_permute)
5877         {
5878           partition.layout = -1;
5879           continue;
5880         }
5881
5882       /* For now only handle true permutes, like
5883          vect_attempt_slp_rearrange_stmts did.  This allows us to be lazy
5884          when permuting constants and invariants keeping the permute
5885          bijective.  */
5886       auto_sbitmap load_index (SLP_TREE_LANES (node));
5887       bitmap_clear (load_index);
5888       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
5889         bitmap_set_bit (load_index, tmp_perm[j] - imin);
5890       unsigned j;
5891       for (j = 0; j < SLP_TREE_LANES (node); ++j)
5892         if (!bitmap_bit_p (load_index, j))
5893           break;
5894       if (j != SLP_TREE_LANES (node))
5895         continue;
5896
5897       vec<unsigned> perm = vNULL;
5898       perm.safe_grow (SLP_TREE_LANES (node), true);
5899       for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
5900         perm[j] = tmp_perm[j] - imin;
5901
5902       if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
5903         {
5904           /* Continue to use existing layouts, but don't add any more.  */
5905           int *entry = layout_ids.get (perm);
5906           partition.layout = entry ? *entry : 0;
5907           perm.release ();
5908         }
5909       else
5910         {
5911           bool existed;
5912           int &layout_i = layout_ids.get_or_insert (perm, &existed);
5913           if (existed)
5914             perm.release ();
5915           else
5916             {
5917               layout_i = m_perms.length ();
5918               m_perms.safe_push (perm);
5919             }
5920           partition.layout = layout_i;
5921         }
5922     }
5923
5924   /* Initially assume that every layout is possible and has zero cost
5925      in every partition.  */
5926   m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
5927                                               * m_perms.length ());
5928
5929   /* We have to mark outgoing permutations facing non-associating-reduction
5930      graph entries that are not represented as to be materialized.
5931      slp_inst_kind_bb_reduc currently only covers associatable reductions.  */
5932   for (slp_instance instance : m_vinfo->slp_instances)
5933     if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
5934       {
5935         unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
5936         m_partitions[m_vertices[node_i].partition].layout = 0;
5937       }
5938     else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
5939       {
5940         stmt_vec_info stmt_info
5941           = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
5942         stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
5943         if (needs_fold_left_reduction_p (TREE_TYPE
5944                                            (gimple_get_lhs (stmt_info->stmt)),
5945                                          STMT_VINFO_REDUC_CODE (reduc_info)))
5946           {
5947             unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
5948             m_partitions[m_vertices[node_i].partition].layout = 0;
5949           }
5950       }
5951
5952   /* Check which layouts each node and partition can handle.  Calculate the
5953      weights associated with inserting layout changes on edges.  */
5954   for (unsigned int node_i : m_partitioned_nodes)
5955     {
5956       auto &vertex = m_vertices[node_i];
5957       auto &partition = m_partitions[vertex.partition];
5958       slp_tree node = vertex.node;
5959
5960       if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
5961         {
5962           vertex.weight = vect_slp_node_weight (node);
5963
5964           /* We do not handle stores with a permutation, so all
5965              incoming permutations must have been materialized.
5966
5967              We also don't handle masked grouped loads, which lack a
5968              permutation vector.  In this case the memory locations
5969              form an implicit second input to the loads, on top of the
5970              explicit mask input, and the memory input's layout cannot
5971              be changed.
5972
5973              On the other hand, we do support permuting gather loads and
5974              masked gather loads, where each scalar load is independent
5975              of the others.  This can be useful if the address/index input
5976              benefits from permutation.  */
5977           if (STMT_VINFO_DATA_REF (rep)
5978               && STMT_VINFO_GROUPED_ACCESS (rep)
5979               && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
5980             partition.layout = 0;
5981
5982           /* We cannot change the layout of an operation that is
5983              not independent on lanes.  Note this is an explicit
5984              negative list since that's much shorter than the respective
5985              positive one but it's critical to keep maintaining it.  */
5986           if (is_gimple_call (STMT_VINFO_STMT (rep)))
5987             switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
5988               {
5989               case CFN_COMPLEX_ADD_ROT90:
5990               case CFN_COMPLEX_ADD_ROT270:
5991               case CFN_COMPLEX_MUL:
5992               case CFN_COMPLEX_MUL_CONJ:
5993               case CFN_VEC_ADDSUB:
5994               case CFN_VEC_FMADDSUB:
5995               case CFN_VEC_FMSUBADD:
5996                 partition.layout = 0;
5997               default:;
5998               }
5999         }
6000
6001       auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
6002         {
6003           auto &other_vertex = m_vertices[other_node_i];
6004
6005           /* Count the number of edges from earlier partitions and the number
6006              of edges to later partitions.  */
6007           if (other_vertex.partition < vertex.partition)
6008             partition.in_degree += 1;
6009           else
6010             partition.out_degree += 1;
6011
6012           /* If the current node uses the result of OTHER_NODE_I, accumulate
6013              the effects of that.  */
6014           if (ud->src == int (node_i))
6015             {
6016               other_vertex.out_weight += vertex.weight;
6017               other_vertex.out_degree += 1;
6018             }
6019         };
6020       for_each_partition_edge (node_i, process_edge);
6021     }
6022 }
6023
6024 /* Return the incoming costs for node NODE_I, assuming that each input keeps
6025    its current (provisional) choice of layout.  The inputs do not necessarily
6026    have the same layout as each other.  */
6027
6028 slpg_layout_cost
6029 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
6030 {
6031   auto &vertex = m_vertices[node_i];
6032   slpg_layout_cost cost;
6033   auto add_cost = [&](graph_edge *, unsigned int other_node_i)
6034     {
6035       auto &other_vertex = m_vertices[other_node_i];
6036       if (other_vertex.partition < vertex.partition)
6037         {
6038           auto &other_partition = m_partitions[other_vertex.partition];
6039           auto &other_costs = partition_layout_costs (other_vertex.partition,
6040                                                       other_partition.layout);
6041           slpg_layout_cost this_cost = other_costs.in_cost;
6042           this_cost.add_serial_cost (other_costs.internal_cost);
6043           this_cost.split (other_partition.out_degree);
6044           cost.add_parallel_cost (this_cost);
6045         }
6046     };
6047   for_each_partition_edge (node_i, add_cost);
6048   return cost;
6049 }
6050
6051 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
6052    and layout LAYOUT2_I on cross-partition use-to-def edge UD.  Return
6053    slpg_layout_cost::impossible () if the change isn't possible.  */
6054
6055 slpg_layout_cost
6056 vect_optimize_slp_pass::
6057 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
6058                   unsigned int layout2_i)
6059 {
6060   auto &def_vertex = m_vertices[ud->dest];
6061   auto &use_vertex = m_vertices[ud->src];
6062   auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
6063   auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
6064   auto factor = change_layout_cost (def_vertex.node, def_layout_i,
6065                                     use_layout_i);
6066   if (factor < 0)
6067     return slpg_layout_cost::impossible ();
6068
6069   /* We have a choice of putting the layout change at the site of the
6070      definition or at the site of the use.  Prefer the former when
6071      optimizing for size or when the execution frequency of the
6072      definition is no greater than the combined execution frequencies of
6073      the uses.  When putting the layout change at the site of the definition,
6074      divvy up the cost among all consumers.  */
6075   if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
6076     {
6077       slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
6078       cost.split (def_vertex.out_degree);
6079       return cost;
6080     }
6081   return { use_vertex.weight * factor, m_optimize_size };
6082 }
6083
6084 /* UD represents a use-def link between FROM_NODE_I and a node in a later
6085    partition; FROM_NODE_I could be the definition node or the use node.
6086    The node at the other end of the link wants to use layout TO_LAYOUT_I.
6087    Return the cost of any necessary fix-ups on edge UD, or return
6088    slpg_layout_cost::impossible () if the change isn't possible.
6089
6090    At this point, FROM_NODE_I's partition has chosen the cheapest
6091    layout based on the information available so far, but this choice
6092    is only provisional.  */
6093
6094 slpg_layout_cost
6095 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
6096                                       unsigned int to_layout_i)
6097 {
6098   auto &from_vertex = m_vertices[from_node_i];
6099   unsigned int from_partition_i = from_vertex.partition;
6100   slpg_partition_info &from_partition = m_partitions[from_partition_i];
6101   gcc_assert (from_partition.layout >= 0);
6102
6103   /* First calculate the cost on the assumption that FROM_PARTITION sticks
6104      with its current layout preference.  */
6105   slpg_layout_cost cost = slpg_layout_cost::impossible ();
6106   auto edge_cost = edge_layout_cost (ud, from_node_i,
6107                                      from_partition.layout, to_layout_i);
6108   if (edge_cost.is_possible ())
6109     {
6110       auto &from_costs = partition_layout_costs (from_partition_i,
6111                                                  from_partition.layout);
6112       cost = from_costs.in_cost;
6113       cost.add_serial_cost (from_costs.internal_cost);
6114       cost.split (from_partition.out_degree);
6115       cost.add_serial_cost (edge_cost);
6116     }
6117   else if (from_partition.layout == 0)
6118     /* We must allow the source partition to have layout 0 as a fallback,
6119        in case all other options turn out to be impossible.  */
6120     return cost;
6121
6122   /* Take the minimum of that cost and the cost that applies if
6123      FROM_PARTITION instead switches to TO_LAYOUT_I.  */
6124   auto &direct_layout_costs = partition_layout_costs (from_partition_i,
6125                                                       to_layout_i);
6126   if (direct_layout_costs.is_possible ())
6127     {
6128       slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
6129       direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
6130       direct_cost.split (from_partition.out_degree);
6131       if (!cost.is_possible ()
6132           || direct_cost.is_better_than (cost, m_optimize_size))
6133         cost = direct_cost;
6134     }
6135
6136   return cost;
6137 }
6138
6139 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
6140    partition; TO_NODE_I could be the definition node or the use node.
6141    The node at the other end of the link wants to use layout FROM_LAYOUT_I;
6142    return the cost of any necessary fix-ups on edge UD, or
6143    slpg_layout_cost::impossible () if the choice cannot be made.
6144
6145    At this point, TO_NODE_I's partition has a fixed choice of layout.  */
6146
6147 slpg_layout_cost
6148 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
6149                                        unsigned int from_layout_i)
6150 {
6151   auto &to_vertex = m_vertices[to_node_i];
6152   unsigned int to_partition_i = to_vertex.partition;
6153   slpg_partition_info &to_partition = m_partitions[to_partition_i];
6154   gcc_assert (to_partition.layout >= 0);
6155
6156   /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
6157      adjusted for this input having layout FROM_LAYOUT_I.  Assume that
6158      any other inputs keep their current choice of layout.  */
6159   auto &to_costs = partition_layout_costs (to_partition_i,
6160                                            to_partition.layout);
6161   if (ud->src == int (to_node_i)
6162       && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
6163     {
6164       auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
6165       auto old_layout = from_partition.layout;
6166       from_partition.layout = from_layout_i;
6167       int factor = internal_node_cost (to_vertex.node, -1,
6168                                        to_partition.layout);
6169       from_partition.layout = old_layout;
6170       if (factor >= 0)
6171         {
6172           slpg_layout_cost cost = to_costs.out_cost;
6173           cost.add_serial_cost ({ to_vertex.weight * factor,
6174                                   m_optimize_size });
6175           cost.split (to_partition.in_degree);
6176           return cost;
6177         }
6178     }
6179
6180   /* Compute the cost if we insert any necessary layout change on edge UD.  */
6181   auto edge_cost = edge_layout_cost (ud, to_node_i,
6182                                      to_partition.layout, from_layout_i);
6183   if (edge_cost.is_possible ())
6184     {
6185       slpg_layout_cost cost = to_costs.out_cost;
6186       cost.add_serial_cost (to_costs.internal_cost);
6187       cost.split (to_partition.in_degree);
6188       cost.add_serial_cost (edge_cost);
6189       return cost;
6190     }
6191
6192   return slpg_layout_cost::impossible ();
6193 }
6194
6195 /* Make a forward pass through the partitions, accumulating input costs.
6196    Make a tentative (provisional) choice of layout for each partition,
6197    ensuring that this choice still allows later partitions to keep
6198    their original layout.  */
6199
6200 void
6201 vect_optimize_slp_pass::forward_pass ()
6202 {
6203   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
6204        ++partition_i)
6205     {
6206       auto &partition = m_partitions[partition_i];
6207
6208       /* If the partition consists of a single VEC_PERM_EXPR, precompute
6209          the incoming cost that would apply if every predecessor partition
6210          keeps its current layout.  This is used within the loop below.  */
6211       slpg_layout_cost in_cost;
6212       slp_tree single_node = nullptr;
6213       if (partition.node_end == partition.node_begin + 1)
6214         {
6215           unsigned int node_i = m_partitioned_nodes[partition.node_begin];
6216           single_node = m_vertices[node_i].node;
6217           if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
6218             in_cost = total_in_cost (node_i);
6219         }
6220
6221       /* Go through the possible layouts.  Decide which ones are valid
6222          for this partition and record which of the valid layouts has
6223          the lowest cost.  */
6224       unsigned int min_layout_i = 0;
6225       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
6226       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6227         {
6228           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6229           if (!layout_costs.is_possible ())
6230             continue;
6231
6232           /* If the recorded layout is already 0 then the layout cannot
6233              change.  */
6234           if (partition.layout == 0 && layout_i != 0)
6235             {
6236               layout_costs.mark_impossible ();
6237               continue;
6238             }
6239
6240           bool is_possible = true;
6241           for (unsigned int order_i = partition.node_begin;
6242                order_i < partition.node_end; ++order_i)
6243             {
6244               unsigned int node_i = m_partitioned_nodes[order_i];
6245               auto &vertex = m_vertices[node_i];
6246
6247               /* Reject the layout if it is individually incompatible
6248                  with any node in the partition.  */
6249               if (!is_compatible_layout (vertex.node, layout_i))
6250                 {
6251                   is_possible = false;
6252                   break;
6253                 }
6254
6255               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
6256                 {
6257                   auto &other_vertex = m_vertices[other_node_i];
6258                   if (other_vertex.partition < vertex.partition)
6259                     {
6260                       /* Accumulate the incoming costs from earlier
6261                          partitions, plus the cost of any layout changes
6262                          on UD itself.  */
6263                       auto cost = forward_cost (ud, other_node_i, layout_i);
6264                       if (!cost.is_possible ())
6265                         is_possible = false;
6266                       else
6267                         layout_costs.in_cost.add_parallel_cost (cost);
6268                     }
6269                   else
6270                     /* Reject the layout if it would make layout 0 impossible
6271                        for later partitions.  This amounts to testing that the
6272                        target supports reversing the layout change on edges
6273                        to later partitions.
6274
6275                        In principle, it might be possible to push a layout
6276                        change all the way down a graph, so that it never
6277                        needs to be reversed and so that the target doesn't
6278                        need to support the reverse operation.  But it would
6279                        be awkward to bail out if we hit a partition that
6280                        does not support the new layout, especially since
6281                        we are not dealing with a lattice.  */
6282                     is_possible &= edge_layout_cost (ud, other_node_i, 0,
6283                                                      layout_i).is_possible ();
6284                 };
6285               for_each_partition_edge (node_i, add_cost);
6286
6287               /* Accumulate the cost of using LAYOUT_I within NODE,
6288                  both for the inputs and the outputs.  */
6289               int factor = internal_node_cost (vertex.node, layout_i,
6290                                                layout_i);
6291               if (factor < 0)
6292                 {
6293                   is_possible = false;
6294                   break;
6295                 }
6296               else if (factor)
6297                 layout_costs.internal_cost.add_serial_cost
6298                   ({ vertex.weight * factor, m_optimize_size });
6299             }
6300           if (!is_possible)
6301             {
6302               layout_costs.mark_impossible ();
6303               continue;
6304             }
6305
6306           /* Combine the incoming and partition-internal costs.  */
6307           slpg_layout_cost combined_cost = layout_costs.in_cost;
6308           combined_cost.add_serial_cost (layout_costs.internal_cost);
6309
6310           /* If this partition consists of a single VEC_PERM_EXPR, see
6311              if the VEC_PERM_EXPR can be changed to support output layout
6312              LAYOUT_I while keeping all the provisional choices of input
6313              layout.  */
6314           if (single_node
6315               && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
6316             {
6317               int factor = internal_node_cost (single_node, -1, layout_i);
6318               if (factor >= 0)
6319                 {
6320                   auto weight = m_vertices[single_node->vertex].weight;
6321                   slpg_layout_cost internal_cost
6322                     = { weight * factor, m_optimize_size };
6323
6324                   slpg_layout_cost alt_cost = in_cost;
6325                   alt_cost.add_serial_cost (internal_cost);
6326                   if (alt_cost.is_better_than (combined_cost, m_optimize_size))
6327                     {
6328                       combined_cost = alt_cost;
6329                       layout_costs.in_cost = in_cost;
6330                       layout_costs.internal_cost = internal_cost;
6331                     }
6332                 }
6333             }
6334
6335           /* Record the layout with the lowest cost.  Prefer layout 0 in
6336              the event of a tie between it and another layout.  */
6337           if (!min_layout_cost.is_possible ()
6338               || combined_cost.is_better_than (min_layout_cost,
6339                                                m_optimize_size))
6340             {
6341               min_layout_i = layout_i;
6342               min_layout_cost = combined_cost;
6343             }
6344         }
6345
6346       /* This loop's handling of earlier partitions should ensure that
6347          choosing the original layout for the current partition is no
6348          less valid than it was in the original graph, even with the
6349          provisional layout choices for those earlier partitions.  */
6350       gcc_assert (min_layout_cost.is_possible ());
6351       partition.layout = min_layout_i;
6352     }
6353 }
6354
6355 /* Make a backward pass through the partitions, accumulating output costs.
6356    Make a final choice of layout for each partition.  */
6357
6358 void
6359 vect_optimize_slp_pass::backward_pass ()
6360 {
6361   for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
6362     {
6363       auto &partition = m_partitions[partition_i];
6364
6365       unsigned int min_layout_i = 0;
6366       slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
6367       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6368         {
6369           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6370           if (!layout_costs.is_possible ())
6371             continue;
6372
6373           /* Accumulate the costs from successor partitions.  */
6374           bool is_possible = true;
6375           for (unsigned int order_i = partition.node_begin;
6376                order_i < partition.node_end; ++order_i)
6377             {
6378               unsigned int node_i = m_partitioned_nodes[order_i];
6379               auto &vertex = m_vertices[node_i];
6380               auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
6381                 {
6382                   auto &other_vertex = m_vertices[other_node_i];
6383                   auto &other_partition = m_partitions[other_vertex.partition];
6384                   if (other_vertex.partition > vertex.partition)
6385                     {
6386                       /* Accumulate the incoming costs from later
6387                          partitions, plus the cost of any layout changes
6388                          on UD itself.  */
6389                       auto cost = backward_cost (ud, other_node_i, layout_i);
6390                       if (!cost.is_possible ())
6391                         is_possible = false;
6392                       else
6393                         layout_costs.out_cost.add_parallel_cost (cost);
6394                     }
6395                   else
6396                     /* Make sure that earlier partitions can (if necessary
6397                        or beneficial) keep the layout that they chose in
6398                        the forward pass.  This ensures that there is at
6399                        least one valid choice of layout.  */
6400                     is_possible &= edge_layout_cost (ud, other_node_i,
6401                                                      other_partition.layout,
6402                                                      layout_i).is_possible ();
6403                 };
6404               for_each_partition_edge (node_i, add_cost);
6405             }
6406           if (!is_possible)
6407             {
6408               layout_costs.mark_impossible ();
6409               continue;
6410             }
6411
6412           /* Locally combine the costs from the forward and backward passes.
6413              (This combined cost is not passed on, since that would lead
6414              to double counting.)  */
6415           slpg_layout_cost combined_cost = layout_costs.in_cost;
6416           combined_cost.add_serial_cost (layout_costs.internal_cost);
6417           combined_cost.add_serial_cost (layout_costs.out_cost);
6418
6419           /* Record the layout with the lowest cost.  Prefer layout 0 in
6420              the event of a tie between it and another layout.  */
6421           if (!min_layout_cost.is_possible ()
6422               || combined_cost.is_better_than (min_layout_cost,
6423                                                m_optimize_size))
6424             {
6425               min_layout_i = layout_i;
6426               min_layout_cost = combined_cost;
6427             }
6428         }
6429
6430       gcc_assert (min_layout_cost.is_possible ());
6431       partition.layout = min_layout_i;
6432     }
6433 }
6434
6435 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
6436    NODE already has the layout that was selected for its partition.  */
6437
6438 slp_tree
6439 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
6440                                                 unsigned int to_layout_i)
6441 {
6442   unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
6443   slp_tree result = m_node_layouts[result_i];
6444   if (result)
6445     return result;
6446
6447   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
6448       || (SLP_TREE_DEF_TYPE (node) == vect_external_def
6449           /* We can't permute vector defs in place.  */
6450           && SLP_TREE_VEC_DEFS (node).is_empty ()))
6451     {
6452       /* If the vector is uniform or unchanged, there's nothing to do.  */
6453       if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
6454         result = node;
6455       else
6456         {
6457           auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
6458           result = vect_create_new_slp_node (scalar_ops);
6459           vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
6460         }
6461     }
6462   else
6463     {
6464       unsigned int partition_i = m_vertices[node->vertex].partition;
6465       unsigned int from_layout_i = m_partitions[partition_i].layout;
6466       if (from_layout_i == to_layout_i)
6467         return node;
6468
6469       /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
6470          permutation instead of a serial one.  Leave the new permutation
6471          in TMP_PERM on success.  */
6472       auto_lane_permutation_t tmp_perm;
6473       unsigned int num_inputs = 1;
6474       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6475         {
6476           tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
6477           if (from_layout_i != 0)
6478             vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
6479           if (to_layout_i != 0)
6480             vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
6481           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
6482                                               tmp_perm,
6483                                               SLP_TREE_CHILDREN (node),
6484                                               false) >= 0)
6485             num_inputs = SLP_TREE_CHILDREN (node).length ();
6486           else
6487             tmp_perm.truncate (0);
6488         }
6489
6490       if (dump_enabled_p ())
6491         {
6492           if (tmp_perm.length () > 0)
6493             dump_printf_loc (MSG_NOTE, vect_location,
6494                              "duplicating permutation node %p with"
6495                              " layout %d\n",
6496                              (void *) node, to_layout_i);
6497           else
6498             dump_printf_loc (MSG_NOTE, vect_location,
6499                              "inserting permutation node in place of %p\n",
6500                              (void *) node);
6501         }
6502
6503       unsigned int num_lanes = SLP_TREE_LANES (node);
6504       result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
6505       if (SLP_TREE_SCALAR_STMTS (node).length ())
6506         {
6507           auto &stmts = SLP_TREE_SCALAR_STMTS (result);
6508           stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
6509           if (from_layout_i != 0)
6510             vect_slp_permute (m_perms[from_layout_i], stmts, false);
6511           if (to_layout_i != 0)
6512             vect_slp_permute (m_perms[to_layout_i], stmts, true);
6513         }
6514       SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
6515       SLP_TREE_LANES (result) = num_lanes;
6516       SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
6517       result->vertex = -1;
6518
6519       auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
6520       if (tmp_perm.length ())
6521         {
6522           lane_perm.safe_splice (tmp_perm);
6523           SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
6524         }
6525       else
6526         {
6527           lane_perm.create (num_lanes);
6528           for (unsigned j = 0; j < num_lanes; ++j)
6529             lane_perm.quick_push ({ 0, j });
6530           if (from_layout_i != 0)
6531             vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
6532           if (to_layout_i != 0)
6533             vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
6534           SLP_TREE_CHILDREN (result).safe_push (node);
6535         }
6536       for (slp_tree child : SLP_TREE_CHILDREN (result))
6537         child->refcnt++;
6538     }
6539   m_node_layouts[result_i] = result;
6540   return result;
6541 }
6542
6543 /* Apply the chosen vector layouts to the SLP graph.  */
6544
6545 void
6546 vect_optimize_slp_pass::materialize ()
6547 {
6548   /* We no longer need the costs, so avoid having two O(N * P) arrays
6549      live at the same time.  */
6550   m_partition_layout_costs.release ();
6551   m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
6552
6553   auto_sbitmap fully_folded (m_vertices.length ());
6554   bitmap_clear (fully_folded);
6555   for (unsigned int node_i : m_partitioned_nodes)
6556     {
6557       auto &vertex = m_vertices[node_i];
6558       slp_tree node = vertex.node;
6559       int layout_i = m_partitions[vertex.partition].layout;
6560       gcc_assert (layout_i >= 0);
6561
6562       /* Rearrange the scalar statements to match the chosen layout.  */
6563       if (layout_i > 0)
6564         vect_slp_permute (m_perms[layout_i],
6565                           SLP_TREE_SCALAR_STMTS (node), true);
6566
6567       /* Update load and lane permutations.  */
6568       if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6569         {
6570           /* First try to absorb the input vector layouts.  If that fails,
6571              force the inputs to have layout LAYOUT_I too.  We checked that
6572              that was possible before deciding to use nonzero output layouts.
6573              (Note that at this stage we don't really have any guarantee that
6574              the target supports the original VEC_PERM_EXPR.)  */
6575           auto &perm = SLP_TREE_LANE_PERMUTATION (node);
6576           auto_lane_permutation_t tmp_perm;
6577           tmp_perm.safe_splice (perm);
6578           change_vec_perm_layout (node, tmp_perm, -1, layout_i);
6579           if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
6580                                               tmp_perm,
6581                                               SLP_TREE_CHILDREN (node),
6582                                               false) >= 0)
6583             {
6584               if (dump_enabled_p ()
6585                   && !std::equal (tmp_perm.begin (), tmp_perm.end (),
6586                                   perm.begin ()))
6587                 dump_printf_loc (MSG_NOTE, vect_location,
6588                                  "absorbing input layouts into %p\n",
6589                                  (void *) node);
6590               std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
6591               bitmap_set_bit (fully_folded, node_i);
6592             }
6593           else
6594             {
6595               /* Not MSG_MISSED because it would make no sense to users.  */
6596               if (dump_enabled_p ())
6597                 dump_printf_loc (MSG_NOTE, vect_location,
6598                                  "failed to absorb input layouts into %p\n",
6599                                  (void *) node);
6600               change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
6601             }
6602         }
6603       else
6604         {
6605           gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
6606           auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
6607           if (layout_i > 0)
6608             /* ???  When we handle non-bijective permutes the idea
6609                is that we can force the load-permutation to be
6610                { min, min + 1, min + 2, ... max }.  But then the
6611                scalar defs might no longer match the lane content
6612                which means wrong-code with live lane vectorization.
6613                So we possibly have to have NULL entries for those.  */
6614             vect_slp_permute (m_perms[layout_i], load_perm, true);
6615         }
6616     }
6617
6618   /* Do this before any nodes disappear, since it involves a walk
6619      over the leaves.  */
6620   remove_redundant_permutations ();
6621
6622   /* Replace each child with a correctly laid-out version.  */
6623   for (unsigned int node_i : m_partitioned_nodes)
6624     {
6625       /* Skip nodes that have already been handled above.  */
6626       if (bitmap_bit_p (fully_folded, node_i))
6627         continue;
6628
6629       auto &vertex = m_vertices[node_i];
6630       int in_layout_i = m_partitions[vertex.partition].layout;
6631       gcc_assert (in_layout_i >= 0);
6632
6633       unsigned j;
6634       slp_tree child;
6635       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
6636         {
6637           if (!child)
6638             continue;
6639
6640           slp_tree new_child = get_result_with_layout (child, in_layout_i);
6641           if (new_child != child)
6642             {
6643               vect_free_slp_tree (child);
6644               SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
6645               new_child->refcnt += 1;
6646             }
6647         }
6648     }
6649 }
6650
6651 /* Elide load permutations that are not necessary.  Such permutations might
6652    be pre-existing, rather than created by the layout optimizations.  */
6653
6654 void
6655 vect_optimize_slp_pass::remove_redundant_permutations ()
6656 {
6657   for (unsigned int node_i : m_leafs)
6658     {
6659       slp_tree node = m_vertices[node_i].node;
6660       if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
6661         continue;
6662
6663       /* In basic block vectorization we allow any subchain of an interleaving
6664          chain.
6665          FORNOW: not in loop SLP because of realignment complications.  */
6666       if (is_a <bb_vec_info> (m_vinfo))
6667         {
6668           bool subchain_p = true;
6669           stmt_vec_info next_load_info = NULL;
6670           stmt_vec_info load_info;
6671           unsigned j;
6672           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
6673             {
6674               if (j != 0
6675                   && (next_load_info != load_info
6676                       || ! load_info
6677                       || DR_GROUP_GAP (load_info) != 1))
6678                 {
6679                   subchain_p = false;
6680                   break;
6681                 }
6682               next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
6683             }
6684           if (subchain_p)
6685             {
6686               SLP_TREE_LOAD_PERMUTATION (node).release ();
6687               continue;
6688             }
6689         }
6690       else
6691         {
6692           loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
6693           stmt_vec_info load_info;
6694           bool this_load_permuted = false;
6695           unsigned j;
6696           FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
6697             if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
6698               {
6699                 this_load_permuted = true;
6700                 break;
6701               }
6702           /* When this isn't a grouped access we know it's single element
6703              and contiguous.  */
6704           if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
6705             {
6706               if (!this_load_permuted
6707                   && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
6708                       || SLP_TREE_LANES (node) == 1))
6709                 SLP_TREE_LOAD_PERMUTATION (node).release ();
6710               continue;
6711             }
6712           stmt_vec_info first_stmt_info
6713             = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
6714           if (!this_load_permuted
6715               /* The load requires permutation when unrolling exposes
6716                  a gap either because the group is larger than the SLP
6717                  group-size or because there is a gap between the groups.  */
6718               && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
6719                   || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
6720                       && DR_GROUP_GAP (first_stmt_info) == 0)))
6721             {
6722               SLP_TREE_LOAD_PERMUTATION (node).release ();
6723               continue;
6724             }
6725         }
6726     }
6727 }
6728
6729 /* Print the partition graph and layout information to the dump file.  */
6730
6731 void
6732 vect_optimize_slp_pass::dump ()
6733 {
6734   dump_printf_loc (MSG_NOTE, vect_location,
6735                    "SLP optimize permutations:\n");
6736   for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
6737     {
6738       dump_printf_loc (MSG_NOTE, vect_location, "  %d: { ", layout_i);
6739       const char *sep = "";
6740       for (unsigned int idx : m_perms[layout_i])
6741         {
6742           dump_printf (MSG_NOTE, "%s%d", sep, idx);
6743           sep = ", ";
6744         }
6745       dump_printf (MSG_NOTE, " }\n");
6746     }
6747   dump_printf_loc (MSG_NOTE, vect_location,
6748                    "SLP optimize partitions:\n");
6749   for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
6750        ++partition_i)
6751     {
6752       auto &partition = m_partitions[partition_i];
6753       dump_printf_loc (MSG_NOTE, vect_location,  "  -------------\n");
6754       dump_printf_loc (MSG_NOTE, vect_location,
6755                        "  partition %d (layout %d):\n",
6756                        partition_i, partition.layout);
6757       dump_printf_loc (MSG_NOTE, vect_location, "    nodes:\n");
6758       for (unsigned int order_i = partition.node_begin;
6759            order_i < partition.node_end; ++order_i)
6760         {
6761           auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
6762           dump_printf_loc (MSG_NOTE, vect_location, "      - %p:\n",
6763                            (void *) vertex.node);
6764           dump_printf_loc (MSG_NOTE, vect_location,
6765                            "          weight: %f\n",
6766                            vertex.weight.to_double ());
6767           if (vertex.out_degree)
6768             dump_printf_loc (MSG_NOTE, vect_location,
6769                              "          out weight: %f (degree %d)\n",
6770                              vertex.out_weight.to_double (),
6771                              vertex.out_degree);
6772           if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
6773             dump_printf_loc (MSG_NOTE, vect_location,
6774                              "          op: VEC_PERM_EXPR\n");
6775           else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
6776             dump_printf_loc (MSG_NOTE, vect_location,
6777                              "          op template: %G", rep->stmt);
6778         }
6779       dump_printf_loc (MSG_NOTE, vect_location, "    edges:\n");
6780       for (unsigned int order_i = partition.node_begin;
6781            order_i < partition.node_end; ++order_i)
6782         {
6783           unsigned int node_i = m_partitioned_nodes[order_i];
6784           auto &vertex = m_vertices[node_i];
6785           auto print_edge = [&](graph_edge *, unsigned int other_node_i)
6786             {
6787               auto &other_vertex = m_vertices[other_node_i];
6788               if (other_vertex.partition < vertex.partition)
6789                 dump_printf_loc (MSG_NOTE, vect_location,
6790                                  "      - %p [%d] --> %p\n",
6791                                  (void *) other_vertex.node,
6792                                  other_vertex.partition,
6793                                  (void *) vertex.node);
6794               else
6795                 dump_printf_loc (MSG_NOTE, vect_location,
6796                                  "      - %p --> [%d] %p\n",
6797                                  (void *) vertex.node,
6798                                  other_vertex.partition,
6799                                  (void *) other_vertex.node);
6800             };
6801           for_each_partition_edge (node_i, print_edge);
6802         }
6803
6804       for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6805         {
6806           auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6807           if (layout_costs.is_possible ())
6808             {
6809               dump_printf_loc (MSG_NOTE, vect_location,
6810                                "    layout %d:%s\n", layout_i,
6811                                partition.layout == int (layout_i)
6812                                ? " (*)" : "");
6813               slpg_layout_cost combined_cost = layout_costs.in_cost;
6814               combined_cost.add_serial_cost (layout_costs.internal_cost);
6815               combined_cost.add_serial_cost (layout_costs.out_cost);
6816 #define TEMPLATE "{depth: %f, total: %f}"
6817               dump_printf_loc (MSG_NOTE, vect_location,
6818                                "        " TEMPLATE "\n",
6819                                layout_costs.in_cost.depth.to_double (),
6820                                layout_costs.in_cost.total.to_double ());
6821               dump_printf_loc (MSG_NOTE, vect_location,
6822                                "      + " TEMPLATE "\n",
6823                                layout_costs.internal_cost.depth.to_double (),
6824                                layout_costs.internal_cost.total.to_double ());
6825               dump_printf_loc (MSG_NOTE, vect_location,
6826                                "      + " TEMPLATE "\n",
6827                                layout_costs.out_cost.depth.to_double (),
6828                                layout_costs.out_cost.total.to_double ());
6829               dump_printf_loc (MSG_NOTE, vect_location,
6830                                "      = " TEMPLATE "\n",
6831                                combined_cost.depth.to_double (),
6832                                combined_cost.total.to_double ());
6833 #undef TEMPLATE
6834             }
6835           else
6836             dump_printf_loc (MSG_NOTE, vect_location,
6837                              "    layout %d: rejected\n", layout_i);
6838         }
6839     }
6840 }
6841
6842 /* Main entry point for the SLP graph optimization pass.  */
6843
6844 void
6845 vect_optimize_slp_pass::run ()
6846 {
6847   build_graph ();
6848   create_partitions ();
6849   start_choosing_layouts ();
6850   if (m_perms.length () > 1)
6851     {
6852       forward_pass ();
6853       backward_pass ();
6854       if (dump_enabled_p ())
6855         dump ();
6856       materialize ();
6857       while (!m_perms.is_empty ())
6858         m_perms.pop ().release ();
6859     }
6860   else
6861     remove_redundant_permutations ();
6862   free_graph (m_slpg);
6863 }
6864
6865 /* Apply CSE to NODE and its children using BST_MAP.  */
6866
6867 static void
6868 vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
6869 {
6870   bool put_p = false;
6871   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
6872       /* Besides some VEC_PERM_EXPR, two-operator nodes also
6873          lack scalar stmts and thus CSE doesn't work via bst_map.  Ideally
6874          we'd have sth that works for all internal and external nodes.  */
6875       && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
6876     {
6877       slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
6878       if (leader)
6879         {
6880           /* We've visited this node already.  */
6881           if (!*leader || *leader == node)
6882             return;
6883
6884           if (dump_enabled_p ())
6885             dump_printf_loc (MSG_NOTE, vect_location,
6886                              "re-using SLP tree %p for %p\n",
6887                              (void *)*leader, (void *)node);
6888           vect_free_slp_tree (node);
6889           (*leader)->refcnt += 1;
6890           node = *leader;
6891           return;
6892         }
6893
6894       /* Avoid creating a cycle by populating the map only after recursion.  */
6895       bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
6896       node->refcnt += 1;
6897       put_p = true;
6898       /* And recurse.  */
6899     }
6900
6901   for (slp_tree &child : SLP_TREE_CHILDREN (node))
6902     if (child)
6903       vect_cse_slp_nodes (bst_map, child);
6904
6905   /* Now record the node for CSE in other siblings.  */
6906   if (put_p)
6907     bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
6908 }
6909
6910 /* Optimize the SLP graph of VINFO.  */
6911
6912 void
6913 vect_optimize_slp (vec_info *vinfo)
6914 {
6915   if (vinfo->slp_instances.is_empty ())
6916     return;
6917   vect_optimize_slp_pass (vinfo).run ();
6918
6919   /* Apply CSE again to nodes after permute optimization.  */
6920   scalar_stmts_to_slp_tree_map_t *bst_map
6921     = new scalar_stmts_to_slp_tree_map_t ();
6922
6923   for (auto inst : vinfo->slp_instances)
6924     vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
6925
6926   release_scalar_stmts_to_slp_tree_map (bst_map);
6927 }
6928
6929 /* Gather loads reachable from the individual SLP graph entries.  */
6930
6931 void
6932 vect_gather_slp_loads (vec_info *vinfo)
6933 {
6934   unsigned i;
6935   slp_instance instance;
6936   FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
6937     {
6938       hash_set<slp_tree> visited;
6939       vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
6940                              SLP_INSTANCE_TREE (instance), visited);
6941     }
6942 }
6943
6944
6945 /* For each possible SLP instance decide whether to SLP it and calculate overall
6946    unrolling factor needed to SLP the loop.  Return TRUE if decided to SLP at
6947    least one instance.  */
6948
6949 bool
6950 vect_make_slp_decision (loop_vec_info loop_vinfo)
6951 {
6952   unsigned int i;
6953   poly_uint64 unrolling_factor = 1;
6954   const vec<slp_instance> &slp_instances
6955     = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
6956   slp_instance instance;
6957   int decided_to_slp = 0;
6958
6959   DUMP_VECT_SCOPE ("vect_make_slp_decision");
6960
6961   FOR_EACH_VEC_ELT (slp_instances, i, instance)
6962     {
6963       /* FORNOW: SLP if you can.  */
6964       /* All unroll factors have the form:
6965
6966            GET_MODE_SIZE (vinfo->vector_mode) * X
6967
6968          for some rational X, so they must have a common multiple.  */
6969       unrolling_factor
6970         = force_common_multiple (unrolling_factor,
6971                                  SLP_INSTANCE_UNROLLING_FACTOR (instance));
6972
6973       /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts.  Later we
6974          call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
6975          loop-based vectorization.  Such stmts will be marked as HYBRID.  */
6976       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
6977       decided_to_slp++;
6978     }
6979
6980   LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
6981
6982   if (decided_to_slp && dump_enabled_p ())
6983     {
6984       dump_printf_loc (MSG_NOTE, vect_location,
6985                        "Decided to SLP %d instances. Unrolling factor ",
6986                        decided_to_slp);
6987       dump_dec (MSG_NOTE, unrolling_factor);
6988       dump_printf (MSG_NOTE, "\n");
6989     }
6990
6991   return (decided_to_slp > 0);
6992 }
6993
6994 /* Private data for vect_detect_hybrid_slp.  */
6995 struct vdhs_data
6996 {
6997   loop_vec_info loop_vinfo;
6998   vec<stmt_vec_info> *worklist;
6999 };
7000
7001 /* Walker for walk_gimple_op.  */
7002
7003 static tree
7004 vect_detect_hybrid_slp (tree *tp, int *, void *data)
7005 {
7006   walk_stmt_info *wi = (walk_stmt_info *)data;
7007   vdhs_data *dat = (vdhs_data *)wi->info;
7008
7009   if (wi->is_lhs)
7010     return NULL_TREE;
7011
7012   stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
7013   if (!def_stmt_info)
7014     return NULL_TREE;
7015   def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
7016   if (PURE_SLP_STMT (def_stmt_info))
7017     {
7018       if (dump_enabled_p ())
7019         dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
7020                          def_stmt_info->stmt);
7021       STMT_SLP_TYPE (def_stmt_info) = hybrid;
7022       dat->worklist->safe_push (def_stmt_info);
7023     }
7024
7025   return NULL_TREE;
7026 }
7027
7028 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
7029    if so, otherwise pushing it to WORKLIST.  */
7030
7031 static void
7032 maybe_push_to_hybrid_worklist (vec_info *vinfo,
7033                                vec<stmt_vec_info> &worklist,
7034                                stmt_vec_info stmt_info)
7035 {
7036   if (dump_enabled_p ())
7037     dump_printf_loc (MSG_NOTE, vect_location,
7038                      "Processing hybrid candidate : %G", stmt_info->stmt);
7039   stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
7040   imm_use_iterator iter2;
7041   ssa_op_iter iter1;
7042   use_operand_p use_p;
7043   def_operand_p def_p;
7044   bool any_def = false;
7045   FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
7046     {
7047       any_def = true;
7048       FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
7049         {
7050           if (is_gimple_debug (USE_STMT (use_p)))
7051             continue;
7052           stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
7053           /* An out-of loop use means this is a loop_vect sink.  */
7054           if (!use_info)
7055             {
7056               if (dump_enabled_p ())
7057                 dump_printf_loc (MSG_NOTE, vect_location,
7058                                  "Found loop_vect sink: %G", stmt_info->stmt);
7059               worklist.safe_push (stmt_info);
7060               return;
7061             }
7062           else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
7063             {
7064               if (dump_enabled_p ())
7065                 dump_printf_loc (MSG_NOTE, vect_location,
7066                                  "Found loop_vect use: %G", use_info->stmt);
7067               worklist.safe_push (stmt_info);
7068               return;
7069             }
7070         }
7071     }
7072   /* No def means this is a loo_vect sink.  */
7073   if (!any_def)
7074     {
7075       if (dump_enabled_p ())
7076         dump_printf_loc (MSG_NOTE, vect_location,
7077                          "Found loop_vect sink: %G", stmt_info->stmt);
7078       worklist.safe_push (stmt_info);
7079       return;
7080     }
7081   if (dump_enabled_p ())
7082     dump_printf_loc (MSG_NOTE, vect_location,
7083                      "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
7084   STMT_SLP_TYPE (stmt_info) = pure_slp;
7085 }
7086
7087 /* Find stmts that must be both vectorized and SLPed.  */
7088
7089 void
7090 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
7091 {
7092   DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
7093
7094   /* All stmts participating in SLP are marked pure_slp, all other
7095      stmts are loop_vect.
7096      First collect all loop_vect stmts into a worklist.
7097      SLP patterns cause not all original scalar stmts to appear in
7098      SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
7099      Rectify this here and do a backward walk over the IL only considering
7100      stmts as loop_vect when they are used by a loop_vect stmt and otherwise
7101      mark them as pure_slp.  */
7102   auto_vec<stmt_vec_info> worklist;
7103   for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
7104     {
7105       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
7106       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
7107            gsi_next (&gsi))
7108         {
7109           gphi *phi = gsi.phi ();
7110           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
7111           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
7112             maybe_push_to_hybrid_worklist (loop_vinfo,
7113                                            worklist, stmt_info);
7114         }
7115       for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
7116            gsi_prev (&gsi))
7117         {
7118           gimple *stmt = gsi_stmt (gsi);
7119           if (is_gimple_debug (stmt))
7120             continue;
7121           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
7122           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
7123             {
7124               for (gimple_stmt_iterator gsi2
7125                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
7126                    !gsi_end_p (gsi2); gsi_next (&gsi2))
7127                 {
7128                   stmt_vec_info patt_info
7129                     = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
7130                   if (!STMT_SLP_TYPE (patt_info)
7131                       && STMT_VINFO_RELEVANT (patt_info))
7132                     maybe_push_to_hybrid_worklist (loop_vinfo,
7133                                                    worklist, patt_info);
7134                 }
7135               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7136             }
7137           if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
7138             maybe_push_to_hybrid_worklist (loop_vinfo,
7139                                            worklist, stmt_info);
7140         }
7141     }
7142
7143   /* Now we have a worklist of non-SLP stmts, follow use->def chains and
7144      mark any SLP vectorized stmt as hybrid.
7145      ???  We're visiting def stmts N times (once for each non-SLP and
7146      once for each hybrid-SLP use).  */
7147   walk_stmt_info wi;
7148   vdhs_data dat;
7149   dat.worklist = &worklist;
7150   dat.loop_vinfo = loop_vinfo;
7151   memset (&wi, 0, sizeof (wi));
7152   wi.info = (void *)&dat;
7153   while (!worklist.is_empty ())
7154     {
7155       stmt_vec_info stmt_info = worklist.pop ();
7156       /* Since SSA operands are not set up for pattern stmts we need
7157          to use walk_gimple_op.  */
7158       wi.is_lhs = 0;
7159       walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
7160       /* For gather/scatter make sure to walk the offset operand, that
7161          can be a scaling and conversion away.  */
7162       gather_scatter_info gs_info;
7163       if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
7164           && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
7165         {
7166           int dummy;
7167           vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
7168         }
7169     }
7170 }
7171
7172
7173 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks.  */
7174
7175 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
7176   : vec_info (vec_info::bb, shared),
7177     roots (vNULL)
7178 {
7179   /* The region we are operating on.  bbs[0] is the entry, excluding
7180      its PHI nodes.  In the future we might want to track an explicit
7181      entry edge to cover bbs[0] PHI nodes and have a region entry
7182      insert location.  */
7183   bbs = _bbs.address ();
7184   nbbs = _bbs.length ();
7185
7186   for (unsigned i = 0; i < nbbs; ++i)
7187     {
7188       if (i != 0)
7189         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
7190              gsi_next (&si))
7191           {
7192             gphi *phi = si.phi ();
7193             gimple_set_uid (phi, 0);
7194             add_stmt (phi);
7195           }
7196       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
7197            !gsi_end_p (gsi); gsi_next (&gsi))
7198         {
7199           gimple *stmt = gsi_stmt (gsi);
7200           gimple_set_uid (stmt, 0);
7201           if (is_gimple_debug (stmt))
7202             continue;
7203           add_stmt (stmt);
7204         }
7205     }
7206 }
7207
7208
7209 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
7210    stmts in the basic block.  */
7211
7212 _bb_vec_info::~_bb_vec_info ()
7213 {
7214   /* Reset region marker.  */
7215   for (unsigned i = 0; i < nbbs; ++i)
7216     {
7217       if (i != 0)
7218         for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
7219              gsi_next (&si))
7220           {
7221             gphi *phi = si.phi ();
7222             gimple_set_uid (phi, -1);
7223           }
7224       for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
7225            !gsi_end_p (gsi); gsi_next (&gsi))
7226         {
7227           gimple *stmt = gsi_stmt (gsi);
7228           gimple_set_uid (stmt, -1);
7229         }
7230     }
7231
7232   for (unsigned i = 0; i < roots.length (); ++i)
7233     {
7234       roots[i].stmts.release ();
7235       roots[i].roots.release ();
7236       roots[i].remain.release ();
7237     }
7238   roots.release ();
7239 }
7240
7241 /* Subroutine of vect_slp_analyze_node_operations.  Handle the root of NODE,
7242    given then that child nodes have already been processed, and that
7243    their def types currently match their SLP node's def type.  */
7244
7245 static bool
7246 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
7247                                     slp_instance node_instance,
7248                                     stmt_vector_for_cost *cost_vec)
7249 {
7250   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7251
7252   /* Calculate the number of vector statements to be created for the scalar
7253      stmts in this node.  It is the number of scalar elements in one scalar
7254      iteration (DR_GROUP_SIZE) multiplied by VF divided by the number of
7255      elements in a vector.  For single-defuse-cycle, lane-reducing op, and
7256      PHI statement that starts reduction comprised of only lane-reducing ops,
7257      the number is more than effective vector statements actually required.  */
7258   SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vect_get_num_copies (vinfo, node);
7259
7260   /* Handle purely internal nodes.  */
7261   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7262     {
7263       if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
7264         return false;
7265
7266       stmt_vec_info slp_stmt_info;
7267       unsigned int i;
7268       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
7269         {
7270           if (slp_stmt_info
7271               && STMT_VINFO_LIVE_P (slp_stmt_info)
7272               && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
7273                                                node_instance, i,
7274                                                false, cost_vec))
7275             return false;
7276         }
7277       return true;
7278     }
7279
7280   bool dummy;
7281   return vect_analyze_stmt (vinfo, stmt_info, &dummy,
7282                             node, node_instance, cost_vec);
7283 }
7284
7285 /* Try to build NODE from scalars, returning true on success.
7286    NODE_INSTANCE is the SLP instance that contains NODE.  */
7287
7288 static bool
7289 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
7290                               slp_instance node_instance)
7291 {
7292   stmt_vec_info stmt_info;
7293   unsigned int i;
7294
7295   if (!is_a <bb_vec_info> (vinfo)
7296       || node == SLP_INSTANCE_TREE (node_instance)
7297       || !SLP_TREE_SCALAR_STMTS (node).exists ()
7298       || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
7299       /* Force the mask use to be built from scalars instead.  */
7300       || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
7301     return false;
7302
7303   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7304     if (!stmt_info)
7305       return false;
7306
7307   if (dump_enabled_p ())
7308     dump_printf_loc (MSG_NOTE, vect_location,
7309                      "Building vector operands of %p from scalars instead\n",
7310                      (void *) node);
7311
7312   /* Don't remove and free the child nodes here, since they could be
7313      referenced by other structures.  The analysis and scheduling phases
7314      (need to) ignore child nodes of anything that isn't vect_internal_def.  */
7315   unsigned int group_size = SLP_TREE_LANES (node);
7316   SLP_TREE_DEF_TYPE (node) = vect_external_def;
7317   /* Invariants get their vector type from the uses.  */
7318   SLP_TREE_VECTYPE (node) = NULL_TREE;
7319   SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
7320   SLP_TREE_LOAD_PERMUTATION (node).release ();
7321   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7322     {
7323       tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
7324       SLP_TREE_SCALAR_OPS (node)[i] = lhs;
7325     }
7326   return true;
7327 }
7328
7329 /* Return true if all elements of the slice are the same.  */
7330 bool
7331 vect_scalar_ops_slice::all_same_p () const
7332 {
7333   for (unsigned int i = 1; i < length; ++i)
7334     if (!operand_equal_p (op (0), op (i)))
7335       return false;
7336   return true;
7337 }
7338
7339 hashval_t
7340 vect_scalar_ops_slice_hash::hash (const value_type &s)
7341 {
7342   hashval_t hash = 0;
7343   for (unsigned i = 0; i < s.length; ++i)
7344     hash = iterative_hash_expr (s.op (i), hash);
7345   return hash;
7346 }
7347
7348 bool
7349 vect_scalar_ops_slice_hash::equal (const value_type &s1,
7350                                    const compare_type &s2)
7351 {
7352   if (s1.length != s2.length)
7353     return false;
7354   for (unsigned i = 0; i < s1.length; ++i)
7355     if (!operand_equal_p (s1.op (i), s2.op (i)))
7356       return false;
7357   return true;
7358 }
7359
7360 /* Compute the prologue cost for invariant or constant operands represented
7361    by NODE.  */
7362
7363 static void
7364 vect_prologue_cost_for_slp (slp_tree node,
7365                             stmt_vector_for_cost *cost_vec)
7366 {
7367   /* There's a special case of an existing vector, that costs nothing.  */
7368   if (SLP_TREE_SCALAR_OPS (node).length () == 0
7369       && !SLP_TREE_VEC_DEFS (node).is_empty ())
7370     return;
7371   /* Without looking at the actual initializer a vector of
7372      constants can be implemented as load from the constant pool.
7373      When all elements are the same we can use a splat.  */
7374   tree vectype = SLP_TREE_VECTYPE (node);
7375   unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
7376   unsigned HOST_WIDE_INT const_nunits;
7377   unsigned nelt_limit;
7378   auto ops = &SLP_TREE_SCALAR_OPS (node);
7379   auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7380   if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
7381       && ! multiple_p (const_nunits, group_size))
7382     {
7383       nelt_limit = const_nunits;
7384       hash_set<vect_scalar_ops_slice_hash> vector_ops;
7385       for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
7386         if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
7387           starts.quick_push (i * const_nunits);
7388     }
7389   else
7390     {
7391       /* If either the vector has variable length or the vectors
7392          are composed of repeated whole groups we only need to
7393          cost construction once.  All vectors will be the same.  */
7394       nelt_limit = group_size;
7395       starts.quick_push (0);
7396     }
7397   /* ???  We're just tracking whether vectors in a single node are the same.
7398      Ideally we'd do something more global.  */
7399   bool passed = false;
7400   for (unsigned int start : starts)
7401     {
7402       vect_cost_for_stmt kind;
7403       if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
7404         kind = vector_load;
7405       else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
7406         kind = scalar_to_vec;
7407       else
7408         kind = vec_construct;
7409       /* The target cost hook has no idea which part of the SLP node
7410          we are costing so avoid passing it down more than once.  Pass
7411          it to the first vec_construct or scalar_to_vec part since for those
7412          the x86 backend tries to account for GPR to XMM register moves.  */
7413       record_stmt_cost (cost_vec, 1, kind,
7414                         (kind != vector_load && !passed) ? node : nullptr,
7415                         vectype, 0, vect_prologue);
7416       if (kind != vector_load)
7417         passed = true;
7418     }
7419 }
7420
7421 /* Analyze statements contained in SLP tree NODE after recursively analyzing
7422    the subtree.  NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
7423
7424    Return true if the operations are supported.  */
7425
7426 static bool
7427 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
7428                                   slp_instance node_instance,
7429                                   hash_set<slp_tree> &visited_set,
7430                                   vec<slp_tree> &visited_vec,
7431                                   stmt_vector_for_cost *cost_vec)
7432 {
7433   int i, j;
7434   slp_tree child;
7435
7436   /* Assume we can code-generate all invariants.  */
7437   if (!node
7438       || SLP_TREE_DEF_TYPE (node) == vect_constant_def
7439       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7440     return true;
7441
7442   if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
7443     {
7444       if (dump_enabled_p ())
7445         dump_printf_loc (MSG_NOTE, vect_location,
7446                          "Failed cyclic SLP reference in %p\n", (void *) node);
7447       return false;
7448     }
7449   gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
7450
7451   /* If we already analyzed the exact same set of scalar stmts we're done.
7452      We share the generated vector stmts for those.  */
7453   if (visited_set.add (node))
7454     return true;
7455   visited_vec.safe_push (node);
7456
7457   bool res = true;
7458   unsigned visited_rec_start = visited_vec.length ();
7459   unsigned cost_vec_rec_start = cost_vec->length ();
7460   bool seen_non_constant_child = false;
7461   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7462     {
7463       res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
7464                                               visited_set, visited_vec,
7465                                               cost_vec);
7466       if (!res)
7467         break;
7468       if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
7469         seen_non_constant_child = true;
7470     }
7471   /* We're having difficulties scheduling nodes with just constant
7472      operands and no scalar stmts since we then cannot compute a stmt
7473      insertion place.  */
7474   if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
7475     {
7476       if (dump_enabled_p ())
7477         dump_printf_loc (MSG_NOTE, vect_location,
7478                          "Cannot vectorize all-constant op node %p\n",
7479                          (void *) node);
7480       res = false;
7481     }
7482
7483   if (res)
7484     res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
7485                                               cost_vec);
7486   /* If analysis failed we have to pop all recursive visited nodes
7487      plus ourselves.  */
7488   if (!res)
7489     {
7490       while (visited_vec.length () >= visited_rec_start)
7491         visited_set.remove (visited_vec.pop ());
7492       cost_vec->truncate (cost_vec_rec_start);
7493     }
7494
7495   /* When the node can be vectorized cost invariant nodes it references.
7496      This is not done in DFS order to allow the refering node
7497      vectorizable_* calls to nail down the invariant nodes vector type
7498      and possibly unshare it if it needs a different vector type than
7499      other referrers.  */
7500   if (res)
7501     FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
7502       if (child
7503           && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
7504               || SLP_TREE_DEF_TYPE (child) == vect_external_def)
7505           /* Perform usual caching, note code-generation still
7506              code-gens these nodes multiple times but we expect
7507              to CSE them later.  */
7508           && !visited_set.add (child))
7509         {
7510           visited_vec.safe_push (child);
7511           /* ???  After auditing more code paths make a "default"
7512              and push the vector type from NODE to all children
7513              if it is not already set.  */
7514           /* Compute the number of vectors to be generated.  */
7515           tree vector_type = SLP_TREE_VECTYPE (child);
7516           if (!vector_type)
7517             {
7518               /* For shifts with a scalar argument we don't need
7519                  to cost or code-generate anything.
7520                  ???  Represent this more explicitely.  */
7521               gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
7522                            == shift_vec_info_type)
7523                           && j == 1);
7524               continue;
7525             }
7526
7527           SLP_TREE_NUMBER_OF_VEC_STMTS (child)
7528                 = vect_get_num_copies (vinfo, child);
7529           /* And cost them.  */
7530           vect_prologue_cost_for_slp (child, cost_vec);
7531         }
7532
7533   /* If this node or any of its children can't be vectorized, try pruning
7534      the tree here rather than felling the whole thing.  */
7535   if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
7536     {
7537       /* We'll need to revisit this for invariant costing and number
7538          of vectorized stmt setting.   */
7539       res = true;
7540     }
7541
7542   return res;
7543 }
7544
7545 /* Given a definition DEF, analyze if it will have any live scalar use after
7546    performing SLP vectorization whose information is represented by BB_VINFO,
7547    and record result into hash map SCALAR_USE_MAP as cache for later fast
7548    check.  If recursion DEPTH exceeds a limit, stop analysis and make a
7549    conservative assumption.  Return 0 if no scalar use, 1 if there is, -1
7550    means recursion is limited.  */
7551
7552 static int
7553 vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
7554                         hash_map<tree, int> &scalar_use_map,
7555                         int depth = 0)
7556 {
7557   const int depth_limit = 2;
7558   imm_use_iterator use_iter;
7559   gimple *use_stmt;
7560
7561   if (int *res = scalar_use_map.get (def))
7562     return *res;
7563
7564   int scalar_use = 1;
7565
7566   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
7567     {
7568       if (is_gimple_debug (use_stmt))
7569         continue;
7570
7571       stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
7572
7573       if (!use_stmt_info)
7574         break;
7575
7576       if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
7577         continue;
7578
7579       /* Do not step forward when encounter PHI statement, since it may
7580          involve cyclic reference and cause infinite recursive invocation.  */
7581       if (gimple_code (use_stmt) == GIMPLE_PHI)
7582         break;
7583
7584       /* When pattern recognition is involved, a statement whose definition is
7585          consumed in some pattern, may not be included in the final replacement
7586          pattern statements, so would be skipped when building SLP graph.
7587
7588          * Original
7589           char a_c = *(char *) a;
7590           char b_c = *(char *) b;
7591           unsigned short a_s = (unsigned short) a_c;
7592           int a_i = (int) a_s;
7593           int b_i = (int) b_c;
7594           int r_i = a_i - b_i;
7595
7596          * After pattern replacement
7597           a_s = (unsigned short) a_c;
7598           a_i = (int) a_s;
7599
7600           patt_b_s = (unsigned short) b_c;    // b_i = (int) b_c
7601           patt_b_i = (int) patt_b_s;          // b_i = (int) b_c
7602
7603           patt_r_s = widen_minus(a_c, b_c);   // r_i = a_i - b_i
7604           patt_r_i = (int) patt_r_s;          // r_i = a_i - b_i
7605
7606          The definitions of a_i(original statement) and b_i(pattern statement)
7607          are related to, but actually not part of widen_minus pattern.
7608          Vectorizing the pattern does not cause these definition statements to
7609          be marked as PURE_SLP.  For this case, we need to recursively check
7610          whether their uses are all absorbed into vectorized code.  But there
7611          is an exception that some use may participate in an vectorized
7612          operation via an external SLP node containing that use as an element.
7613          The parameter "scalar_use_map" tags such kind of SSA as having scalar
7614          use in advance.  */
7615       tree lhs = gimple_get_lhs (use_stmt);
7616
7617       if (!lhs || TREE_CODE (lhs) != SSA_NAME)
7618         break;
7619
7620       if (depth_limit && depth >= depth_limit)
7621         return -1;
7622
7623       if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
7624                                                 depth + 1)))
7625         break;
7626     }
7627
7628   if (end_imm_use_stmt_p (&use_iter))
7629     scalar_use = 0;
7630
7631   /* If recursion is limited, do not cache result for non-root defs.  */
7632   if (!depth || scalar_use >= 0)
7633     {
7634       bool added = scalar_use_map.put (def, scalar_use);
7635       gcc_assert (!added);
7636     }
7637
7638   return scalar_use;
7639 }
7640
7641 /* Mark lanes of NODE that are live outside of the basic-block vectorized
7642    region and that can be vectorized using vectorizable_live_operation
7643    with STMT_VINFO_LIVE_P.  Not handled live operations will cause the
7644    scalar code computing it to be retained.  */
7645
7646 static void
7647 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
7648                              slp_instance instance,
7649                              stmt_vector_for_cost *cost_vec,
7650                              hash_map<tree, int> &scalar_use_map,
7651                              hash_set<stmt_vec_info> &svisited,
7652                              hash_set<slp_tree> &visited)
7653 {
7654   if (visited.add (node))
7655     return;
7656
7657   unsigned i;
7658   stmt_vec_info stmt_info;
7659   stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
7660   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7661     {
7662       if (!stmt_info || svisited.contains (stmt_info))
7663         continue;
7664       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
7665       if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
7666           && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
7667         /* Only the pattern root stmt computes the original scalar value.  */
7668         continue;
7669       bool mark_visited = true;
7670       gimple *orig_stmt = orig_stmt_info->stmt;
7671       ssa_op_iter op_iter;
7672       def_operand_p def_p;
7673       FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
7674         {
7675           if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
7676                                       scalar_use_map))
7677             {
7678               STMT_VINFO_LIVE_P (stmt_info) = true;
7679               if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
7680                                                instance, i, false, cost_vec))
7681                 /* ???  So we know we can vectorize the live stmt from one SLP
7682                    node.  If we cannot do so from all or none consistently
7683                    we'd have to record which SLP node (and lane) we want to
7684                    use for the live operation.  So make sure we can
7685                    code-generate from all nodes.  */
7686                 mark_visited = false;
7687               else
7688                 STMT_VINFO_LIVE_P (stmt_info) = false;
7689             }
7690
7691           /* We have to verify whether we can insert the lane extract
7692              before all uses.  The following is a conservative approximation.
7693              We cannot put this into vectorizable_live_operation because
7694              iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
7695              doesn't work.
7696              Note that while the fact that we emit code for loads at the
7697              first load should make this a non-problem leafs we construct
7698              from scalars are vectorized after the last scalar def.
7699              ???  If we'd actually compute the insert location during
7700              analysis we could use sth less conservative than the last
7701              scalar stmt in the node for the dominance check.  */
7702           /* ???  What remains is "live" uses in vector CTORs in the same
7703              SLP graph which is where those uses can end up code-generated
7704              right after their definition instead of close to their original
7705              use.  But that would restrict us to code-generate lane-extracts
7706              from the latest stmt in a node.  So we compensate for this
7707              during code-generation, simply not replacing uses for those
7708              hopefully rare cases.  */
7709           imm_use_iterator use_iter;
7710           gimple *use_stmt;
7711           stmt_vec_info use_stmt_info;
7712
7713           if (STMT_VINFO_LIVE_P (stmt_info))
7714             FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
7715               if (!is_gimple_debug (use_stmt)
7716                   && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
7717                       || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
7718                   && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
7719                 {
7720                   if (dump_enabled_p ())
7721                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7722                                      "Cannot determine insertion place for "
7723                                      "lane extract\n");
7724                   STMT_VINFO_LIVE_P (stmt_info) = false;
7725                   mark_visited = true;
7726                 }
7727         }
7728       if (mark_visited)
7729         svisited.add (stmt_info);
7730     }
7731
7732   slp_tree child;
7733   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7734     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7735       vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
7736                                    scalar_use_map, svisited, visited);
7737 }
7738
7739 /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
7740    are live outside of the basic-block vectorized region and that can be
7741    vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P.  */
7742
7743 static void
7744 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
7745 {
7746   if (bb_vinfo->slp_instances.is_empty ())
7747     return;
7748
7749   hash_set<stmt_vec_info> svisited;
7750   hash_set<slp_tree> visited;
7751   hash_map<tree, int> scalar_use_map;
7752   auto_vec<slp_tree> worklist;
7753
7754   for (slp_instance instance : bb_vinfo->slp_instances)
7755     {
7756       if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
7757         for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
7758           if (TREE_CODE (op) == SSA_NAME)
7759             scalar_use_map.put (op, 1);
7760       if (!visited.add (SLP_INSTANCE_TREE (instance)))
7761         worklist.safe_push (SLP_INSTANCE_TREE (instance));
7762     }
7763
7764   do
7765     {
7766       slp_tree node = worklist.pop ();
7767
7768       if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
7769         {
7770           for (tree op : SLP_TREE_SCALAR_OPS (node))
7771             if (TREE_CODE (op) == SSA_NAME)
7772               scalar_use_map.put (op, 1);
7773         }
7774       else
7775         {
7776           for (slp_tree child : SLP_TREE_CHILDREN (node))
7777             if (child && !visited.add (child))
7778               worklist.safe_push (child);
7779         }
7780     }
7781   while (!worklist.is_empty ());
7782
7783   visited.empty ();
7784
7785   for (slp_instance instance : bb_vinfo->slp_instances)
7786     {
7787       vect_location = instance->location ();
7788       vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
7789                                    instance, &instance->cost_vec,
7790                                    scalar_use_map, svisited, visited);
7791     }
7792 }
7793
7794 /* Determine whether we can vectorize the reduction epilogue for INSTANCE.  */
7795
7796 static bool
7797 vectorizable_bb_reduc_epilogue (slp_instance instance,
7798                                 stmt_vector_for_cost *cost_vec)
7799 {
7800   gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
7801   enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
7802   if (reduc_code == MINUS_EXPR)
7803     reduc_code = PLUS_EXPR;
7804   internal_fn reduc_fn;
7805   tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
7806   if (!vectype
7807       || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7808       || reduc_fn == IFN_LAST
7809       || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
7810       || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
7811                                      TREE_TYPE (vectype)))
7812     {
7813       if (dump_enabled_p ())
7814         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7815                          "not vectorized: basic block reduction epilogue "
7816                          "operation unsupported.\n");
7817       return false;
7818     }
7819
7820   /* There's no way to cost a horizontal vector reduction via REDUC_FN so
7821      cost log2 vector operations plus shuffles and one extraction.  */
7822   unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
7823   record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
7824                     vectype, 0, vect_body);
7825   record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
7826                     vectype, 0, vect_body);
7827   record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
7828                     vectype, 0, vect_body);
7829
7830   /* Since we replace all stmts of a possibly longer scalar reduction
7831      chain account for the extra scalar stmts for that.  */
7832   record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
7833                     instance->root_stmts[0], 0, vect_body);
7834   return true;
7835 }
7836
7837 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
7838    and recurse to children.  */
7839
7840 static void
7841 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
7842                               hash_set<slp_tree> &visited)
7843 {
7844   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
7845       || visited.add (node))
7846     return;
7847
7848   stmt_vec_info stmt;
7849   unsigned i;
7850   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
7851     if (stmt)
7852       roots.remove (vect_orig_stmt (stmt));
7853
7854   slp_tree child;
7855   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7856     if (child)
7857       vect_slp_prune_covered_roots (child, roots, visited);
7858 }
7859
7860 /* Analyze statements in SLP instances of VINFO.  Return true if the
7861    operations are supported. */
7862
7863 bool
7864 vect_slp_analyze_operations (vec_info *vinfo)
7865 {
7866   slp_instance instance;
7867   int i;
7868
7869   DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
7870
7871   hash_set<slp_tree> visited;
7872   for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
7873     {
7874       auto_vec<slp_tree> visited_vec;
7875       stmt_vector_for_cost cost_vec;
7876       cost_vec.create (2);
7877       if (is_a <bb_vec_info> (vinfo))
7878         vect_location = instance->location ();
7879       if (!vect_slp_analyze_node_operations (vinfo,
7880                                              SLP_INSTANCE_TREE (instance),
7881                                              instance, visited, visited_vec,
7882                                              &cost_vec)
7883           /* CTOR instances require vectorized defs for the SLP tree root.  */
7884           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
7885               && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
7886                   != vect_internal_def
7887                   /* Make sure we vectorized with the expected type.  */
7888                   || !useless_type_conversion_p
7889                         (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
7890                                               (instance->root_stmts[0]->stmt))),
7891                          TREE_TYPE (SLP_TREE_VECTYPE
7892                                             (SLP_INSTANCE_TREE (instance))))))
7893           /* Check we can vectorize the reduction.  */
7894           || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
7895               && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
7896         {
7897           slp_tree node = SLP_INSTANCE_TREE (instance);
7898           stmt_vec_info stmt_info;
7899           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7900             stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
7901           else
7902             stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7903           if (dump_enabled_p ())
7904             dump_printf_loc (MSG_NOTE, vect_location,
7905                              "removing SLP instance operations starting from: %G",
7906                              stmt_info->stmt);
7907           vect_free_slp_instance (instance);
7908           vinfo->slp_instances.ordered_remove (i);
7909           cost_vec.release ();
7910           while (!visited_vec.is_empty ())
7911             visited.remove (visited_vec.pop ());
7912         }
7913       else
7914         {
7915           i++;
7916           if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
7917             {
7918               add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
7919               cost_vec.release ();
7920             }
7921           else
7922             /* For BB vectorization remember the SLP graph entry
7923                cost for later.  */
7924             instance->cost_vec = cost_vec;
7925         }
7926     }
7927
7928   /* Now look for SLP instances with a root that are covered by other
7929      instances and remove them.  */
7930   hash_set<stmt_vec_info> roots;
7931   for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
7932     if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7933       roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
7934   if (!roots.is_empty ())
7935     {
7936       visited.empty ();
7937       for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
7938         vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
7939                                       visited);
7940       for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
7941         if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
7942             && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
7943           {
7944             stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
7945             if (dump_enabled_p ())
7946               dump_printf_loc (MSG_NOTE, vect_location,
7947                                "removing SLP instance operations starting "
7948                                "from: %G", root->stmt);
7949             vect_free_slp_instance (instance);
7950             vinfo->slp_instances.ordered_remove (i);
7951           }
7952         else
7953           ++i;
7954     }
7955
7956   /* Compute vectorizable live stmts.  */
7957   if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
7958     vect_bb_slp_mark_live_stmts (bb_vinfo);
7959
7960   return !vinfo->slp_instances.is_empty ();
7961 }
7962
7963 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
7964    closing the eventual chain.  */
7965
7966 static slp_instance
7967 get_ultimate_leader (slp_instance instance,
7968                      hash_map<slp_instance, slp_instance> &instance_leader)
7969 {
7970   auto_vec<slp_instance *, 8> chain;
7971   slp_instance *tem;
7972   while (*(tem = instance_leader.get (instance)) != instance)
7973     {
7974       chain.safe_push (tem);
7975       instance = *tem;
7976     }
7977   while (!chain.is_empty ())
7978     *chain.pop () = instance;
7979   return instance;
7980 }
7981
7982 namespace {
7983 /* Subroutine of vect_bb_partition_graph_r.  Map KEY to INSTANCE in
7984    KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
7985    for KEY.  Return true if KEY was already in KEY_TO_INSTANCE.
7986
7987    INSTANCE_LEADER is as for get_ultimate_leader.  */
7988
7989 template<typename T>
7990 bool
7991 vect_map_to_instance (slp_instance instance, T key,
7992                       hash_map<T, slp_instance> &key_to_instance,
7993                       hash_map<slp_instance, slp_instance> &instance_leader)
7994 {
7995   bool existed_p;
7996   slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
7997   if (!existed_p)
7998     ;
7999   else if (key_instance != instance)
8000     {
8001       /* If we're running into a previously marked key make us the
8002          leader of the current ultimate leader.  This keeps the
8003          leader chain acyclic and works even when the current instance
8004          connects two previously independent graph parts.  */
8005       slp_instance key_leader
8006         = get_ultimate_leader (key_instance, instance_leader);
8007       if (key_leader != instance)
8008         instance_leader.put (key_leader, instance);
8009     }
8010   key_instance = instance;
8011   return existed_p;
8012 }
8013 }
8014
8015 /* Worker of vect_bb_partition_graph, recurse on NODE.  */
8016
8017 static void
8018 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
8019                            slp_instance instance, slp_tree node,
8020                            hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
8021                            hash_map<slp_tree, slp_instance> &node_to_instance,
8022                            hash_map<slp_instance, slp_instance> &instance_leader)
8023 {
8024   stmt_vec_info stmt_info;
8025   unsigned i;
8026
8027   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8028     if (stmt_info)
8029       vect_map_to_instance (instance, stmt_info, stmt_to_instance,
8030                             instance_leader);
8031
8032   if (vect_map_to_instance (instance, node, node_to_instance,
8033                             instance_leader))
8034     return;
8035
8036   slp_tree child;
8037   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8038     if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8039       vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
8040                                  node_to_instance, instance_leader);
8041 }
8042
8043 /* Partition the SLP graph into pieces that can be costed independently.  */
8044
8045 static void
8046 vect_bb_partition_graph (bb_vec_info bb_vinfo)
8047 {
8048   DUMP_VECT_SCOPE ("vect_bb_partition_graph");
8049
8050   /* First walk the SLP graph assigning each involved scalar stmt a
8051      corresponding SLP graph entry and upon visiting a previously
8052      marked stmt, make the stmts leader the current SLP graph entry.  */
8053   hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
8054   hash_map<slp_tree, slp_instance> node_to_instance;
8055   hash_map<slp_instance, slp_instance> instance_leader;
8056   slp_instance instance;
8057   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
8058     {
8059       instance_leader.put (instance, instance);
8060       vect_bb_partition_graph_r (bb_vinfo,
8061                                  instance, SLP_INSTANCE_TREE (instance),
8062                                  stmt_to_instance, node_to_instance,
8063                                  instance_leader);
8064     }
8065
8066   /* Then collect entries to each independent subgraph.  */
8067   for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
8068     {
8069       slp_instance leader = get_ultimate_leader (instance, instance_leader);
8070       leader->subgraph_entries.safe_push (instance);
8071       if (dump_enabled_p ()
8072           && leader != instance)
8073         dump_printf_loc (MSG_NOTE, vect_location,
8074                          "instance %p is leader of %p\n",
8075                          (void *) leader, (void *) instance);
8076     }
8077 }
8078
8079 /* Compute the set of scalar stmts participating in internal and external
8080    nodes.  */
8081
8082 static void
8083 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
8084                                          hash_set<slp_tree> &visited,
8085                                          hash_set<stmt_vec_info> &vstmts,
8086                                          hash_set<stmt_vec_info> &estmts)
8087 {
8088   int i;
8089   stmt_vec_info stmt_info;
8090   slp_tree child;
8091
8092   if (visited.add (node))
8093     return;
8094
8095   if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
8096     {
8097       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8098         if (stmt_info)
8099           vstmts.add (stmt_info);
8100
8101       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8102         if (child)
8103           vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
8104                                                    vstmts, estmts);
8105     }
8106   else
8107     for (tree def : SLP_TREE_SCALAR_OPS (node))
8108       {
8109         stmt_vec_info def_stmt = vinfo->lookup_def (def);
8110         if (def_stmt)
8111           estmts.add (def_stmt);
8112       }
8113 }
8114
8115
8116 /* Compute the scalar cost of the SLP node NODE and its children
8117    and return it.  Do not account defs that are marked in LIFE and
8118    update LIFE according to uses of NODE.  */
8119
8120 static void
8121 vect_bb_slp_scalar_cost (vec_info *vinfo,
8122                          slp_tree node, vec<bool, va_heap> *life,
8123                          stmt_vector_for_cost *cost_vec,
8124                          hash_set<stmt_vec_info> &vectorized_scalar_stmts,
8125                          hash_set<slp_tree> &visited)
8126 {
8127   unsigned i;
8128   stmt_vec_info stmt_info;
8129   slp_tree child;
8130
8131   if (visited.add (node))
8132     return;
8133
8134   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8135     {
8136       ssa_op_iter op_iter;
8137       def_operand_p def_p;
8138
8139       if (!stmt_info || (*life)[i])
8140         continue;
8141
8142       stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
8143       gimple *orig_stmt = orig_stmt_info->stmt;
8144
8145       /* If there is a non-vectorized use of the defs then the scalar
8146          stmt is kept live in which case we do not account it or any
8147          required defs in the SLP children in the scalar cost.  This
8148          way we make the vectorization more costly when compared to
8149          the scalar cost.  */
8150       if (!STMT_VINFO_LIVE_P (stmt_info))
8151         {
8152           auto_vec<gimple *, 8> worklist;
8153           hash_set<gimple *> *worklist_visited = NULL;
8154           worklist.quick_push (orig_stmt);
8155           do
8156             {
8157               gimple *work_stmt = worklist.pop ();
8158               FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
8159                 {
8160                   imm_use_iterator use_iter;
8161                   gimple *use_stmt;
8162                   FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
8163                                          DEF_FROM_PTR (def_p))
8164                     if (!is_gimple_debug (use_stmt))
8165                       {
8166                         stmt_vec_info use_stmt_info
8167                           = vinfo->lookup_stmt (use_stmt);
8168                         if (!use_stmt_info
8169                             || !vectorized_scalar_stmts.contains (use_stmt_info))
8170                           {
8171                             if (use_stmt_info
8172                                 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
8173                               {
8174                                 /* For stmts participating in patterns we have
8175                                    to check its uses recursively.  */
8176                                 if (!worklist_visited)
8177                                   worklist_visited = new hash_set<gimple *> ();
8178                                 if (!worklist_visited->add (use_stmt))
8179                                   worklist.safe_push (use_stmt);
8180                                 continue;
8181                               }
8182                             (*life)[i] = true;
8183                             goto next_lane;
8184                           }
8185                       }
8186                 }
8187             }
8188           while (!worklist.is_empty ());
8189 next_lane:
8190           if (worklist_visited)
8191             delete worklist_visited;
8192           if ((*life)[i])
8193             continue;
8194         }
8195
8196       /* Count scalar stmts only once.  */
8197       if (gimple_visited_p (orig_stmt))
8198         continue;
8199       gimple_set_visited (orig_stmt, true);
8200
8201       vect_cost_for_stmt kind;
8202       if (STMT_VINFO_DATA_REF (orig_stmt_info))
8203         {
8204           data_reference_p dr = STMT_VINFO_DATA_REF (orig_stmt_info);
8205           tree base = get_base_address (DR_REF (dr));
8206           /* When the scalar access is to a non-global not address-taken
8207              decl that is not BLKmode assume we can access it with a single
8208              non-load/store instruction.  */
8209           if (DECL_P (base)
8210               && !is_global_var (base)
8211               && !TREE_ADDRESSABLE (base)
8212               && DECL_MODE (base) != BLKmode)
8213             kind = scalar_stmt;
8214           else if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
8215             kind = scalar_load;
8216           else
8217             kind = scalar_store;
8218         }
8219       else if (vect_nop_conversion_p (orig_stmt_info))
8220         continue;
8221       /* For single-argument PHIs assume coalescing which means zero cost
8222          for the scalar and the vector PHIs.  This avoids artificially
8223          favoring the vector path (but may pessimize it in some cases).  */
8224       else if (is_a <gphi *> (orig_stmt_info->stmt)
8225                && gimple_phi_num_args
8226                     (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
8227         continue;
8228       else
8229         kind = scalar_stmt;
8230       record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
8231                         SLP_TREE_VECTYPE (node), 0, vect_body);
8232     }
8233
8234   auto_vec<bool, 20> subtree_life;
8235   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8236     {
8237       if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8238         {
8239           /* Do not directly pass LIFE to the recursive call, copy it to
8240              confine changes in the callee to the current child/subtree.  */
8241           if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
8242             {
8243               subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
8244               for (unsigned j = 0;
8245                    j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
8246                 {
8247                   auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
8248                   if (perm.first == i)
8249                     subtree_life[perm.second] = (*life)[j];
8250                 }
8251             }
8252           else
8253             {
8254               gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
8255               subtree_life.safe_splice (*life);
8256             }
8257           vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
8258                                    vectorized_scalar_stmts, visited);
8259           subtree_life.truncate (0);
8260         }
8261     }
8262 }
8263
8264 /* Comparator for the loop-index sorted cost vectors.  */
8265
8266 static int
8267 li_cost_vec_cmp (const void *a_, const void *b_)
8268 {
8269   auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
8270   auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
8271   if (a->first < b->first)
8272     return -1;
8273   else if (a->first == b->first)
8274     return 0;
8275   return 1;
8276 }
8277
8278 /* Check if vectorization of the basic block is profitable for the
8279    subgraph denoted by SLP_INSTANCES.  */
8280
8281 static bool
8282 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
8283                                     vec<slp_instance> slp_instances,
8284                                     loop_p orig_loop)
8285 {
8286   slp_instance instance;
8287   int i;
8288   unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
8289   unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
8290
8291   if (dump_enabled_p ())
8292     {
8293       dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
8294       hash_set<slp_tree> visited;
8295       FOR_EACH_VEC_ELT (slp_instances, i, instance)
8296         vect_print_slp_graph (MSG_NOTE, vect_location,
8297                               SLP_INSTANCE_TREE (instance), visited);
8298     }
8299
8300   /* Compute the set of scalar stmts we know will go away 'locally' when
8301      vectorizing.  This used to be tracked with just PURE_SLP_STMT but that's
8302      not accurate for nodes promoted extern late or for scalar stmts that
8303      are used both in extern defs and in vectorized defs.  */
8304   hash_set<stmt_vec_info> vectorized_scalar_stmts;
8305   hash_set<stmt_vec_info> scalar_stmts_in_externs;
8306   hash_set<slp_tree> visited;
8307   FOR_EACH_VEC_ELT (slp_instances, i, instance)
8308     {
8309       vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
8310                                                SLP_INSTANCE_TREE (instance),
8311                                                visited,
8312                                                vectorized_scalar_stmts,
8313                                                scalar_stmts_in_externs);
8314       for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
8315         vectorized_scalar_stmts.add (rstmt);
8316     }
8317   /* Scalar stmts used as defs in external nodes need to be preseved, so
8318      remove them from vectorized_scalar_stmts.  */
8319   for (stmt_vec_info stmt : scalar_stmts_in_externs)
8320     vectorized_scalar_stmts.remove (stmt);
8321
8322   /* Calculate scalar cost and sum the cost for the vector stmts
8323      previously collected.  */
8324   stmt_vector_for_cost scalar_costs = vNULL;
8325   stmt_vector_for_cost vector_costs = vNULL;
8326   visited.empty ();
8327   FOR_EACH_VEC_ELT (slp_instances, i, instance)
8328     {
8329       auto_vec<bool, 20> life;
8330       life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
8331                               true);
8332       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8333         record_stmt_cost (&scalar_costs,
8334                           SLP_INSTANCE_ROOT_STMTS (instance).length (),
8335                           scalar_stmt,
8336                           SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
8337       vect_bb_slp_scalar_cost (bb_vinfo,
8338                                SLP_INSTANCE_TREE (instance),
8339                                &life, &scalar_costs, vectorized_scalar_stmts,
8340                                visited);
8341       vector_costs.safe_splice (instance->cost_vec);
8342       instance->cost_vec.release ();
8343     }
8344
8345   if (dump_enabled_p ())
8346     dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
8347
8348   /* When costing non-loop vectorization we need to consider each covered
8349      loop independently and make sure vectorization is profitable.  For
8350      now we assume a loop may be not entered or executed an arbitrary
8351      number of iterations (???  static information can provide more
8352      precise info here) which means we can simply cost each containing
8353      loops stmts separately.  */
8354
8355   /* First produce cost vectors sorted by loop index.  */
8356   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
8357     li_scalar_costs (scalar_costs.length ());
8358   auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
8359     li_vector_costs (vector_costs.length ());
8360   stmt_info_for_cost *cost;
8361   FOR_EACH_VEC_ELT (scalar_costs, i, cost)
8362     {
8363       unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
8364       li_scalar_costs.quick_push (std::make_pair (l, cost));
8365     }
8366   /* Use a random used loop as fallback in case the first vector_costs
8367      entry does not have a stmt_info associated with it.  */
8368   unsigned l = li_scalar_costs[0].first;
8369   FOR_EACH_VEC_ELT (vector_costs, i, cost)
8370     {
8371       /* We inherit from the previous COST, invariants, externals and
8372          extracts immediately follow the cost for the related stmt.  */
8373       if (cost->stmt_info)
8374         l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
8375       li_vector_costs.quick_push (std::make_pair (l, cost));
8376     }
8377   li_scalar_costs.qsort (li_cost_vec_cmp);
8378   li_vector_costs.qsort (li_cost_vec_cmp);
8379
8380   /* Now cost the portions individually.  */
8381   unsigned vi = 0;
8382   unsigned si = 0;
8383   bool profitable = true;
8384   while (si < li_scalar_costs.length ()
8385          && vi < li_vector_costs.length ())
8386     {
8387       unsigned sl = li_scalar_costs[si].first;
8388       unsigned vl = li_vector_costs[vi].first;
8389       if (sl != vl)
8390         {
8391           if (dump_enabled_p ())
8392             dump_printf_loc (MSG_NOTE, vect_location,
8393                              "Scalar %d and vector %d loop part do not "
8394                              "match up, skipping scalar part\n", sl, vl);
8395           /* Skip the scalar part, assuming zero cost on the vector side.  */
8396           do
8397             {
8398               si++;
8399             }
8400           while (si < li_scalar_costs.length ()
8401                  && li_scalar_costs[si].first == sl);
8402           continue;
8403         }
8404
8405       class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
8406       do
8407         {
8408           add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
8409           si++;
8410         }
8411       while (si < li_scalar_costs.length ()
8412              && li_scalar_costs[si].first == sl);
8413       unsigned dummy;
8414       finish_cost (scalar_target_cost_data, nullptr,
8415                    &dummy, &scalar_cost, &dummy);
8416
8417       /* Complete the target-specific vector cost calculation.  */
8418       class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
8419       do
8420         {
8421           add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
8422           vi++;
8423         }
8424       while (vi < li_vector_costs.length ()
8425              && li_vector_costs[vi].first == vl);
8426       finish_cost (vect_target_cost_data, scalar_target_cost_data,
8427                    &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
8428       delete scalar_target_cost_data;
8429       delete vect_target_cost_data;
8430
8431       vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
8432
8433       if (dump_enabled_p ())
8434         {
8435           dump_printf_loc (MSG_NOTE, vect_location,
8436                            "Cost model analysis for part in loop %d:\n", sl);
8437           dump_printf (MSG_NOTE, "  Vector cost: %d\n",
8438                        vec_inside_cost + vec_outside_cost);
8439           dump_printf (MSG_NOTE, "  Scalar cost: %d\n", scalar_cost);
8440         }
8441
8442       /* Vectorization is profitable if its cost is more than the cost of scalar
8443          version.  Note that we err on the vector side for equal cost because
8444          the cost estimate is otherwise quite pessimistic (constant uses are
8445          free on the scalar side but cost a load on the vector side for
8446          example).  */
8447       if (vec_outside_cost + vec_inside_cost > scalar_cost)
8448         {
8449           profitable = false;
8450           break;
8451         }
8452     }
8453   if (profitable && vi < li_vector_costs.length ())
8454     {
8455       if (dump_enabled_p ())
8456         dump_printf_loc (MSG_NOTE, vect_location,
8457                          "Excess vector cost for part in loop %d:\n",
8458                          li_vector_costs[vi].first);
8459       profitable = false;
8460     }
8461
8462   /* Unset visited flag.  This is delayed when the subgraph is profitable
8463      and we process the loop for remaining unvectorized if-converted code.  */
8464   if (!orig_loop || !profitable)
8465     FOR_EACH_VEC_ELT (scalar_costs, i, cost)
8466       gimple_set_visited  (cost->stmt_info->stmt, false);
8467
8468   scalar_costs.release ();
8469   vector_costs.release ();
8470
8471   return profitable;
8472 }
8473
8474 /* qsort comparator for lane defs.  */
8475
8476 static int
8477 vld_cmp (const void *a_, const void *b_)
8478 {
8479   auto *a = (const std::pair<unsigned, tree> *)a_;
8480   auto *b = (const std::pair<unsigned, tree> *)b_;
8481   return a->first - b->first;
8482 }
8483
8484 /* Return true if USE_STMT is a vector lane insert into VEC and set
8485    *THIS_LANE to the lane number that is set.  */
8486
8487 static bool
8488 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
8489 {
8490   gassign *use_ass = dyn_cast <gassign *> (use_stmt);
8491   if (!use_ass
8492       || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
8493       || (vec
8494           ? gimple_assign_rhs1 (use_ass) != vec
8495           : ((vec = gimple_assign_rhs1 (use_ass)), false))
8496       || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
8497                                      TREE_TYPE (gimple_assign_rhs2 (use_ass)))
8498       || !constant_multiple_p
8499             (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
8500              tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
8501              this_lane))
8502     return false;
8503   return true;
8504 }
8505
8506 /* Find any vectorizable constructors and add them to the grouped_store
8507    array.  */
8508
8509 static void
8510 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
8511 {
8512   for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
8513     for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
8514          !gsi_end_p (gsi); gsi_next (&gsi))
8515     {
8516       gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
8517       if (!assign)
8518         continue;
8519
8520       tree rhs = gimple_assign_rhs1 (assign);
8521       enum tree_code code = gimple_assign_rhs_code (assign);
8522       use_operand_p use_p;
8523       gimple *use_stmt;
8524       if (code == CONSTRUCTOR)
8525         {
8526           if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
8527               || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
8528                            CONSTRUCTOR_NELTS (rhs))
8529               || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
8530               || uniform_vector_p (rhs))
8531             continue;
8532
8533           unsigned j;
8534           tree val;
8535           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
8536             if (TREE_CODE (val) != SSA_NAME
8537                 || !bb_vinfo->lookup_def (val))
8538               break;
8539           if (j != CONSTRUCTOR_NELTS (rhs))
8540             continue;
8541
8542           vec<stmt_vec_info> roots = vNULL;
8543           roots.safe_push (bb_vinfo->lookup_stmt (assign));
8544           vec<stmt_vec_info> stmts;
8545           stmts.create (CONSTRUCTOR_NELTS (rhs));
8546           FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
8547             stmts.quick_push
8548               (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
8549           bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
8550                                                stmts, roots));
8551         }
8552       else if (code == BIT_INSERT_EXPR
8553                && VECTOR_TYPE_P (TREE_TYPE (rhs))
8554                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
8555                && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
8556                && integer_zerop (gimple_assign_rhs3 (assign))
8557                && useless_type_conversion_p
8558                     (TREE_TYPE (TREE_TYPE (rhs)),
8559                      TREE_TYPE (gimple_assign_rhs2 (assign)))
8560                && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
8561         {
8562           /* We start to match on insert to lane zero but since the
8563              inserts need not be ordered we'd have to search both
8564              the def and the use chains.  */
8565           tree vectype = TREE_TYPE (rhs);
8566           unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
8567           auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
8568           auto_sbitmap lanes (nlanes);
8569           bitmap_clear (lanes);
8570           bitmap_set_bit (lanes, 0);
8571           tree def = gimple_assign_lhs (assign);
8572           lane_defs.quick_push
8573                       (std::make_pair (0, gimple_assign_rhs2 (assign)));
8574           unsigned lanes_found = 1;
8575           /* Start with the use chains, the last stmt will be the root.  */
8576           stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
8577           vec<stmt_vec_info> roots = vNULL;
8578           roots.safe_push (last);
8579           do
8580             {
8581               use_operand_p use_p;
8582               gimple *use_stmt;
8583               if (!single_imm_use (def, &use_p, &use_stmt))
8584                 break;
8585               unsigned this_lane;
8586               if (!bb_vinfo->lookup_stmt (use_stmt)
8587                   || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
8588                   || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
8589                 break;
8590               if (bitmap_bit_p (lanes, this_lane))
8591                 break;
8592               lanes_found++;
8593               bitmap_set_bit (lanes, this_lane);
8594               gassign *use_ass = as_a <gassign *> (use_stmt);
8595               lane_defs.quick_push (std::make_pair
8596                                      (this_lane, gimple_assign_rhs2 (use_ass)));
8597               last = bb_vinfo->lookup_stmt (use_ass);
8598               roots.safe_push (last);
8599               def = gimple_assign_lhs (use_ass);
8600             }
8601           while (lanes_found < nlanes);
8602           if (roots.length () > 1)
8603             std::swap(roots[0], roots[roots.length () - 1]);
8604           if (lanes_found < nlanes)
8605             {
8606               /* Now search the def chain.  */
8607               def = gimple_assign_rhs1 (assign);
8608               do
8609                 {
8610                   if (TREE_CODE (def) != SSA_NAME
8611                       || !has_single_use (def))
8612                     break;
8613                   gimple *def_stmt = SSA_NAME_DEF_STMT (def);
8614                   unsigned this_lane;
8615                   if (!bb_vinfo->lookup_stmt (def_stmt)
8616                       || !vect_slp_is_lane_insert (def_stmt,
8617                                                    NULL_TREE, &this_lane)
8618                       || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
8619                     break;
8620                   if (bitmap_bit_p (lanes, this_lane))
8621                     break;
8622                   lanes_found++;
8623                   bitmap_set_bit (lanes, this_lane);
8624                   lane_defs.quick_push (std::make_pair
8625                                           (this_lane,
8626                                            gimple_assign_rhs2 (def_stmt)));
8627                   roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
8628                   def = gimple_assign_rhs1 (def_stmt);
8629                 }
8630               while (lanes_found < nlanes);
8631             }
8632           if (lanes_found == nlanes)
8633             {
8634               /* Sort lane_defs after the lane index and register the root.  */
8635               lane_defs.qsort (vld_cmp);
8636               vec<stmt_vec_info> stmts;
8637               stmts.create (nlanes);
8638               for (unsigned i = 0; i < nlanes; ++i)
8639                 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
8640               bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
8641                                                    stmts, roots));
8642             }
8643           else
8644             roots.release ();
8645         }
8646       else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
8647                && (associative_tree_code (code) || code == MINUS_EXPR)
8648                /* ???  This pessimizes a two-element reduction.  PR54400.
8649                   ???  In-order reduction could be handled if we only
8650                   traverse one operand chain in vect_slp_linearize_chain.  */
8651                && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
8652                /* Ops with constants at the tail can be stripped here.  */
8653                && TREE_CODE (rhs) == SSA_NAME
8654                && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
8655                /* Should be the chain end.  */
8656                && (!single_imm_use (gimple_assign_lhs (assign),
8657                                     &use_p, &use_stmt)
8658                    || !is_gimple_assign (use_stmt)
8659                    || (gimple_assign_rhs_code (use_stmt) != code
8660                        && ((code != PLUS_EXPR && code != MINUS_EXPR)
8661                            || (gimple_assign_rhs_code (use_stmt)
8662                                != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
8663         {
8664           /* We start the match at the end of a possible association
8665              chain.  */
8666           auto_vec<chain_op_t> chain;
8667           auto_vec<std::pair<tree_code, gimple *> > worklist;
8668           auto_vec<gimple *> chain_stmts;
8669           gimple *code_stmt = NULL, *alt_code_stmt = NULL;
8670           if (code == MINUS_EXPR)
8671             code = PLUS_EXPR;
8672           internal_fn reduc_fn;
8673           if (!reduction_fn_for_scalar_code (code, &reduc_fn)
8674               || reduc_fn == IFN_LAST)
8675             continue;
8676           vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
8677                                     /* ??? */
8678                                     code_stmt, alt_code_stmt, &chain_stmts);
8679           if (chain.length () > 1)
8680             {
8681               /* Sort the chain according to def_type and operation.  */
8682               chain.sort (dt_sort_cmp, bb_vinfo);
8683               /* ???  Now we'd want to strip externals and constants
8684                  but record those to be handled in the epilogue.  */
8685               /* ???  For now do not allow mixing ops or externs/constants.  */
8686               bool invalid = false;
8687               unsigned remain_cnt = 0;
8688               unsigned last_idx = 0;
8689               for (unsigned i = 0; i < chain.length (); ++i)
8690                 {
8691                   if (chain[i].code != code)
8692                     {
8693                       invalid = true;
8694                       break;
8695                     }
8696                   if (chain[i].dt != vect_internal_def
8697                       /* Avoid stmts where the def is not the LHS, like
8698                          ASMs.  */
8699                       || (gimple_get_lhs (bb_vinfo->lookup_def
8700                                                       (chain[i].op)->stmt)
8701                           != chain[i].op))
8702                     remain_cnt++;
8703                   else
8704                     last_idx = i;
8705                 }
8706               /* Make sure to have an even number of lanes as we later do
8707                  all-or-nothing discovery, not trying to split further.  */
8708               if ((chain.length () - remain_cnt) & 1)
8709                 remain_cnt++;
8710               if (!invalid && chain.length () - remain_cnt > 1)
8711                 {
8712                   vec<stmt_vec_info> stmts;
8713                   vec<tree> remain = vNULL;
8714                   stmts.create (chain.length ());
8715                   if (remain_cnt > 0)
8716                     remain.create (remain_cnt);
8717                   for (unsigned i = 0; i < chain.length (); ++i)
8718                     {
8719                       stmt_vec_info stmt_info;
8720                       if (chain[i].dt == vect_internal_def
8721                           && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
8722                               gimple_get_lhs (stmt_info->stmt) == chain[i].op)
8723                           && (i != last_idx
8724                               || (stmts.length () & 1)))
8725                         stmts.quick_push (stmt_info);
8726                       else
8727                         remain.quick_push (chain[i].op);
8728                     }
8729                   vec<stmt_vec_info> roots;
8730                   roots.create (chain_stmts.length ());
8731                   for (unsigned i = 0; i < chain_stmts.length (); ++i)
8732                     roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
8733                   bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
8734                                                        stmts, roots, remain));
8735                 }
8736             }
8737         }
8738     }
8739 }
8740
8741 /* Walk the grouped store chains and replace entries with their
8742    pattern variant if any.  */
8743
8744 static void
8745 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
8746 {
8747   stmt_vec_info first_element;
8748   unsigned i;
8749
8750   FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
8751     {
8752       /* We also have CTORs in this array.  */
8753       if (!STMT_VINFO_GROUPED_ACCESS (first_element))
8754         continue;
8755       if (STMT_VINFO_IN_PATTERN_P (first_element))
8756         {
8757           stmt_vec_info orig = first_element;
8758           first_element = STMT_VINFO_RELATED_STMT (first_element);
8759           DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
8760           DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
8761           DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
8762           DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
8763           vinfo->grouped_stores[i] = first_element;
8764         }
8765       stmt_vec_info prev = first_element;
8766       while (DR_GROUP_NEXT_ELEMENT (prev))
8767         {
8768           stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
8769           if (STMT_VINFO_IN_PATTERN_P (elt))
8770             {
8771               stmt_vec_info orig = elt;
8772               elt = STMT_VINFO_RELATED_STMT (elt);
8773               DR_GROUP_NEXT_ELEMENT (prev) = elt;
8774               DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
8775               DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
8776             }
8777           DR_GROUP_FIRST_ELEMENT (elt) = first_element;
8778           prev = elt;
8779         }
8780     }
8781 }
8782
8783 /* Check if the region described by BB_VINFO can be vectorized, returning
8784    true if so.  When returning false, set FATAL to true if the same failure
8785    would prevent vectorization at other vector sizes, false if it is still
8786    worth trying other sizes.  N_STMTS is the number of statements in the
8787    region.  */
8788
8789 static bool
8790 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
8791                        vec<int> *dataref_groups)
8792 {
8793   DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
8794
8795   slp_instance instance;
8796   int i;
8797   poly_uint64 min_vf = 2;
8798
8799   /* The first group of checks is independent of the vector size.  */
8800   fatal = true;
8801
8802   /* Analyze the data references.  */
8803
8804   if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
8805     {
8806       if (dump_enabled_p ())
8807         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8808                          "not vectorized: unhandled data-ref in basic "
8809                          "block.\n");
8810       return false;
8811     }
8812
8813   if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
8814     {
8815      if (dump_enabled_p ())
8816        dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8817                         "not vectorized: unhandled data access in "
8818                         "basic block.\n");
8819       return false;
8820     }
8821
8822   vect_slp_check_for_roots (bb_vinfo);
8823
8824   /* If there are no grouped stores and no constructors in the region
8825      there is no need to continue with pattern recog as vect_analyze_slp
8826      will fail anyway.  */
8827   if (bb_vinfo->grouped_stores.is_empty ()
8828       && bb_vinfo->roots.is_empty ())
8829     {
8830       if (dump_enabled_p ())
8831         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8832                          "not vectorized: no grouped stores in "
8833                          "basic block.\n");
8834       return false;
8835     }
8836
8837   /* While the rest of the analysis below depends on it in some way.  */
8838   fatal = false;
8839
8840   vect_pattern_recog (bb_vinfo);
8841
8842   /* Update store groups from pattern processing.  */
8843   vect_fixup_store_groups_with_patterns (bb_vinfo);
8844
8845   /* Check the SLP opportunities in the basic block, analyze and build SLP
8846      trees.  */
8847   if (!vect_analyze_slp (bb_vinfo, n_stmts))
8848     {
8849       if (dump_enabled_p ())
8850         {
8851           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8852                            "Failed to SLP the basic block.\n");
8853           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8854                            "not vectorized: failed to find SLP opportunities "
8855                            "in basic block.\n");
8856         }
8857       return false;
8858     }
8859
8860   /* Optimize permutations.  */
8861   vect_optimize_slp (bb_vinfo);
8862
8863   /* Gather the loads reachable from the SLP graph entries.  */
8864   vect_gather_slp_loads (bb_vinfo);
8865
8866   vect_record_base_alignments (bb_vinfo);
8867
8868   /* Analyze and verify the alignment of data references and the
8869      dependence in the SLP instances.  */
8870   for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
8871     {
8872       vect_location = instance->location ();
8873       if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
8874           || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
8875         {
8876           slp_tree node = SLP_INSTANCE_TREE (instance);
8877           stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8878           if (dump_enabled_p ())
8879             dump_printf_loc (MSG_NOTE, vect_location,
8880                              "removing SLP instance operations starting from: %G",
8881                              stmt_info->stmt);
8882           vect_free_slp_instance (instance);
8883           BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
8884           continue;
8885         }
8886
8887       /* Mark all the statements that we want to vectorize as pure SLP and
8888          relevant.  */
8889       vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
8890       vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
8891       unsigned j;
8892       stmt_vec_info root;
8893       /* Likewise consider instance root stmts as vectorized.  */
8894       FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
8895         STMT_SLP_TYPE (root) = pure_slp;
8896
8897       i++;
8898     }
8899   if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
8900     return false;
8901
8902   if (!vect_slp_analyze_operations (bb_vinfo))
8903     {
8904       if (dump_enabled_p ())
8905         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8906                          "not vectorized: bad operation in basic block.\n");
8907       return false;
8908     }
8909
8910   vect_bb_partition_graph (bb_vinfo);
8911
8912   return true;
8913 }
8914
8915 /* Subroutine of vect_slp_bb.  Try to vectorize the statements for all
8916    basic blocks in BBS, returning true on success.
8917    The region has N_STMTS statements and has the datarefs given by DATAREFS.  */
8918
8919 static bool
8920 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
8921                  vec<int> *dataref_groups, unsigned int n_stmts,
8922                  loop_p orig_loop)
8923 {
8924   bb_vec_info bb_vinfo;
8925   auto_vector_modes vector_modes;
8926
8927   /* Autodetect first vector size we try.  */
8928   machine_mode next_vector_mode = VOIDmode;
8929   targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
8930   unsigned int mode_i = 0;
8931
8932   vec_info_shared shared;
8933
8934   machine_mode autodetected_vector_mode = VOIDmode;
8935   while (1)
8936     {
8937       bool vectorized = false;
8938       bool fatal = false;
8939       bb_vinfo = new _bb_vec_info (bbs, &shared);
8940
8941       bool first_time_p = shared.datarefs.is_empty ();
8942       BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
8943       if (first_time_p)
8944         bb_vinfo->shared->save_datarefs ();
8945       else
8946         bb_vinfo->shared->check_datarefs ();
8947       bb_vinfo->vector_mode = next_vector_mode;
8948
8949       if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
8950         {
8951           if (dump_enabled_p ())
8952             {
8953               dump_printf_loc (MSG_NOTE, vect_location,
8954                                "***** Analysis succeeded with vector mode"
8955                                " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
8956               dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
8957             }
8958
8959           bb_vinfo->shared->check_datarefs ();
8960
8961           bool force_clear = false;
8962           auto_vec<slp_instance> profitable_subgraphs;
8963           for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
8964             {
8965               if (instance->subgraph_entries.is_empty ())
8966                 continue;
8967
8968               dump_user_location_t saved_vect_location = vect_location;
8969               vect_location = instance->location ();
8970               if (!unlimited_cost_model (NULL)
8971                   && !vect_bb_vectorization_profitable_p
8972                         (bb_vinfo, instance->subgraph_entries, orig_loop))
8973                 {
8974                   if (dump_enabled_p ())
8975                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8976                                      "not vectorized: vectorization is not "
8977                                      "profitable.\n");
8978                   vect_location = saved_vect_location;
8979                   continue;
8980                 }
8981
8982               vect_location = saved_vect_location;
8983               if (!dbg_cnt (vect_slp))
8984                 {
8985                   force_clear = true;
8986                   continue;
8987                 }
8988
8989               profitable_subgraphs.safe_push (instance);
8990             }
8991
8992           /* When we're vectorizing an if-converted loop body make sure
8993              we vectorized all if-converted code.  */
8994           if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
8995             {
8996               gcc_assert (bb_vinfo->nbbs == 1);
8997               for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
8998                    !gsi_end_p (gsi); gsi_next (&gsi))
8999                 {
9000                   /* The costing above left us with DCEable vectorized scalar
9001                      stmts having the visited flag set on profitable
9002                      subgraphs.  Do the delayed clearing of the flag here.  */
9003                   if (gimple_visited_p (gsi_stmt (gsi)))
9004                     {
9005                       gimple_set_visited (gsi_stmt (gsi), false);
9006                       continue;
9007                     }
9008                   if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
9009                     continue;
9010
9011                   if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
9012                     if (gimple_assign_rhs_code (ass) == COND_EXPR)
9013                       {
9014                         if (!profitable_subgraphs.is_empty ()
9015                             && dump_enabled_p ())
9016                           dump_printf_loc (MSG_NOTE, vect_location,
9017                                            "not profitable because of "
9018                                            "unprofitable if-converted scalar "
9019                                            "code\n");
9020                         profitable_subgraphs.truncate (0);
9021                       }
9022                 }
9023             }
9024
9025           /* Finally schedule the profitable subgraphs.  */
9026           for (slp_instance instance : profitable_subgraphs)
9027             {
9028               if (!vectorized && dump_enabled_p ())
9029                 dump_printf_loc (MSG_NOTE, vect_location,
9030                                  "Basic block will be vectorized "
9031                                  "using SLP\n");
9032               vectorized = true;
9033
9034               /* Dump before scheduling as store vectorization will remove
9035                  the original stores and mess with the instance tree
9036                  so querying its location will eventually ICE.  */
9037               if (flag_checking)
9038                 for (slp_instance sub : instance->subgraph_entries)
9039                   gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
9040               unsigned HOST_WIDE_INT bytes;
9041               if (dump_enabled_p ())
9042                 for (slp_instance sub : instance->subgraph_entries)
9043                   {
9044                     tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
9045                     if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
9046                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
9047                                        sub->location (),
9048                                        "basic block part vectorized using %wu "
9049                                        "byte vectors\n", bytes);
9050                     else
9051                       dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
9052                                        sub->location (),
9053                                        "basic block part vectorized using "
9054                                        "variable length vectors\n");
9055                   }
9056
9057               dump_user_location_t saved_vect_location = vect_location;
9058               vect_location = instance->location ();
9059
9060               vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
9061
9062               vect_location = saved_vect_location;
9063             }
9064         }
9065       else
9066         {
9067           if (dump_enabled_p ())
9068             dump_printf_loc (MSG_NOTE, vect_location,
9069                              "***** Analysis failed with vector mode %s\n",
9070                              GET_MODE_NAME (bb_vinfo->vector_mode));
9071         }
9072
9073       if (mode_i == 0)
9074         autodetected_vector_mode = bb_vinfo->vector_mode;
9075
9076       if (!fatal)
9077         while (mode_i < vector_modes.length ()
9078                && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
9079           {
9080             if (dump_enabled_p ())
9081               dump_printf_loc (MSG_NOTE, vect_location,
9082                                "***** The result for vector mode %s would"
9083                                " be the same\n",
9084                                GET_MODE_NAME (vector_modes[mode_i]));
9085             mode_i += 1;
9086           }
9087
9088       delete bb_vinfo;
9089
9090       if (mode_i < vector_modes.length ()
9091           && VECTOR_MODE_P (autodetected_vector_mode)
9092           && (related_vector_mode (vector_modes[mode_i],
9093                                    GET_MODE_INNER (autodetected_vector_mode))
9094               == autodetected_vector_mode)
9095           && (related_vector_mode (autodetected_vector_mode,
9096                                    GET_MODE_INNER (vector_modes[mode_i]))
9097               == vector_modes[mode_i]))
9098         {
9099           if (dump_enabled_p ())
9100             dump_printf_loc (MSG_NOTE, vect_location,
9101                              "***** Skipping vector mode %s, which would"
9102                              " repeat the analysis for %s\n",
9103                              GET_MODE_NAME (vector_modes[mode_i]),
9104                              GET_MODE_NAME (autodetected_vector_mode));
9105           mode_i += 1;
9106         }
9107
9108       if (vectorized
9109           || mode_i == vector_modes.length ()
9110           || autodetected_vector_mode == VOIDmode
9111           /* If vect_slp_analyze_bb_1 signaled that analysis for all
9112              vector sizes will fail do not bother iterating.  */
9113           || fatal)
9114         return vectorized;
9115
9116       /* Try the next biggest vector size.  */
9117       next_vector_mode = vector_modes[mode_i++];
9118       if (dump_enabled_p ())
9119         dump_printf_loc (MSG_NOTE, vect_location,
9120                          "***** Re-trying analysis with vector mode %s\n",
9121                          GET_MODE_NAME (next_vector_mode));
9122     }
9123 }
9124
9125
9126 /* Main entry for the BB vectorizer.  Analyze and transform BBS, returns
9127    true if anything in the basic-block was vectorized.  */
9128
9129 static bool
9130 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
9131 {
9132   vec<data_reference_p> datarefs = vNULL;
9133   auto_vec<int> dataref_groups;
9134   int insns = 0;
9135   int current_group = 0;
9136
9137   for (unsigned i = 0; i < bbs.length (); i++)
9138     {
9139       basic_block bb = bbs[i];
9140       for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
9141            gsi_next (&gsi))
9142         {
9143           gimple *stmt = gsi_stmt (gsi);
9144           if (is_gimple_debug (stmt))
9145             continue;
9146
9147           insns++;
9148
9149           if (gimple_location (stmt) != UNKNOWN_LOCATION)
9150             vect_location = stmt;
9151
9152           if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
9153                                               &dataref_groups, current_group))
9154             ++current_group;
9155         }
9156       /* New BBs always start a new DR group.  */
9157       ++current_group;
9158     }
9159
9160   return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
9161 }
9162
9163 /* Special entry for the BB vectorizer.  Analyze and transform a single
9164    if-converted BB with ORIG_LOOPs body being the not if-converted
9165    representation.  Returns true if anything in the basic-block was
9166    vectorized.  */
9167
9168 bool
9169 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
9170 {
9171   auto_vec<basic_block> bbs;
9172   bbs.safe_push (bb);
9173   return vect_slp_bbs (bbs, orig_loop);
9174 }
9175
9176 /* Main entry for the BB vectorizer.  Analyze and transform BB, returns
9177    true if anything in the basic-block was vectorized.  */
9178
9179 bool
9180 vect_slp_function (function *fun)
9181 {
9182   bool r = false;
9183   int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
9184   auto_bitmap exit_bbs;
9185   bitmap_set_bit (exit_bbs, EXIT_BLOCK);
9186   edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
9187   unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
9188                                                       true, rpo, NULL);
9189
9190   /* For the moment split the function into pieces to avoid making
9191      the iteration on the vector mode moot.  Split at points we know
9192      to not handle well which is CFG merges (SLP discovery doesn't
9193      handle non-loop-header PHIs) and loop exits.  Since pattern
9194      recog requires reverse iteration to visit uses before defs
9195      simply chop RPO into pieces.  */
9196   auto_vec<basic_block> bbs;
9197   for (unsigned i = 0; i < n; i++)
9198     {
9199       basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
9200       bool split = false;
9201
9202       /* Split when a BB is not dominated by the first block.  */
9203       if (!bbs.is_empty ()
9204           && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
9205         {
9206           if (dump_enabled_p ())
9207             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9208                              "splitting region at dominance boundary bb%d\n",
9209                              bb->index);
9210           split = true;
9211         }
9212       /* Split when the loop determined by the first block
9213          is exited.  This is because we eventually insert
9214          invariants at region begin.  */
9215       else if (!bbs.is_empty ()
9216                && bbs[0]->loop_father != bb->loop_father
9217                && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
9218         {
9219           if (dump_enabled_p ())
9220             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9221                              "splitting region at loop %d exit at bb%d\n",
9222                              bbs[0]->loop_father->num, bb->index);
9223           split = true;
9224         }
9225       else if (!bbs.is_empty ()
9226                && bb->loop_father->header == bb
9227                && bb->loop_father->dont_vectorize)
9228         {
9229           if (dump_enabled_p ())
9230             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9231                              "splitting region at dont-vectorize loop %d "
9232                              "entry at bb%d\n",
9233                              bb->loop_father->num, bb->index);
9234           split = true;
9235         }
9236
9237       if (split && !bbs.is_empty ())
9238         {
9239           r |= vect_slp_bbs (bbs, NULL);
9240           bbs.truncate (0);
9241         }
9242
9243       if (bbs.is_empty ())
9244         {
9245           /* We need to be able to insert at the head of the region which
9246              we cannot for region starting with a returns-twice call.  */
9247           if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
9248             if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
9249               {
9250                 if (dump_enabled_p ())
9251                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9252                                    "skipping bb%d as start of region as it "
9253                                    "starts with returns-twice call\n",
9254                                    bb->index);
9255                 continue;
9256               }
9257           /* If the loop this BB belongs to is marked as not to be vectorized
9258              honor that also for BB vectorization.  */
9259           if (bb->loop_father->dont_vectorize)
9260             continue;
9261         }
9262
9263       bbs.safe_push (bb);
9264
9265       /* When we have a stmt ending this block and defining a
9266          value we have to insert on edges when inserting after it for
9267          a vector containing its definition.  Avoid this for now.  */
9268       if (gimple *last = *gsi_last_bb (bb))
9269         if (gimple_get_lhs (last)
9270             && is_ctrl_altering_stmt (last))
9271           {
9272             if (dump_enabled_p ())
9273               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9274                                "splitting region at control altering "
9275                                "definition %G", last);
9276             r |= vect_slp_bbs (bbs, NULL);
9277             bbs.truncate (0);
9278           }
9279     }
9280
9281   if (!bbs.is_empty ())
9282     r |= vect_slp_bbs (bbs, NULL);
9283
9284   free (rpo);
9285
9286   return r;
9287 }
9288
9289 /* Build a variable-length vector in which the elements in ELTS are repeated
9290    to a fill NRESULTS vectors of type VECTOR_TYPE.  Store the vectors in
9291    RESULTS and add any new instructions to SEQ.
9292
9293    The approach we use is:
9294
9295    (1) Find a vector mode VM with integer elements of mode IM.
9296
9297    (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
9298        ELTS' has mode IM.  This involves creating NELTS' VIEW_CONVERT_EXPRs
9299        from small vectors to IM.
9300
9301    (3) Duplicate each ELTS'[I] into a vector of mode VM.
9302
9303    (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
9304        correct byte contents.
9305
9306    (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
9307
9308    We try to find the largest IM for which this sequence works, in order
9309    to cut down on the number of interleaves.  */
9310
9311 void
9312 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
9313                           const vec<tree> &elts, unsigned int nresults,
9314                           vec<tree> &results)
9315 {
9316   unsigned int nelts = elts.length ();
9317   tree element_type = TREE_TYPE (vector_type);
9318
9319   /* (1) Find a vector mode VM with integer elements of mode IM.  */
9320   unsigned int nvectors = 1;
9321   tree new_vector_type;
9322   tree permutes[2];
9323   if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
9324                                        &nvectors, &new_vector_type,
9325                                        permutes))
9326     gcc_unreachable ();
9327
9328   /* Get a vector type that holds ELTS[0:NELTS/NELTS'].  */
9329   unsigned int partial_nelts = nelts / nvectors;
9330   tree partial_vector_type = build_vector_type (element_type, partial_nelts);
9331
9332   tree_vector_builder partial_elts;
9333   auto_vec<tree, 32> pieces (nvectors * 2);
9334   pieces.quick_grow_cleared (nvectors * 2);
9335   for (unsigned int i = 0; i < nvectors; ++i)
9336     {
9337       /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
9338              ELTS' has mode IM.  */
9339       partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
9340       for (unsigned int j = 0; j < partial_nelts; ++j)
9341         partial_elts.quick_push (elts[i * partial_nelts + j]);
9342       tree t = gimple_build_vector (seq, &partial_elts);
9343       t = gimple_build (seq, VIEW_CONVERT_EXPR,
9344                         TREE_TYPE (new_vector_type), t);
9345
9346       /* (3) Duplicate each ELTS'[I] into a vector of mode VM.  */
9347       pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
9348     }
9349
9350   /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
9351          correct byte contents.
9352
9353      Conceptually, we need to repeat the following operation log2(nvectors)
9354      times, where hi_start = nvectors / 2:
9355
9356         out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
9357         out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
9358
9359      However, if each input repeats every N elements and the VF is
9360      a multiple of N * 2, the HI result is the same as the LO result.
9361      This will be true for the first N1 iterations of the outer loop,
9362      followed by N2 iterations for which both the LO and HI results
9363      are needed.  I.e.:
9364
9365         N1 + N2 = log2(nvectors)
9366
9367      Each "N1 iteration" doubles the number of redundant vectors and the
9368      effect of the process as a whole is to have a sequence of nvectors/2**N1
9369      vectors that repeats 2**N1 times.  Rather than generate these redundant
9370      vectors, we halve the number of vectors for each N1 iteration.  */
9371   unsigned int in_start = 0;
9372   unsigned int out_start = nvectors;
9373   unsigned int new_nvectors = nvectors;
9374   for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
9375     {
9376       unsigned int hi_start = new_nvectors / 2;
9377       unsigned int out_i = 0;
9378       for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
9379         {
9380           if ((in_i & 1) != 0
9381               && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
9382                              2 * in_repeat))
9383             continue;
9384
9385           tree output = make_ssa_name (new_vector_type);
9386           tree input1 = pieces[in_start + (in_i / 2)];
9387           tree input2 = pieces[in_start + (in_i / 2) + hi_start];
9388           gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
9389                                                input1, input2,
9390                                                permutes[in_i & 1]);
9391           gimple_seq_add_stmt (seq, stmt);
9392           pieces[out_start + out_i] = output;
9393           out_i += 1;
9394         }
9395       std::swap (in_start, out_start);
9396       new_nvectors = out_i;
9397     }
9398
9399   /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type.  */
9400   results.reserve (nresults);
9401   for (unsigned int i = 0; i < nresults; ++i)
9402     if (i < new_nvectors)
9403       results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
9404                                         pieces[in_start + i]));
9405     else
9406       results.quick_push (results[i - new_nvectors]);
9407 }
9408
9409
9410 /* For constant and loop invariant defs in OP_NODE this function creates
9411    vector defs that will be used in the vectorized stmts and stores them
9412    to SLP_TREE_VEC_DEFS of OP_NODE.  */
9413
9414 static void
9415 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
9416 {
9417   unsigned HOST_WIDE_INT nunits;
9418   tree vec_cst;
9419   unsigned j, number_of_places_left_in_vector;
9420   tree vector_type;
9421   tree vop;
9422   int group_size = op_node->ops.length ();
9423   unsigned int vec_num, i;
9424   unsigned number_of_copies = 1;
9425   bool constant_p;
9426   gimple_seq ctor_seq = NULL;
9427   auto_vec<tree, 16> permute_results;
9428
9429   /* We always want SLP_TREE_VECTYPE (op_node) here correctly set.  */
9430   vector_type = SLP_TREE_VECTYPE (op_node);
9431
9432   unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
9433   SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
9434   auto_vec<tree> voprnds (number_of_vectors);
9435
9436   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
9437      created vectors. It is greater than 1 if unrolling is performed.
9438
9439      For example, we have two scalar operands, s1 and s2 (e.g., group of
9440      strided accesses of size two), while NUNITS is four (i.e., four scalars
9441      of this type can be packed in a vector).  The output vector will contain
9442      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
9443      will be 2).
9444
9445      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
9446      containing the operands.
9447
9448      For example, NUNITS is four as before, and the group size is 8
9449      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
9450      {s5, s6, s7, s8}.  */
9451
9452   /* When using duplicate_and_interleave, we just need one element for
9453      each scalar statement.  */
9454   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
9455     nunits = group_size;
9456
9457   number_of_copies = nunits * number_of_vectors / group_size;
9458
9459   number_of_places_left_in_vector = nunits;
9460   constant_p = true;
9461   tree uniform_elt = NULL_TREE;
9462   tree_vector_builder elts (vector_type, nunits, 1);
9463   elts.quick_grow (nunits);
9464   stmt_vec_info insert_after = NULL;
9465   for (j = 0; j < number_of_copies; j++)
9466     {
9467       tree op;
9468       for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
9469         {
9470           /* Create 'vect_ = {op0,op1,...,opn}'.  */
9471           tree orig_op = op;
9472           if (number_of_places_left_in_vector == nunits)
9473             uniform_elt = op;
9474           else if (uniform_elt && operand_equal_p (uniform_elt, op))
9475             op = elts[number_of_places_left_in_vector];
9476           else
9477             uniform_elt = NULL_TREE;
9478           number_of_places_left_in_vector--;
9479           if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
9480             {
9481               if (CONSTANT_CLASS_P (op))
9482                 {
9483                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
9484                     {
9485                       /* Can't use VIEW_CONVERT_EXPR for booleans because
9486                          of possibly different sizes of scalar value and
9487                          vector element.  */
9488                       if (integer_zerop (op))
9489                         op = build_int_cst (TREE_TYPE (vector_type), 0);
9490                       else if (integer_onep (op))
9491                         op = build_all_ones_cst (TREE_TYPE (vector_type));
9492                       else
9493                         gcc_unreachable ();
9494                     }
9495                   else
9496                     op = fold_unary (VIEW_CONVERT_EXPR,
9497                                      TREE_TYPE (vector_type), op);
9498                   gcc_assert (op && CONSTANT_CLASS_P (op));
9499                 }
9500               else
9501                 {
9502                   tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
9503                   gimple *init_stmt;
9504                   if (VECTOR_BOOLEAN_TYPE_P (vector_type))
9505                     {
9506                       tree true_val
9507                         = build_all_ones_cst (TREE_TYPE (vector_type));
9508                       tree false_val
9509                         = build_zero_cst (TREE_TYPE (vector_type));
9510                       gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
9511                       init_stmt = gimple_build_assign (new_temp, COND_EXPR,
9512                                                        op, true_val,
9513                                                        false_val);
9514                     }
9515                   else
9516                     {
9517                       op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
9518                                    op);
9519                       init_stmt
9520                         = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
9521                                                op);
9522                     }
9523                   gimple_seq_add_stmt (&ctor_seq, init_stmt);
9524                   op = new_temp;
9525                 }
9526             }
9527           elts[number_of_places_left_in_vector] = op;
9528           if (!CONSTANT_CLASS_P (op))
9529             constant_p = false;
9530           /* For BB vectorization we have to compute an insert location
9531              when a def is inside the analyzed region since we cannot
9532              simply insert at the BB start in this case.  */
9533           stmt_vec_info opdef;
9534           if (TREE_CODE (orig_op) == SSA_NAME
9535               && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
9536               && is_a <bb_vec_info> (vinfo)
9537               && (opdef = vinfo->lookup_def (orig_op)))
9538             {
9539               if (!insert_after)
9540                 insert_after = opdef;
9541               else
9542                 insert_after = get_later_stmt (insert_after, opdef);
9543             }
9544
9545           if (number_of_places_left_in_vector == 0)
9546             {
9547               auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
9548               if (uniform_elt)
9549                 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
9550                                                         elts[0]);
9551               else if (constant_p
9552                        ? multiple_p (type_nunits, nunits)
9553                        : known_eq (type_nunits, nunits))
9554                 vec_cst = gimple_build_vector (&ctor_seq, &elts);
9555               else
9556                 {
9557                   if (permute_results.is_empty ())
9558                     duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
9559                                               elts, number_of_vectors,
9560                                               permute_results);
9561                   vec_cst = permute_results[number_of_vectors - j - 1];
9562                 }
9563               if (!gimple_seq_empty_p (ctor_seq))
9564                 {
9565                   if (insert_after)
9566                     {
9567                       gimple_stmt_iterator gsi;
9568                       if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
9569                         {
9570                           gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
9571                           gsi_insert_seq_before (&gsi, ctor_seq,
9572                                                  GSI_CONTINUE_LINKING);
9573                         }
9574                       else if (!stmt_ends_bb_p (insert_after->stmt))
9575                         {
9576                           gsi = gsi_for_stmt (insert_after->stmt);
9577                           gsi_insert_seq_after (&gsi, ctor_seq,
9578                                                 GSI_CONTINUE_LINKING);
9579                         }
9580                       else
9581                         {
9582                           /* When we want to insert after a def where the
9583                              defining stmt throws then insert on the fallthru
9584                              edge.  */
9585                           edge e = find_fallthru_edge
9586                                      (gimple_bb (insert_after->stmt)->succs);
9587                           basic_block new_bb
9588                             = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
9589                           gcc_assert (!new_bb);
9590                         }
9591                     }
9592                   else
9593                     vinfo->insert_seq_on_entry (NULL, ctor_seq);
9594                   ctor_seq = NULL;
9595                 }
9596               voprnds.quick_push (vec_cst);
9597               insert_after = NULL;
9598               number_of_places_left_in_vector = nunits;
9599               constant_p = true;
9600               elts.new_vector (vector_type, nunits, 1);
9601               elts.quick_grow (nunits);
9602             }
9603         }
9604     }
9605
9606   /* Since the vectors are created in the reverse order, we should invert
9607      them.  */
9608   vec_num = voprnds.length ();
9609   for (j = vec_num; j != 0; j--)
9610     {
9611       vop = voprnds[j - 1];
9612       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
9613     }
9614
9615   /* In case that VF is greater than the unrolling factor needed for the SLP
9616      group of stmts, NUMBER_OF_VECTORS to be created is greater than
9617      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
9618      to replicate the vectors.  */
9619   while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
9620     for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
9621          i++)
9622       SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
9623 }
9624
9625 /* Get the Ith vectorized definition from SLP_NODE.  */
9626
9627 tree
9628 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
9629 {
9630   return SLP_TREE_VEC_DEFS (slp_node)[i];
9631 }
9632
9633 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS.  */
9634
9635 void
9636 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
9637 {
9638   vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
9639   vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
9640 }
9641
9642 /* Get N vectorized definitions for SLP_NODE.  */
9643
9644 void
9645 vect_get_slp_defs (vec_info *,
9646                    slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
9647 {
9648   if (n == -1U)
9649     n = SLP_TREE_CHILDREN (slp_node).length ();
9650
9651   for (unsigned i = 0; i < n; ++i)
9652     {
9653       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9654       vec<tree> vec_defs = vNULL;
9655       vect_get_slp_defs (child, &vec_defs);
9656       vec_oprnds->quick_push (vec_defs);
9657     }
9658 }
9659
9660 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
9661    - PERM gives the permutation that the caller wants to use for NODE,
9662      which might be different from SLP_LOAD_PERMUTATION.
9663    - DUMP_P controls whether the function dumps information.  */
9664
9665 static bool
9666 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
9667                                 load_permutation_t &perm,
9668                                 const vec<tree> &dr_chain,
9669                                 gimple_stmt_iterator *gsi, poly_uint64 vf,
9670                                 bool analyze_only, bool dump_p,
9671                                 unsigned *n_perms, unsigned int *n_loads,
9672                                 bool dce_chain)
9673 {
9674   stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
9675   int vec_index = 0;
9676   tree vectype = SLP_TREE_VECTYPE (node);
9677   unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
9678   unsigned int mask_element;
9679   unsigned dr_group_size;
9680   machine_mode mode;
9681
9682   if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
9683     dr_group_size = 1;
9684   else
9685     {
9686       stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
9687       dr_group_size = DR_GROUP_SIZE (stmt_info);
9688     }
9689
9690   mode = TYPE_MODE (vectype);
9691   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9692   unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9693
9694   /* Initialize the vect stmts of NODE to properly insert the generated
9695      stmts later.  */
9696   if (! analyze_only)
9697     for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
9698       SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
9699
9700   /* Generate permutation masks for every NODE. Number of masks for each NODE
9701      is equal to GROUP_SIZE.
9702      E.g., we have a group of three nodes with three loads from the same
9703      location in each node, and the vector size is 4. I.e., we have a
9704      a0b0c0a1b1c1... sequence and we need to create the following vectors:
9705      for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
9706      for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
9707      ...
9708
9709      The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
9710      The last mask is illegal since we assume two operands for permute
9711      operation, and the mask element values can't be outside that range.
9712      Hence, the last mask must be converted into {2,5,5,5}.
9713      For the first two permutations we need the first and the second input
9714      vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
9715      we need the second and the third vectors: {b1,c1,a2,b2} and
9716      {c2,a3,b3,c3}.  */
9717
9718   int vect_stmts_counter = 0;
9719   unsigned int index = 0;
9720   int first_vec_index = -1;
9721   int second_vec_index = -1;
9722   bool noop_p = true;
9723   *n_perms = 0;
9724
9725   vec_perm_builder mask;
9726   unsigned int nelts_to_build;
9727   unsigned int nvectors_per_build;
9728   unsigned int in_nlanes;
9729   bool repeating_p = (group_size == dr_group_size
9730                       && multiple_p (nunits, group_size));
9731   if (repeating_p)
9732     {
9733       /* A single vector contains a whole number of copies of the node, so:
9734          (a) all permutes can use the same mask; and
9735          (b) the permutes only need a single vector input.  */
9736       mask.new_vector (nunits, group_size, 3);
9737       nelts_to_build = mask.encoded_nelts ();
9738       /* It's possible to obtain zero nstmts during analyze_only, so make
9739          it at least one to ensure the later computation for n_perms
9740          proceed.  */
9741       nvectors_per_build = nstmts > 0 ? nstmts : 1;
9742       in_nlanes = dr_group_size * 3;
9743     }
9744   else
9745     {
9746       /* We need to construct a separate mask for each vector statement.  */
9747       unsigned HOST_WIDE_INT const_nunits, const_vf;
9748       if (!nunits.is_constant (&const_nunits)
9749           || !vf.is_constant (&const_vf))
9750         return false;
9751       mask.new_vector (const_nunits, const_nunits, 1);
9752       nelts_to_build = const_vf * group_size;
9753       nvectors_per_build = 1;
9754       in_nlanes = const_vf * dr_group_size;
9755     }
9756   auto_sbitmap used_in_lanes (in_nlanes);
9757   bitmap_clear (used_in_lanes);
9758   auto_bitmap used_defs;
9759
9760   unsigned int count = mask.encoded_nelts ();
9761   mask.quick_grow (count);
9762   vec_perm_indices indices;
9763
9764   for (unsigned int j = 0; j < nelts_to_build; j++)
9765     {
9766       unsigned int iter_num = j / group_size;
9767       unsigned int stmt_num = j % group_size;
9768       unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
9769       bitmap_set_bit (used_in_lanes, i);
9770       if (repeating_p)
9771         {
9772           first_vec_index = 0;
9773           mask_element = i;
9774         }
9775       else
9776         {
9777           /* Enforced before the loop when !repeating_p.  */
9778           unsigned int const_nunits = nunits.to_constant ();
9779           vec_index = i / const_nunits;
9780           mask_element = i % const_nunits;
9781           if (vec_index == first_vec_index
9782               || first_vec_index == -1)
9783             {
9784               first_vec_index = vec_index;
9785             }
9786           else if (vec_index == second_vec_index
9787                    || second_vec_index == -1)
9788             {
9789               second_vec_index = vec_index;
9790               mask_element += const_nunits;
9791             }
9792           else
9793             {
9794               if (dump_p)
9795                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9796                                  "permutation requires at "
9797                                  "least three vectors %G",
9798                                  stmt_info->stmt);
9799               gcc_assert (analyze_only);
9800               return false;
9801             }
9802
9803           gcc_assert (mask_element < 2 * const_nunits);
9804         }
9805
9806       if (mask_element != index)
9807         noop_p = false;
9808       mask[index++] = mask_element;
9809
9810       if (index == count)
9811         {
9812           if (!noop_p)
9813             {
9814               indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
9815               if (!can_vec_perm_const_p (mode, mode, indices))
9816                 {
9817                   if (dump_p)
9818                     {
9819                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9820                                        "unsupported vect permute { ");
9821                       for (i = 0; i < count; ++i)
9822                         {
9823                           dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
9824                           dump_printf (MSG_MISSED_OPTIMIZATION, " ");
9825                         }
9826                       dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
9827                     }
9828                   gcc_assert (analyze_only);
9829                   return false;
9830                 }
9831
9832               tree mask_vec = NULL_TREE;
9833               if (!analyze_only)
9834                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
9835
9836               if (second_vec_index == -1)
9837                 second_vec_index = first_vec_index;
9838
9839               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
9840                 {
9841                   ++*n_perms;
9842                   if (analyze_only)
9843                     continue;
9844                   /* Generate the permute statement if necessary.  */
9845                   tree first_vec = dr_chain[first_vec_index + ri];
9846                   tree second_vec = dr_chain[second_vec_index + ri];
9847                   gassign *stmt = as_a<gassign *> (stmt_info->stmt);
9848                   tree perm_dest
9849                     = vect_create_destination_var (gimple_assign_lhs (stmt),
9850                                                    vectype);
9851                   perm_dest = make_ssa_name (perm_dest);
9852                   gimple *perm_stmt
9853                     = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
9854                                            second_vec, mask_vec);
9855                   vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
9856                                                gsi);
9857                   if (dce_chain)
9858                     {
9859                       bitmap_set_bit (used_defs, first_vec_index + ri);
9860                       bitmap_set_bit (used_defs, second_vec_index + ri);
9861                     }
9862
9863                   /* Store the vector statement in NODE.  */
9864                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
9865                 }
9866             }
9867           else if (!analyze_only)
9868             {
9869               for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
9870                 {
9871                   tree first_vec = dr_chain[first_vec_index + ri];
9872                   /* If mask was NULL_TREE generate the requested
9873                      identity transform.  */
9874                   if (dce_chain)
9875                     bitmap_set_bit (used_defs, first_vec_index + ri);
9876
9877                   /* Store the vector statement in NODE.  */
9878                   SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
9879                 }
9880             }
9881
9882           index = 0;
9883           first_vec_index = -1;
9884           second_vec_index = -1;
9885           noop_p = true;
9886         }
9887     }
9888
9889   if (n_loads)
9890     {
9891       if (repeating_p)
9892         *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9893       else
9894         {
9895           /* Enforced above when !repeating_p.  */
9896           unsigned int const_nunits = nunits.to_constant ();
9897           *n_loads = 0;
9898           bool load_seen = false;
9899           for (unsigned i = 0; i < in_nlanes; ++i)
9900             {
9901               if (i % const_nunits == 0)
9902                 {
9903                   if (load_seen)
9904                     *n_loads += 1;
9905                   load_seen = false;
9906                 }
9907               if (bitmap_bit_p (used_in_lanes, i))
9908                 load_seen = true;
9909             }
9910           if (load_seen)
9911             *n_loads += 1;
9912         }
9913     }
9914
9915   if (dce_chain)
9916     for (unsigned i = 0; i < dr_chain.length (); ++i)
9917       if (!bitmap_bit_p (used_defs, i))
9918         {
9919           tree def = dr_chain[i];
9920           do
9921             {
9922               gimple *stmt = SSA_NAME_DEF_STMT (def);
9923               if (is_gimple_assign (stmt)
9924                   && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
9925                       || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
9926                 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
9927               else
9928                 def = NULL;
9929               gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
9930               gsi_remove (&rgsi, true);
9931               release_defs (stmt);
9932             }
9933           while (def);
9934         }
9935
9936   return true;
9937 }
9938
9939 /* Generate vector permute statements from a list of loads in DR_CHAIN.
9940    If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
9941    permute statements for the SLP node NODE.  Store the number of vector
9942    permute instructions in *N_PERMS and the number of vector load
9943    instructions in *N_LOADS.  If DCE_CHAIN is true, remove all definitions
9944    that were not needed.  */
9945
9946 bool
9947 vect_transform_slp_perm_load (vec_info *vinfo,
9948                               slp_tree node, const vec<tree> &dr_chain,
9949                               gimple_stmt_iterator *gsi, poly_uint64 vf,
9950                               bool analyze_only, unsigned *n_perms,
9951                               unsigned int *n_loads, bool dce_chain)
9952 {
9953   return vect_transform_slp_perm_load_1 (vinfo, node,
9954                                          SLP_TREE_LOAD_PERMUTATION (node),
9955                                          dr_chain, gsi, vf, analyze_only,
9956                                          dump_enabled_p (), n_perms, n_loads,
9957                                          dce_chain);
9958 }
9959
9960 /* Produce the next vector result for SLP permutation NODE by adding a vector
9961    statement at GSI.  If MASK_VEC is nonnull, add:
9962
9963       <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
9964
9965    otherwise add:
9966
9967       <new SSA name> = FIRST_DEF.  */
9968
9969 static void
9970 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9971                           slp_tree node, tree first_def, tree second_def,
9972                           tree mask_vec, poly_uint64 identity_offset)
9973 {
9974   tree vectype = SLP_TREE_VECTYPE (node);
9975
9976   /* ???  We SLP match existing vector element extracts but
9977      allow punning which we need to re-instantiate at uses
9978      but have no good way of explicitly representing.  */
9979   if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
9980       && !types_compatible_p (TREE_TYPE (first_def), vectype))
9981     {
9982       gassign *conv_stmt
9983         = gimple_build_assign (make_ssa_name (vectype),
9984                                build1 (VIEW_CONVERT_EXPR, vectype, first_def));
9985       vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
9986       first_def = gimple_assign_lhs (conv_stmt);
9987     }
9988   gassign *perm_stmt;
9989   tree perm_dest = make_ssa_name (vectype);
9990   if (mask_vec)
9991     {
9992       if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
9993                            TYPE_SIZE (vectype))
9994           && !types_compatible_p (TREE_TYPE (second_def), vectype))
9995         {
9996           gassign *conv_stmt
9997             = gimple_build_assign (make_ssa_name (vectype),
9998                                    build1 (VIEW_CONVERT_EXPR,
9999                                            vectype, second_def));
10000           vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
10001           second_def = gimple_assign_lhs (conv_stmt);
10002         }
10003       perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
10004                                        first_def, second_def,
10005                                        mask_vec);
10006     }
10007   else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
10008     {
10009       /* For identity permutes we still need to handle the case
10010          of offsetted extracts or concats.  */
10011       unsigned HOST_WIDE_INT c;
10012       auto first_def_nunits
10013         = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
10014       if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
10015         {
10016           unsigned HOST_WIDE_INT elsz
10017             = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
10018           tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
10019                                  TYPE_SIZE (vectype),
10020                                  bitsize_int (identity_offset * elsz));
10021           perm_stmt = gimple_build_assign (perm_dest, lowpart);
10022         }
10023       else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
10024                                     first_def_nunits, &c) && c == 2)
10025         {
10026           tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
10027                                             NULL_TREE, second_def);
10028           perm_stmt = gimple_build_assign (perm_dest, ctor);
10029         }
10030       else
10031         gcc_unreachable ();
10032     }
10033   else
10034     {
10035       /* We need a copy here in case the def was external.  */
10036       perm_stmt = gimple_build_assign (perm_dest, first_def);
10037     }
10038   vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
10039   /* Store the vector statement in NODE.  */
10040   node->push_vec_def (perm_stmt);
10041 }
10042
10043 /* Subroutine of vectorizable_slp_permutation.  Check whether the target
10044    can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
10045    If GSI is nonnull, emit the permutation there.
10046
10047    When GSI is null, the only purpose of NODE is to give properties
10048    of the result, such as the vector type and number of SLP lanes.
10049    The node does not need to be a VEC_PERM_EXPR.
10050
10051    If the target supports the operation, return the number of individual
10052    VEC_PERM_EXPRs needed, otherwise return -1.  Print information to the
10053    dump file if DUMP_P is true.  */
10054
10055 static int
10056 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
10057                                 slp_tree node, lane_permutation_t &perm,
10058                                 vec<slp_tree> &children, bool dump_p)
10059 {
10060   tree vectype = SLP_TREE_VECTYPE (node);
10061
10062   /* ???  We currently only support all same vector input types
10063      while the SLP IL should really do a concat + select and thus accept
10064      arbitrary mismatches.  */
10065   slp_tree child;
10066   unsigned i;
10067   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10068   bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
10069   tree op_vectype = NULL_TREE;
10070   FOR_EACH_VEC_ELT (children, i, child)
10071     if (SLP_TREE_VECTYPE (child))
10072       {
10073         op_vectype = SLP_TREE_VECTYPE (child);
10074         break;
10075       }
10076   if (!op_vectype)
10077     op_vectype = vectype;
10078   FOR_EACH_VEC_ELT (children, i, child)
10079     {
10080       if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
10081            && !vect_maybe_update_slp_op_vectype (child, op_vectype))
10082           || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
10083           || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
10084         {
10085           if (dump_p)
10086             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10087                              "Unsupported vector types in lane permutation\n");
10088           return -1;
10089         }
10090       if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
10091         repeating_p = false;
10092     }
10093
10094   gcc_assert (perm.length () == SLP_TREE_LANES (node));
10095
10096   /* Load-lanes permute.  This permute only acts as a forwarder to
10097      select the correct vector def of the load-lanes load which
10098      has the permuted vectors in its vector defs like
10099      { v0, w0, r0, v1, w1, r1 ... } for a ld3.  */
10100   if (node->ldst_lanes)
10101     {
10102       gcc_assert (children.length () == 1);
10103       if (!gsi)
10104         /* This is a trivial op always supported.  */
10105         return 1;
10106       slp_tree child = children[0];
10107       unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
10108                           / SLP_TREE_LANES (node));
10109       unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
10110       for (unsigned i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
10111         {
10112           tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num  + vec_idx];
10113           node->push_vec_def (def);
10114         }
10115       return 1;
10116     }
10117
10118   /* REPEATING_P is true if every output vector is guaranteed to use the
10119      same permute vector.  We can handle that case for both variable-length
10120      and constant-length vectors, but we only handle other cases for
10121      constant-length vectors.
10122
10123      Set:
10124
10125      - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
10126        mask vector that we want to build.
10127
10128      - NCOPIES to the number of copies of PERM that we need in order
10129        to build the necessary permute mask vectors.
10130
10131      - NOUTPUTS_PER_MASK to the number of output vectors we want to create
10132        for each permute mask vector.  This is only relevant when GSI is
10133        nonnull.  */
10134   uint64_t npatterns;
10135   unsigned nelts_per_pattern;
10136   uint64_t ncopies;
10137   unsigned noutputs_per_mask;
10138   if (repeating_p)
10139     {
10140       /* We need a single permute mask vector that has the form:
10141
10142            { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
10143
10144          In other words, the original n-element permute in PERM is
10145          "unrolled" to fill a full vector.  The stepped vector encoding
10146          that we use for permutes requires 3n elements.  */
10147       npatterns = SLP_TREE_LANES (node);
10148       nelts_per_pattern = ncopies = 3;
10149       noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
10150     }
10151   else
10152     {
10153       /* Calculate every element of every permute mask vector explicitly,
10154          instead of relying on the pattern described above.  */
10155       if (!nunits.is_constant (&npatterns)
10156           || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
10157         return -1;
10158       nelts_per_pattern = ncopies = 1;
10159       if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
10160         if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
10161           return -1;
10162       noutputs_per_mask = 1;
10163     }
10164   unsigned olanes = ncopies * SLP_TREE_LANES (node);
10165   gcc_assert (repeating_p || multiple_p (olanes, nunits));
10166
10167   /* Compute the { { SLP operand, vector index}, lane } permutation sequence
10168      from the { SLP operand, scalar lane } permutation as recorded in the
10169      SLP node as intermediate step.  This part should already work
10170      with SLP children with arbitrary number of lanes.  */
10171   auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
10172   auto_vec<unsigned> active_lane;
10173   vperm.create (olanes);
10174   active_lane.safe_grow_cleared (children.length (), true);
10175   for (unsigned i = 0; i < ncopies; ++i)
10176     {
10177       for (unsigned pi = 0; pi < perm.length (); ++pi)
10178         {
10179           std::pair<unsigned, unsigned> p = perm[pi];
10180           tree vtype = SLP_TREE_VECTYPE (children[p.first]);
10181           if (repeating_p)
10182             vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
10183           else
10184             {
10185               /* We checked above that the vectors are constant-length.  */
10186               unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
10187               unsigned vi = (active_lane[p.first] + p.second) / vnunits;
10188               unsigned vl = (active_lane[p.first] + p.second) % vnunits;
10189               vperm.quick_push ({{p.first, vi}, vl});
10190             }
10191         }
10192       /* Advance to the next group.  */
10193       for (unsigned j = 0; j < children.length (); ++j)
10194         active_lane[j] += SLP_TREE_LANES (children[j]);
10195     }
10196
10197   if (dump_p)
10198     {
10199       dump_printf_loc (MSG_NOTE, vect_location,
10200                        "vectorizing permutation");
10201       for (unsigned i = 0; i < perm.length (); ++i)
10202         dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
10203       if (repeating_p)
10204         dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
10205       dump_printf (MSG_NOTE, "\n");
10206       dump_printf_loc (MSG_NOTE, vect_location, "as");
10207       for (unsigned i = 0; i < vperm.length (); ++i)
10208         {
10209           if (i != 0
10210               && (repeating_p
10211                   ? multiple_p (i, npatterns)
10212                   : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
10213             dump_printf (MSG_NOTE, ",");
10214           dump_printf (MSG_NOTE, " vops%u[%u][%u]",
10215                        vperm[i].first.first, vperm[i].first.second,
10216                        vperm[i].second);
10217         }
10218       dump_printf (MSG_NOTE, "\n");
10219     }
10220
10221   /* We can only handle two-vector permutes, everything else should
10222      be lowered on the SLP level.  The following is closely inspired
10223      by vect_transform_slp_perm_load and is supposed to eventually
10224      replace it.
10225      ???   As intermediate step do code-gen in the SLP tree representation
10226      somehow?  */
10227   std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
10228   std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
10229   unsigned int index = 0;
10230   poly_uint64 mask_element;
10231   vec_perm_builder mask;
10232   mask.new_vector (nunits, npatterns, nelts_per_pattern);
10233   unsigned int count = mask.encoded_nelts ();
10234   mask.quick_grow (count);
10235   vec_perm_indices indices;
10236   unsigned nperms = 0;
10237   for (unsigned i = 0; i < vperm.length (); ++i)
10238     {
10239       mask_element = vperm[i].second;
10240       if (first_vec.first == -1U
10241           || first_vec == vperm[i].first)
10242         first_vec = vperm[i].first;
10243       else if (second_vec.first == -1U
10244                || second_vec == vperm[i].first)
10245         {
10246           second_vec = vperm[i].first;
10247           mask_element += nunits;
10248         }
10249       else
10250         {
10251           if (dump_p)
10252             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10253                              "permutation requires at "
10254                              "least three vectors\n");
10255           gcc_assert (!gsi);
10256           return -1;
10257         }
10258
10259       mask[index++] = mask_element;
10260
10261       if (index == count)
10262         {
10263           indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
10264                               TYPE_VECTOR_SUBPARTS (op_vectype));
10265           bool identity_p = (indices.series_p (0, 1, mask[0], 1)
10266                              && constant_multiple_p (mask[0], nunits));
10267           machine_mode vmode = TYPE_MODE (vectype);
10268           machine_mode op_vmode = TYPE_MODE (op_vectype);
10269           unsigned HOST_WIDE_INT c;
10270           if ((!identity_p
10271                && !can_vec_perm_const_p (vmode, op_vmode, indices))
10272               || (identity_p
10273                   && !known_le (nunits,
10274                                 TYPE_VECTOR_SUBPARTS (op_vectype))
10275                   && (!constant_multiple_p (nunits,
10276                                             TYPE_VECTOR_SUBPARTS (op_vectype),
10277                                             &c) || c != 2)))
10278             {
10279               if (dump_p)
10280                 {
10281                   dump_printf_loc (MSG_MISSED_OPTIMIZATION,
10282                                    vect_location,
10283                                    "unsupported vect permute { ");
10284                   for (i = 0; i < count; ++i)
10285                     {
10286                       dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
10287                       dump_printf (MSG_MISSED_OPTIMIZATION, " ");
10288                     }
10289                   dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
10290                 }
10291               gcc_assert (!gsi);
10292               return -1;
10293             }
10294
10295           if (!identity_p)
10296             nperms++;
10297           if (gsi)
10298             {
10299               if (second_vec.first == -1U)
10300                 second_vec = first_vec;
10301
10302               slp_tree
10303                 first_node = children[first_vec.first],
10304                 second_node = children[second_vec.first];
10305
10306               tree mask_vec = NULL_TREE;
10307               if (!identity_p)
10308                 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
10309
10310               for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
10311                 {
10312                   tree first_def
10313                     = vect_get_slp_vect_def (first_node,
10314                                              first_vec.second + vi);
10315                   tree second_def
10316                     = vect_get_slp_vect_def (second_node,
10317                                              second_vec.second + vi);
10318                   vect_add_slp_permutation (vinfo, gsi, node, first_def,
10319                                             second_def, mask_vec, mask[0]);
10320                 }
10321             }
10322
10323           index = 0;
10324           first_vec = std::make_pair (-1U, -1U);
10325           second_vec = std::make_pair (-1U, -1U);
10326         }
10327     }
10328
10329   return nperms;
10330 }
10331
10332 /* Vectorize the SLP permutations in NODE as specified
10333    in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
10334    child number and lane number.
10335    Interleaving of two two-lane two-child SLP subtrees (not supported):
10336      [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
10337    A blend of two four-lane two-child SLP subtrees:
10338      [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
10339    Highpart of a four-lane one-child SLP subtree (not supported):
10340      [ { 0, 2 }, { 0, 3 } ]
10341    Where currently only a subset is supported by code generating below.  */
10342
10343 static bool
10344 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
10345                               slp_tree node, stmt_vector_for_cost *cost_vec)
10346 {
10347   tree vectype = SLP_TREE_VECTYPE (node);
10348   lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
10349   int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
10350                                                SLP_TREE_CHILDREN (node),
10351                                                dump_enabled_p ());
10352   if (nperms < 0)
10353     return false;
10354
10355   if (!gsi)
10356     record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
10357
10358   return true;
10359 }
10360
10361 /* Vectorize SLP NODE.  */
10362
10363 static void
10364 vect_schedule_slp_node (vec_info *vinfo,
10365                         slp_tree node, slp_instance instance)
10366 {
10367   gimple_stmt_iterator si;
10368   int i;
10369   slp_tree child;
10370
10371   /* Vectorize externals and constants.  */
10372   if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
10373       || SLP_TREE_DEF_TYPE (node) == vect_external_def)
10374     {
10375       /* ???  vectorizable_shift can end up using a scalar operand which is
10376          currently denoted as !SLP_TREE_VECTYPE.  No need to vectorize the
10377          node in this case.  */
10378       if (!SLP_TREE_VECTYPE (node))
10379         return;
10380
10381       /* There are two reasons vector defs might already exist.  The first
10382          is that we are vectorizing an existing vector def.  The second is
10383          when performing BB vectorization shared constant/external nodes
10384          are not split apart during partitioning so during the code-gen
10385          DFS walk we can end up visiting them twice.  */
10386       if (! SLP_TREE_VEC_DEFS (node).exists ())
10387         vect_create_constant_vectors (vinfo, node);
10388       return;
10389     }
10390
10391   gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
10392
10393   stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
10394
10395   gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
10396   SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
10397
10398   if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
10399       && STMT_VINFO_DATA_REF (stmt_info))
10400     {
10401       /* Vectorized loads go before the first scalar load to make it
10402          ready early, vectorized stores go before the last scalar
10403          stmt which is where all uses are ready.  */
10404       stmt_vec_info last_stmt_info = NULL;
10405       if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
10406         last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
10407       else /* DR_IS_WRITE */
10408         last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
10409       si = gsi_for_stmt (last_stmt_info->stmt);
10410     }
10411   else if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
10412            && (STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
10413                || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
10414                || STMT_VINFO_TYPE (stmt_info) == phi_info_type))
10415     {
10416       /* For PHI node vectorization we do not use the insertion iterator.  */
10417       si = gsi_none ();
10418     }
10419   else
10420     {
10421       /* Emit other stmts after the children vectorized defs which is
10422          earliest possible.  */
10423       gimple *last_stmt = NULL;
10424       bool seen_vector_def = false;
10425       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
10426         if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
10427           {
10428             /* For fold-left reductions we are retaining the scalar
10429                reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
10430                set so the representation isn't perfect.  Resort to the
10431                last scalar def here.  */
10432             if (SLP_TREE_VEC_DEFS (child).is_empty ())
10433               {
10434                 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
10435                             == cycle_phi_info_type);
10436                 gphi *phi = as_a <gphi *>
10437                               (vect_find_last_scalar_stmt_in_slp (child)->stmt);
10438                 if (!last_stmt
10439                     || vect_stmt_dominates_stmt_p (last_stmt, phi))
10440                   last_stmt = phi;
10441               }
10442             /* We are emitting all vectorized stmts in the same place and
10443                the last one is the last.
10444                ???  Unless we have a load permutation applied and that
10445                figures to re-use an earlier generated load.  */
10446             unsigned j;
10447             tree vdef;
10448             FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
10449               {
10450                 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
10451                 if (!last_stmt
10452                     || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
10453                   last_stmt = vstmt;
10454               }
10455           }
10456         else if (!SLP_TREE_VECTYPE (child))
10457           {
10458             /* For externals we use unvectorized at all scalar defs.  */
10459             unsigned j;
10460             tree def;
10461             FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
10462               if (TREE_CODE (def) == SSA_NAME
10463                   && !SSA_NAME_IS_DEFAULT_DEF (def))
10464                 {
10465                   gimple *stmt = SSA_NAME_DEF_STMT (def);
10466                   if (!last_stmt
10467                       || vect_stmt_dominates_stmt_p (last_stmt, stmt))
10468                     last_stmt = stmt;
10469                 }
10470           }
10471         else
10472           {
10473             /* For externals we have to look at all defs since their
10474                insertion place is decided per vector.  But beware
10475                of pre-existing vectors where we need to make sure
10476                we do not insert before the region boundary.  */
10477             if (SLP_TREE_SCALAR_OPS (child).is_empty ()
10478                 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
10479               seen_vector_def = true;
10480             else
10481               {
10482                 unsigned j;
10483                 tree vdef;
10484                 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
10485                   if (TREE_CODE (vdef) == SSA_NAME
10486                       && !SSA_NAME_IS_DEFAULT_DEF (vdef))
10487                     {
10488                       gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
10489                       if (!last_stmt
10490                           || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
10491                         last_stmt = vstmt;
10492                     }
10493               }
10494           }
10495       /* This can happen when all children are pre-existing vectors or
10496          constants.  */
10497       if (!last_stmt)
10498         last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
10499       if (!last_stmt)
10500         {
10501           gcc_assert (seen_vector_def);
10502           si = gsi_after_labels (vinfo->bbs[0]);
10503         }
10504       else if (is_ctrl_altering_stmt (last_stmt))
10505         {
10506           /* We split regions to vectorize at control altering stmts
10507              with a definition so this must be an external which
10508              we can insert at the start of the region.  */
10509           si = gsi_after_labels (vinfo->bbs[0]);
10510         }
10511       else if (is_a <bb_vec_info> (vinfo)
10512                && SLP_TREE_CODE (node) != VEC_PERM_EXPR
10513                && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
10514                && gimple_could_trap_p (stmt_info->stmt))
10515         {
10516           /* We've constrained possibly trapping operations to all come
10517              from the same basic-block, if vectorized defs would allow earlier
10518              scheduling still force vectorized stmts to the original block.
10519              This is only necessary for BB vectorization since for loop vect
10520              all operations are in a single BB and scalar stmt based
10521              placement doesn't play well with epilogue vectorization.  */
10522           gcc_assert (dominated_by_p (CDI_DOMINATORS,
10523                                       gimple_bb (stmt_info->stmt),
10524                                       gimple_bb (last_stmt)));
10525           si = gsi_after_labels (gimple_bb (stmt_info->stmt));
10526         }
10527       else if (is_a <gphi *> (last_stmt))
10528         si = gsi_after_labels (gimple_bb (last_stmt));
10529       else
10530         {
10531           si = gsi_for_stmt (last_stmt);
10532           gsi_next (&si);
10533
10534           /* Avoid scheduling internal defs outside of the loop when
10535              we might have only implicitly tracked loop mask/len defs.  */
10536           if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
10537             if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10538                 || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10539               {
10540                 gimple_stmt_iterator si2
10541                   = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
10542                 if ((gsi_end_p (si2)
10543                      && (LOOP_VINFO_LOOP (loop_vinfo)->header
10544                          != gimple_bb (last_stmt))
10545                      && dominated_by_p (CDI_DOMINATORS,
10546                                         LOOP_VINFO_LOOP (loop_vinfo)->header,
10547                                         gimple_bb (last_stmt)))
10548                     || (!gsi_end_p (si2)
10549                         && last_stmt != *si2
10550                         && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
10551                   si = si2;
10552               }
10553         }
10554     }
10555
10556   /* Handle purely internal nodes.  */
10557   if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
10558     {
10559       if (dump_enabled_p ())
10560         dump_printf_loc (MSG_NOTE, vect_location,
10561                          "------>vectorizing SLP permutation node\n");
10562       /* ???  the transform kind is stored to STMT_VINFO_TYPE which might
10563          be shared with different SLP nodes (but usually it's the same
10564          operation apart from the case the stmt is only there for denoting
10565          the actual scalar lane defs ...).  So do not call vect_transform_stmt
10566          but open-code it here (partly).  */
10567       bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
10568       gcc_assert (done);
10569       stmt_vec_info slp_stmt_info;
10570       unsigned int i;
10571       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
10572         if (slp_stmt_info && STMT_VINFO_LIVE_P (slp_stmt_info))
10573           {
10574             done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
10575                                                 instance, i, true, NULL);
10576             gcc_assert (done);
10577           }
10578     }
10579   else
10580     {
10581       if (dump_enabled_p ())
10582         dump_printf_loc (MSG_NOTE, vect_location,
10583                          "------>vectorizing SLP node starting from: %G",
10584                          stmt_info->stmt);
10585       vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
10586     }
10587 }
10588
10589 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
10590    For loop vectorization this is done in vectorizable_call, but for SLP
10591    it needs to be deferred until end of vect_schedule_slp, because multiple
10592    SLP instances may refer to the same scalar stmt.  */
10593
10594 static void
10595 vect_remove_slp_scalar_calls (vec_info *vinfo,
10596                               slp_tree node, hash_set<slp_tree> &visited)
10597 {
10598   gimple *new_stmt;
10599   gimple_stmt_iterator gsi;
10600   int i;
10601   slp_tree child;
10602   tree lhs;
10603   stmt_vec_info stmt_info;
10604
10605   if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
10606     return;
10607
10608   if (visited.add (node))
10609     return;
10610
10611   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
10612     vect_remove_slp_scalar_calls (vinfo, child, visited);
10613
10614   FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
10615     {
10616       if (!stmt_info)
10617         continue;
10618       gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
10619       if (!stmt || gimple_bb (stmt) == NULL)
10620         continue;
10621       if (is_pattern_stmt_p (stmt_info)
10622           || !PURE_SLP_STMT (stmt_info))
10623         continue;
10624       lhs = gimple_call_lhs (stmt);
10625       if (lhs)
10626         new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
10627       else
10628         {
10629           new_stmt = gimple_build_nop ();
10630           unlink_stmt_vdef (stmt_info->stmt);
10631         }
10632       gsi = gsi_for_stmt (stmt);
10633       vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
10634       if (lhs)
10635         SSA_NAME_DEF_STMT (lhs) = new_stmt;
10636     }
10637 }
10638
10639 static void
10640 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
10641 {
10642   hash_set<slp_tree> visited;
10643   vect_remove_slp_scalar_calls (vinfo, node, visited);
10644 }
10645
10646 /* Vectorize the instance root.  */
10647
10648 void
10649 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
10650 {
10651   gassign *rstmt = NULL;
10652
10653   if (instance->kind == slp_inst_kind_ctor)
10654     {
10655       if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
10656         {
10657           tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
10658           tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
10659           if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
10660                                           TREE_TYPE (vect_lhs)))
10661             vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
10662                                vect_lhs);
10663           rstmt = gimple_build_assign (root_lhs, vect_lhs);
10664         }
10665       else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
10666         {
10667           int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
10668           tree child_def;
10669           int j;
10670           vec<constructor_elt, va_gc> *v;
10671           vec_alloc (v, nelts);
10672
10673           /* A CTOR can handle V16HI composition from VNx8HI so we
10674              do not need to convert vector elements if the types
10675              do not match.  */
10676           FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
10677             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
10678           tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
10679           tree rtype
10680             = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
10681           tree r_constructor = build_constructor (rtype, v);
10682           rstmt = gimple_build_assign (lhs, r_constructor);
10683         }
10684     }
10685   else if (instance->kind == slp_inst_kind_bb_reduc)
10686     {
10687       /* Largely inspired by reduction chain epilogue handling in
10688          vect_create_epilog_for_reduction.  */
10689       vec<tree> vec_defs = vNULL;
10690       vect_get_slp_defs (node, &vec_defs);
10691       enum tree_code reduc_code
10692         = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
10693       /* ???  We actually have to reflect signs somewhere.  */
10694       if (reduc_code == MINUS_EXPR)
10695         reduc_code = PLUS_EXPR;
10696       gimple_seq epilogue = NULL;
10697       /* We may end up with more than one vector result, reduce them
10698          to one vector.  */
10699       tree vec_def = vec_defs[0];
10700       tree vectype = TREE_TYPE (vec_def);
10701       tree compute_vectype = vectype;
10702       bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
10703                                  && TYPE_OVERFLOW_UNDEFINED (vectype)
10704                                  && operation_can_overflow (reduc_code));
10705       if (pun_for_overflow_p)
10706         {
10707           compute_vectype = unsigned_type_for (vectype);
10708           vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
10709                                   compute_vectype, vec_def);
10710         }
10711       for (unsigned i = 1; i < vec_defs.length (); ++i)
10712         {
10713           tree def = vec_defs[i];
10714           if (pun_for_overflow_p)
10715             def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
10716                                 compute_vectype, def);
10717           vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
10718                                   vec_def, def);
10719         }
10720       vec_defs.release ();
10721       /* ???  Support other schemes than direct internal fn.  */
10722       internal_fn reduc_fn;
10723       if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
10724           || reduc_fn == IFN_LAST)
10725         gcc_unreachable ();
10726       tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
10727                                       TREE_TYPE (compute_vectype), vec_def);
10728       if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
10729         {
10730           tree rem_def = NULL_TREE;
10731           for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
10732             {
10733               def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
10734               if (!rem_def)
10735                 rem_def = def;
10736               else
10737                 rem_def = gimple_build (&epilogue, reduc_code,
10738                                         TREE_TYPE (scalar_def),
10739                                         rem_def, def);
10740             }
10741           scalar_def = gimple_build (&epilogue, reduc_code,
10742                                      TREE_TYPE (scalar_def),
10743                                      scalar_def, rem_def);
10744         }
10745       scalar_def = gimple_convert (&epilogue,
10746                                    TREE_TYPE (vectype), scalar_def);
10747       gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
10748       gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
10749       gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
10750       update_stmt (gsi_stmt (rgsi));
10751       return;
10752     }
10753   else
10754     gcc_unreachable ();
10755
10756   gcc_assert (rstmt);
10757
10758   gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
10759   gsi_replace (&rgsi, rstmt, true);
10760 }
10761
10762 struct slp_scc_info
10763 {
10764   bool on_stack;
10765   int dfs;
10766   int lowlink;
10767 };
10768
10769 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs.  */
10770
10771 static void
10772 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
10773                    hash_map<slp_tree, slp_scc_info> &scc_info,
10774                    int &maxdfs, vec<slp_tree> &stack)
10775 {
10776   bool existed_p;
10777   slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
10778   gcc_assert (!existed_p);
10779   info->dfs = maxdfs;
10780   info->lowlink = maxdfs;
10781   maxdfs++;
10782
10783   /* Leaf.  */
10784   if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
10785     {
10786       info->on_stack = false;
10787       vect_schedule_slp_node (vinfo, node, instance);
10788       return;
10789     }
10790
10791   info->on_stack = true;
10792   stack.safe_push (node);
10793
10794   unsigned i;
10795   slp_tree child;
10796   /* DFS recurse.  */
10797   FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
10798     {
10799       if (!child)
10800         continue;
10801       slp_scc_info *child_info = scc_info.get (child);
10802       if (!child_info)
10803         {
10804           vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
10805           /* Recursion might have re-allocated the node.  */
10806           info = scc_info.get (node);
10807           child_info = scc_info.get (child);
10808           info->lowlink = MIN (info->lowlink, child_info->lowlink);
10809         }
10810       else if (child_info->on_stack)
10811         info->lowlink = MIN (info->lowlink, child_info->dfs);
10812     }
10813   if (info->lowlink != info->dfs)
10814     return;
10815
10816   auto_vec<slp_tree, 4> phis_to_fixup;
10817
10818   /* Singleton.  */
10819   if (stack.last () == node)
10820     {
10821       stack.pop ();
10822       info->on_stack = false;
10823       vect_schedule_slp_node (vinfo, node, instance);
10824       if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
10825           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
10826         phis_to_fixup.quick_push (node);
10827     }
10828   else
10829     {
10830       /* SCC.  */
10831       int last_idx = stack.length () - 1;
10832       while (stack[last_idx] != node)
10833         last_idx--;
10834       /* We can break the cycle at PHIs who have at least one child
10835          code generated.  Then we could re-start the DFS walk until
10836          all nodes in the SCC are covered (we might have new entries
10837          for only back-reachable nodes).  But it's simpler to just
10838          iterate and schedule those that are ready.  */
10839       unsigned todo = stack.length () - last_idx;
10840       do
10841         {
10842           for (int idx = stack.length () - 1; idx >= last_idx; --idx)
10843             {
10844               slp_tree entry = stack[idx];
10845               if (!entry)
10846                 continue;
10847               bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
10848                           && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
10849               bool ready = !phi;
10850               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
10851                   if (!child)
10852                     {
10853                       gcc_assert (phi);
10854                       ready = true;
10855                       break;
10856                     }
10857                   else if (scc_info.get (child)->on_stack)
10858                     {
10859                       if (!phi)
10860                         {
10861                           ready = false;
10862                           break;
10863                         }
10864                     }
10865                   else
10866                     {
10867                       if (phi)
10868                         {
10869                           ready = true;
10870                           break;
10871                         }
10872                     }
10873               if (ready)
10874                 {
10875                   vect_schedule_slp_node (vinfo, entry, instance);
10876                   scc_info.get (entry)->on_stack = false;
10877                   stack[idx] = NULL;
10878                   todo--;
10879                   if (phi)
10880                     phis_to_fixup.safe_push (entry);
10881                 }
10882             }
10883         }
10884       while (todo != 0);
10885
10886       /* Pop the SCC.  */
10887       stack.truncate (last_idx);
10888     }
10889
10890   /* Now fixup the backedge def of the vectorized PHIs in this SCC.  */
10891   slp_tree phi_node;
10892   FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
10893     {
10894       gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
10895       edge_iterator ei;
10896       edge e;
10897       FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
10898         {
10899           unsigned dest_idx = e->dest_idx;
10900           child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
10901           if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
10902             continue;
10903           unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
10904           /* Simply fill all args.  */
10905           if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
10906               != vect_first_order_recurrence)
10907             for (unsigned i = 0; i < n; ++i)
10908               {
10909                 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
10910                 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
10911                 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
10912                              e, gimple_phi_arg_location (phi, dest_idx));
10913               }
10914           else
10915             {
10916               /* Unless it is a first order recurrence which needs
10917                  args filled in for both the PHI node and the permutes.  */
10918               gimple *perm
10919                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
10920               gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
10921               add_phi_arg (as_a <gphi *> (rphi),
10922                            vect_get_slp_vect_def (child, n - 1),
10923                            e, gimple_phi_arg_location (phi, dest_idx));
10924               for (unsigned i = 0; i < n; ++i)
10925                 {
10926                   gimple *perm
10927                     = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
10928                   if (i > 0)
10929                     gimple_assign_set_rhs1 (perm,
10930                                             vect_get_slp_vect_def (child, i - 1));
10931                   gimple_assign_set_rhs2 (perm,
10932                                           vect_get_slp_vect_def (child, i));
10933                   update_stmt (perm);
10934                 }
10935             }
10936         }
10937     }
10938 }
10939
10940 /* Generate vector code for SLP_INSTANCES in the loop/basic block.  */
10941
10942 void
10943 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
10944 {
10945   slp_instance instance;
10946   unsigned int i;
10947
10948   hash_map<slp_tree, slp_scc_info> scc_info;
10949   int maxdfs = 0;
10950   FOR_EACH_VEC_ELT (slp_instances, i, instance)
10951     {
10952       slp_tree node = SLP_INSTANCE_TREE (instance);
10953       if (dump_enabled_p ())
10954         {
10955           dump_printf_loc (MSG_NOTE, vect_location,
10956                            "Vectorizing SLP tree:\n");
10957           /* ???  Dump all?  */
10958           if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
10959             dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
10960                          SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
10961           vect_print_slp_graph (MSG_NOTE, vect_location,
10962                                 SLP_INSTANCE_TREE (instance));
10963         }
10964       /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
10965          have a PHI be the node breaking the cycle.  */
10966       auto_vec<slp_tree> stack;
10967       if (!scc_info.get (node))
10968         vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
10969
10970       if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
10971         vectorize_slp_instance_root_stmt (node, instance);
10972
10973       if (dump_enabled_p ())
10974         dump_printf_loc (MSG_NOTE, vect_location,
10975                          "vectorizing stmts using SLP.\n");
10976     }
10977
10978   FOR_EACH_VEC_ELT (slp_instances, i, instance)
10979     {
10980       slp_tree root = SLP_INSTANCE_TREE (instance);
10981       stmt_vec_info store_info;
10982       unsigned int j;
10983
10984       /* Remove scalar call stmts.  Do not do this for basic-block
10985          vectorization as not all uses may be vectorized.
10986          ???  Why should this be necessary?  DCE should be able to
10987          remove the stmts itself.
10988          ???  For BB vectorization we can as well remove scalar
10989          stmts starting from the SLP tree root if they have no
10990          uses.  */
10991       if (is_a <loop_vec_info> (vinfo))
10992         vect_remove_slp_scalar_calls (vinfo, root);
10993
10994       /* Remove vectorized stores original scalar stmts.  */
10995       for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
10996         {
10997           if (!STMT_VINFO_DATA_REF (store_info)
10998               || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
10999             break;
11000
11001           store_info = vect_orig_stmt (store_info);
11002           /* Free the attached stmt_vec_info and remove the stmt.  */
11003           vinfo->remove_stmt (store_info);
11004
11005           /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
11006              to not crash in vect_free_slp_tree later.  */
11007           if (SLP_TREE_REPRESENTATIVE (root) == store_info)
11008             SLP_TREE_REPRESENTATIVE (root) = NULL;
11009         }
11010     }
11011 }