libstdc++: Refactor loops in std::__platform_semaphore
[official-gcc.git] / gcc / tree-vect-slp.cc
blob4fcb9e2fa2bb4246058614152051678ae6b46985
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #define INCLUDE_ALGORITHM
24 #define INCLUDE_MEMORY
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "gimple.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "insn-config.h"
36 #include "recog.h" /* FIXME: for insn_data */
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "gimple-iterator.h"
40 #include "cfgloop.h"
41 #include "tree-vectorizer.h"
42 #include "langhooks.h"
43 #include "gimple-walk.h"
44 #include "dbgcnt.h"
45 #include "tree-vector-builder.h"
46 #include "vec-perm-indices.h"
47 #include "gimple-fold.h"
48 #include "internal-fn.h"
49 #include "dump-context.h"
50 #include "cfganal.h"
51 #include "tree-eh.h"
52 #include "tree-cfg.h"
53 #include "alloc-pool.h"
54 #include "sreal.h"
55 #include "predict.h"
57 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
58 load_permutation_t &,
59 const vec<tree> &,
60 gimple_stmt_iterator *,
61 poly_uint64, bool, bool,
62 unsigned *,
63 unsigned * = nullptr,
64 bool = false);
65 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
66 slp_tree, lane_permutation_t &,
67 vec<slp_tree> &, bool);
68 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
69 slp_tree, stmt_vector_for_cost *);
70 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
72 static object_allocator<_slp_tree> *slp_tree_pool;
73 static slp_tree slp_first_node;
75 void
76 vect_slp_init (void)
78 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
81 void
82 vect_slp_fini (void)
84 while (slp_first_node)
85 delete slp_first_node;
86 delete slp_tree_pool;
87 slp_tree_pool = NULL;
90 void *
91 _slp_tree::operator new (size_t n)
93 gcc_assert (n == sizeof (_slp_tree));
94 return slp_tree_pool->allocate_raw ();
97 void
98 _slp_tree::operator delete (void *node, size_t n)
100 gcc_assert (n == sizeof (_slp_tree));
101 slp_tree_pool->remove_raw (node);
105 /* Initialize a SLP node. */
107 _slp_tree::_slp_tree ()
109 this->prev_node = NULL;
110 if (slp_first_node)
111 slp_first_node->prev_node = this;
112 this->next_node = slp_first_node;
113 slp_first_node = this;
114 SLP_TREE_SCALAR_STMTS (this) = vNULL;
115 SLP_TREE_SCALAR_OPS (this) = vNULL;
116 SLP_TREE_VEC_DEFS (this) = vNULL;
117 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
118 SLP_TREE_CHILDREN (this) = vNULL;
119 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
120 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
121 SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
122 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
123 SLP_TREE_CODE (this) = ERROR_MARK;
124 this->ldst_lanes = false;
125 SLP_TREE_VECTYPE (this) = NULL_TREE;
126 SLP_TREE_REPRESENTATIVE (this) = NULL;
127 SLP_TREE_REF_COUNT (this) = 1;
128 this->failed = NULL;
129 this->max_nunits = 1;
130 this->lanes = 0;
133 /* Tear down a SLP node. */
135 _slp_tree::~_slp_tree ()
137 if (this->prev_node)
138 this->prev_node->next_node = this->next_node;
139 else
140 slp_first_node = this->next_node;
141 if (this->next_node)
142 this->next_node->prev_node = this->prev_node;
143 SLP_TREE_CHILDREN (this).release ();
144 SLP_TREE_SCALAR_STMTS (this).release ();
145 SLP_TREE_SCALAR_OPS (this).release ();
146 SLP_TREE_VEC_DEFS (this).release ();
147 SLP_TREE_LOAD_PERMUTATION (this).release ();
148 SLP_TREE_LANE_PERMUTATION (this).release ();
149 SLP_TREE_SIMD_CLONE_INFO (this).release ();
150 if (this->failed)
151 free (failed);
154 /* Push the single SSA definition in DEF to the vector of vector defs. */
156 void
157 _slp_tree::push_vec_def (gimple *def)
159 if (gphi *phi = dyn_cast <gphi *> (def))
160 vec_defs.quick_push (gimple_phi_result (phi));
161 else
163 def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
164 vec_defs.quick_push (get_def_from_ptr (defop));
168 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
170 void
171 vect_free_slp_tree (slp_tree node)
173 int i;
174 slp_tree child;
176 if (--SLP_TREE_REF_COUNT (node) != 0)
177 return;
179 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
180 if (child)
181 vect_free_slp_tree (child);
183 /* If the node defines any SLP only patterns then those patterns are no
184 longer valid and should be removed. */
185 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
186 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
188 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
189 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
190 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
193 delete node;
196 /* Return a location suitable for dumpings related to the SLP instance. */
198 dump_user_location_t
199 _slp_instance::location () const
201 if (!root_stmts.is_empty ())
202 return root_stmts[0]->stmt;
203 else
204 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
208 /* Free the memory allocated for the SLP instance. */
210 void
211 vect_free_slp_instance (slp_instance instance)
213 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
214 SLP_INSTANCE_LOADS (instance).release ();
215 SLP_INSTANCE_ROOT_STMTS (instance).release ();
216 SLP_INSTANCE_REMAIN_DEFS (instance).release ();
217 instance->subgraph_entries.release ();
218 instance->cost_vec.release ();
219 free (instance);
223 /* Create an SLP node for SCALAR_STMTS. */
225 slp_tree
226 vect_create_new_slp_node (unsigned nops, tree_code code)
228 slp_tree node = new _slp_tree;
229 SLP_TREE_SCALAR_STMTS (node) = vNULL;
230 SLP_TREE_CHILDREN (node).create (nops);
231 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
232 SLP_TREE_CODE (node) = code;
233 return node;
235 /* Create an SLP node for SCALAR_STMTS. */
237 static slp_tree
238 vect_create_new_slp_node (slp_tree node,
239 vec<stmt_vec_info> scalar_stmts, unsigned nops)
241 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
242 SLP_TREE_CHILDREN (node).create (nops);
243 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
244 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
245 SLP_TREE_LANES (node) = scalar_stmts.length ();
246 return node;
249 /* Create an SLP node for SCALAR_STMTS. */
251 static slp_tree
252 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
254 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
257 /* Create an SLP node for OPS. */
259 static slp_tree
260 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
262 SLP_TREE_SCALAR_OPS (node) = ops;
263 SLP_TREE_DEF_TYPE (node) = vect_external_def;
264 SLP_TREE_LANES (node) = ops.length ();
265 return node;
268 /* Create an SLP node for OPS. */
270 static slp_tree
271 vect_create_new_slp_node (vec<tree> ops)
273 return vect_create_new_slp_node (new _slp_tree, ops);
277 /* This structure is used in creation of an SLP tree. Each instance
278 corresponds to the same operand in a group of scalar stmts in an SLP
279 node. */
280 typedef struct _slp_oprnd_info
282 /* Def-stmts for the operands. */
283 vec<stmt_vec_info> def_stmts;
284 /* Operands. */
285 vec<tree> ops;
286 /* Information about the first statement, its vector def-type, type, the
287 operand itself in case it's constant, and an indication if it's a pattern
288 stmt and gather/scatter info. */
289 tree first_op_type;
290 enum vect_def_type first_dt;
291 bool any_pattern;
292 bool first_gs_p;
293 gather_scatter_info first_gs_info;
294 } *slp_oprnd_info;
297 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
298 operand. */
299 static vec<slp_oprnd_info>
300 vect_create_oprnd_info (int nops, int group_size)
302 int i;
303 slp_oprnd_info oprnd_info;
304 vec<slp_oprnd_info> oprnds_info;
306 oprnds_info.create (nops);
307 for (i = 0; i < nops; i++)
309 oprnd_info = XNEW (struct _slp_oprnd_info);
310 oprnd_info->def_stmts.create (group_size);
311 oprnd_info->ops.create (group_size);
312 oprnd_info->first_dt = vect_uninitialized_def;
313 oprnd_info->first_op_type = NULL_TREE;
314 oprnd_info->any_pattern = false;
315 oprnd_info->first_gs_p = false;
316 oprnds_info.quick_push (oprnd_info);
319 return oprnds_info;
323 /* Free operands info. */
325 static void
326 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
328 int i;
329 slp_oprnd_info oprnd_info;
331 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
333 oprnd_info->def_stmts.release ();
334 oprnd_info->ops.release ();
335 XDELETE (oprnd_info);
338 oprnds_info.release ();
341 /* Return the execution frequency of NODE (so that a higher value indicates
342 a "more important" node when optimizing for speed). */
344 static sreal
345 vect_slp_node_weight (slp_tree node)
347 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
348 basic_block bb = gimple_bb (stmt_info->stmt);
349 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
352 /* Return true if STMTS contains a pattern statement. */
354 static bool
355 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
357 stmt_vec_info stmt_info;
358 unsigned int i;
359 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
360 if (stmt_info && is_pattern_stmt_p (stmt_info))
361 return true;
362 return false;
365 /* Return true when all lanes in the external or constant NODE have
366 the same value. */
368 static bool
369 vect_slp_tree_uniform_p (slp_tree node)
371 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
372 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
374 /* Pre-exsting vectors. */
375 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
376 return false;
378 unsigned i;
379 tree op, first = NULL_TREE;
380 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
381 if (!first)
382 first = op;
383 else if (!operand_equal_p (first, op, 0))
384 return false;
386 return true;
389 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
390 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
391 of the chain. */
394 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
395 stmt_vec_info first_stmt_info)
397 stmt_vec_info next_stmt_info = first_stmt_info;
398 int result = 0;
400 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
401 return -1;
405 if (next_stmt_info == stmt_info)
406 return result;
407 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
408 if (next_stmt_info)
409 result += DR_GROUP_GAP (next_stmt_info);
411 while (next_stmt_info);
413 return -1;
416 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
417 using the method implemented by duplicate_and_interleave. Return true
418 if so, returning the number of intermediate vectors in *NVECTORS_OUT
419 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
420 (if nonnull). */
422 bool
423 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
424 tree elt_type, unsigned int *nvectors_out,
425 tree *vector_type_out,
426 tree *permutes)
428 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
429 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
430 return false;
432 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
433 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
434 unsigned int nvectors = 1;
435 for (;;)
437 scalar_int_mode int_mode;
438 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
439 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
441 /* Get the natural vector type for this SLP group size. */
442 tree int_type = build_nonstandard_integer_type
443 (GET_MODE_BITSIZE (int_mode), 1);
444 tree vector_type
445 = get_vectype_for_scalar_type (vinfo, int_type, count);
446 poly_int64 half_nelts;
447 if (vector_type
448 && VECTOR_MODE_P (TYPE_MODE (vector_type))
449 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
450 GET_MODE_SIZE (base_vector_mode))
451 && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
452 2, &half_nelts))
454 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
455 together into elements of type INT_TYPE and using the result
456 to build NVECTORS vectors. */
457 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
458 vec_perm_builder sel1 (nelts, 2, 3);
459 vec_perm_builder sel2 (nelts, 2, 3);
461 for (unsigned int i = 0; i < 3; ++i)
463 sel1.quick_push (i);
464 sel1.quick_push (i + nelts);
465 sel2.quick_push (half_nelts + i);
466 sel2.quick_push (half_nelts + i + nelts);
468 vec_perm_indices indices1 (sel1, 2, nelts);
469 vec_perm_indices indices2 (sel2, 2, nelts);
470 machine_mode vmode = TYPE_MODE (vector_type);
471 if (can_vec_perm_const_p (vmode, vmode, indices1)
472 && can_vec_perm_const_p (vmode, vmode, indices2))
474 if (nvectors_out)
475 *nvectors_out = nvectors;
476 if (vector_type_out)
477 *vector_type_out = vector_type;
478 if (permutes)
480 permutes[0] = vect_gen_perm_mask_checked (vector_type,
481 indices1);
482 permutes[1] = vect_gen_perm_mask_checked (vector_type,
483 indices2);
485 return true;
489 if (!multiple_p (elt_bytes, 2, &elt_bytes))
490 return false;
491 nvectors *= 2;
495 /* Return true if DTA and DTB match. */
497 static bool
498 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
500 return (dta == dtb
501 || ((dta == vect_external_def || dta == vect_constant_def)
502 && (dtb == vect_external_def || dtb == vect_constant_def)));
505 static const int cond_expr_maps[3][5] = {
506 { 4, -1, -2, 1, 2 },
507 { 4, -2, -1, 1, 2 },
508 { 4, -1, -2, 2, 1 }
510 static const int arg0_map[] = { 1, 0 };
511 static const int arg1_map[] = { 1, 1 };
512 static const int arg2_map[] = { 1, 2 };
513 static const int arg1_arg4_map[] = { 2, 1, 4 };
514 static const int arg3_arg2_map[] = { 2, 3, 2 };
515 static const int op1_op0_map[] = { 2, 1, 0 };
516 static const int off_map[] = { 1, -3 };
517 static const int off_op0_map[] = { 2, -3, 0 };
518 static const int off_arg2_map[] = { 2, -3, 2 };
519 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
520 static const int mask_call_maps[6][7] = {
521 { 1, 1, },
522 { 2, 1, 2, },
523 { 3, 1, 2, 3, },
524 { 4, 1, 2, 3, 4, },
525 { 5, 1, 2, 3, 4, 5, },
526 { 6, 1, 2, 3, 4, 5, 6 },
529 /* For most SLP statements, there is a one-to-one mapping between
530 gimple arguments and child nodes. If that is not true for STMT,
531 return an array that contains:
533 - the number of child nodes, followed by
534 - for each child node, the index of the argument associated with that node.
535 The special index -1 is the first operand of an embedded comparison and
536 the special index -2 is the second operand of an embedded comparison.
537 The special indes -3 is the offset of a gather as analyzed by
538 vect_check_gather_scatter.
540 SWAP is as for vect_get_and_check_slp_defs. */
542 static const int *
543 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
544 unsigned char swap = 0)
546 if (auto assign = dyn_cast<const gassign *> (stmt))
548 if (gimple_assign_rhs_code (assign) == COND_EXPR
549 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
550 return cond_expr_maps[swap];
551 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
552 && swap)
553 return op1_op0_map;
554 if (gather_scatter_p)
555 return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
556 ? off_op0_map : off_map);
558 gcc_assert (!swap);
559 if (auto call = dyn_cast<const gcall *> (stmt))
561 if (gimple_call_internal_p (call))
562 switch (gimple_call_internal_fn (call))
564 case IFN_MASK_LOAD:
565 return gather_scatter_p ? off_arg2_map : arg2_map;
567 case IFN_GATHER_LOAD:
568 return arg1_map;
570 case IFN_MASK_GATHER_LOAD:
571 case IFN_MASK_LEN_GATHER_LOAD:
572 return arg1_arg4_map;
574 case IFN_MASK_STORE:
575 return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
577 case IFN_MASK_CALL:
579 unsigned nargs = gimple_call_num_args (call);
580 if (nargs >= 2 && nargs <= 7)
581 return mask_call_maps[nargs-2];
582 else
583 return nullptr;
586 case IFN_CLZ:
587 case IFN_CTZ:
588 return arg0_map;
590 default:
591 break;
594 return nullptr;
597 /* Return the SLP node child index for operand OP of STMT. */
600 vect_slp_child_index_for_operand (const gimple *stmt, int op,
601 bool gather_scatter_p)
603 const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
604 if (!opmap)
605 return op;
606 for (int i = 1; i < 1 + opmap[0]; ++i)
607 if (opmap[i] == op)
608 return i - 1;
609 gcc_unreachable ();
612 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
613 they are of a valid type and that they match the defs of the first stmt of
614 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
615 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
616 indicates swap is required for cond_expr stmts. Specifically, SWAP
617 is 1 if STMT is cond and operands of comparison need to be swapped;
618 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
620 If there was a fatal error return -1; if the error could be corrected by
621 swapping operands of father node of this one, return 1; if everything is
622 ok return 0. */
623 static int
624 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
625 bool *skip_args,
626 vec<stmt_vec_info> stmts, unsigned stmt_num,
627 vec<slp_oprnd_info> *oprnds_info)
629 stmt_vec_info stmt_info = stmts[stmt_num];
630 tree oprnd;
631 unsigned int i, number_of_oprnds;
632 enum vect_def_type dt = vect_uninitialized_def;
633 slp_oprnd_info oprnd_info;
634 gather_scatter_info gs_info;
635 unsigned int gs_op = -1u;
636 unsigned int commutative_op = -1U;
637 bool first = stmt_num == 0;
639 if (!is_a<gcall *> (stmt_info->stmt)
640 && !is_a<gassign *> (stmt_info->stmt)
641 && !is_a<gphi *> (stmt_info->stmt))
642 return -1;
644 number_of_oprnds = gimple_num_args (stmt_info->stmt);
645 const int *map
646 = vect_get_operand_map (stmt_info->stmt,
647 STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
648 if (map)
649 number_of_oprnds = *map++;
650 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
652 if (gimple_call_internal_p (stmt))
654 internal_fn ifn = gimple_call_internal_fn (stmt);
655 commutative_op = first_commutative_argument (ifn);
658 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
660 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
661 commutative_op = 0;
664 bool swapped = (swap != 0);
665 bool backedge = false;
666 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
667 for (i = 0; i < number_of_oprnds; i++)
669 oprnd_info = (*oprnds_info)[i];
670 int opno = map ? map[i] : int (i);
671 if (opno == -3)
673 gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
674 if (!is_a <loop_vec_info> (vinfo)
675 || !vect_check_gather_scatter (stmt_info,
676 as_a <loop_vec_info> (vinfo),
677 first ? &oprnd_info->first_gs_info
678 : &gs_info))
679 return -1;
681 if (first)
683 oprnd_info->first_gs_p = true;
684 oprnd = oprnd_info->first_gs_info.offset;
686 else
688 gs_op = i;
689 oprnd = gs_info.offset;
692 else if (opno < 0)
693 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
694 else
696 oprnd = gimple_arg (stmt_info->stmt, opno);
697 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
699 edge e = gimple_phi_arg_edge (stmt, opno);
700 backedge = (is_a <bb_vec_info> (vinfo)
701 ? e->flags & EDGE_DFS_BACK
702 : dominated_by_p (CDI_DOMINATORS, e->src,
703 gimple_bb (stmt_info->stmt)));
706 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
707 oprnd = TREE_OPERAND (oprnd, 0);
709 stmt_vec_info def_stmt_info;
710 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
714 "Build SLP failed: can't analyze def for %T\n",
715 oprnd);
717 return -1;
720 if (skip_args[i])
722 oprnd_info->def_stmts.quick_push (NULL);
723 oprnd_info->ops.quick_push (NULL_TREE);
724 oprnd_info->first_dt = vect_uninitialized_def;
725 continue;
728 oprnd_info->def_stmts.quick_push (def_stmt_info);
729 oprnd_info->ops.quick_push (oprnd);
731 if (def_stmt_info
732 && is_pattern_stmt_p (def_stmt_info))
734 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
735 != def_stmt_info)
736 oprnd_info->any_pattern = true;
737 else
738 /* If we promote this to external use the original stmt def. */
739 oprnd_info->ops.last ()
740 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
743 /* If there's a extern def on a backedge make sure we can
744 code-generate at the region start.
745 ??? This is another case that could be fixed by adjusting
746 how we split the function but at the moment we'd have conflicting
747 goals there. */
748 if (backedge
749 && dts[i] == vect_external_def
750 && is_a <bb_vec_info> (vinfo)
751 && TREE_CODE (oprnd) == SSA_NAME
752 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
753 && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
754 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
756 if (dump_enabled_p ())
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 "Build SLP failed: extern def %T only defined "
759 "on backedge\n", oprnd);
760 return -1;
763 if (first)
765 tree type = TREE_TYPE (oprnd);
766 dt = dts[i];
768 /* For the swapping logic below force vect_reduction_def
769 for the reduction op in a SLP reduction group. */
770 if (!STMT_VINFO_DATA_REF (stmt_info)
771 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
772 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
773 && def_stmt_info)
774 dts[i] = dt = vect_reduction_def;
776 /* Check the types of the definition. */
777 switch (dt)
779 case vect_external_def:
780 case vect_constant_def:
781 case vect_internal_def:
782 case vect_reduction_def:
783 case vect_double_reduction_def:
784 case vect_induction_def:
785 case vect_nested_cycle:
786 case vect_first_order_recurrence:
787 break;
789 default:
790 /* FORNOW: Not supported. */
791 if (dump_enabled_p ())
792 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
793 "Build SLP failed: illegal type of def %T\n",
794 oprnd);
795 return -1;
798 oprnd_info->first_dt = dt;
799 oprnd_info->first_op_type = type;
802 if (first)
803 return 0;
805 /* Now match the operand definition types to that of the first stmt. */
806 for (i = 0; i < number_of_oprnds;)
808 if (skip_args[i])
810 ++i;
811 continue;
814 oprnd_info = (*oprnds_info)[i];
815 dt = dts[i];
816 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
817 oprnd = oprnd_info->ops[stmt_num];
818 tree type = TREE_TYPE (oprnd);
820 if (!types_compatible_p (oprnd_info->first_op_type, type))
822 if (dump_enabled_p ())
823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
824 "Build SLP failed: different operand types\n");
825 return 1;
828 if ((gs_op == i) != oprnd_info->first_gs_p)
830 if (dump_enabled_p ())
831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
832 "Build SLP failed: mixed gather and non-gather\n");
833 return 1;
835 else if (gs_op == i)
837 if (!operand_equal_p (oprnd_info->first_gs_info.base,
838 gs_info.base))
840 if (dump_enabled_p ())
841 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
842 "Build SLP failed: different gather base\n");
843 return 1;
845 if (oprnd_info->first_gs_info.scale != gs_info.scale)
847 if (dump_enabled_p ())
848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
849 "Build SLP failed: different gather scale\n");
850 return 1;
854 /* Not first stmt of the group, check that the def-stmt/s match
855 the def-stmt/s of the first stmt. Allow different definition
856 types for reduction chains: the first stmt must be a
857 vect_reduction_def (a phi node), and the rest
858 end in the reduction chain. */
859 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
860 && !(oprnd_info->first_dt == vect_reduction_def
861 && !STMT_VINFO_DATA_REF (stmt_info)
862 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
863 && def_stmt_info
864 && !STMT_VINFO_DATA_REF (def_stmt_info)
865 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
866 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
867 || (!STMT_VINFO_DATA_REF (stmt_info)
868 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
869 && ((!def_stmt_info
870 || STMT_VINFO_DATA_REF (def_stmt_info)
871 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
872 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
873 != (oprnd_info->first_dt != vect_reduction_def))))
875 /* Try swapping operands if we got a mismatch. For BB
876 vectorization only in case it will clearly improve things. */
877 if (i == commutative_op && !swapped
878 && (!is_a <bb_vec_info> (vinfo)
879 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
880 dts[i+1])
881 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
882 || vect_def_types_match
883 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
885 if (dump_enabled_p ())
886 dump_printf_loc (MSG_NOTE, vect_location,
887 "trying swapped operands\n");
888 std::swap (dts[i], dts[i+1]);
889 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
890 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
891 std::swap ((*oprnds_info)[i]->ops[stmt_num],
892 (*oprnds_info)[i+1]->ops[stmt_num]);
893 /* After swapping some operands we lost track whether an
894 operand has any pattern defs so be conservative here. */
895 if ((*oprnds_info)[i]->any_pattern
896 || (*oprnds_info)[i+1]->any_pattern)
897 (*oprnds_info)[i]->any_pattern
898 = (*oprnds_info)[i+1]->any_pattern = true;
899 swapped = true;
900 continue;
903 if (is_a <bb_vec_info> (vinfo)
904 && !oprnd_info->any_pattern)
906 /* Now for commutative ops we should see whether we can
907 make the other operand matching. */
908 if (dump_enabled_p ())
909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
910 "treating operand as external\n");
911 oprnd_info->first_dt = dt = vect_external_def;
913 else
915 if (dump_enabled_p ())
916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
917 "Build SLP failed: different types\n");
918 return 1;
922 /* Make sure to demote the overall operand to external. */
923 if (dt == vect_external_def)
924 oprnd_info->first_dt = vect_external_def;
925 /* For a SLP reduction chain we want to duplicate the reduction to
926 each of the chain members. That gets us a sane SLP graph (still
927 the stmts are not 100% correct wrt the initial values). */
928 else if ((dt == vect_internal_def
929 || dt == vect_reduction_def)
930 && oprnd_info->first_dt == vect_reduction_def
931 && !STMT_VINFO_DATA_REF (stmt_info)
932 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
933 && !STMT_VINFO_DATA_REF (def_stmt_info)
934 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
935 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
937 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
938 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
941 ++i;
944 /* Swap operands. */
945 if (swapped)
947 if (dump_enabled_p ())
948 dump_printf_loc (MSG_NOTE, vect_location,
949 "swapped operands to match def types in %G",
950 stmt_info->stmt);
953 return 0;
956 /* Return true if call statements CALL1 and CALL2 are similar enough
957 to be combined into the same SLP group. */
959 bool
960 compatible_calls_p (gcall *call1, gcall *call2)
962 unsigned int nargs = gimple_call_num_args (call1);
963 if (nargs != gimple_call_num_args (call2))
964 return false;
966 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
967 return false;
969 if (gimple_call_internal_p (call1))
971 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
972 TREE_TYPE (gimple_call_lhs (call2))))
973 return false;
974 for (unsigned int i = 0; i < nargs; ++i)
975 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
976 TREE_TYPE (gimple_call_arg (call2, i))))
977 return false;
979 else
981 if (!operand_equal_p (gimple_call_fn (call1),
982 gimple_call_fn (call2), 0))
983 return false;
985 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
986 return false;
989 /* Check that any unvectorized arguments are equal. */
990 if (const int *map = vect_get_operand_map (call1))
992 unsigned int nkept = *map++;
993 unsigned int mapi = 0;
994 for (unsigned int i = 0; i < nargs; ++i)
995 if (mapi < nkept && map[mapi] == int (i))
996 mapi += 1;
997 else if (!operand_equal_p (gimple_call_arg (call1, i),
998 gimple_call_arg (call2, i)))
999 return false;
1002 return true;
1005 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1006 caller's attempt to find the vector type in STMT_INFO with the narrowest
1007 element type. Return true if VECTYPE is nonnull and if it is valid
1008 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1009 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1010 vect_build_slp_tree. */
1012 static bool
1013 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1014 unsigned int group_size,
1015 tree vectype, poly_uint64 *max_nunits)
1017 if (!vectype)
1019 if (dump_enabled_p ())
1020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1021 "Build SLP failed: unsupported data-type in %G\n",
1022 stmt_info->stmt);
1023 /* Fatal mismatch. */
1024 return false;
1027 /* If populating the vector type requires unrolling then fail
1028 before adjusting *max_nunits for basic-block vectorization. */
1029 if (is_a <bb_vec_info> (vinfo)
1030 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1032 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1034 "Build SLP failed: unrolling required "
1035 "in basic block SLP\n");
1036 /* Fatal mismatch. */
1037 return false;
1040 /* In case of multiple types we need to detect the smallest type. */
1041 vect_update_max_nunits (max_nunits, vectype);
1042 return true;
1045 /* Verify if the scalar stmts STMTS are isomorphic, require data
1046 permutation or are of unsupported types of operation. Return
1047 true if they are, otherwise return false and indicate in *MATCHES
1048 which stmts are not isomorphic to the first one. If MATCHES[0]
1049 is false then this indicates the comparison could not be
1050 carried out or the stmts will never be vectorized by SLP.
1052 Note COND_EXPR is possibly isomorphic to another one after swapping its
1053 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1054 the first stmt by swapping the two operands of comparison; set SWAP[i]
1055 to 2 if stmt I is isormorphic to the first stmt by inverting the code
1056 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1057 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1059 static bool
1060 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1061 vec<stmt_vec_info> stmts, unsigned int group_size,
1062 poly_uint64 *max_nunits, bool *matches,
1063 bool *two_operators, tree *node_vectype)
1065 unsigned int i;
1066 stmt_vec_info first_stmt_info = stmts[0];
1067 code_helper first_stmt_code = ERROR_MARK;
1068 code_helper alt_stmt_code = ERROR_MARK;
1069 code_helper rhs_code = ERROR_MARK;
1070 code_helper first_cond_code = ERROR_MARK;
1071 tree lhs;
1072 bool need_same_oprnds = false;
1073 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1074 stmt_vec_info first_load = NULL, prev_first_load = NULL;
1075 bool first_stmt_ldst_p = false, ldst_p = false;
1076 bool first_stmt_phi_p = false, phi_p = false;
1077 int first_reduc_idx = -1;
1078 bool maybe_soft_fail = false;
1079 tree soft_fail_nunits_vectype = NULL_TREE;
1081 /* For every stmt in NODE find its def stmt/s. */
1082 stmt_vec_info stmt_info;
1083 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1085 swap[i] = 0;
1086 matches[i] = false;
1087 if (!stmt_info)
1089 matches[i] = true;
1090 continue;
1093 gimple *stmt = stmt_info->stmt;
1094 if (dump_enabled_p ())
1095 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1097 /* Fail to vectorize statements marked as unvectorizable, throw
1098 or are volatile. */
1099 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1100 || stmt_can_throw_internal (cfun, stmt)
1101 || gimple_has_volatile_ops (stmt))
1103 if (dump_enabled_p ())
1104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1105 "Build SLP failed: unvectorizable statement %G",
1106 stmt);
1107 /* ??? For BB vectorization we want to commutate operands in a way
1108 to shuffle all unvectorizable defs into one operand and have
1109 the other still vectorized. The following doesn't reliably
1110 work for this though but it's the easiest we can do here. */
1111 if (is_a <bb_vec_info> (vinfo) && i != 0)
1112 continue;
1113 /* Fatal mismatch. */
1114 matches[0] = false;
1115 return false;
1118 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1119 lhs = gimple_get_lhs (stmt);
1120 if (lhs == NULL_TREE
1121 && (!call_stmt
1122 || !gimple_call_internal_p (stmt)
1123 || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1125 if (dump_enabled_p ())
1126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1127 "Build SLP failed: not GIMPLE_ASSIGN nor "
1128 "GIMPLE_CALL %G", stmt);
1129 if (is_a <bb_vec_info> (vinfo) && i != 0)
1130 continue;
1131 /* Fatal mismatch. */
1132 matches[0] = false;
1133 return false;
1136 tree nunits_vectype;
1137 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1138 &nunits_vectype, group_size))
1140 if (is_a <bb_vec_info> (vinfo) && i != 0)
1141 continue;
1142 /* Fatal mismatch. */
1143 matches[0] = false;
1144 return false;
1146 /* Record nunits required but continue analysis, producing matches[]
1147 as if nunits was not an issue. This allows splitting of groups
1148 to happen. */
1149 if (nunits_vectype
1150 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1151 nunits_vectype, max_nunits))
1153 gcc_assert (is_a <bb_vec_info> (vinfo));
1154 maybe_soft_fail = true;
1155 soft_fail_nunits_vectype = nunits_vectype;
1158 gcc_assert (vectype);
1160 if (call_stmt)
1162 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1163 if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1164 rhs_code = cfn;
1165 else
1166 rhs_code = CALL_EXPR;
1168 if (cfn == CFN_MASK_LOAD
1169 || cfn == CFN_GATHER_LOAD
1170 || cfn == CFN_MASK_GATHER_LOAD
1171 || cfn == CFN_MASK_LEN_GATHER_LOAD)
1172 ldst_p = true;
1173 else if (cfn == CFN_MASK_STORE)
1175 ldst_p = true;
1176 rhs_code = CFN_MASK_STORE;
1178 else if ((cfn != CFN_LAST
1179 && cfn != CFN_MASK_CALL
1180 && internal_fn_p (cfn)
1181 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1182 || gimple_call_tail_p (call_stmt)
1183 || gimple_call_noreturn_p (call_stmt)
1184 || gimple_call_chain (call_stmt))
1186 if (dump_enabled_p ())
1187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1188 "Build SLP failed: unsupported call type %G",
1189 (gimple *) call_stmt);
1190 if (is_a <bb_vec_info> (vinfo) && i != 0)
1191 continue;
1192 /* Fatal mismatch. */
1193 matches[0] = false;
1194 return false;
1197 else if (gimple_code (stmt) == GIMPLE_PHI)
1199 rhs_code = ERROR_MARK;
1200 phi_p = true;
1202 else
1204 rhs_code = gimple_assign_rhs_code (stmt);
1205 ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1208 /* Check the operation. */
1209 if (i == 0)
1211 *node_vectype = vectype;
1212 first_stmt_code = rhs_code;
1213 first_stmt_ldst_p = ldst_p;
1214 first_stmt_phi_p = phi_p;
1215 first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
1217 /* Shift arguments should be equal in all the packed stmts for a
1218 vector shift with scalar shift operand. */
1219 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1220 || rhs_code == LROTATE_EXPR
1221 || rhs_code == RROTATE_EXPR)
1223 /* First see if we have a vector/vector shift. */
1224 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1226 /* No vector/vector shift, try for a vector/scalar shift. */
1227 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1229 if (dump_enabled_p ())
1230 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1231 "Build SLP failed: "
1232 "op not supported by target.\n");
1233 if (is_a <bb_vec_info> (vinfo) && i != 0)
1234 continue;
1235 /* Fatal mismatch. */
1236 matches[0] = false;
1237 return false;
1239 need_same_oprnds = true;
1240 first_op1 = gimple_assign_rhs2 (stmt);
1243 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1245 need_same_oprnds = true;
1246 first_op1 = gimple_assign_rhs2 (stmt);
1248 else if (!ldst_p
1249 && rhs_code == BIT_FIELD_REF)
1251 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1252 if (!is_a <bb_vec_info> (vinfo)
1253 || TREE_CODE (vec) != SSA_NAME
1254 /* When the element types are not compatible we pun the
1255 source to the target vectype which requires equal size. */
1256 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1257 || !types_compatible_p (TREE_TYPE (vectype),
1258 TREE_TYPE (TREE_TYPE (vec))))
1259 && !operand_equal_p (TYPE_SIZE (vectype),
1260 TYPE_SIZE (TREE_TYPE (vec)))))
1262 if (dump_enabled_p ())
1263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1264 "Build SLP failed: "
1265 "BIT_FIELD_REF not supported\n");
1266 /* Fatal mismatch. */
1267 matches[0] = false;
1268 return false;
1271 else if (rhs_code == CFN_DIV_POW2)
1273 need_same_oprnds = true;
1274 first_op1 = gimple_call_arg (call_stmt, 1);
1277 else
1279 if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1280 /* For SLP reduction groups the index isn't necessarily
1281 uniform but only that of the first stmt matters. */
1282 && !(first_reduc_idx != -1
1283 && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1284 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
1286 if (dump_enabled_p ())
1288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1289 "Build SLP failed: different reduc_idx "
1290 "%d instead of %d in %G",
1291 STMT_VINFO_REDUC_IDX (stmt_info),
1292 first_reduc_idx, stmt);
1294 /* Mismatch. */
1295 continue;
1297 if (first_stmt_code != rhs_code
1298 && alt_stmt_code == ERROR_MARK)
1299 alt_stmt_code = rhs_code;
1300 if ((first_stmt_code != rhs_code
1301 && (first_stmt_code != IMAGPART_EXPR
1302 || rhs_code != REALPART_EXPR)
1303 && (first_stmt_code != REALPART_EXPR
1304 || rhs_code != IMAGPART_EXPR)
1305 /* Handle mismatches in plus/minus by computing both
1306 and merging the results. */
1307 && !((first_stmt_code == PLUS_EXPR
1308 || first_stmt_code == MINUS_EXPR)
1309 && (alt_stmt_code == PLUS_EXPR
1310 || alt_stmt_code == MINUS_EXPR)
1311 && rhs_code == alt_stmt_code)
1312 && !(first_stmt_code.is_tree_code ()
1313 && rhs_code.is_tree_code ()
1314 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1315 == tcc_comparison)
1316 && (swap_tree_comparison (tree_code (first_stmt_code))
1317 == tree_code (rhs_code)))
1318 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1319 && (first_stmt_code == ARRAY_REF
1320 || first_stmt_code == BIT_FIELD_REF
1321 || first_stmt_code == COMPONENT_REF
1322 || first_stmt_code == REALPART_EXPR
1323 || first_stmt_code == IMAGPART_EXPR
1324 || first_stmt_code == MEM_REF)
1325 && (rhs_code == ARRAY_REF
1326 || rhs_code == BIT_FIELD_REF
1327 || rhs_code == COMPONENT_REF
1328 || rhs_code == REALPART_EXPR
1329 || rhs_code == IMAGPART_EXPR
1330 || rhs_code == MEM_REF)))
1331 || (ldst_p
1332 && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1333 != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1334 || (ldst_p
1335 && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1336 != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1337 || first_stmt_ldst_p != ldst_p
1338 || first_stmt_phi_p != phi_p)
1340 if (dump_enabled_p ())
1342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1343 "Build SLP failed: different operation "
1344 "in stmt %G", stmt);
1345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1346 "original stmt %G", first_stmt_info->stmt);
1348 /* Mismatch. */
1349 continue;
1352 if (!ldst_p
1353 && first_stmt_code == BIT_FIELD_REF
1354 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1355 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1357 if (dump_enabled_p ())
1358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1359 "Build SLP failed: different BIT_FIELD_REF "
1360 "arguments in %G", stmt);
1361 /* Mismatch. */
1362 continue;
1365 if (call_stmt
1366 && first_stmt_code != CFN_MASK_LOAD
1367 && first_stmt_code != CFN_MASK_STORE)
1369 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1370 call_stmt))
1372 if (dump_enabled_p ())
1373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1374 "Build SLP failed: different calls in %G",
1375 stmt);
1376 /* Mismatch. */
1377 continue;
1381 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1382 && (gimple_bb (first_stmt_info->stmt)
1383 != gimple_bb (stmt_info->stmt)))
1385 if (dump_enabled_p ())
1386 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1387 "Build SLP failed: different BB for PHI "
1388 "or possibly trapping operation in %G", stmt);
1389 /* Mismatch. */
1390 continue;
1393 if (need_same_oprnds)
1395 tree other_op1 = gimple_arg (stmt, 1);
1396 if (!operand_equal_p (first_op1, other_op1, 0))
1398 if (dump_enabled_p ())
1399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1400 "Build SLP failed: different shift "
1401 "arguments in %G", stmt);
1402 /* Mismatch. */
1403 continue;
1407 if (!types_compatible_p (vectype, *node_vectype))
1409 if (dump_enabled_p ())
1410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1411 "Build SLP failed: different vector type "
1412 "in %G", stmt);
1413 /* Mismatch. */
1414 continue;
1418 /* Grouped store or load. */
1419 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1421 gcc_assert (ldst_p);
1422 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1424 /* Store. */
1425 gcc_assert (rhs_code == CFN_MASK_STORE
1426 || REFERENCE_CLASS_P (lhs)
1427 || DECL_P (lhs));
1429 else
1431 /* Load. */
1432 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1433 if (prev_first_load)
1435 /* Check that there are no loads from different interleaving
1436 chains in the same node. */
1437 if (prev_first_load != first_load)
1439 if (dump_enabled_p ())
1440 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1441 vect_location,
1442 "Build SLP failed: different "
1443 "interleaving chains in one node %G",
1444 stmt);
1445 /* Mismatch. */
1446 continue;
1449 else
1450 prev_first_load = first_load;
1453 /* Non-grouped store or load. */
1454 else if (ldst_p)
1456 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1457 && rhs_code != CFN_GATHER_LOAD
1458 && rhs_code != CFN_MASK_GATHER_LOAD
1459 && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1460 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1461 /* Not grouped loads are handled as externals for BB
1462 vectorization. For loop vectorization we can handle
1463 splats the same we handle single element interleaving. */
1464 && (is_a <bb_vec_info> (vinfo)
1465 || stmt_info != first_stmt_info))
1467 /* Not grouped load. */
1468 if (dump_enabled_p ())
1469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1470 "Build SLP failed: not grouped load %G", stmt);
1472 if (i != 0)
1473 continue;
1474 /* Fatal mismatch. */
1475 matches[0] = false;
1476 return false;
1479 /* Not memory operation. */
1480 else
1482 if (!phi_p
1483 && rhs_code.is_tree_code ()
1484 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1485 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1486 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1487 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1488 && rhs_code != VIEW_CONVERT_EXPR
1489 && rhs_code != CALL_EXPR
1490 && rhs_code != BIT_FIELD_REF)
1492 if (dump_enabled_p ())
1493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1494 "Build SLP failed: operation unsupported %G",
1495 stmt);
1496 if (is_a <bb_vec_info> (vinfo) && i != 0)
1497 continue;
1498 /* Fatal mismatch. */
1499 matches[0] = false;
1500 return false;
1503 if (rhs_code == COND_EXPR)
1505 tree cond_expr = gimple_assign_rhs1 (stmt);
1506 enum tree_code cond_code = TREE_CODE (cond_expr);
1507 enum tree_code swap_code = ERROR_MARK;
1508 enum tree_code invert_code = ERROR_MARK;
1510 if (i == 0)
1511 first_cond_code = TREE_CODE (cond_expr);
1512 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1514 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1515 swap_code = swap_tree_comparison (cond_code);
1516 invert_code = invert_tree_comparison (cond_code, honor_nans);
1519 if (first_cond_code == cond_code)
1521 /* Isomorphic can be achieved by swapping. */
1522 else if (first_cond_code == swap_code)
1523 swap[i] = 1;
1524 /* Isomorphic can be achieved by inverting. */
1525 else if (first_cond_code == invert_code)
1526 swap[i] = 2;
1527 else
1529 if (dump_enabled_p ())
1530 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1531 "Build SLP failed: different"
1532 " operation %G", stmt);
1533 /* Mismatch. */
1534 continue;
1538 if (rhs_code.is_tree_code ()
1539 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1540 && (swap_tree_comparison ((tree_code)first_stmt_code)
1541 == (tree_code)rhs_code))
1542 swap[i] = 1;
1545 matches[i] = true;
1548 for (i = 0; i < group_size; ++i)
1549 if (!matches[i])
1550 return false;
1552 /* If we allowed a two-operation SLP node verify the target can cope
1553 with the permute we are going to use. */
1554 if (alt_stmt_code != ERROR_MARK
1555 && (!alt_stmt_code.is_tree_code ()
1556 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1557 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1559 *two_operators = true;
1562 if (maybe_soft_fail)
1564 unsigned HOST_WIDE_INT const_nunits;
1565 if (!TYPE_VECTOR_SUBPARTS
1566 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1567 || const_nunits > group_size)
1568 matches[0] = false;
1569 else
1571 /* With constant vector elements simulate a mismatch at the
1572 point we need to split. */
1573 unsigned tail = group_size & (const_nunits - 1);
1574 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1576 return false;
1579 return true;
1582 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1583 Note we never remove apart from at destruction time so we do not
1584 need a special value for deleted that differs from empty. */
1585 struct bst_traits
1587 typedef vec <stmt_vec_info> value_type;
1588 typedef vec <stmt_vec_info> compare_type;
1589 static inline hashval_t hash (value_type);
1590 static inline bool equal (value_type existing, value_type candidate);
1591 static inline bool is_empty (value_type x) { return !x.exists (); }
1592 static inline bool is_deleted (value_type x) { return !x.exists (); }
1593 static const bool empty_zero_p = true;
1594 static inline void mark_empty (value_type &x) { x.release (); }
1595 static inline void mark_deleted (value_type &x) { x.release (); }
1596 static inline void remove (value_type &x) { x.release (); }
1598 inline hashval_t
1599 bst_traits::hash (value_type x)
1601 inchash::hash h;
1602 for (unsigned i = 0; i < x.length (); ++i)
1603 h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
1604 return h.end ();
1606 inline bool
1607 bst_traits::equal (value_type existing, value_type candidate)
1609 if (existing.length () != candidate.length ())
1610 return false;
1611 for (unsigned i = 0; i < existing.length (); ++i)
1612 if (existing[i] != candidate[i])
1613 return false;
1614 return true;
1617 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1618 simple_hashmap_traits <bst_traits, slp_tree> >
1619 scalar_stmts_to_slp_tree_map_t;
1621 /* Release BST_MAP. */
1623 static void
1624 release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
1626 /* The map keeps a reference on SLP nodes built, release that. */
1627 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1628 it != bst_map->end (); ++it)
1629 if ((*it).second)
1630 vect_free_slp_tree ((*it).second);
1631 delete bst_map;
1634 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1635 but then vec::insert does memmove and that's not compatible with
1636 std::pair. */
1637 struct chain_op_t
1639 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1640 : code (code_), dt (dt_), op (op_) {}
1641 tree_code code;
1642 vect_def_type dt;
1643 tree op;
1646 /* Comparator for sorting associatable chains. */
1648 static int
1649 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1651 auto *op1 = (const chain_op_t *) op1_;
1652 auto *op2 = (const chain_op_t *) op2_;
1653 if (op1->dt != op2->dt)
1654 return (int)op1->dt - (int)op2->dt;
1655 return (int)op1->code - (int)op2->code;
1658 /* Linearize the associatable expression chain at START with the
1659 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1660 filling CHAIN with the result and using WORKLIST as intermediate storage.
1661 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1662 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1663 stmts, starting with START. */
1665 static void
1666 vect_slp_linearize_chain (vec_info *vinfo,
1667 vec<std::pair<tree_code, gimple *> > &worklist,
1668 vec<chain_op_t> &chain,
1669 enum tree_code code, gimple *start,
1670 gimple *&code_stmt, gimple *&alt_code_stmt,
1671 vec<gimple *> *chain_stmts)
1673 /* For each lane linearize the addition/subtraction (or other
1674 uniform associatable operation) expression tree. */
1675 worklist.safe_push (std::make_pair (code, start));
1676 while (!worklist.is_empty ())
1678 auto entry = worklist.pop ();
1679 gassign *stmt = as_a <gassign *> (entry.second);
1680 enum tree_code in_code = entry.first;
1681 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1682 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1683 if (!code_stmt
1684 && gimple_assign_rhs_code (stmt) == code)
1685 code_stmt = stmt;
1686 else if (!alt_code_stmt
1687 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1688 alt_code_stmt = stmt;
1689 if (chain_stmts)
1690 chain_stmts->safe_push (stmt);
1691 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1693 tree op = gimple_op (stmt, opnum);
1694 vect_def_type dt;
1695 stmt_vec_info def_stmt_info;
1696 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1697 gcc_assert (res);
1698 if (dt == vect_internal_def
1699 && is_pattern_stmt_p (def_stmt_info))
1700 op = gimple_get_lhs (def_stmt_info->stmt);
1701 gimple *use_stmt;
1702 use_operand_p use_p;
1703 if (dt == vect_internal_def
1704 && single_imm_use (op, &use_p, &use_stmt)
1705 && is_gimple_assign (def_stmt_info->stmt)
1706 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1707 || (code == PLUS_EXPR
1708 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1709 == MINUS_EXPR))))
1711 tree_code op_def_code = this_code;
1712 if (op_def_code == MINUS_EXPR && opnum == 1)
1713 op_def_code = PLUS_EXPR;
1714 if (in_code == MINUS_EXPR)
1715 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1716 worklist.safe_push (std::make_pair (op_def_code,
1717 def_stmt_info->stmt));
1719 else
1721 tree_code op_def_code = this_code;
1722 if (op_def_code == MINUS_EXPR && opnum == 1)
1723 op_def_code = PLUS_EXPR;
1724 if (in_code == MINUS_EXPR)
1725 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1726 chain.safe_push (chain_op_t (op_def_code, dt, op));
1732 static slp_tree
1733 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1734 vec<stmt_vec_info> stmts, unsigned int group_size,
1735 poly_uint64 *max_nunits,
1736 bool *matches, unsigned *limit, unsigned *tree_size,
1737 scalar_stmts_to_slp_tree_map_t *bst_map);
1739 static slp_tree
1740 vect_build_slp_tree (vec_info *vinfo,
1741 vec<stmt_vec_info> stmts, unsigned int group_size,
1742 poly_uint64 *max_nunits,
1743 bool *matches, unsigned *limit, unsigned *tree_size,
1744 scalar_stmts_to_slp_tree_map_t *bst_map)
1746 if (slp_tree *leader = bst_map->get (stmts))
1748 if (dump_enabled_p ())
1749 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1750 !(*leader)->failed ? "" : "failed ",
1751 (void *) *leader);
1752 if (!(*leader)->failed)
1754 SLP_TREE_REF_COUNT (*leader)++;
1755 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1756 stmts.release ();
1757 return *leader;
1759 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1760 return NULL;
1763 /* Single-lane SLP doesn't have the chance of run-away, do not account
1764 it to the limit. */
1765 if (stmts.length () > 1)
1767 if (*limit == 0)
1769 if (dump_enabled_p ())
1770 dump_printf_loc (MSG_NOTE, vect_location,
1771 "SLP discovery limit exceeded\n");
1772 memset (matches, 0, sizeof (bool) * group_size);
1773 return NULL;
1775 --*limit;
1778 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1779 so we can pick up backedge destinations during discovery. */
1780 slp_tree res = new _slp_tree;
1781 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1782 SLP_TREE_SCALAR_STMTS (res) = stmts;
1783 bst_map->put (stmts.copy (), res);
1785 if (dump_enabled_p ())
1786 dump_printf_loc (MSG_NOTE, vect_location,
1787 "starting SLP discovery for node %p\n", (void *) res);
1789 poly_uint64 this_max_nunits = 1;
1790 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1791 &this_max_nunits,
1792 matches, limit, tree_size, bst_map);
1793 if (!res_)
1795 if (dump_enabled_p ())
1796 dump_printf_loc (MSG_NOTE, vect_location,
1797 "SLP discovery for node %p failed\n", (void *) res);
1798 /* Mark the node invalid so we can detect those when still in use
1799 as backedge destinations. */
1800 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1801 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1802 res->failed = XNEWVEC (bool, group_size);
1803 if (flag_checking)
1805 unsigned i;
1806 for (i = 0; i < group_size; ++i)
1807 if (!matches[i])
1808 break;
1809 gcc_assert (i < group_size);
1811 memcpy (res->failed, matches, sizeof (bool) * group_size);
1813 else
1815 if (dump_enabled_p ())
1816 dump_printf_loc (MSG_NOTE, vect_location,
1817 "SLP discovery for node %p succeeded\n",
1818 (void *) res);
1819 gcc_assert (res_ == res);
1820 res->max_nunits = this_max_nunits;
1821 vect_update_max_nunits (max_nunits, this_max_nunits);
1822 /* Keep a reference for the bst_map use. */
1823 SLP_TREE_REF_COUNT (res)++;
1825 return res_;
1828 /* Helper for building an associated SLP node chain. */
1830 static void
1831 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1832 slp_tree op0, slp_tree op1,
1833 stmt_vec_info oper1, stmt_vec_info oper2,
1834 vec<std::pair<unsigned, unsigned> > lperm)
1836 unsigned group_size = SLP_TREE_LANES (op1);
1838 slp_tree child1 = new _slp_tree;
1839 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1840 SLP_TREE_VECTYPE (child1) = vectype;
1841 SLP_TREE_LANES (child1) = group_size;
1842 SLP_TREE_CHILDREN (child1).create (2);
1843 SLP_TREE_CHILDREN (child1).quick_push (op0);
1844 SLP_TREE_CHILDREN (child1).quick_push (op1);
1845 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1847 slp_tree child2 = new _slp_tree;
1848 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1849 SLP_TREE_VECTYPE (child2) = vectype;
1850 SLP_TREE_LANES (child2) = group_size;
1851 SLP_TREE_CHILDREN (child2).create (2);
1852 SLP_TREE_CHILDREN (child2).quick_push (op0);
1853 SLP_TREE_REF_COUNT (op0)++;
1854 SLP_TREE_CHILDREN (child2).quick_push (op1);
1855 SLP_TREE_REF_COUNT (op1)++;
1856 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1858 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1859 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1860 SLP_TREE_VECTYPE (perm) = vectype;
1861 SLP_TREE_LANES (perm) = group_size;
1862 /* ??? We should set this NULL but that's not expected. */
1863 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1864 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1865 SLP_TREE_CHILDREN (perm).quick_push (child1);
1866 SLP_TREE_CHILDREN (perm).quick_push (child2);
1869 /* Recursively build an SLP tree starting from NODE.
1870 Fail (and return a value not equal to zero) if def-stmts are not
1871 isomorphic, require data permutation or are of unsupported types of
1872 operation. Otherwise, return 0.
1873 The value returned is the depth in the SLP tree where a mismatch
1874 was found. */
1876 static slp_tree
1877 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1878 vec<stmt_vec_info> stmts, unsigned int group_size,
1879 poly_uint64 *max_nunits,
1880 bool *matches, unsigned *limit, unsigned *tree_size,
1881 scalar_stmts_to_slp_tree_map_t *bst_map)
1883 unsigned nops, i, this_tree_size = 0;
1884 poly_uint64 this_max_nunits = *max_nunits;
1886 matches[0] = false;
1888 stmt_vec_info stmt_info = stmts[0];
1889 if (!is_a<gcall *> (stmt_info->stmt)
1890 && !is_a<gassign *> (stmt_info->stmt)
1891 && !is_a<gphi *> (stmt_info->stmt))
1892 return NULL;
1894 nops = gimple_num_args (stmt_info->stmt);
1895 if (const int *map = vect_get_operand_map (stmt_info->stmt,
1896 STMT_VINFO_GATHER_SCATTER_P
1897 (stmt_info)))
1898 nops = map[0];
1900 /* If the SLP node is a PHI (induction or reduction), terminate
1901 the recursion. */
1902 bool *skip_args = XALLOCAVEC (bool, nops);
1903 memset (skip_args, 0, sizeof (bool) * nops);
1904 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1905 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1907 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1908 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1909 group_size);
1910 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1911 max_nunits))
1912 return NULL;
1914 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1915 if (def_type == vect_induction_def)
1917 /* Induction PHIs are not cycles but walk the initial
1918 value. Only for inner loops through, for outer loops
1919 we need to pick up the value from the actual PHIs
1920 to more easily support peeling and epilogue vectorization. */
1921 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1922 if (!nested_in_vect_loop_p (loop, stmt_info))
1923 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1924 else
1925 loop = loop->inner;
1926 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1928 else if (def_type == vect_reduction_def
1929 || def_type == vect_double_reduction_def
1930 || def_type == vect_nested_cycle
1931 || def_type == vect_first_order_recurrence)
1933 /* Else def types have to match. */
1934 stmt_vec_info other_info;
1935 bool all_same = true;
1936 FOR_EACH_VEC_ELT (stmts, i, other_info)
1938 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1939 return NULL;
1940 if (other_info != stmt_info)
1941 all_same = false;
1943 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1944 /* Reduction initial values are not explicitely represented. */
1945 if (def_type != vect_first_order_recurrence
1946 && gimple_bb (stmt_info->stmt) == loop->header)
1947 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1948 /* Reduction chain backedge defs are filled manually.
1949 ??? Need a better way to identify a SLP reduction chain PHI.
1950 Or a better overall way to SLP match those. */
1951 if (stmts.length () > 1
1952 && all_same && def_type == vect_reduction_def)
1953 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1955 else if (def_type != vect_internal_def)
1956 return NULL;
1960 bool two_operators = false;
1961 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1962 tree vectype = NULL_TREE;
1963 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1964 &this_max_nunits, matches, &two_operators,
1965 &vectype))
1966 return NULL;
1968 /* If the SLP node is a load, terminate the recursion unless masked. */
1969 if (STMT_VINFO_DATA_REF (stmt_info)
1970 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1972 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1973 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1974 else
1976 *max_nunits = this_max_nunits;
1977 (*tree_size)++;
1978 node = vect_create_new_slp_node (node, stmts, 0);
1979 SLP_TREE_VECTYPE (node) = vectype;
1980 /* And compute the load permutation. Whether it is actually
1981 a permutation depends on the unrolling factor which is
1982 decided later. */
1983 vec<unsigned> load_permutation;
1984 int j;
1985 stmt_vec_info load_info;
1986 load_permutation.create (group_size);
1987 stmt_vec_info first_stmt_info
1988 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1989 bool any_permute = false;
1990 bool any_null = false;
1991 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1993 int load_place;
1994 if (! load_info)
1996 load_place = j;
1997 any_null = true;
1999 else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2000 load_place = vect_get_place_in_interleaving_chain
2001 (load_info, first_stmt_info);
2002 else
2003 load_place = 0;
2004 gcc_assert (load_place != -1);
2005 any_permute |= load_place != j;
2006 load_permutation.quick_push (load_place);
2008 if (any_null)
2010 gcc_assert (!any_permute);
2011 load_permutation.release ();
2014 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
2016 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
2017 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
2018 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
2019 || gimple_call_internal_p (stmt,
2020 IFN_MASK_LEN_GATHER_LOAD));
2021 load_permutation.release ();
2022 /* We cannot handle permuted masked loads, see PR114375. */
2023 if (any_permute
2024 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2025 && DR_GROUP_SIZE (first_stmt_info) != group_size)
2026 || STMT_VINFO_STRIDED_P (stmt_info))
2028 matches[0] = false;
2029 return NULL;
2032 else
2034 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
2035 return node;
2039 else if (gimple_assign_single_p (stmt_info->stmt)
2040 && !gimple_vuse (stmt_info->stmt)
2041 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
2043 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
2044 the same SSA name vector of a compatible type to vectype. */
2045 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
2046 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
2047 stmt_vec_info estmt_info;
2048 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
2050 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
2051 tree bfref = gimple_assign_rhs1 (estmt);
2052 HOST_WIDE_INT lane;
2053 if (!known_eq (bit_field_size (bfref),
2054 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2055 || !constant_multiple_p (bit_field_offset (bfref),
2056 bit_field_size (bfref), &lane))
2058 lperm.release ();
2059 matches[0] = false;
2060 return NULL;
2062 lperm.safe_push (std::make_pair (0, (unsigned)lane));
2064 slp_tree vnode = vect_create_new_slp_node (vNULL);
2065 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2066 /* ??? We record vectype here but we hide eventually necessary
2067 punning and instead rely on code generation to materialize
2068 VIEW_CONVERT_EXPRs as necessary. We instead should make
2069 this explicit somehow. */
2070 SLP_TREE_VECTYPE (vnode) = vectype;
2071 else
2073 /* For different size but compatible elements we can still
2074 use VEC_PERM_EXPR without punning. */
2075 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2076 && types_compatible_p (TREE_TYPE (vectype),
2077 TREE_TYPE (TREE_TYPE (vec))));
2078 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2080 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2081 unsigned HOST_WIDE_INT const_nunits;
2082 if (nunits.is_constant (&const_nunits))
2083 SLP_TREE_LANES (vnode) = const_nunits;
2084 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2085 /* We are always building a permutation node even if it is an identity
2086 permute to shield the rest of the vectorizer from the odd node
2087 representing an actual vector without any scalar ops.
2088 ??? We could hide it completely with making the permute node
2089 external? */
2090 node = vect_create_new_slp_node (node, stmts, 1);
2091 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2092 SLP_TREE_LANE_PERMUTATION (node) = lperm;
2093 SLP_TREE_VECTYPE (node) = vectype;
2094 SLP_TREE_CHILDREN (node).quick_push (vnode);
2095 return node;
2097 /* When discovery reaches an associatable operation see whether we can
2098 improve that to match up lanes in a way superior to the operand
2099 swapping code which at most looks at two defs.
2100 ??? For BB vectorization we cannot do the brute-force search
2101 for matching as we can succeed by means of builds from scalars
2102 and have no good way to "cost" one build against another. */
2103 else if (is_a <loop_vec_info> (vinfo)
2104 /* Do not bother for single-lane SLP. */
2105 && group_size > 1
2106 /* ??? We don't handle !vect_internal_def defs below. */
2107 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2108 /* ??? Do not associate a reduction, this will wreck REDUC_IDX
2109 mapping as long as that exists on the stmt_info level. */
2110 && STMT_VINFO_REDUC_IDX (stmt_info) == -1
2111 && is_gimple_assign (stmt_info->stmt)
2112 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2113 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2114 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2115 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2116 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2118 /* See if we have a chain of (mixed) adds or subtracts or other
2119 associatable ops. */
2120 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2121 if (code == MINUS_EXPR)
2122 code = PLUS_EXPR;
2123 stmt_vec_info other_op_stmt_info = NULL;
2124 stmt_vec_info op_stmt_info = NULL;
2125 unsigned chain_len = 0;
2126 auto_vec<chain_op_t> chain;
2127 auto_vec<std::pair<tree_code, gimple *> > worklist;
2128 auto_vec<vec<chain_op_t> > chains (group_size);
2129 auto_vec<slp_tree, 4> children;
2130 bool hard_fail = true;
2131 for (unsigned lane = 0; lane < group_size; ++lane)
2133 /* For each lane linearize the addition/subtraction (or other
2134 uniform associatable operation) expression tree. */
2135 gimple *op_stmt = NULL, *other_op_stmt = NULL;
2136 vect_slp_linearize_chain (vinfo, worklist, chain, code,
2137 stmts[lane]->stmt, op_stmt, other_op_stmt,
2138 NULL);
2139 if (!op_stmt_info && op_stmt)
2140 op_stmt_info = vinfo->lookup_stmt (op_stmt);
2141 if (!other_op_stmt_info && other_op_stmt)
2142 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2143 if (chain.length () == 2)
2145 /* In a chain of just two elements resort to the regular
2146 operand swapping scheme. Likewise if we run into a
2147 length mismatch process regularly as well as we did not
2148 process the other lanes we cannot report a good hint what
2149 lanes to try swapping in the parent. */
2150 hard_fail = false;
2151 break;
2153 else if (chain_len == 0)
2154 chain_len = chain.length ();
2155 else if (chain.length () != chain_len)
2157 /* ??? Here we could slip in magic to compensate with
2158 neutral operands. */
2159 matches[lane] = false;
2160 if (lane != group_size - 1)
2161 matches[0] = false;
2162 break;
2164 chains.quick_push (chain.copy ());
2165 chain.truncate (0);
2167 if (chains.length () == group_size)
2169 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2170 if (!op_stmt_info)
2172 hard_fail = false;
2173 goto out;
2175 /* Now we have a set of chains with the same length. */
2176 /* 1. pre-sort according to def_type and operation. */
2177 for (unsigned lane = 0; lane < group_size; ++lane)
2178 chains[lane].stablesort (dt_sort_cmp, vinfo);
2179 if (dump_enabled_p ())
2181 dump_printf_loc (MSG_NOTE, vect_location,
2182 "pre-sorted chains of %s\n",
2183 get_tree_code_name (code));
2184 for (unsigned lane = 0; lane < group_size; ++lane)
2186 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2187 dump_printf (MSG_NOTE, "%s %T ",
2188 get_tree_code_name (chains[lane][opnum].code),
2189 chains[lane][opnum].op);
2190 dump_printf (MSG_NOTE, "\n");
2193 /* 2. try to build children nodes, associating as necessary. */
2194 for (unsigned n = 0; n < chain_len; ++n)
2196 vect_def_type dt = chains[0][n].dt;
2197 unsigned lane;
2198 for (lane = 0; lane < group_size; ++lane)
2199 if (chains[lane][n].dt != dt)
2201 if (dt == vect_constant_def
2202 && chains[lane][n].dt == vect_external_def)
2203 dt = vect_external_def;
2204 else if (dt == vect_external_def
2205 && chains[lane][n].dt == vect_constant_def)
2207 else
2208 break;
2210 if (lane != group_size)
2212 if (dump_enabled_p ())
2213 dump_printf_loc (MSG_NOTE, vect_location,
2214 "giving up on chain due to mismatched "
2215 "def types\n");
2216 matches[lane] = false;
2217 if (lane != group_size - 1)
2218 matches[0] = false;
2219 goto out;
2221 if (dt == vect_constant_def
2222 || dt == vect_external_def)
2224 /* Check whether we can build the invariant. If we can't
2225 we never will be able to. */
2226 tree type = TREE_TYPE (chains[0][n].op);
2227 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2228 && (TREE_CODE (type) == BOOLEAN_TYPE
2229 || !can_duplicate_and_interleave_p (vinfo, group_size,
2230 type)))
2232 matches[0] = false;
2233 goto out;
2235 vec<tree> ops;
2236 ops.create (group_size);
2237 for (lane = 0; lane < group_size; ++lane)
2238 ops.quick_push (chains[lane][n].op);
2239 slp_tree child = vect_create_new_slp_node (ops);
2240 SLP_TREE_DEF_TYPE (child) = dt;
2241 children.safe_push (child);
2243 else if (dt != vect_internal_def)
2245 /* Not sure, we might need sth special.
2246 gcc.dg/vect/pr96854.c,
2247 gfortran.dg/vect/fast-math-pr37021.f90
2248 and gfortran.dg/vect/pr61171.f trigger. */
2249 /* Soft-fail for now. */
2250 hard_fail = false;
2251 goto out;
2253 else
2255 vec<stmt_vec_info> op_stmts;
2256 op_stmts.create (group_size);
2257 slp_tree child = NULL;
2258 /* Brute-force our way. We have to consider a lane
2259 failing after fixing an earlier fail up in the
2260 SLP discovery recursion. So track the current
2261 permute per lane. */
2262 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2263 memset (perms, 0, sizeof (unsigned) * group_size);
2266 op_stmts.truncate (0);
2267 for (lane = 0; lane < group_size; ++lane)
2268 op_stmts.quick_push
2269 (vinfo->lookup_def (chains[lane][n].op));
2270 child = vect_build_slp_tree (vinfo, op_stmts,
2271 group_size, &this_max_nunits,
2272 matches, limit,
2273 &this_tree_size, bst_map);
2274 /* ??? We're likely getting too many fatal mismatches
2275 here so maybe we want to ignore them (but then we
2276 have no idea which lanes fatally mismatched). */
2277 if (child || !matches[0])
2278 break;
2279 /* Swap another lane we have not yet matched up into
2280 lanes that did not match. If we run out of
2281 permute possibilities for a lane terminate the
2282 search. */
2283 bool term = false;
2284 for (lane = 1; lane < group_size; ++lane)
2285 if (!matches[lane])
2287 if (n + perms[lane] + 1 == chain_len)
2289 term = true;
2290 break;
2292 std::swap (chains[lane][n],
2293 chains[lane][n + perms[lane] + 1]);
2294 perms[lane]++;
2296 if (term)
2297 break;
2299 while (1);
2300 if (!child)
2302 if (dump_enabled_p ())
2303 dump_printf_loc (MSG_NOTE, vect_location,
2304 "failed to match up op %d\n", n);
2305 op_stmts.release ();
2306 if (lane != group_size - 1)
2307 matches[0] = false;
2308 else
2309 matches[lane] = false;
2310 goto out;
2312 if (dump_enabled_p ())
2314 dump_printf_loc (MSG_NOTE, vect_location,
2315 "matched up op %d to\n", n);
2316 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2318 children.safe_push (child);
2321 /* 3. build SLP nodes to combine the chain. */
2322 for (unsigned lane = 0; lane < group_size; ++lane)
2323 if (chains[lane][0].code != code)
2325 /* See if there's any alternate all-PLUS entry. */
2326 unsigned n;
2327 for (n = 1; n < chain_len; ++n)
2329 for (lane = 0; lane < group_size; ++lane)
2330 if (chains[lane][n].code != code)
2331 break;
2332 if (lane == group_size)
2333 break;
2335 if (n != chain_len)
2337 /* Swap that in at first position. */
2338 std::swap (children[0], children[n]);
2339 for (lane = 0; lane < group_size; ++lane)
2340 std::swap (chains[lane][0], chains[lane][n]);
2342 else
2344 /* ??? When this triggers and we end up with two
2345 vect_constant/external_def up-front things break (ICE)
2346 spectacularly finding an insertion place for the
2347 all-constant op. We should have a fully
2348 vect_internal_def operand though(?) so we can swap
2349 that into first place and then prepend the all-zero
2350 constant. */
2351 if (dump_enabled_p ())
2352 dump_printf_loc (MSG_NOTE, vect_location,
2353 "inserting constant zero to compensate "
2354 "for (partially) negated first "
2355 "operand\n");
2356 chain_len++;
2357 for (lane = 0; lane < group_size; ++lane)
2358 chains[lane].safe_insert
2359 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2360 vec<tree> zero_ops;
2361 zero_ops.create (group_size);
2362 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2363 for (lane = 1; lane < group_size; ++lane)
2364 zero_ops.quick_push (zero_ops[0]);
2365 slp_tree zero = vect_create_new_slp_node (zero_ops);
2366 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2367 children.safe_insert (0, zero);
2369 break;
2371 for (unsigned i = 1; i < children.length (); ++i)
2373 slp_tree op0 = children[i - 1];
2374 slp_tree op1 = children[i];
2375 bool this_two_op = false;
2376 for (unsigned lane = 0; lane < group_size; ++lane)
2377 if (chains[lane][i].code != chains[0][i].code)
2379 this_two_op = true;
2380 break;
2382 slp_tree child;
2383 if (i == children.length () - 1)
2384 child = vect_create_new_slp_node (node, stmts, 2);
2385 else
2386 child = vect_create_new_slp_node (2, ERROR_MARK);
2387 if (this_two_op)
2389 vec<std::pair<unsigned, unsigned> > lperm;
2390 lperm.create (group_size);
2391 for (unsigned lane = 0; lane < group_size; ++lane)
2392 lperm.quick_push (std::make_pair
2393 (chains[lane][i].code != chains[0][i].code, lane));
2394 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2395 (chains[0][i].code == code
2396 ? op_stmt_info
2397 : other_op_stmt_info),
2398 (chains[0][i].code == code
2399 ? other_op_stmt_info
2400 : op_stmt_info),
2401 lperm);
2403 else
2405 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2406 SLP_TREE_VECTYPE (child) = vectype;
2407 SLP_TREE_LANES (child) = group_size;
2408 SLP_TREE_CHILDREN (child).quick_push (op0);
2409 SLP_TREE_CHILDREN (child).quick_push (op1);
2410 SLP_TREE_REPRESENTATIVE (child)
2411 = (chains[0][i].code == code
2412 ? op_stmt_info : other_op_stmt_info);
2414 children[i] = child;
2416 *tree_size += this_tree_size + 1;
2417 *max_nunits = this_max_nunits;
2418 while (!chains.is_empty ())
2419 chains.pop ().release ();
2420 return node;
2422 out:
2423 if (dump_enabled_p ())
2424 dump_printf_loc (MSG_NOTE, vect_location,
2425 "failed to line up SLP graph by re-associating "
2426 "operations in lanes%s\n",
2427 !hard_fail ? " trying regular discovery" : "");
2428 while (!children.is_empty ())
2429 vect_free_slp_tree (children.pop ());
2430 while (!chains.is_empty ())
2431 chains.pop ().release ();
2432 /* Hard-fail, otherwise we might run into quadratic processing of the
2433 chains starting one stmt into the chain again. */
2434 if (hard_fail)
2435 return NULL;
2436 /* Fall thru to normal processing. */
2439 /* Get at the operands, verifying they are compatible. */
2440 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2441 slp_oprnd_info oprnd_info;
2442 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2444 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2445 stmts, i, &oprnds_info);
2446 if (res != 0)
2447 matches[(res == -1) ? 0 : i] = false;
2448 if (!matches[0])
2449 break;
2451 for (i = 0; i < group_size; ++i)
2452 if (!matches[i])
2454 vect_free_oprnd_info (oprnds_info);
2455 return NULL;
2457 swap = NULL;
2459 bool has_two_operators_perm = false;
2460 auto_vec<unsigned> two_op_perm_indices[2];
2461 vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
2463 if (two_operators && oprnds_info.length () == 2 && group_size > 2)
2465 unsigned idx = 0;
2466 hash_map<gimple *, unsigned> seen;
2467 vec<slp_oprnd_info> new_oprnds_info
2468 = vect_create_oprnd_info (1, group_size);
2469 bool success = true;
2471 enum tree_code code = ERROR_MARK;
2472 if (oprnds_info[0]->def_stmts[0]
2473 && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
2474 code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
2476 for (unsigned j = 0; j < group_size; ++j)
2478 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2480 stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
2481 if (!stmt_info || !stmt_info->stmt
2482 || !is_a<gassign *> (stmt_info->stmt)
2483 || gimple_assign_rhs_code (stmt_info->stmt) != code
2484 || skip_args[i])
2486 success = false;
2487 break;
2490 bool exists;
2491 unsigned &stmt_idx
2492 = seen.get_or_insert (stmt_info->stmt, &exists);
2494 if (!exists)
2496 new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
2497 new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
2498 stmt_idx = idx;
2499 idx++;
2502 two_op_perm_indices[i].safe_push (stmt_idx);
2505 if (!success)
2506 break;
2509 if (success && idx == group_size)
2511 if (dump_enabled_p ())
2513 dump_printf_loc (MSG_NOTE, vect_location,
2514 "Replace two_operators operands:\n");
2516 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2518 dump_printf_loc (MSG_NOTE, vect_location,
2519 "Operand %u:\n", i);
2520 for (unsigned j = 0; j < group_size; j++)
2521 dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2522 j, oprnd_info->def_stmts[j]->stmt);
2525 dump_printf_loc (MSG_NOTE, vect_location,
2526 "With a single operand:\n");
2527 for (unsigned j = 0; j < group_size; j++)
2528 dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2529 j, new_oprnds_info[0]->def_stmts[j]->stmt);
2532 two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
2533 two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
2535 new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
2536 new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
2537 new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
2538 new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
2539 new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
2541 vect_free_oprnd_info (oprnds_info);
2542 oprnds_info = new_oprnds_info;
2543 nops = 1;
2544 has_two_operators_perm = true;
2548 auto_vec<slp_tree, 4> children;
2550 stmt_info = stmts[0];
2552 /* Create SLP_TREE nodes for the definition node/s. */
2553 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2555 slp_tree child = nullptr;
2556 unsigned int j;
2558 /* We're skipping certain operands from processing, for example
2559 outer loop reduction initial defs. */
2560 if (skip_args[i])
2562 children.safe_push (NULL);
2563 continue;
2566 if (oprnd_info->first_dt == vect_uninitialized_def)
2568 /* COND_EXPR have one too many eventually if the condition
2569 is a SSA name. */
2570 gcc_assert (i == 3 && nops == 4);
2571 continue;
2574 if (is_a <bb_vec_info> (vinfo)
2575 && oprnd_info->first_dt == vect_internal_def
2576 && !oprnd_info->any_pattern)
2578 /* For BB vectorization, if all defs are the same do not
2579 bother to continue the build along the single-lane
2580 graph but use a splat of the scalar value. */
2581 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2582 for (j = 1; j < group_size; ++j)
2583 if (oprnd_info->def_stmts[j] != first_def)
2584 break;
2585 if (j == group_size
2586 /* But avoid doing this for loads where we may be
2587 able to CSE things, unless the stmt is not
2588 vectorizable. */
2589 && (!STMT_VINFO_VECTORIZABLE (first_def)
2590 || !gimple_vuse (first_def->stmt)))
2592 if (dump_enabled_p ())
2593 dump_printf_loc (MSG_NOTE, vect_location,
2594 "Using a splat of the uniform operand %G",
2595 first_def->stmt);
2596 oprnd_info->first_dt = vect_external_def;
2600 if (oprnd_info->first_dt == vect_external_def
2601 || oprnd_info->first_dt == vect_constant_def)
2603 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2605 tree op0;
2606 tree uniform_val = op0 = oprnd_info->ops[0];
2607 for (j = 1; j < oprnd_info->ops.length (); ++j)
2608 if (!operand_equal_p (uniform_val, oprnd_info->ops[j]))
2610 uniform_val = NULL_TREE;
2611 break;
2613 if (!uniform_val
2614 && !can_duplicate_and_interleave_p (vinfo,
2615 oprnd_info->ops.length (),
2616 TREE_TYPE (op0)))
2618 matches[j] = false;
2619 if (dump_enabled_p ())
2620 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2621 "Build SLP failed: invalid type of def "
2622 "for variable-length SLP %T\n", op0);
2623 goto fail;
2626 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2627 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2628 oprnd_info->ops = vNULL;
2629 children.safe_push (invnode);
2630 continue;
2633 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2634 group_size, &this_max_nunits,
2635 matches, limit,
2636 &this_tree_size, bst_map)) != NULL)
2638 oprnd_info->def_stmts = vNULL;
2639 children.safe_push (child);
2640 continue;
2643 /* If the SLP build for operand zero failed and operand zero
2644 and one can be commutated try that for the scalar stmts
2645 that failed the match. */
2646 if (i == 0
2647 /* A first scalar stmt mismatch signals a fatal mismatch. */
2648 && matches[0]
2649 /* ??? For COND_EXPRs we can swap the comparison operands
2650 as well as the arms under some constraints. */
2651 && nops == 2
2652 && oprnds_info[1]->first_dt == vect_internal_def
2653 && is_gimple_assign (stmt_info->stmt)
2654 /* Swapping operands for reductions breaks assumptions later on. */
2655 && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
2657 /* See whether we can swap the matching or the non-matching
2658 stmt operands. */
2659 bool swap_not_matching = true;
2662 for (j = 0; j < group_size; ++j)
2664 if (matches[j] != !swap_not_matching)
2665 continue;
2666 stmt_vec_info stmt_info = stmts[j];
2667 /* Verify if we can swap operands of this stmt. */
2668 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2669 if (!stmt
2670 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2672 if (!swap_not_matching)
2673 goto fail;
2674 swap_not_matching = false;
2675 break;
2679 while (j != group_size);
2681 /* Swap mismatched definition stmts. */
2682 if (dump_enabled_p ())
2683 dump_printf_loc (MSG_NOTE, vect_location,
2684 "Re-trying with swapped operands of stmts ");
2685 for (j = 0; j < group_size; ++j)
2686 if (matches[j] == !swap_not_matching)
2688 std::swap (oprnds_info[0]->def_stmts[j],
2689 oprnds_info[1]->def_stmts[j]);
2690 std::swap (oprnds_info[0]->ops[j],
2691 oprnds_info[1]->ops[j]);
2692 if (dump_enabled_p ())
2693 dump_printf (MSG_NOTE, "%d ", j);
2695 if (dump_enabled_p ())
2696 dump_printf (MSG_NOTE, "\n");
2697 /* After swapping some operands we lost track whether an
2698 operand has any pattern defs so be conservative here. */
2699 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2700 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2701 /* And try again with scratch 'matches' ... */
2702 bool *tem = XALLOCAVEC (bool, group_size);
2703 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2704 group_size, &this_max_nunits,
2705 tem, limit,
2706 &this_tree_size, bst_map)) != NULL)
2708 oprnd_info->def_stmts = vNULL;
2709 children.safe_push (child);
2710 continue;
2713 fail:
2715 /* If the SLP build failed and we analyze a basic-block
2716 simply treat nodes we fail to build as externally defined
2717 (and thus build vectors from the scalar defs).
2718 The cost model will reject outright expensive cases.
2719 ??? This doesn't treat cases where permutation ultimatively
2720 fails (or we don't try permutation below). Ideally we'd
2721 even compute a permutation that will end up with the maximum
2722 SLP tree size... */
2723 if (is_a <bb_vec_info> (vinfo)
2724 /* ??? Rejecting patterns this way doesn't work. We'd have to
2725 do extra work to cancel the pattern so the uses see the
2726 scalar version. */
2727 && !is_pattern_stmt_p (stmt_info)
2728 && !oprnd_info->any_pattern)
2730 /* But if there's a leading vector sized set of matching stmts
2731 fail here so we can split the group. This matches the condition
2732 vect_analyze_slp_instance uses. */
2733 /* ??? We might want to split here and combine the results to support
2734 multiple vector sizes better. */
2735 for (j = 0; j < group_size; ++j)
2736 if (!matches[j])
2737 break;
2738 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2740 if (dump_enabled_p ())
2741 dump_printf_loc (MSG_NOTE, vect_location,
2742 "Building vector operands from scalars\n");
2743 this_tree_size++;
2744 child = vect_create_new_slp_node (oprnd_info->ops);
2745 children.safe_push (child);
2746 oprnd_info->ops = vNULL;
2747 continue;
2751 gcc_assert (child == NULL);
2752 FOR_EACH_VEC_ELT (children, j, child)
2753 if (child)
2754 vect_free_slp_tree (child);
2755 vect_free_oprnd_info (oprnds_info);
2756 return NULL;
2759 vect_free_oprnd_info (oprnds_info);
2761 /* If we have all children of a child built up from uniform scalars
2762 or does more than one possibly expensive vector construction then
2763 just throw that away, causing it built up from scalars.
2764 The exception is the SLP node for the vector store. */
2765 if (is_a <bb_vec_info> (vinfo)
2766 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2767 /* ??? Rejecting patterns this way doesn't work. We'd have to
2768 do extra work to cancel the pattern so the uses see the
2769 scalar version. */
2770 && !is_pattern_stmt_p (stmt_info))
2772 slp_tree child;
2773 unsigned j;
2774 bool all_uniform_p = true;
2775 unsigned n_vector_builds = 0;
2776 FOR_EACH_VEC_ELT (children, j, child)
2778 if (!child)
2780 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2781 all_uniform_p = false;
2782 else if (!vect_slp_tree_uniform_p (child))
2784 all_uniform_p = false;
2785 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2786 n_vector_builds++;
2789 if (all_uniform_p
2790 || n_vector_builds > 1
2791 || (n_vector_builds == children.length ()
2792 && is_a <gphi *> (stmt_info->stmt)))
2794 /* Roll back. */
2795 matches[0] = false;
2796 FOR_EACH_VEC_ELT (children, j, child)
2797 if (child)
2798 vect_free_slp_tree (child);
2800 if (dump_enabled_p ())
2801 dump_printf_loc (MSG_NOTE, vect_location,
2802 "Building parent vector operands from "
2803 "scalars instead\n");
2804 return NULL;
2808 *tree_size += this_tree_size + 1;
2809 *max_nunits = this_max_nunits;
2811 if (two_operators)
2813 /* ??? We'd likely want to either cache in bst_map sth like
2814 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2815 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2816 explicit stmts to put in so the keying on 'stmts' doesn't
2817 work (but we have the same issue with nodes that use 'ops'). */
2819 if (has_two_operators_perm)
2821 slp_tree child = children[0];
2822 children.truncate (0);
2823 for (i = 0; i < 2; i++)
2825 slp_tree pnode
2826 = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
2827 SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
2828 SLP_TREE_VECTYPE (pnode) = vectype;
2829 SLP_TREE_CHILDREN (pnode).quick_push (child);
2830 SLP_TREE_CHILDREN (pnode).quick_push (child);
2831 lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
2832 children.safe_push (pnode);
2834 for (unsigned j = 0; j < stmts.length (); j++)
2835 perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
2838 SLP_TREE_REF_COUNT (child) += 4;
2841 slp_tree one = new _slp_tree;
2842 slp_tree two = new _slp_tree;
2843 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2844 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2845 SLP_TREE_VECTYPE (one) = vectype;
2846 SLP_TREE_VECTYPE (two) = vectype;
2847 SLP_TREE_CHILDREN (one).safe_splice (children);
2848 SLP_TREE_CHILDREN (two).safe_splice (children);
2849 slp_tree child;
2850 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2851 SLP_TREE_REF_COUNT (child)++;
2853 /* Here we record the original defs since this
2854 node represents the final lane configuration. */
2855 node = vect_create_new_slp_node (node, stmts, 2);
2856 SLP_TREE_VECTYPE (node) = vectype;
2857 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2858 SLP_TREE_CHILDREN (node).quick_push (one);
2859 SLP_TREE_CHILDREN (node).quick_push (two);
2860 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2861 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2862 enum tree_code ocode = ERROR_MARK;
2863 stmt_vec_info ostmt_info;
2864 unsigned j = 0;
2865 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2867 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2868 if (gimple_assign_rhs_code (ostmt) != code0)
2870 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2871 ocode = gimple_assign_rhs_code (ostmt);
2872 j = i;
2874 else
2875 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2878 SLP_TREE_CODE (one) = code0;
2879 SLP_TREE_CODE (two) = ocode;
2880 SLP_TREE_LANES (one) = stmts.length ();
2881 SLP_TREE_LANES (two) = stmts.length ();
2882 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2883 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2885 return node;
2888 node = vect_create_new_slp_node (node, stmts, nops);
2889 SLP_TREE_VECTYPE (node) = vectype;
2890 SLP_TREE_CHILDREN (node).splice (children);
2891 return node;
2894 /* Dump a single SLP tree NODE. */
2896 static void
2897 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2898 slp_tree node)
2900 unsigned i, j;
2901 slp_tree child;
2902 stmt_vec_info stmt_info;
2903 tree op;
2905 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2906 dump_user_location_t user_loc = loc.get_user_location ();
2907 dump_printf_loc (metadata, user_loc,
2908 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2909 ", refcnt=%u)",
2910 SLP_TREE_DEF_TYPE (node) == vect_external_def
2911 ? " (external)"
2912 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2913 ? " (constant)"
2914 : ""), (void *) node,
2915 estimated_poly_value (node->max_nunits),
2916 SLP_TREE_REF_COUNT (node));
2917 if (SLP_TREE_VECTYPE (node))
2918 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2919 dump_printf (metadata, "\n");
2920 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2922 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2923 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2924 else
2925 dump_printf_loc (metadata, user_loc, "op template: %G",
2926 SLP_TREE_REPRESENTATIVE (node)->stmt);
2928 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2929 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2930 if (stmt_info)
2931 dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
2932 STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
2933 i, stmt_info->stmt);
2934 else
2935 dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
2936 else
2938 dump_printf_loc (metadata, user_loc, "\t{ ");
2939 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2940 dump_printf (metadata, "%T%s ", op,
2941 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2942 dump_printf (metadata, "}\n");
2944 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2946 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2947 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2948 dump_printf (dump_kind, " %u", j);
2949 dump_printf (dump_kind, " }\n");
2951 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2953 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2954 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2955 dump_printf (dump_kind, " %u[%u]",
2956 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2957 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2958 dump_printf (dump_kind, " }%s\n",
2959 node->ldst_lanes ? " (load-lanes)" : "");
2961 if (SLP_TREE_CHILDREN (node).is_empty ())
2962 return;
2963 dump_printf_loc (metadata, user_loc, "\tchildren");
2964 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2965 dump_printf (dump_kind, " %p", (void *)child);
2966 dump_printf (dump_kind, "%s\n",
2967 node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
2968 ? " (store-lanes)" : "");
2971 DEBUG_FUNCTION void
2972 debug (slp_tree node)
2974 debug_dump_context ctx;
2975 vect_print_slp_tree (MSG_NOTE,
2976 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2977 node);
2980 /* Recursive helper for the dot producer below. */
2982 static void
2983 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2985 if (visited.add (node))
2986 return;
2988 fprintf (f, "\"%p\" [label=\"", (void *)node);
2989 vect_print_slp_tree (MSG_NOTE,
2990 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2991 node);
2992 fprintf (f, "\"];\n");
2995 for (slp_tree child : SLP_TREE_CHILDREN (node))
2996 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2998 for (slp_tree child : SLP_TREE_CHILDREN (node))
2999 if (child)
3000 dot_slp_tree (f, child, visited);
3003 DEBUG_FUNCTION void
3004 dot_slp_tree (const char *fname, slp_tree node)
3006 FILE *f = fopen (fname, "w");
3007 fprintf (f, "digraph {\n");
3008 fflush (f);
3010 debug_dump_context ctx (f);
3011 hash_set<slp_tree> visited;
3012 dot_slp_tree (f, node, visited);
3014 fflush (f);
3015 fprintf (f, "}\n");
3016 fclose (f);
3019 DEBUG_FUNCTION void
3020 dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
3022 FILE *f = fopen (fname, "w");
3023 fprintf (f, "digraph {\n");
3024 fflush (f);
3026 debug_dump_context ctx (f);
3027 hash_set<slp_tree> visited;
3028 for (auto inst : slp_instances)
3029 dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
3031 fflush (f);
3032 fprintf (f, "}\n");
3033 fclose (f);
3036 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
3038 static void
3039 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3040 slp_tree node, hash_set<slp_tree> &visited)
3042 unsigned i;
3043 slp_tree child;
3045 if (visited.add (node))
3046 return;
3048 vect_print_slp_tree (dump_kind, loc, node);
3050 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3051 if (child)
3052 vect_print_slp_graph (dump_kind, loc, child, visited);
3055 static void
3056 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3057 slp_tree entry)
3059 hash_set<slp_tree> visited;
3060 vect_print_slp_graph (dump_kind, loc, entry, visited);
3063 DEBUG_FUNCTION void
3064 debug (slp_instance instance)
3066 debug_dump_context ctx;
3067 vect_print_slp_graph (MSG_NOTE,
3068 dump_location_t::from_location_t (UNKNOWN_LOCATION),
3069 SLP_INSTANCE_TREE (instance));
3072 /* Mark the tree rooted at NODE with PURE_SLP. */
3074 static void
3075 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
3077 int i;
3078 stmt_vec_info stmt_info;
3079 slp_tree child;
3081 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3082 return;
3084 if (visited.add (node))
3085 return;
3087 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3088 if (stmt_info)
3089 STMT_SLP_TYPE (stmt_info) = pure_slp;
3091 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3092 if (child)
3093 vect_mark_slp_stmts (child, visited);
3096 static void
3097 vect_mark_slp_stmts (slp_tree node)
3099 hash_set<slp_tree> visited;
3100 vect_mark_slp_stmts (node, visited);
3103 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
3105 static void
3106 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
3108 int i;
3109 stmt_vec_info stmt_info;
3110 slp_tree child;
3112 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3113 return;
3115 if (visited.add (node))
3116 return;
3118 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3119 if (stmt_info)
3121 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
3122 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
3123 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
3126 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3127 if (child)
3128 vect_mark_slp_stmts_relevant (child, visited);
3131 static void
3132 vect_mark_slp_stmts_relevant (slp_tree node)
3134 hash_set<slp_tree> visited;
3135 vect_mark_slp_stmts_relevant (node, visited);
3139 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
3141 static void
3142 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
3143 hash_set<slp_tree> &visited)
3145 if (!node || visited.add (node))
3146 return;
3148 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3149 return;
3151 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3153 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3154 if (STMT_VINFO_DATA_REF (stmt_info)
3155 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3156 loads.safe_push (node);
3159 unsigned i;
3160 slp_tree child;
3161 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3162 vect_gather_slp_loads (loads, child, visited);
3166 /* Find the last store in SLP INSTANCE. */
3168 stmt_vec_info
3169 vect_find_last_scalar_stmt_in_slp (slp_tree node)
3171 stmt_vec_info last = NULL;
3172 stmt_vec_info stmt_vinfo;
3174 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3175 if (stmt_vinfo)
3177 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3178 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
3181 return last;
3184 /* Find the first stmt in NODE. */
3186 stmt_vec_info
3187 vect_find_first_scalar_stmt_in_slp (slp_tree node)
3189 stmt_vec_info first = NULL;
3190 stmt_vec_info stmt_vinfo;
3192 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3193 if (stmt_vinfo)
3195 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3196 if (!first
3197 || get_later_stmt (stmt_vinfo, first) == first)
3198 first = stmt_vinfo;
3201 return first;
3204 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3205 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3206 (also containing the first GROUP1_SIZE stmts, since stores are
3207 consecutive), the second containing the remainder.
3208 Return the first stmt in the second group. */
3210 static stmt_vec_info
3211 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3213 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3214 gcc_assert (group1_size > 0);
3215 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3216 gcc_assert (group2_size > 0);
3217 DR_GROUP_SIZE (first_vinfo) = group1_size;
3219 stmt_vec_info stmt_info = first_vinfo;
3220 for (unsigned i = group1_size; i > 1; i--)
3222 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3223 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3225 /* STMT is now the last element of the first group. */
3226 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3227 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3229 DR_GROUP_SIZE (group2) = group2_size;
3230 for (stmt_info = group2; stmt_info;
3231 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3233 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3234 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3237 /* For the second group, the DR_GROUP_GAP is that before the original group,
3238 plus skipping over the first vector. */
3239 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3241 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3242 DR_GROUP_GAP (first_vinfo) += group2_size;
3244 if (dump_enabled_p ())
3245 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3246 group1_size, group2_size);
3248 return group2;
3251 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3252 statements and a vector of NUNITS elements. */
3254 static poly_uint64
3255 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3257 return exact_div (common_multiple (nunits, group_size), group_size);
3260 /* Helper that checks to see if a node is a load node. */
3262 static inline bool
3263 vect_is_slp_load_node (slp_tree root)
3265 return (SLP_TREE_CODE (root) != VEC_PERM_EXPR
3266 && SLP_TREE_DEF_TYPE (root) == vect_internal_def
3267 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3268 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
3272 /* Helper function of optimize_load_redistribution that performs the operation
3273 recursively. */
3275 static slp_tree
3276 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3277 vec_info *vinfo, unsigned int group_size,
3278 hash_map<slp_tree, slp_tree> *load_map,
3279 slp_tree root)
3281 if (slp_tree *leader = load_map->get (root))
3282 return *leader;
3284 slp_tree node;
3285 unsigned i;
3287 /* For now, we don't know anything about externals so do not do anything. */
3288 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3289 return NULL;
3290 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3292 /* First convert this node into a load node and add it to the leaves
3293 list and flatten the permute from a lane to a load one. If it's
3294 unneeded it will be elided later. */
3295 vec<stmt_vec_info> stmts;
3296 stmts.create (SLP_TREE_LANES (root));
3297 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3298 for (unsigned j = 0; j < lane_perm.length (); j++)
3300 std::pair<unsigned, unsigned> perm = lane_perm[j];
3301 node = SLP_TREE_CHILDREN (root)[perm.first];
3303 if (!vect_is_slp_load_node (node)
3304 || SLP_TREE_CHILDREN (node).exists ())
3306 stmts.release ();
3307 goto next;
3310 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3313 if (dump_enabled_p ())
3314 dump_printf_loc (MSG_NOTE, vect_location,
3315 "converting stmts on permute node %p\n",
3316 (void *) root);
3318 bool *matches = XALLOCAVEC (bool, group_size);
3319 poly_uint64 max_nunits = 1;
3320 unsigned tree_size = 0, limit = 1;
3321 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3322 matches, &limit, &tree_size, bst_map);
3323 if (!node)
3324 stmts.release ();
3326 load_map->put (root, node);
3327 return node;
3330 next:
3331 load_map->put (root, NULL);
3333 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3335 slp_tree value
3336 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3337 node);
3338 if (value)
3340 SLP_TREE_REF_COUNT (value)++;
3341 SLP_TREE_CHILDREN (root)[i] = value;
3342 /* ??? We know the original leafs of the replaced nodes will
3343 be referenced by bst_map, only the permutes created by
3344 pattern matching are not. */
3345 if (SLP_TREE_REF_COUNT (node) == 1)
3346 load_map->remove (node);
3347 vect_free_slp_tree (node);
3351 return NULL;
3354 /* Temporary workaround for loads not being CSEd during SLP build. This
3355 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3356 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3357 same DR such that the final operation is equal to a permuted load. Such
3358 NODES are then directly converted into LOADS themselves. The nodes are
3359 CSEd using BST_MAP. */
3361 static void
3362 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3363 vec_info *vinfo, unsigned int group_size,
3364 hash_map<slp_tree, slp_tree> *load_map,
3365 slp_tree root)
3367 slp_tree node;
3368 unsigned i;
3370 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3372 slp_tree value
3373 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3374 node);
3375 if (value)
3377 SLP_TREE_REF_COUNT (value)++;
3378 SLP_TREE_CHILDREN (root)[i] = value;
3379 /* ??? We know the original leafs of the replaced nodes will
3380 be referenced by bst_map, only the permutes created by
3381 pattern matching are not. */
3382 if (SLP_TREE_REF_COUNT (node) == 1)
3383 load_map->remove (node);
3384 vect_free_slp_tree (node);
3389 /* Helper function of vect_match_slp_patterns.
3391 Attempts to match patterns against the slp tree rooted in REF_NODE using
3392 VINFO. Patterns are matched in post-order traversal.
3394 If matching is successful the value in REF_NODE is updated and returned, if
3395 not then it is returned unchanged. */
3397 static bool
3398 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3399 slp_tree_to_load_perm_map_t *perm_cache,
3400 slp_compat_nodes_map_t *compat_cache,
3401 hash_set<slp_tree> *visited)
3403 unsigned i;
3404 slp_tree node = *ref_node;
3405 bool found_p = false;
3406 if (!node || visited->add (node))
3407 return false;
3409 slp_tree child;
3410 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3411 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3412 vinfo, perm_cache, compat_cache,
3413 visited);
3415 for (unsigned x = 0; x < num__slp_patterns; x++)
3417 vect_pattern *pattern
3418 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3419 if (pattern)
3421 pattern->build (vinfo);
3422 delete pattern;
3423 found_p = true;
3427 return found_p;
3430 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3431 vec_info VINFO.
3433 The modified tree is returned. Patterns are tried in order and multiple
3434 patterns may match. */
3436 static bool
3437 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3438 hash_set<slp_tree> *visited,
3439 slp_tree_to_load_perm_map_t *perm_cache,
3440 slp_compat_nodes_map_t *compat_cache)
3442 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3443 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3445 if (dump_enabled_p ())
3446 dump_printf_loc (MSG_NOTE, vect_location,
3447 "Analyzing SLP tree %p for patterns\n",
3448 (void *) SLP_INSTANCE_TREE (instance));
3450 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3451 visited);
3454 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3455 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3456 Return true if we could use IFN_STORE_LANES instead and if that appears
3457 to be the better approach. */
3459 static bool
3460 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3461 unsigned int group_size,
3462 unsigned int new_group_size)
3464 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3465 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3466 if (!vectype)
3467 return false;
3468 /* Allow the split if one of the two new groups would operate on full
3469 vectors *within* rather than across one scalar loop iteration.
3470 This is purely a heuristic, but it should work well for group
3471 sizes of 3 and 4, where the possible splits are:
3473 3->2+1: OK if the vector has exactly two elements
3474 4->2+2: Likewise
3475 4->3+1: Less clear-cut. */
3476 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3477 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3478 return false;
3479 return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3482 /* Analyze an SLP instance starting from a group of grouped stores. Call
3483 vect_build_slp_tree to build a tree of packed stmts if possible.
3484 Return FALSE if it's impossible to SLP any stmt in the loop. */
3486 static bool
3487 vect_analyze_slp_instance (vec_info *vinfo,
3488 scalar_stmts_to_slp_tree_map_t *bst_map,
3489 stmt_vec_info stmt_info, slp_instance_kind kind,
3490 unsigned max_tree_size, unsigned *limit,
3491 bool force_single_lane = false);
3493 /* Build an interleaving scheme for the store sources RHS_NODES from
3494 SCALAR_STMTS. */
3496 static slp_tree
3497 vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
3498 vec<stmt_vec_info> &scalar_stmts)
3500 unsigned int group_size = scalar_stmts.length ();
3501 slp_tree node = vect_create_new_slp_node (scalar_stmts,
3502 SLP_TREE_CHILDREN
3503 (rhs_nodes[0]).length ());
3504 SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
3505 for (unsigned l = 0;
3506 l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
3508 /* And a permute merging all RHS SLP trees. */
3509 slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
3510 VEC_PERM_EXPR);
3511 SLP_TREE_CHILDREN (node).quick_push (perm);
3512 SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
3513 SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
3514 SLP_TREE_LANES (perm) = group_size;
3515 /* ??? We should set this NULL but that's not expected. */
3516 SLP_TREE_REPRESENTATIVE (perm)
3517 = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
3518 for (unsigned j = 0; j < rhs_nodes.length (); ++j)
3520 SLP_TREE_CHILDREN (perm)
3521 .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
3522 SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
3523 for (unsigned k = 0;
3524 k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
3526 /* ??? We should populate SLP_TREE_SCALAR_STMTS
3527 or SLP_TREE_SCALAR_OPS but then we might have
3528 a mix of both in our children. */
3529 SLP_TREE_LANE_PERMUTATION (perm)
3530 .quick_push (std::make_pair (j, k));
3534 /* Now we have a single permute node but we cannot code-generate
3535 the case with more than two inputs.
3536 Perform pairwise reduction, reducing the two inputs
3537 with the least number of lanes to one and then repeat until
3538 we end up with two inputs. That scheme makes sure we end
3539 up with permutes satisfying the restriction of requiring at
3540 most two vector inputs to produce a single vector output
3541 when the number of lanes is even. */
3542 while (SLP_TREE_CHILDREN (perm).length () > 2)
3544 /* When we have three equal sized groups left the pairwise
3545 reduction does not result in a scheme that avoids using
3546 three vectors. Instead merge the first two groups
3547 to the final size with do-not-care elements (chosen
3548 from the first group) and then merge with the third.
3549 { A0, B0, x, A1, B1, x, ... }
3550 -> { A0, B0, C0, A1, B1, C1, ... }
3551 This handles group size of three (and at least
3552 power-of-two multiples of that). */
3553 if (SLP_TREE_CHILDREN (perm).length () == 3
3554 && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3555 == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
3556 && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3557 == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
3559 int ai = 0;
3560 int bi = 1;
3561 slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3562 slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3563 unsigned n = SLP_TREE_LANES (perm);
3565 slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3566 SLP_TREE_LANES (permab) = n;
3567 SLP_TREE_LANE_PERMUTATION (permab).create (n);
3568 SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3569 /* ??? Should be NULL but that's not expected. */
3570 SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3571 SLP_TREE_CHILDREN (permab).quick_push (a);
3572 for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3573 SLP_TREE_LANE_PERMUTATION (permab)
3574 .quick_push (std::make_pair (0, k));
3575 SLP_TREE_CHILDREN (permab).quick_push (b);
3576 for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3577 SLP_TREE_LANE_PERMUTATION (permab)
3578 .quick_push (std::make_pair (1, k));
3579 /* Push the do-not-care lanes. */
3580 for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3581 SLP_TREE_LANE_PERMUTATION (permab)
3582 .quick_push (std::make_pair (0, k));
3584 /* Put the merged node into 'perm', in place of a. */
3585 SLP_TREE_CHILDREN (perm)[ai] = permab;
3586 /* Adjust the references to b in the permutation
3587 of perm and to the later children which we'll
3588 remove. */
3589 for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3591 std::pair<unsigned, unsigned> &p
3592 = SLP_TREE_LANE_PERMUTATION (perm)[k];
3593 if (p.first == (unsigned) bi)
3595 p.first = ai;
3596 p.second += SLP_TREE_LANES (a);
3598 else if (p.first > (unsigned) bi)
3599 p.first--;
3601 SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3602 break;
3605 /* Pick the two nodes with the least number of lanes,
3606 prefer the earliest candidate and maintain ai < bi. */
3607 int ai = -1;
3608 int bi = -1;
3609 for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
3611 if (ai == -1)
3612 ai = ci;
3613 else if (bi == -1)
3614 bi = ci;
3615 else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
3616 < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
3617 || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
3618 < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
3620 if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
3621 <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
3622 bi = ci;
3623 else
3625 ai = bi;
3626 bi = ci;
3631 /* Produce a merge of nodes ai and bi. */
3632 slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3633 slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3634 unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
3635 slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3636 SLP_TREE_LANES (permab) = n;
3637 SLP_TREE_LANE_PERMUTATION (permab).create (n);
3638 SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3639 /* ??? Should be NULL but that's not expected. */
3640 SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3641 SLP_TREE_CHILDREN (permab).quick_push (a);
3642 for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3643 SLP_TREE_LANE_PERMUTATION (permab)
3644 .quick_push (std::make_pair (0, k));
3645 SLP_TREE_CHILDREN (permab).quick_push (b);
3646 for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3647 SLP_TREE_LANE_PERMUTATION (permab)
3648 .quick_push (std::make_pair (1, k));
3650 /* Put the merged node into 'perm', in place of a. */
3651 SLP_TREE_CHILDREN (perm)[ai] = permab;
3652 /* Adjust the references to b in the permutation
3653 of perm and to the later children which we'll
3654 remove. */
3655 for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3657 std::pair<unsigned, unsigned> &p
3658 = SLP_TREE_LANE_PERMUTATION (perm)[k];
3659 if (p.first == (unsigned) bi)
3661 p.first = ai;
3662 p.second += SLP_TREE_LANES (a);
3664 else if (p.first > (unsigned) bi)
3665 p.first--;
3667 SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3671 return node;
3674 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3675 of KIND. Return true if successful. */
3677 static bool
3678 vect_build_slp_instance (vec_info *vinfo,
3679 slp_instance_kind kind,
3680 vec<stmt_vec_info> &scalar_stmts,
3681 vec<stmt_vec_info> &root_stmt_infos,
3682 vec<tree> &remain,
3683 unsigned max_tree_size, unsigned *limit,
3684 scalar_stmts_to_slp_tree_map_t *bst_map,
3685 /* ??? We need stmt_info for group splitting. */
3686 stmt_vec_info stmt_info_,
3687 bool force_single_lane = false)
3689 /* If there's no budget left bail out early. */
3690 if (*limit == 0)
3691 return false;
3693 if (kind == slp_inst_kind_ctor)
3695 if (dump_enabled_p ())
3696 dump_printf_loc (MSG_NOTE, vect_location,
3697 "Analyzing vectorizable constructor: %G\n",
3698 root_stmt_infos[0]->stmt);
3701 if (dump_enabled_p ())
3703 dump_printf_loc (MSG_NOTE, vect_location,
3704 "Starting SLP discovery for\n");
3705 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3706 dump_printf_loc (MSG_NOTE, vect_location,
3707 " %G", scalar_stmts[i]->stmt);
3710 /* Build the tree for the SLP instance. */
3711 unsigned int group_size = scalar_stmts.length ();
3712 bool *matches = XALLOCAVEC (bool, group_size);
3713 poly_uint64 max_nunits = 1;
3714 unsigned tree_size = 0;
3715 unsigned i;
3717 slp_tree node = NULL;
3718 if (force_single_lane)
3720 matches[0] = true;
3721 matches[1] = false;
3723 else
3724 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3725 &max_nunits, matches, limit,
3726 &tree_size, bst_map);
3727 if (node != NULL)
3729 /* Calculate the unrolling factor based on the smallest type. */
3730 poly_uint64 unrolling_factor
3731 = calculate_unrolling_factor (max_nunits, group_size);
3733 if (maybe_ne (unrolling_factor, 1U)
3734 && is_a <bb_vec_info> (vinfo))
3736 unsigned HOST_WIDE_INT const_max_nunits;
3737 if (!max_nunits.is_constant (&const_max_nunits)
3738 || const_max_nunits > group_size)
3740 if (dump_enabled_p ())
3741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3742 "Build SLP failed: store group "
3743 "size not a multiple of the vector size "
3744 "in basic block SLP\n");
3745 vect_free_slp_tree (node);
3746 return false;
3748 /* Fatal mismatch. */
3749 if (dump_enabled_p ())
3750 dump_printf_loc (MSG_NOTE, vect_location,
3751 "SLP discovery succeeded but node needs "
3752 "splitting\n");
3753 memset (matches, true, group_size);
3754 matches[group_size / const_max_nunits * const_max_nunits] = false;
3755 vect_free_slp_tree (node);
3757 else
3759 /* Create a new SLP instance. */
3760 slp_instance new_instance = XNEW (class _slp_instance);
3761 SLP_INSTANCE_TREE (new_instance) = node;
3762 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3763 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3764 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3765 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3766 SLP_INSTANCE_KIND (new_instance) = kind;
3767 new_instance->reduc_phis = NULL;
3768 new_instance->cost_vec = vNULL;
3769 new_instance->subgraph_entries = vNULL;
3771 if (dump_enabled_p ())
3772 dump_printf_loc (MSG_NOTE, vect_location,
3773 "SLP size %u vs. limit %u.\n",
3774 tree_size, max_tree_size);
3776 /* Fixup SLP reduction chains. */
3777 if (kind == slp_inst_kind_reduc_chain)
3779 /* If this is a reduction chain with a conversion in front
3780 amend the SLP tree with a node for that. */
3781 gimple *scalar_def
3782 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3783 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3785 /* Get at the conversion stmt - we know it's the single use
3786 of the last stmt of the reduction chain. */
3787 use_operand_p use_p;
3788 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3789 &use_p, &scalar_def);
3790 gcc_assert (r);
3791 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3792 next_info = vect_stmt_to_vectorize (next_info);
3793 scalar_stmts = vNULL;
3794 scalar_stmts.create (group_size);
3795 for (unsigned i = 0; i < group_size; ++i)
3796 scalar_stmts.quick_push (next_info);
3797 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3798 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3799 SLP_TREE_CHILDREN (conv).quick_push (node);
3800 SLP_INSTANCE_TREE (new_instance) = conv;
3801 /* We also have to fake this conversion stmt as SLP reduction
3802 group so we don't have to mess with too much code
3803 elsewhere. */
3804 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3805 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3807 /* Fill the backedge child of the PHI SLP node. The
3808 general matching code cannot find it because the
3809 scalar code does not reflect how we vectorize the
3810 reduction. */
3811 use_operand_p use_p;
3812 imm_use_iterator imm_iter;
3813 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3814 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3815 gimple_get_lhs (scalar_def))
3816 /* There are exactly two non-debug uses, the reduction
3817 PHI and the loop-closed PHI node. */
3818 if (!is_gimple_debug (USE_STMT (use_p))
3819 && gimple_bb (USE_STMT (use_p)) == loop->header)
3821 auto_vec<stmt_vec_info, 64> phis (group_size);
3822 stmt_vec_info phi_info
3823 = vinfo->lookup_stmt (USE_STMT (use_p));
3824 for (unsigned i = 0; i < group_size; ++i)
3825 phis.quick_push (phi_info);
3826 slp_tree *phi_node = bst_map->get (phis);
3827 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3828 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3829 = SLP_INSTANCE_TREE (new_instance);
3830 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3834 vinfo->slp_instances.safe_push (new_instance);
3836 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3837 the number of scalar stmts in the root in a few places.
3838 Verify that assumption holds. */
3839 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3840 .length () == group_size);
3842 if (dump_enabled_p ())
3844 dump_printf_loc (MSG_NOTE, vect_location,
3845 "Final SLP tree for instance %p:\n",
3846 (void *) new_instance);
3847 vect_print_slp_graph (MSG_NOTE, vect_location,
3848 SLP_INSTANCE_TREE (new_instance));
3851 return true;
3854 /* Failed to SLP. */
3856 stmt_vec_info stmt_info = stmt_info_;
3857 /* Try to break the group up into pieces. */
3858 if (*limit > 0 && kind == slp_inst_kind_store)
3860 /* ??? We could delay all the actual splitting of store-groups
3861 until after SLP discovery of the original group completed.
3862 Then we can recurse to vect_build_slp_instance directly. */
3863 for (i = 0; i < group_size; i++)
3864 if (!matches[i])
3865 break;
3867 /* For basic block SLP, try to break the group up into multiples of
3868 a vector size. */
3869 if (is_a <bb_vec_info> (vinfo)
3870 && (i > 1 && i < group_size))
3872 /* Free the allocated memory. */
3873 scalar_stmts.release ();
3875 tree scalar_type
3876 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3877 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3878 1 << floor_log2 (i));
3879 unsigned HOST_WIDE_INT const_nunits;
3880 if (vectype
3881 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3883 /* Split into two groups at the first vector boundary. */
3884 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3885 unsigned group1_size = i & ~(const_nunits - 1);
3887 if (dump_enabled_p ())
3888 dump_printf_loc (MSG_NOTE, vect_location,
3889 "Splitting SLP group at stmt %u\n", i);
3890 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3891 group1_size);
3892 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3893 kind, max_tree_size,
3894 limit);
3895 /* Split the rest at the failure point and possibly
3896 re-analyze the remaining matching part if it has
3897 at least two lanes. */
3898 if (group1_size < i
3899 && (i + 1 < group_size
3900 || i - group1_size > 1))
3902 stmt_vec_info rest2 = rest;
3903 rest = vect_split_slp_store_group (rest, i - group1_size);
3904 if (i - group1_size > 1)
3905 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3906 kind, max_tree_size,
3907 limit);
3909 /* Re-analyze the non-matching tail if it has at least
3910 two lanes. */
3911 if (i + 1 < group_size)
3912 res |= vect_analyze_slp_instance (vinfo, bst_map,
3913 rest, kind, max_tree_size,
3914 limit);
3915 return res;
3919 /* For loop vectorization split the RHS into arbitrary pieces of
3920 size >= 1. */
3921 else if (is_a <loop_vec_info> (vinfo)
3922 && (group_size != 1 && i < group_size))
3924 /* There are targets that cannot do even/odd interleaving schemes
3925 so they absolutely need to use load/store-lanes. For now
3926 force single-lane SLP for them - they would be happy with
3927 uniform power-of-two lanes (but depending on element size),
3928 but even if we can use 'i' as indicator we would need to
3929 backtrack when later lanes fail to discover with the same
3930 granularity. We cannot turn any of strided or scatter store
3931 into store-lanes. */
3932 /* ??? If this is not in sync with what get_load_store_type
3933 later decides the SLP representation is not good for other
3934 store vectorization methods. */
3935 bool want_store_lanes
3936 = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
3937 && ! STMT_VINFO_STRIDED_P (stmt_info)
3938 && compare_step_with_zero (vinfo, stmt_info) > 0
3939 && vect_slp_prefer_store_lanes_p (vinfo, stmt_info,
3940 group_size, 1));
3941 if (want_store_lanes || force_single_lane)
3942 i = 1;
3944 /* A fatal discovery fail doesn't always mean single-lane SLP
3945 isn't a possibility, so try. */
3946 if (i == 0)
3947 i = 1;
3949 if (dump_enabled_p ())
3950 dump_printf_loc (MSG_NOTE, vect_location,
3951 "Splitting SLP group at stmt %u\n", i);
3953 /* Analyze the stored values and pinch them together with
3954 a permute node so we can preserve the whole store group. */
3955 auto_vec<slp_tree> rhs_nodes;
3957 /* Calculate the unrolling factor based on the smallest type. */
3958 poly_uint64 unrolling_factor = 1;
3960 unsigned int start = 0, end = i;
3961 while (start < group_size)
3963 gcc_assert (end - start >= 1);
3964 vec<stmt_vec_info> substmts;
3965 substmts.create (end - start);
3966 for (unsigned j = start; j < end; ++j)
3967 substmts.quick_push (scalar_stmts[j]);
3968 max_nunits = 1;
3969 node = vect_build_slp_tree (vinfo, substmts, end - start,
3970 &max_nunits,
3971 matches, limit, &tree_size, bst_map);
3972 if (node)
3974 /* ??? Possibly not safe, but not sure how to check
3975 and fail SLP build? */
3976 unrolling_factor
3977 = force_common_multiple (unrolling_factor,
3978 calculate_unrolling_factor
3979 (max_nunits, end - start));
3980 rhs_nodes.safe_push (node);
3981 start = end;
3982 if (want_store_lanes || force_single_lane)
3983 end = start + 1;
3984 else
3985 end = group_size;
3987 else
3989 substmts.release ();
3990 if (end - start == 1)
3992 /* Single-lane discovery failed. Free ressources. */
3993 for (auto node : rhs_nodes)
3994 vect_free_slp_tree (node);
3995 scalar_stmts.release ();
3996 if (dump_enabled_p ())
3997 dump_printf_loc (MSG_NOTE, vect_location,
3998 "SLP discovery failed\n");
3999 return false;
4002 /* ??? It really happens that we soft-fail SLP
4003 build at a mismatch but the matching part hard-fails
4004 later. As we know we arrived here with a group
4005 larger than one try a group of size one! */
4006 if (!matches[0])
4007 end = start + 1;
4008 else
4009 for (unsigned j = start; j < end; j++)
4010 if (!matches[j - start])
4012 end = j;
4013 break;
4018 /* Now we assume we can build the root SLP node from all stores. */
4019 if (want_store_lanes)
4021 /* For store-lanes feed the store node with all RHS nodes
4022 in order. */
4023 node = vect_create_new_slp_node (scalar_stmts,
4024 SLP_TREE_CHILDREN
4025 (rhs_nodes[0]).length ());
4026 SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
4027 node->ldst_lanes = true;
4028 SLP_TREE_CHILDREN (node)
4029 .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
4030 + rhs_nodes.length () - 1);
4031 /* First store value and possibly mask. */
4032 SLP_TREE_CHILDREN (node)
4033 .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
4034 /* Rest of the store values. All mask nodes are the same,
4035 this should be guaranteed by dataref group discovery. */
4036 for (unsigned j = 1; j < rhs_nodes.length (); ++j)
4037 SLP_TREE_CHILDREN (node)
4038 .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
4039 for (slp_tree child : SLP_TREE_CHILDREN (node))
4040 child->refcnt++;
4042 else
4043 node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts);
4045 while (!rhs_nodes.is_empty ())
4046 vect_free_slp_tree (rhs_nodes.pop ());
4048 /* Create a new SLP instance. */
4049 slp_instance new_instance = XNEW (class _slp_instance);
4050 SLP_INSTANCE_TREE (new_instance) = node;
4051 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
4052 SLP_INSTANCE_LOADS (new_instance) = vNULL;
4053 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4054 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4055 SLP_INSTANCE_KIND (new_instance) = kind;
4056 new_instance->reduc_phis = NULL;
4057 new_instance->cost_vec = vNULL;
4058 new_instance->subgraph_entries = vNULL;
4060 if (dump_enabled_p ())
4061 dump_printf_loc (MSG_NOTE, vect_location,
4062 "SLP size %u vs. limit %u.\n",
4063 tree_size, max_tree_size);
4065 vinfo->slp_instances.safe_push (new_instance);
4067 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4068 the number of scalar stmts in the root in a few places.
4069 Verify that assumption holds. */
4070 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4071 .length () == group_size);
4073 if (dump_enabled_p ())
4075 dump_printf_loc (MSG_NOTE, vect_location,
4076 "Final SLP tree for instance %p:\n",
4077 (void *) new_instance);
4078 vect_print_slp_graph (MSG_NOTE, vect_location,
4079 SLP_INSTANCE_TREE (new_instance));
4081 return true;
4083 else
4084 /* Free the allocated memory. */
4085 scalar_stmts.release ();
4087 /* Even though the first vector did not all match, we might be able to SLP
4088 (some) of the remainder. FORNOW ignore this possibility. */
4090 else
4091 /* Free the allocated memory. */
4092 scalar_stmts.release ();
4094 /* Failed to SLP. */
4095 if (dump_enabled_p ())
4096 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4097 return false;
4101 /* Analyze an SLP instance starting from a group of grouped stores. Call
4102 vect_build_slp_tree to build a tree of packed stmts if possible.
4103 Return FALSE if it's impossible to SLP any stmt in the loop. */
4105 static bool
4106 vect_analyze_slp_instance (vec_info *vinfo,
4107 scalar_stmts_to_slp_tree_map_t *bst_map,
4108 stmt_vec_info stmt_info,
4109 slp_instance_kind kind,
4110 unsigned max_tree_size, unsigned *limit,
4111 bool force_single_lane)
4113 vec<stmt_vec_info> scalar_stmts;
4115 if (is_a <bb_vec_info> (vinfo))
4116 vect_location = stmt_info->stmt;
4118 stmt_vec_info next_info = stmt_info;
4119 if (kind == slp_inst_kind_store)
4121 /* Collect the stores and store them in scalar_stmts. */
4122 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
4123 while (next_info)
4125 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4126 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
4129 else if (kind == slp_inst_kind_reduc_chain)
4131 /* Collect the reduction stmts and store them in scalar_stmts. */
4132 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
4133 while (next_info)
4135 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4136 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
4138 /* Mark the first element of the reduction chain as reduction to properly
4139 transform the node. In the reduction analysis phase only the last
4140 element of the chain is marked as reduction. */
4141 STMT_VINFO_DEF_TYPE (stmt_info)
4142 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
4143 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
4144 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
4146 else
4147 gcc_unreachable ();
4149 vec<stmt_vec_info> roots = vNULL;
4150 vec<tree> remain = vNULL;
4151 /* Build the tree for the SLP instance. */
4152 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
4153 roots, remain,
4154 max_tree_size, limit, bst_map,
4155 kind == slp_inst_kind_store
4156 ? stmt_info : NULL, force_single_lane);
4158 /* ??? If this is slp_inst_kind_store and the above succeeded here's
4159 where we should do store group splitting. */
4161 return res;
4164 /* qsort comparator ordering SLP load nodes. */
4166 static int
4167 vllp_cmp (const void *a_, const void *b_)
4169 const slp_tree a = *(const slp_tree *)a_;
4170 const slp_tree b = *(const slp_tree *)b_;
4171 stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
4172 stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
4173 if (STMT_VINFO_GROUPED_ACCESS (a0)
4174 && STMT_VINFO_GROUPED_ACCESS (b0)
4175 && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
4177 /* Same group, order after lanes used. */
4178 if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
4179 return 1;
4180 else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
4181 return -1;
4182 else
4184 /* Try to order loads using the same lanes together, breaking
4185 the tie with the lane number that first differs. */
4186 if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
4187 && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
4188 return 0;
4189 else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
4190 && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
4191 return 1;
4192 else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
4193 && SLP_TREE_LOAD_PERMUTATION (b).exists ())
4194 return -1;
4195 else
4197 for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
4198 if (SLP_TREE_LOAD_PERMUTATION (a)[i]
4199 != SLP_TREE_LOAD_PERMUTATION (b)[i])
4201 /* In-order lane first, that's what the above case for
4202 no permutation does. */
4203 if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
4204 return -1;
4205 else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
4206 return 1;
4207 else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
4208 < SLP_TREE_LOAD_PERMUTATION (b)[i])
4209 return -1;
4210 else
4211 return 1;
4213 return 0;
4217 else /* Different groups or non-groups. */
4219 /* Order groups as their first element to keep them together. */
4220 if (STMT_VINFO_GROUPED_ACCESS (a0))
4221 a0 = DR_GROUP_FIRST_ELEMENT (a0);
4222 if (STMT_VINFO_GROUPED_ACCESS (b0))
4223 b0 = DR_GROUP_FIRST_ELEMENT (b0);
4224 if (a0 == b0)
4225 return 0;
4226 /* Tie using UID. */
4227 else if (gimple_uid (STMT_VINFO_STMT (a0))
4228 < gimple_uid (STMT_VINFO_STMT (b0)))
4229 return -1;
4230 else
4232 gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
4233 != gimple_uid (STMT_VINFO_STMT (b0)));
4234 return 1;
4239 /* Process the set of LOADS that are all from the same dataref group. */
4241 static void
4242 vect_lower_load_permutations (loop_vec_info loop_vinfo,
4243 scalar_stmts_to_slp_tree_map_t *bst_map,
4244 const array_slice<slp_tree> &loads)
4246 /* We at this point want to lower without a fixed VF or vector
4247 size in mind which means we cannot actually compute whether we
4248 need three or more vectors for a load permutation yet. So always
4249 lower. */
4250 stmt_vec_info first
4251 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
4252 unsigned group_lanes = DR_GROUP_SIZE (first);
4254 /* Verify if all load permutations can be implemented with a suitably
4255 large element load-lanes operation. */
4256 unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
4257 if (STMT_VINFO_STRIDED_P (first)
4258 || compare_step_with_zero (loop_vinfo, first) <= 0
4259 || exact_log2 (ld_lanes_lanes) == -1
4260 /* ??? For now only support the single-lane case as there is
4261 missing support on the store-lane side and code generation
4262 isn't up to the task yet. */
4263 || ld_lanes_lanes != 1
4264 || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
4265 group_lanes / ld_lanes_lanes,
4266 false) == IFN_LAST)
4267 ld_lanes_lanes = 0;
4268 else
4269 /* Verify the loads access the same number of lanes aligned to
4270 ld_lanes_lanes. */
4271 for (slp_tree load : loads)
4273 if (SLP_TREE_LANES (load) != ld_lanes_lanes)
4275 ld_lanes_lanes = 0;
4276 break;
4278 unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
4279 if (first % ld_lanes_lanes != 0)
4281 ld_lanes_lanes = 0;
4282 break;
4284 for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i)
4285 if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i)
4287 ld_lanes_lanes = 0;
4288 break;
4292 /* Only a power-of-two number of lanes matches interleaving with N levels.
4293 ??? An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
4294 at each step. */
4295 if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
4296 return;
4298 for (slp_tree load : loads)
4300 /* Leave masked or gather loads alone for now. */
4301 if (!SLP_TREE_CHILDREN (load).is_empty ())
4302 continue;
4304 /* We want to pattern-match special cases here and keep those
4305 alone. Candidates are splats and load-lane. */
4307 /* We need to lower only loads of less than half of the groups
4308 lanes, including duplicate lanes. Note this leaves nodes
4309 with a non-1:1 load permutation around instead of canonicalizing
4310 those into a load and a permute node. Removing this early
4311 check would do such canonicalization. */
4312 if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
4313 && ld_lanes_lanes == 0)
4314 continue;
4316 /* Build the permute to get the original load permutation order. */
4317 bool contiguous = true;
4318 lane_permutation_t final_perm;
4319 final_perm.create (SLP_TREE_LANES (load));
4320 for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
4322 final_perm.quick_push
4323 (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
4324 if (i != 0
4325 && (SLP_TREE_LOAD_PERMUTATION (load)[i]
4326 != SLP_TREE_LOAD_PERMUTATION (load)[i-1] + 1))
4327 contiguous = false;
4330 /* When the load permutation accesses a contiguous unpermuted,
4331 power-of-two aligned and sized chunk leave the load alone.
4332 We can likely (re-)load it more efficiently rather than
4333 extracting it from the larger load.
4334 ??? Long-term some of the lowering should move to where
4335 the vector types involved are fixed. */
4336 if (ld_lanes_lanes == 0
4337 && contiguous
4338 && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
4339 && pow2p_hwi (SLP_TREE_LANES (load))
4340 && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
4341 && group_lanes % SLP_TREE_LANES (load) == 0)
4343 final_perm.release ();
4344 continue;
4347 /* First build (and possibly re-use) a load node for the
4348 unpermuted group. Gaps in the middle and on the end are
4349 represented with NULL stmts. */
4350 vec<stmt_vec_info> stmts;
4351 stmts.create (group_lanes);
4352 for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
4354 if (s != first)
4355 for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
4356 stmts.quick_push (NULL);
4357 stmts.quick_push (s);
4359 for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
4360 stmts.quick_push (NULL);
4361 poly_uint64 max_nunits = 1;
4362 bool *matches = XALLOCAVEC (bool, group_lanes);
4363 unsigned limit = 1;
4364 unsigned tree_size = 0;
4365 slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
4366 group_lanes,
4367 &max_nunits, matches, &limit,
4368 &tree_size, bst_map);
4370 if (ld_lanes_lanes != 0)
4372 /* ??? If this is not in sync with what get_load_store_type
4373 later decides the SLP representation is not good for other
4374 store vectorization methods. */
4375 l0->ldst_lanes = true;
4376 load->ldst_lanes = true;
4379 while (1)
4381 unsigned group_lanes = SLP_TREE_LANES (l0);
4382 if (ld_lanes_lanes != 0
4383 || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
4384 break;
4386 /* Try to lower by reducing the group to half its size using an
4387 interleaving scheme. For this try to compute whether all
4388 elements needed for this load are in even or odd elements of
4389 an even/odd decomposition with N consecutive elements.
4390 Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
4391 with N == 2. */
4392 /* ??? Only an even number of lanes can be handed this way, but the
4393 fallback below could work for any number. We have to make sure
4394 to round up in that case. */
4395 gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
4396 unsigned even = 0, odd = 0;
4397 if ((group_lanes & 1) == 0)
4399 even = (1 << ceil_log2 (group_lanes)) - 1;
4400 odd = even;
4401 for (auto l : final_perm)
4403 even &= ~l.second;
4404 odd &= l.second;
4408 /* Now build an even or odd extraction from the unpermuted load. */
4409 lane_permutation_t perm;
4410 perm.create ((group_lanes + 1) / 2);
4411 unsigned level;
4412 if (even
4413 && ((level = 1 << ctz_hwi (even)), true)
4414 && group_lanes % (2 * level) == 0)
4416 /* { 0, 1, ... 4, 5 ..., } */
4417 unsigned level = 1 << ctz_hwi (even);
4418 for (unsigned i = 0; i < group_lanes / 2 / level; ++i)
4419 for (unsigned j = 0; j < level; ++j)
4420 perm.quick_push (std::make_pair (0, 2 * i * level + j));
4422 else if (odd)
4424 /* { ..., 2, 3, ... 6, 7 } */
4425 unsigned level = 1 << ctz_hwi (odd);
4426 gcc_assert (group_lanes % (2 * level) == 0);
4427 for (unsigned i = 0; i < group_lanes / 2 / level; ++i)
4428 for (unsigned j = 0; j < level; ++j)
4429 perm.quick_push (std::make_pair (0, (2 * i + 1) * level + j));
4431 else
4433 /* As fallback extract all used lanes and fill to half the
4434 group size by repeating the last element.
4435 ??? This is quite a bad strathegy for re-use - we could
4436 brute force our way to find more optimal filling lanes to
4437 maximize re-use when looking at all loads from the group. */
4438 auto_bitmap l;
4439 for (auto p : final_perm)
4440 bitmap_set_bit (l, p.second);
4441 unsigned i = 0;
4442 bitmap_iterator bi;
4443 EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
4444 perm.quick_push (std::make_pair (0, i));
4445 while (perm.length () < (group_lanes + 1) / 2)
4446 perm.quick_push (perm.last ());
4449 /* Update final_perm with the intermediate permute. */
4450 for (unsigned i = 0; i < final_perm.length (); ++i)
4452 unsigned l = final_perm[i].second;
4453 unsigned j;
4454 for (j = 0; j < perm.length (); ++j)
4455 if (perm[j].second == l)
4457 final_perm[i].second = j;
4458 break;
4460 gcc_assert (j < perm.length ());
4463 /* And create scalar stmts. */
4464 vec<stmt_vec_info> perm_stmts;
4465 perm_stmts.create (perm.length ());
4466 for (unsigned i = 0; i < perm.length (); ++i)
4467 perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
4469 slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
4470 SLP_TREE_CHILDREN (p).quick_push (l0);
4471 SLP_TREE_LANE_PERMUTATION (p) = perm;
4472 SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
4473 SLP_TREE_LANES (p) = perm.length ();
4474 SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
4475 /* ??? As we have scalar stmts for this intermediate permute we
4476 could CSE it via bst_map but we do not want to pick up
4477 another SLP node with a load permutation. We instead should
4478 have a "local" CSE map here. */
4479 SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
4481 /* We now have a node for (group_lanes + 1) / 2 lanes. */
4482 l0 = p;
4485 /* And finally from the ordered reduction node create the
4486 permute to shuffle the lanes into the original load-permutation
4487 order. We replace the original load node with this. */
4488 SLP_TREE_CODE (load) = VEC_PERM_EXPR;
4489 SLP_TREE_LOAD_PERMUTATION (load).release ();
4490 SLP_TREE_LANE_PERMUTATION (load) = final_perm;
4491 SLP_TREE_CHILDREN (load).create (1);
4492 SLP_TREE_CHILDREN (load).quick_push (l0);
4496 /* Transform SLP loads in the SLP graph created by SLP discovery to
4497 group loads from the same group and lower load permutations that
4498 are unlikely to be supported into a series of permutes.
4499 In the degenerate case of having only single-lane SLP instances
4500 this should result in a series of permute nodes emulating an
4501 interleaving scheme. */
4503 static void
4504 vect_lower_load_permutations (loop_vec_info loop_vinfo,
4505 scalar_stmts_to_slp_tree_map_t *bst_map)
4507 /* Gather and sort loads across all instances. */
4508 hash_set<slp_tree> visited;
4509 auto_vec<slp_tree> loads;
4510 for (auto inst : loop_vinfo->slp_instances)
4511 vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
4512 if (loads.is_empty ())
4513 return;
4514 loads.qsort (vllp_cmp);
4516 /* Now process each dataref group separately. */
4517 unsigned firsti = 0;
4518 for (unsigned i = 1; i < loads.length (); ++i)
4520 slp_tree first = loads[firsti];
4521 slp_tree next = loads[i];
4522 stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
4523 stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
4524 if (STMT_VINFO_GROUPED_ACCESS (a0)
4525 && STMT_VINFO_GROUPED_ACCESS (b0)
4526 && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
4527 continue;
4528 /* Now we have one or multiple SLP loads of the same group from
4529 firsti to i - 1. */
4530 if (STMT_VINFO_GROUPED_ACCESS (a0))
4531 vect_lower_load_permutations (loop_vinfo, bst_map,
4532 make_array_slice (&loads[firsti],
4533 i - firsti));
4534 firsti = i;
4536 if (firsti < loads.length ()
4537 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
4538 vect_lower_load_permutations (loop_vinfo, bst_map,
4539 make_array_slice (&loads[firsti],
4540 loads.length () - firsti));
4543 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
4544 trees of packed scalar stmts if SLP is possible. */
4546 opt_result
4547 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
4549 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4550 unsigned int i;
4551 stmt_vec_info first_element;
4552 slp_instance instance;
4554 DUMP_VECT_SCOPE ("vect_analyze_slp");
4556 unsigned limit = max_tree_size;
4558 scalar_stmts_to_slp_tree_map_t *bst_map
4559 = new scalar_stmts_to_slp_tree_map_t ();
4561 /* Find SLP sequences starting from groups of grouped stores. */
4562 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4563 vect_analyze_slp_instance (vinfo, bst_map, first_element,
4564 slp_inst_kind_store, max_tree_size, &limit);
4566 /* For loops also start SLP discovery from non-grouped stores. */
4567 if (loop_vinfo)
4569 data_reference_p dr;
4570 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
4571 if (DR_IS_WRITE (dr))
4573 stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
4574 /* Grouped stores are already handled above. */
4575 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
4576 continue;
4577 vec<stmt_vec_info> stmts;
4578 vec<stmt_vec_info> roots = vNULL;
4579 vec<tree> remain = vNULL;
4580 stmts.create (1);
4581 stmts.quick_push (stmt_info);
4582 vect_build_slp_instance (vinfo, slp_inst_kind_store,
4583 stmts, roots, remain, max_tree_size,
4584 &limit, bst_map, NULL);
4588 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4590 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
4592 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
4593 /* Apply patterns. */
4594 for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
4595 bb_vinfo->roots[i].stmts[j]
4596 = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
4597 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
4598 bb_vinfo->roots[i].stmts,
4599 bb_vinfo->roots[i].roots,
4600 bb_vinfo->roots[i].remain,
4601 max_tree_size, &limit, bst_map, NULL))
4603 bb_vinfo->roots[i].stmts = vNULL;
4604 bb_vinfo->roots[i].roots = vNULL;
4605 bb_vinfo->roots[i].remain = vNULL;
4610 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4612 /* Find SLP sequences starting from reduction chains. */
4613 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
4614 if (! STMT_VINFO_RELEVANT_P (first_element)
4615 && ! STMT_VINFO_LIVE_P (first_element))
4617 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
4618 slp_inst_kind_reduc_chain,
4619 max_tree_size, &limit))
4621 /* Dissolve reduction chain group. */
4622 stmt_vec_info vinfo = first_element;
4623 stmt_vec_info last = NULL;
4624 while (vinfo)
4626 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
4627 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
4628 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
4629 last = vinfo;
4630 vinfo = next;
4632 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
4633 /* It can be still vectorized as part of an SLP reduction. */
4634 loop_vinfo->reductions.safe_push (last);
4637 /* Find SLP sequences starting from groups of reductions. */
4638 if (loop_vinfo->reductions.length () > 0)
4640 /* Collect reduction statements we can combine into
4641 a SLP reduction. */
4642 vec<stmt_vec_info> scalar_stmts;
4643 scalar_stmts.create (loop_vinfo->reductions.length ());
4644 for (auto next_info : loop_vinfo->reductions)
4646 next_info = vect_stmt_to_vectorize (next_info);
4647 if ((STMT_VINFO_RELEVANT_P (next_info)
4648 || STMT_VINFO_LIVE_P (next_info))
4649 /* ??? Make sure we didn't skip a conversion around a
4650 reduction path. In that case we'd have to reverse
4651 engineer that conversion stmt following the chain using
4652 reduc_idx and from the PHI using reduc_def. */
4653 && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
4654 || (STMT_VINFO_DEF_TYPE (next_info)
4655 == vect_double_reduction_def)))
4657 /* Do not discover SLP reductions combining lane-reducing
4658 ops, that will fail later. */
4659 if (!lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
4660 scalar_stmts.quick_push (next_info);
4661 else
4663 /* Do SLP discovery for single-lane reductions. */
4664 vec<stmt_vec_info> stmts;
4665 vec<stmt_vec_info> roots = vNULL;
4666 vec<tree> remain = vNULL;
4667 stmts.create (1);
4668 stmts.quick_push (next_info);
4669 vect_build_slp_instance (vinfo,
4670 slp_inst_kind_reduc_group,
4671 stmts, roots, remain,
4672 max_tree_size, &limit,
4673 bst_map, NULL);
4677 /* Save for re-processing on failure. */
4678 vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
4679 vec<stmt_vec_info> roots = vNULL;
4680 vec<tree> remain = vNULL;
4681 if (scalar_stmts.length () <= 1
4682 || !vect_build_slp_instance (loop_vinfo,
4683 slp_inst_kind_reduc_group,
4684 scalar_stmts, roots, remain,
4685 max_tree_size, &limit, bst_map,
4686 NULL))
4688 if (scalar_stmts.length () <= 1)
4689 scalar_stmts.release ();
4690 /* Do SLP discovery for single-lane reductions. */
4691 for (auto stmt_info : saved_stmts)
4693 vec<stmt_vec_info> stmts;
4694 vec<stmt_vec_info> roots = vNULL;
4695 vec<tree> remain = vNULL;
4696 stmts.create (1);
4697 stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
4698 vect_build_slp_instance (vinfo,
4699 slp_inst_kind_reduc_group,
4700 stmts, roots, remain,
4701 max_tree_size, &limit,
4702 bst_map, NULL);
4704 saved_stmts.release ();
4708 /* Make sure to vectorize only-live stmts, usually inductions. */
4709 for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
4710 for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
4711 gsi_next (&gsi))
4713 gphi *lc_phi = *gsi;
4714 tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
4715 stmt_vec_info stmt_info;
4716 if (TREE_CODE (def) == SSA_NAME
4717 && !virtual_operand_p (def)
4718 && (stmt_info = loop_vinfo->lookup_def (def))
4719 && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
4720 && STMT_VINFO_LIVE_P (stmt_info)
4721 && (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
4722 || (STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
4723 && STMT_VINFO_REDUC_IDX (stmt_info) == -1)))
4725 vec<stmt_vec_info> stmts;
4726 vec<stmt_vec_info> roots = vNULL;
4727 vec<tree> remain = vNULL;
4728 stmts.create (1);
4729 stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
4730 vect_build_slp_instance (vinfo,
4731 slp_inst_kind_reduc_group,
4732 stmts, roots, remain,
4733 max_tree_size, &limit,
4734 bst_map, NULL);
4739 hash_set<slp_tree> visited_patterns;
4740 slp_tree_to_load_perm_map_t perm_cache;
4741 slp_compat_nodes_map_t compat_cache;
4743 /* See if any patterns can be found in the SLP tree. */
4744 bool pattern_found = false;
4745 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4746 pattern_found |= vect_match_slp_patterns (instance, vinfo,
4747 &visited_patterns, &perm_cache,
4748 &compat_cache);
4750 /* If any were found optimize permutations of loads. */
4751 if (pattern_found)
4753 hash_map<slp_tree, slp_tree> load_map;
4754 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4756 slp_tree root = SLP_INSTANCE_TREE (instance);
4757 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
4758 &load_map, root);
4762 /* Check whether we should force some SLP instances to use load/store-lanes
4763 and do so by forcing SLP re-discovery with single lanes. We used
4764 to cancel SLP when this applied to all instances in a loop but now
4765 we decide this per SLP instance. It's important to do this only
4766 after SLP pattern recognition. */
4767 if (is_a <loop_vec_info> (vinfo))
4768 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4769 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
4770 && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
4772 slp_tree slp_root = SLP_INSTANCE_TREE (instance);
4773 int group_size = SLP_TREE_LANES (slp_root);
4774 tree vectype = SLP_TREE_VECTYPE (slp_root);
4776 stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
4777 gimple *rep = STMT_VINFO_STMT (rep_info);
4778 bool masked = (is_gimple_call (rep)
4779 && gimple_call_internal_p (rep)
4780 && internal_fn_mask_index
4781 (gimple_call_internal_fn (rep)) != -1);
4782 if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
4783 || slp_root->ldst_lanes
4784 || (vect_store_lanes_supported (vectype, group_size, masked)
4785 == IFN_LAST))
4786 continue;
4788 auto_vec<slp_tree> loads;
4789 hash_set<slp_tree> visited;
4790 vect_gather_slp_loads (loads, slp_root, visited);
4792 /* Check whether any load in the SLP instance is possibly
4793 permuted. */
4794 bool loads_permuted = false;
4795 slp_tree load_node;
4796 unsigned j;
4797 FOR_EACH_VEC_ELT (loads, j, load_node)
4799 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
4800 continue;
4801 unsigned k;
4802 stmt_vec_info load_info;
4803 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
4804 if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
4806 loads_permuted = true;
4807 break;
4811 /* If the loads and stores can use load/store-lanes force re-discovery
4812 with single lanes. */
4813 if (loads_permuted)
4815 bool can_use_lanes = true;
4816 FOR_EACH_VEC_ELT (loads, j, load_node)
4817 if (STMT_VINFO_GROUPED_ACCESS
4818 (SLP_TREE_REPRESENTATIVE (load_node)))
4820 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
4821 (SLP_TREE_REPRESENTATIVE (load_node));
4822 rep = STMT_VINFO_STMT (stmt_vinfo);
4823 masked = (is_gimple_call (rep)
4824 && gimple_call_internal_p (rep)
4825 && internal_fn_mask_index
4826 (gimple_call_internal_fn (rep)));
4827 /* Use SLP for strided accesses (or if we can't
4828 load-lanes). */
4829 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
4830 || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
4831 || vect_load_lanes_supported
4832 (STMT_VINFO_VECTYPE (stmt_vinfo),
4833 DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
4834 /* ??? During SLP re-discovery with a single lane
4835 a masked grouped load will appear permuted and
4836 discovery will fail. We have to rework this
4837 on the discovery side - for now avoid ICEing. */
4838 || masked)
4840 can_use_lanes = false;
4841 break;
4845 if (can_use_lanes)
4847 if (dump_enabled_p ())
4848 dump_printf_loc (MSG_NOTE, vect_location,
4849 "SLP instance %p can use load/store-lanes,"
4850 " re-discovering with single-lanes\n",
4851 (void *) instance);
4853 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
4855 vect_free_slp_instance (instance);
4856 limit = max_tree_size;
4857 bool res = vect_analyze_slp_instance (vinfo, bst_map,
4858 stmt_info,
4859 slp_inst_kind_store,
4860 max_tree_size, &limit,
4861 true);
4862 gcc_assert (res);
4863 auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
4864 LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
4869 /* When we end up with load permutations that we cannot possibly handle,
4870 like those requiring three vector inputs, lower them using interleaving
4871 like schemes. */
4872 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4874 vect_lower_load_permutations (loop_vinfo, bst_map);
4875 if (dump_enabled_p ())
4877 dump_printf_loc (MSG_NOTE, vect_location,
4878 "SLP graph after lowering permutations:\n");
4879 hash_set<slp_tree> visited;
4880 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4881 vect_print_slp_graph (MSG_NOTE, vect_location,
4882 SLP_INSTANCE_TREE (instance), visited);
4886 release_scalar_stmts_to_slp_tree_map (bst_map);
4888 if (pattern_found && dump_enabled_p ())
4890 dump_printf_loc (MSG_NOTE, vect_location,
4891 "Pattern matched SLP tree\n");
4892 hash_set<slp_tree> visited;
4893 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4894 vect_print_slp_graph (MSG_NOTE, vect_location,
4895 SLP_INSTANCE_TREE (instance), visited);
4898 return opt_result::success ();
4901 /* Estimates the cost of inserting layout changes into the SLP graph.
4902 It can also say that the insertion is impossible. */
4904 struct slpg_layout_cost
4906 slpg_layout_cost () = default;
4907 slpg_layout_cost (sreal, bool);
4909 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
4910 bool is_possible () const { return depth != sreal::max (); }
4912 bool operator== (const slpg_layout_cost &) const;
4913 bool operator!= (const slpg_layout_cost &) const;
4915 bool is_better_than (const slpg_layout_cost &, bool) const;
4917 void add_parallel_cost (const slpg_layout_cost &);
4918 void add_serial_cost (const slpg_layout_cost &);
4919 void split (unsigned int);
4921 /* The longest sequence of layout changes needed during any traversal
4922 of the partition dag, weighted by execution frequency.
4924 This is the most important metric when optimizing for speed, since
4925 it helps to ensure that we keep the number of operations on
4926 critical paths to a minimum. */
4927 sreal depth = 0;
4929 /* An estimate of the total number of operations needed. It is weighted by
4930 execution frequency when optimizing for speed but not when optimizing for
4931 size. In order to avoid double-counting, a node with a fanout of N will
4932 distribute 1/N of its total cost to each successor.
4934 This is the most important metric when optimizing for size, since
4935 it helps to keep the total number of operations to a minimum, */
4936 sreal total = 0;
4939 /* Construct costs for a node with weight WEIGHT. A higher weight
4940 indicates more frequent execution. IS_FOR_SIZE is true if we are
4941 optimizing for size rather than speed. */
4943 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
4944 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
4948 bool
4949 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
4951 return depth == other.depth && total == other.total;
4954 bool
4955 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
4957 return !operator== (other);
4960 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
4961 true if we are optimizing for size rather than speed. */
4963 bool
4964 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
4965 bool is_for_size) const
4967 if (is_for_size)
4969 if (total != other.total)
4970 return total < other.total;
4971 return depth < other.depth;
4973 else
4975 if (depth != other.depth)
4976 return depth < other.depth;
4977 return total < other.total;
4981 /* Increase the costs to account for something with cost INPUT_COST
4982 happening in parallel with the current costs. */
4984 void
4985 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
4987 depth = std::max (depth, input_cost.depth);
4988 total += input_cost.total;
4991 /* Increase the costs to account for something with cost INPUT_COST
4992 happening in series with the current costs. */
4994 void
4995 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
4997 depth += other.depth;
4998 total += other.total;
5001 /* Split the total cost among TIMES successors or predecessors. */
5003 void
5004 slpg_layout_cost::split (unsigned int times)
5006 if (times > 1)
5007 total /= times;
5010 /* Information about one node in the SLP graph, for use during
5011 vect_optimize_slp_pass. */
5013 struct slpg_vertex
5015 slpg_vertex (slp_tree node_) : node (node_) {}
5017 /* The node itself. */
5018 slp_tree node;
5020 /* Which partition the node belongs to, or -1 if none. Nodes outside of
5021 partitions are flexible; they can have whichever layout consumers
5022 want them to have. */
5023 int partition = -1;
5025 /* The number of nodes that directly use the result of this one
5026 (i.e. the number of nodes that count this one as a child). */
5027 unsigned int out_degree = 0;
5029 /* The execution frequency of the node. */
5030 sreal weight = 0;
5032 /* The total execution frequency of all nodes that directly use the
5033 result of this one. */
5034 sreal out_weight = 0;
5037 /* Information about one partition of the SLP graph, for use during
5038 vect_optimize_slp_pass. */
5040 struct slpg_partition_info
5042 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
5043 of m_partitioned_nodes. */
5044 unsigned int node_begin = 0;
5045 unsigned int node_end = 0;
5047 /* Which layout we've chosen to use for this partition, or -1 if
5048 we haven't picked one yet. */
5049 int layout = -1;
5051 /* The number of predecessors and successors in the partition dag.
5052 The predecessors always have lower partition numbers and the
5053 successors always have higher partition numbers.
5055 Note that the directions of these edges are not necessarily the
5056 same as in the data flow graph. For example, if an SCC has separate
5057 partitions for an inner loop and an outer loop, the inner loop's
5058 partition will have at least two incoming edges from the outer loop's
5059 partition: one for a live-in value and one for a live-out value.
5060 In data flow terms, one of these edges would also be from the outer loop
5061 to the inner loop, but the other would be in the opposite direction. */
5062 unsigned int in_degree = 0;
5063 unsigned int out_degree = 0;
5066 /* Information about the costs of using a particular layout for a
5067 particular partition. It can also say that the combination is
5068 impossible. */
5070 struct slpg_partition_layout_costs
5072 bool is_possible () const { return internal_cost.is_possible (); }
5073 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
5075 /* The costs inherited from predecessor partitions. */
5076 slpg_layout_cost in_cost;
5078 /* The inherent cost of the layout within the node itself. For example,
5079 this is nonzero for a load if choosing a particular layout would require
5080 the load to permute the loaded elements. It is nonzero for a
5081 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
5082 to full-vector moves. */
5083 slpg_layout_cost internal_cost;
5085 /* The costs inherited from successor partitions. */
5086 slpg_layout_cost out_cost;
5089 /* This class tries to optimize the layout of vectors in order to avoid
5090 unnecessary shuffling. At the moment, the set of possible layouts are
5091 restricted to bijective permutations.
5093 The goal of the pass depends on whether we're optimizing for size or
5094 for speed. When optimizing for size, the goal is to reduce the overall
5095 number of layout changes (including layout changes implied by things
5096 like load permutations). When optimizing for speed, the goal is to
5097 reduce the maximum latency attributable to layout changes on any
5098 non-cyclical path through the data flow graph.
5100 For example, when optimizing a loop nest for speed, we will prefer
5101 to make layout changes outside of a loop rather than inside of a loop,
5102 and will prefer to make layout changes in parallel rather than serially,
5103 even if that increases the overall number of layout changes.
5105 The high-level procedure is:
5107 (1) Build a graph in which edges go from uses (parents) to definitions
5108 (children).
5110 (2) Divide the graph into a dag of strongly-connected components (SCCs).
5112 (3) When optimizing for speed, partition the nodes in each SCC based
5113 on their containing cfg loop. When optimizing for size, treat
5114 each SCC as a single partition.
5116 This gives us a dag of partitions. The goal is now to assign a
5117 layout to each partition.
5119 (4) Construct a set of vector layouts that are worth considering.
5120 Record which nodes must keep their current layout.
5122 (5) Perform a forward walk over the partition dag (from loads to stores)
5123 accumulating the "forward" cost of using each layout. When visiting
5124 each partition, assign a tentative choice of layout to the partition
5125 and use that choice when calculating the cost of using a different
5126 layout in successor partitions.
5128 (6) Perform a backward walk over the partition dag (from stores to loads),
5129 accumulating the "backward" cost of using each layout. When visiting
5130 each partition, make a final choice of layout for that partition based
5131 on the accumulated forward costs (from (5)) and backward costs
5132 (from (6)).
5134 (7) Apply the chosen layouts to the SLP graph.
5136 For example, consider the SLP statements:
5138 S1: a_1 = load
5139 loop:
5140 S2: a_2 = PHI<a_1, a_3>
5141 S3: b_1 = load
5142 S4: a_3 = a_2 + b_1
5143 exit:
5144 S5: a_4 = PHI<a_3>
5145 S6: store a_4
5147 S2 and S4 form an SCC and are part of the same loop. Every other
5148 statement is in a singleton SCC. In this example there is a one-to-one
5149 mapping between SCCs and partitions and the partition dag looks like this;
5151 S1 S3
5153 S2+S4
5159 S2, S3 and S4 will have a higher execution frequency than the other
5160 statements, so when optimizing for speed, the goal is to avoid any
5161 layout changes:
5163 - within S3
5164 - within S2+S4
5165 - on the S3->S2+S4 edge
5167 For example, if S3 was originally a reversing load, the goal of the
5168 pass is to make it an unreversed load and change the layout on the
5169 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
5170 on S1->S2+S4 and S5->S6 would also be acceptable.)
5172 The difference between SCCs and partitions becomes important if we
5173 add an outer loop:
5175 S1: a_1 = ...
5176 loop1:
5177 S2: a_2 = PHI<a_1, a_6>
5178 S3: b_1 = load
5179 S4: a_3 = a_2 + b_1
5180 loop2:
5181 S5: a_4 = PHI<a_3, a_5>
5182 S6: c_1 = load
5183 S7: a_5 = a_4 + c_1
5184 exit2:
5185 S8: a_6 = PHI<a_5>
5186 S9: store a_6
5187 exit1:
5189 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
5190 for speed, we usually do not want restrictions in the outer loop to "infect"
5191 the decision for the inner loop. For example, if an outer-loop node
5192 in the SCC contains a statement with a fixed layout, that should not
5193 prevent the inner loop from using a different layout. Conversely,
5194 the inner loop should not dictate a layout to the outer loop: if the
5195 outer loop does a lot of computation, then it may not be efficient to
5196 do all of that computation in the inner loop's preferred layout.
5198 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
5199 and S5+S7 (inner). We also try to arrange partitions so that:
5201 - the partition for an outer loop comes before the partition for
5202 an inner loop
5204 - if a sibling loop A dominates a sibling loop B, A's partition
5205 comes before B's
5207 This gives the following partition dag for the example above:
5209 S1 S3
5211 S2+S4+S8 S6
5212 | \\ /
5213 | S5+S7
5217 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
5218 one for a reversal of the edge S7->S8.
5220 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
5221 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
5222 preferred layout against the cost of changing the layout on entry to the
5223 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
5225 Although this works well when optimizing for speed, it has the downside
5226 when optimizing for size that the choice of layout for S5+S7 is completely
5227 independent of S9, which lessens the chance of reducing the overall number
5228 of permutations. We therefore do not partition SCCs when optimizing
5229 for size.
5231 To give a concrete example of the difference between optimizing
5232 for size and speed, consider:
5234 a[0] = (b[1] << c[3]) - d[1];
5235 a[1] = (b[0] << c[2]) - d[0];
5236 a[2] = (b[3] << c[1]) - d[3];
5237 a[3] = (b[2] << c[0]) - d[2];
5239 There are three different layouts here: one for a, one for b and d,
5240 and one for c. When optimizing for speed it is better to permute each
5241 of b, c and d into the order required by a, since those permutations
5242 happen in parallel. But when optimizing for size, it is better to:
5244 - permute c into the same order as b
5245 - do the arithmetic
5246 - permute the result into the order required by a
5248 This gives 2 permutations rather than 3. */
5250 class vect_optimize_slp_pass
5252 public:
5253 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
5254 void run ();
5256 private:
5257 /* Graph building. */
5258 struct loop *containing_loop (slp_tree);
5259 bool is_cfg_latch_edge (graph_edge *);
5260 void build_vertices (hash_set<slp_tree> &, slp_tree);
5261 void build_vertices ();
5262 void build_graph ();
5264 /* Partitioning. */
5265 void create_partitions ();
5266 template<typename T> void for_each_partition_edge (unsigned int, T);
5268 /* Layout selection. */
5269 bool is_compatible_layout (slp_tree, unsigned int);
5270 int change_layout_cost (slp_tree, unsigned int, unsigned int);
5271 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
5272 unsigned int);
5273 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
5274 int, unsigned int);
5275 int internal_node_cost (slp_tree, int, unsigned int);
5276 void start_choosing_layouts ();
5278 /* Cost propagation. */
5279 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
5280 unsigned int, unsigned int);
5281 slpg_layout_cost total_in_cost (unsigned int);
5282 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
5283 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
5284 void forward_pass ();
5285 void backward_pass ();
5287 /* Rematerialization. */
5288 slp_tree get_result_with_layout (slp_tree, unsigned int);
5289 void materialize ();
5291 /* Clean-up. */
5292 void remove_redundant_permutations ();
5294 void dump ();
5296 vec_info *m_vinfo;
5298 /* True if we should optimize the graph for size, false if we should
5299 optimize it for speed. (It wouldn't be easy to make this decision
5300 more locally.) */
5301 bool m_optimize_size;
5303 /* A graph of all SLP nodes, with edges leading from uses to definitions.
5304 In other words, a node's predecessors are its slp_tree parents and
5305 a node's successors are its slp_tree children. */
5306 graph *m_slpg = nullptr;
5308 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
5309 auto_vec<slpg_vertex> m_vertices;
5311 /* The list of all leaves of M_SLPG. such as external definitions, constants,
5312 and loads. */
5313 auto_vec<int> m_leafs;
5315 /* This array has one entry for every vector layout that we're considering.
5316 Element 0 is null and indicates "no change". Other entries describe
5317 permutations that are inherent in the current graph and that we would
5318 like to reverse if possible.
5320 For example, a permutation { 1, 2, 3, 0 } means that something has
5321 effectively been permuted in that way, such as a load group
5322 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
5323 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
5324 in order to put things "back" in order. */
5325 auto_vec<vec<unsigned> > m_perms;
5327 /* A partitioning of the nodes for which a layout must be chosen.
5328 Each partition represents an <SCC, cfg loop> pair; that is,
5329 nodes in different SCCs belong to different partitions, and nodes
5330 within an SCC can be further partitioned according to a containing
5331 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
5333 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
5334 from leaves (such as loads) to roots (such as stores).
5336 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
5337 auto_vec<slpg_partition_info> m_partitions;
5339 /* The list of all nodes for which a layout must be chosen. Nodes for
5340 partition P come before the nodes for partition P+1. Nodes within a
5341 partition are in reverse postorder. */
5342 auto_vec<unsigned int> m_partitioned_nodes;
5344 /* Index P * num-layouts + L contains the cost of using layout L
5345 for partition P. */
5346 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
5348 /* Index N * num-layouts + L, if nonnull, is a node that provides the
5349 original output of node N adjusted to have layout L. */
5350 auto_vec<slp_tree> m_node_layouts;
5353 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
5354 Also record whether we should optimize anything for speed rather
5355 than size. */
5357 void
5358 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
5359 slp_tree node)
5361 unsigned i;
5362 slp_tree child;
5364 if (visited.add (node))
5365 return;
5367 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
5369 basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
5370 if (optimize_bb_for_speed_p (bb))
5371 m_optimize_size = false;
5374 node->vertex = m_vertices.length ();
5375 m_vertices.safe_push (slpg_vertex (node));
5377 bool leaf = true;
5378 bool force_leaf = false;
5379 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5380 if (child)
5382 leaf = false;
5383 build_vertices (visited, child);
5385 else
5386 force_leaf = true;
5387 /* Since SLP discovery works along use-def edges all cycles have an
5388 entry - but there's the exception of cycles where we do not handle
5389 the entry explicitely (but with a NULL SLP node), like some reductions
5390 and inductions. Force those SLP PHIs to act as leafs to make them
5391 backwards reachable. */
5392 if (leaf || force_leaf)
5393 m_leafs.safe_push (node->vertex);
5396 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
5398 void
5399 vect_optimize_slp_pass::build_vertices ()
5401 hash_set<slp_tree> visited;
5402 unsigned i;
5403 slp_instance instance;
5404 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
5405 build_vertices (visited, SLP_INSTANCE_TREE (instance));
5408 /* Apply (reverse) bijectite PERM to VEC. */
5410 template <class T>
5411 static void
5412 vect_slp_permute (vec<unsigned> perm,
5413 vec<T> &vec, bool reverse)
5415 auto_vec<T, 64> saved;
5416 saved.create (vec.length ());
5417 for (unsigned i = 0; i < vec.length (); ++i)
5418 saved.quick_push (vec[i]);
5420 if (reverse)
5422 for (unsigned i = 0; i < vec.length (); ++i)
5423 vec[perm[i]] = saved[i];
5424 for (unsigned i = 0; i < vec.length (); ++i)
5425 gcc_assert (vec[perm[i]] == saved[i]);
5427 else
5429 for (unsigned i = 0; i < vec.length (); ++i)
5430 vec[i] = saved[perm[i]];
5431 for (unsigned i = 0; i < vec.length (); ++i)
5432 gcc_assert (vec[i] == saved[perm[i]]);
5436 /* Return the cfg loop that contains NODE. */
5438 struct loop *
5439 vect_optimize_slp_pass::containing_loop (slp_tree node)
5441 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
5442 if (!rep)
5443 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
5444 return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
5447 /* Return true if UD (an edge from a use to a definition) is associated
5448 with a loop latch edge in the cfg. */
5450 bool
5451 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
5453 slp_tree use = m_vertices[ud->src].node;
5454 slp_tree def = m_vertices[ud->dest].node;
5455 if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
5456 || SLP_TREE_CODE (use) == VEC_PERM_EXPR)
5457 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
5458 return false;
5460 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
5461 return (is_a<gphi *> (use_rep->stmt)
5462 && bb_loop_header_p (gimple_bb (use_rep->stmt))
5463 && containing_loop (def) == containing_loop (use));
5466 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
5467 a nonnull data field. */
5469 void
5470 vect_optimize_slp_pass::build_graph ()
5472 m_optimize_size = true;
5473 build_vertices ();
5475 m_slpg = new_graph (m_vertices.length ());
5476 for (slpg_vertex &v : m_vertices)
5477 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
5478 if (child)
5480 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
5481 if (is_cfg_latch_edge (ud))
5482 ud->data = this;
5486 /* Return true if E corresponds to a loop latch edge in the cfg. */
5488 static bool
5489 skip_cfg_latch_edges (graph_edge *e)
5491 return e->data;
5494 /* Create the node partitions. */
5496 void
5497 vect_optimize_slp_pass::create_partitions ()
5499 /* Calculate a postorder of the graph, ignoring edges that correspond
5500 to natural latch edges in the cfg. Reading the vector from the end
5501 to the beginning gives the reverse postorder. */
5502 auto_vec<int> initial_rpo;
5503 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
5504 false, NULL, skip_cfg_latch_edges);
5505 gcc_assert (initial_rpo.length () == m_vertices.length ());
5507 /* Calculate the strongly connected components of the graph. */
5508 auto_vec<int> scc_grouping;
5509 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
5511 /* Create a new index order in which all nodes from the same SCC are
5512 consecutive. Use scc_pos to record the index of the first node in
5513 each SCC. */
5514 auto_vec<unsigned int> scc_pos (num_sccs);
5515 int last_component = -1;
5516 unsigned int node_count = 0;
5517 for (unsigned int node_i : scc_grouping)
5519 if (last_component != m_slpg->vertices[node_i].component)
5521 last_component = m_slpg->vertices[node_i].component;
5522 gcc_assert (last_component == int (scc_pos.length ()));
5523 scc_pos.quick_push (node_count);
5525 node_count += 1;
5527 gcc_assert (node_count == initial_rpo.length ()
5528 && last_component + 1 == int (num_sccs));
5530 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
5531 inside each SCC following the RPO we calculated above. The fact that
5532 we ignored natural latch edges when calculating the RPO should ensure
5533 that, for natural loop nests:
5535 - the first node that we encounter in a cfg loop is the loop header phi
5536 - the loop header phis are in dominance order
5538 Arranging for this is an optimization (see below) rather than a
5539 correctness issue. Unnatural loops with a tangled mess of backedges
5540 will still work correctly, but might give poorer results.
5542 Also update scc_pos so that it gives 1 + the index of the last node
5543 in the SCC. */
5544 m_partitioned_nodes.safe_grow (node_count);
5545 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
5547 unsigned int node_i = initial_rpo[old_i];
5548 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
5549 m_partitioned_nodes[new_i] = node_i;
5552 /* When optimizing for speed, partition each SCC based on the containing
5553 cfg loop. The order we constructed above should ensure that, for natural
5554 cfg loops, we'll create sub-SCC partitions for outer loops before
5555 the corresponding sub-SCC partitions for inner loops. Similarly,
5556 when one sibling loop A dominates another sibling loop B, we should
5557 create a sub-SCC partition for A before a sub-SCC partition for B.
5559 As above, nothing depends for correctness on whether this achieves
5560 a natural nesting, but we should get better results when it does. */
5561 m_partitions.reserve (m_vertices.length ());
5562 unsigned int next_partition_i = 0;
5563 hash_map<struct loop *, int> loop_partitions;
5564 unsigned int rpo_begin = 0;
5565 unsigned int num_partitioned_nodes = 0;
5566 for (unsigned int rpo_end : scc_pos)
5568 loop_partitions.empty ();
5569 unsigned int partition_i = next_partition_i;
5570 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
5572 /* Handle externals and constants optimistically throughout.
5573 But treat existing vectors as fixed since we do not handle
5574 permuting them. */
5575 unsigned int node_i = m_partitioned_nodes[rpo_i];
5576 auto &vertex = m_vertices[node_i];
5577 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
5578 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
5579 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
5580 vertex.partition = -1;
5581 else
5583 bool existed;
5584 if (m_optimize_size)
5585 existed = next_partition_i > partition_i;
5586 else
5588 struct loop *loop = containing_loop (vertex.node);
5589 auto &entry = loop_partitions.get_or_insert (loop, &existed);
5590 if (!existed)
5591 entry = next_partition_i;
5592 partition_i = entry;
5594 if (!existed)
5596 m_partitions.quick_push (slpg_partition_info ());
5597 next_partition_i += 1;
5599 vertex.partition = partition_i;
5600 num_partitioned_nodes += 1;
5601 m_partitions[partition_i].node_end += 1;
5604 rpo_begin = rpo_end;
5607 /* Assign ranges of consecutive node indices to each partition,
5608 in partition order. Start with node_end being the same as
5609 node_begin so that the next loop can use it as a counter. */
5610 unsigned int node_begin = 0;
5611 for (auto &partition : m_partitions)
5613 partition.node_begin = node_begin;
5614 node_begin += partition.node_end;
5615 partition.node_end = partition.node_begin;
5617 gcc_assert (node_begin == num_partitioned_nodes);
5619 /* Finally build the list of nodes in partition order. */
5620 m_partitioned_nodes.truncate (num_partitioned_nodes);
5621 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
5623 int partition_i = m_vertices[node_i].partition;
5624 if (partition_i >= 0)
5626 unsigned int order_i = m_partitions[partition_i].node_end++;
5627 m_partitioned_nodes[order_i] = node_i;
5632 /* Look for edges from earlier partitions into node NODE_I and edges from
5633 node NODE_I into later partitions. Call:
5635 FN (ud, other_node_i)
5637 for each such use-to-def edge ud, where other_node_i is the node at the
5638 other end of the edge. */
5640 template<typename T>
5641 void
5642 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
5644 int partition_i = m_vertices[node_i].partition;
5645 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
5646 pred; pred = pred->pred_next)
5648 int src_partition_i = m_vertices[pred->src].partition;
5649 if (src_partition_i >= 0 && src_partition_i != partition_i)
5650 fn (pred, pred->src);
5652 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
5653 succ; succ = succ->succ_next)
5655 int dest_partition_i = m_vertices[succ->dest].partition;
5656 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
5657 fn (succ, succ->dest);
5661 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
5662 that NODE would operate on. This test is independent of NODE's actual
5663 operation. */
5665 bool
5666 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
5667 unsigned int layout_i)
5669 if (layout_i == 0)
5670 return true;
5672 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
5673 return false;
5675 return true;
5678 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
5679 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
5680 layouts is incompatible with NODE or if the change is not possible for
5681 some other reason.
5683 The properties taken from NODE include the number of lanes and the
5684 vector type. The actual operation doesn't matter. */
5687 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
5688 unsigned int from_layout_i,
5689 unsigned int to_layout_i)
5691 if (!is_compatible_layout (node, from_layout_i)
5692 || !is_compatible_layout (node, to_layout_i))
5693 return -1;
5695 if (from_layout_i == to_layout_i)
5696 return 0;
5698 auto_vec<slp_tree, 1> children (1);
5699 children.quick_push (node);
5700 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
5701 if (from_layout_i > 0)
5702 for (unsigned int i : m_perms[from_layout_i])
5703 perm.quick_push ({ 0, i });
5704 else
5705 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
5706 perm.quick_push ({ 0, i });
5707 if (to_layout_i > 0)
5708 vect_slp_permute (m_perms[to_layout_i], perm, true);
5709 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
5710 children, false);
5711 if (count >= 0)
5712 return MAX (count, 1);
5714 /* ??? In principle we could try changing via layout 0, giving two
5715 layout changes rather than 1. Doing that would require
5716 corresponding support in get_result_with_layout. */
5717 return -1;
5720 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
5722 inline slpg_partition_layout_costs &
5723 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
5724 unsigned int layout_i)
5726 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
5729 /* Change PERM in one of two ways:
5731 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
5732 chosen for child I of NODE.
5734 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
5736 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
5738 void
5739 vect_optimize_slp_pass::
5740 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
5741 int in_layout_i, unsigned int out_layout_i)
5743 for (auto &entry : perm)
5745 int this_in_layout_i = in_layout_i;
5746 if (this_in_layout_i < 0)
5748 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
5749 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
5750 if (in_partition_i == -1u)
5751 continue;
5752 this_in_layout_i = m_partitions[in_partition_i].layout;
5754 if (this_in_layout_i > 0)
5755 entry.second = m_perms[this_in_layout_i][entry.second];
5757 if (out_layout_i > 0)
5758 vect_slp_permute (m_perms[out_layout_i], perm, true);
5761 /* Check whether the target allows NODE to be rearranged so that the node's
5762 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
5763 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
5765 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
5766 NODE can adapt to the layout changes that have (perhaps provisionally)
5767 been chosen for NODE's children, so that no extra permutations are
5768 needed on either the input or the output of NODE.
5770 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
5771 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
5773 IN_LAYOUT_I has no meaning for other types of node.
5775 Keeping the node as-is is always valid. If the target doesn't appear
5776 to support the node as-is, but might realistically support other layouts,
5777 then layout 0 instead has the cost of a worst-case permutation. On the
5778 one hand, this ensures that every node has at least one valid layout,
5779 avoiding what would otherwise be an awkward special case. On the other,
5780 it still encourages the pass to change an invalid pre-existing layout
5781 choice into a valid one. */
5784 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
5785 unsigned int out_layout_i)
5787 const int fallback_cost = 1;
5789 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5791 auto_lane_permutation_t tmp_perm;
5792 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5794 /* Check that the child nodes support the chosen layout. Checking
5795 the first child is enough, since any second child would have the
5796 same shape. */
5797 auto first_child = SLP_TREE_CHILDREN (node)[0];
5798 if (in_layout_i > 0
5799 && !is_compatible_layout (first_child, in_layout_i))
5800 return -1;
5802 change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
5803 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
5804 node, tmp_perm,
5805 SLP_TREE_CHILDREN (node),
5806 false);
5807 if (count < 0)
5809 if (in_layout_i == 0 && out_layout_i == 0)
5811 /* Use the fallback cost if the node could in principle support
5812 some nonzero layout for both the inputs and the outputs.
5813 Otherwise assume that the node will be rejected later
5814 and rebuilt from scalars. */
5815 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
5816 return fallback_cost;
5817 return 0;
5819 return -1;
5822 /* We currently have no way of telling whether the new layout is cheaper
5823 or more expensive than the old one. But at least in principle,
5824 it should be worth making zero permutations (whole-vector shuffles)
5825 cheaper than real permutations, in case the pass is able to remove
5826 the latter. */
5827 return count == 0 ? 0 : 1;
5830 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
5831 if (rep
5832 && STMT_VINFO_DATA_REF (rep)
5833 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
5834 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
5836 auto_load_permutation_t tmp_perm;
5837 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
5838 if (out_layout_i > 0)
5839 vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
5841 poly_uint64 vf = 1;
5842 if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
5843 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5844 unsigned int n_perms;
5845 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
5846 nullptr, vf, true, false, &n_perms))
5848 auto rep = SLP_TREE_REPRESENTATIVE (node);
5849 if (out_layout_i == 0)
5851 /* Use the fallback cost if the load is an N-to-N permutation.
5852 Otherwise assume that the node will be rejected later
5853 and rebuilt from scalars. */
5854 if (STMT_VINFO_GROUPED_ACCESS (rep)
5855 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
5856 == SLP_TREE_LANES (node)))
5857 return fallback_cost;
5858 return 0;
5860 return -1;
5863 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
5864 return n_perms == 0 ? 0 : 1;
5867 return 0;
5870 /* Decide which element layouts we should consider using. Calculate the
5871 weights associated with inserting layout changes on partition edges.
5872 Also mark partitions that cannot change layout, by setting their
5873 layout to zero. */
5875 void
5876 vect_optimize_slp_pass::start_choosing_layouts ()
5878 /* Used to assign unique permutation indices. */
5879 using perm_hash = unbounded_hashmap_traits<
5880 vec_free_hash_base<int_hash_base<unsigned>>,
5881 int_hash<int, -1, -2>
5883 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
5885 /* Layout 0 is "no change". */
5886 m_perms.safe_push (vNULL);
5888 /* Create layouts from existing permutations. */
5889 auto_load_permutation_t tmp_perm;
5890 for (unsigned int node_i : m_partitioned_nodes)
5892 /* Leafs also double as entries to the reverse graph. Allow the
5893 layout of those to be changed. */
5894 auto &vertex = m_vertices[node_i];
5895 auto &partition = m_partitions[vertex.partition];
5896 if (!m_slpg->vertices[node_i].succ)
5897 partition.layout = 0;
5899 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
5900 slp_tree node = vertex.node;
5901 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
5902 slp_tree child;
5903 unsigned HOST_WIDE_INT imin, imax = 0;
5904 bool any_permute = false;
5905 tmp_perm.truncate (0);
5906 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
5908 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
5909 unpermuted, record a layout that reverses this permutation.
5911 We would need more work to cope with loads that are internally
5912 permuted and also have inputs (such as masks for
5913 IFN_MASK_LOADs). */
5914 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
5915 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
5917 partition.layout = -1;
5918 continue;
5920 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
5921 imin = DR_GROUP_SIZE (dr_stmt) + 1;
5922 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
5924 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
5925 && SLP_TREE_CHILDREN (node).length () == 1
5926 && (child = SLP_TREE_CHILDREN (node)[0])
5927 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
5928 .is_constant (&imin)))
5930 /* If the child has the same vector size as this node,
5931 reversing the permutation can make the permutation a no-op.
5932 In other cases it can change a true permutation into a
5933 full-vector extract. */
5934 tmp_perm.reserve (SLP_TREE_LANES (node));
5935 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
5936 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
5938 else
5939 continue;
5941 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
5943 unsigned idx = tmp_perm[j];
5944 imin = MIN (imin, idx);
5945 imax = MAX (imax, idx);
5946 if (idx - tmp_perm[0] != j)
5947 any_permute = true;
5949 /* If the span doesn't match we'd disrupt VF computation, avoid
5950 that for now. */
5951 if (imax - imin + 1 != SLP_TREE_LANES (node))
5952 continue;
5953 /* If there's no permute no need to split one out. In this case
5954 we can consider turning a load into a permuted load, if that
5955 turns out to be cheaper than alternatives. */
5956 if (!any_permute)
5958 partition.layout = -1;
5959 continue;
5962 /* For now only handle true permutes, like
5963 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
5964 when permuting constants and invariants keeping the permute
5965 bijective. */
5966 auto_sbitmap load_index (SLP_TREE_LANES (node));
5967 bitmap_clear (load_index);
5968 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
5969 bitmap_set_bit (load_index, tmp_perm[j] - imin);
5970 unsigned j;
5971 for (j = 0; j < SLP_TREE_LANES (node); ++j)
5972 if (!bitmap_bit_p (load_index, j))
5973 break;
5974 if (j != SLP_TREE_LANES (node))
5975 continue;
5977 vec<unsigned> perm = vNULL;
5978 perm.safe_grow (SLP_TREE_LANES (node), true);
5979 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
5980 perm[j] = tmp_perm[j] - imin;
5982 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
5984 /* Continue to use existing layouts, but don't add any more. */
5985 int *entry = layout_ids.get (perm);
5986 partition.layout = entry ? *entry : 0;
5987 perm.release ();
5989 else
5991 bool existed;
5992 int &layout_i = layout_ids.get_or_insert (perm, &existed);
5993 if (existed)
5994 perm.release ();
5995 else
5997 layout_i = m_perms.length ();
5998 m_perms.safe_push (perm);
6000 partition.layout = layout_i;
6004 /* Initially assume that every layout is possible and has zero cost
6005 in every partition. */
6006 m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
6007 * m_perms.length ());
6009 /* We have to mark outgoing permutations facing non-associating-reduction
6010 graph entries that are not represented as to be materialized.
6011 slp_inst_kind_bb_reduc currently only covers associatable reductions. */
6012 for (slp_instance instance : m_vinfo->slp_instances)
6013 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
6015 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
6016 m_partitions[m_vertices[node_i].partition].layout = 0;
6018 else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
6020 stmt_vec_info stmt_info
6021 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
6022 stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
6023 if (needs_fold_left_reduction_p (TREE_TYPE
6024 (gimple_get_lhs (stmt_info->stmt)),
6025 STMT_VINFO_REDUC_CODE (reduc_info)))
6027 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
6028 m_partitions[m_vertices[node_i].partition].layout = 0;
6032 /* Check which layouts each node and partition can handle. Calculate the
6033 weights associated with inserting layout changes on edges. */
6034 for (unsigned int node_i : m_partitioned_nodes)
6036 auto &vertex = m_vertices[node_i];
6037 auto &partition = m_partitions[vertex.partition];
6038 slp_tree node = vertex.node;
6040 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
6042 vertex.weight = vect_slp_node_weight (node);
6044 /* We do not handle stores with a permutation, so all
6045 incoming permutations must have been materialized.
6047 We also don't handle masked grouped loads, which lack a
6048 permutation vector. In this case the memory locations
6049 form an implicit second input to the loads, on top of the
6050 explicit mask input, and the memory input's layout cannot
6051 be changed.
6053 On the other hand, we do support permuting gather loads and
6054 masked gather loads, where each scalar load is independent
6055 of the others. This can be useful if the address/index input
6056 benefits from permutation. */
6057 if (STMT_VINFO_DATA_REF (rep)
6058 && STMT_VINFO_GROUPED_ACCESS (rep)
6059 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
6060 partition.layout = 0;
6062 /* We cannot change the layout of an operation that is
6063 not independent on lanes. Note this is an explicit
6064 negative list since that's much shorter than the respective
6065 positive one but it's critical to keep maintaining it. */
6066 if (is_gimple_call (STMT_VINFO_STMT (rep)))
6067 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
6069 case CFN_COMPLEX_ADD_ROT90:
6070 case CFN_COMPLEX_ADD_ROT270:
6071 case CFN_COMPLEX_MUL:
6072 case CFN_COMPLEX_MUL_CONJ:
6073 case CFN_VEC_ADDSUB:
6074 case CFN_VEC_FMADDSUB:
6075 case CFN_VEC_FMSUBADD:
6076 partition.layout = 0;
6077 default:;
6081 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
6083 auto &other_vertex = m_vertices[other_node_i];
6085 /* Count the number of edges from earlier partitions and the number
6086 of edges to later partitions. */
6087 if (other_vertex.partition < vertex.partition)
6088 partition.in_degree += 1;
6089 else
6090 partition.out_degree += 1;
6092 /* If the current node uses the result of OTHER_NODE_I, accumulate
6093 the effects of that. */
6094 if (ud->src == int (node_i))
6096 other_vertex.out_weight += vertex.weight;
6097 other_vertex.out_degree += 1;
6100 for_each_partition_edge (node_i, process_edge);
6104 /* Return the incoming costs for node NODE_I, assuming that each input keeps
6105 its current (provisional) choice of layout. The inputs do not necessarily
6106 have the same layout as each other. */
6108 slpg_layout_cost
6109 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
6111 auto &vertex = m_vertices[node_i];
6112 slpg_layout_cost cost;
6113 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
6115 auto &other_vertex = m_vertices[other_node_i];
6116 if (other_vertex.partition < vertex.partition)
6118 auto &other_partition = m_partitions[other_vertex.partition];
6119 auto &other_costs = partition_layout_costs (other_vertex.partition,
6120 other_partition.layout);
6121 slpg_layout_cost this_cost = other_costs.in_cost;
6122 this_cost.add_serial_cost (other_costs.internal_cost);
6123 this_cost.split (other_partition.out_degree);
6124 cost.add_parallel_cost (this_cost);
6127 for_each_partition_edge (node_i, add_cost);
6128 return cost;
6131 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
6132 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
6133 slpg_layout_cost::impossible () if the change isn't possible. */
6135 slpg_layout_cost
6136 vect_optimize_slp_pass::
6137 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
6138 unsigned int layout2_i)
6140 auto &def_vertex = m_vertices[ud->dest];
6141 auto &use_vertex = m_vertices[ud->src];
6142 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
6143 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
6144 auto factor = change_layout_cost (def_vertex.node, def_layout_i,
6145 use_layout_i);
6146 if (factor < 0)
6147 return slpg_layout_cost::impossible ();
6149 /* We have a choice of putting the layout change at the site of the
6150 definition or at the site of the use. Prefer the former when
6151 optimizing for size or when the execution frequency of the
6152 definition is no greater than the combined execution frequencies of
6153 the uses. When putting the layout change at the site of the definition,
6154 divvy up the cost among all consumers. */
6155 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
6157 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
6158 cost.split (def_vertex.out_degree);
6159 return cost;
6161 return { use_vertex.weight * factor, m_optimize_size };
6164 /* UD represents a use-def link between FROM_NODE_I and a node in a later
6165 partition; FROM_NODE_I could be the definition node or the use node.
6166 The node at the other end of the link wants to use layout TO_LAYOUT_I.
6167 Return the cost of any necessary fix-ups on edge UD, or return
6168 slpg_layout_cost::impossible () if the change isn't possible.
6170 At this point, FROM_NODE_I's partition has chosen the cheapest
6171 layout based on the information available so far, but this choice
6172 is only provisional. */
6174 slpg_layout_cost
6175 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
6176 unsigned int to_layout_i)
6178 auto &from_vertex = m_vertices[from_node_i];
6179 unsigned int from_partition_i = from_vertex.partition;
6180 slpg_partition_info &from_partition = m_partitions[from_partition_i];
6181 gcc_assert (from_partition.layout >= 0);
6183 /* First calculate the cost on the assumption that FROM_PARTITION sticks
6184 with its current layout preference. */
6185 slpg_layout_cost cost = slpg_layout_cost::impossible ();
6186 auto edge_cost = edge_layout_cost (ud, from_node_i,
6187 from_partition.layout, to_layout_i);
6188 if (edge_cost.is_possible ())
6190 auto &from_costs = partition_layout_costs (from_partition_i,
6191 from_partition.layout);
6192 cost = from_costs.in_cost;
6193 cost.add_serial_cost (from_costs.internal_cost);
6194 cost.split (from_partition.out_degree);
6195 cost.add_serial_cost (edge_cost);
6197 else if (from_partition.layout == 0)
6198 /* We must allow the source partition to have layout 0 as a fallback,
6199 in case all other options turn out to be impossible. */
6200 return cost;
6202 /* Take the minimum of that cost and the cost that applies if
6203 FROM_PARTITION instead switches to TO_LAYOUT_I. */
6204 auto &direct_layout_costs = partition_layout_costs (from_partition_i,
6205 to_layout_i);
6206 if (direct_layout_costs.is_possible ())
6208 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
6209 direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
6210 direct_cost.split (from_partition.out_degree);
6211 if (!cost.is_possible ()
6212 || direct_cost.is_better_than (cost, m_optimize_size))
6213 cost = direct_cost;
6216 return cost;
6219 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
6220 partition; TO_NODE_I could be the definition node or the use node.
6221 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
6222 return the cost of any necessary fix-ups on edge UD, or
6223 slpg_layout_cost::impossible () if the choice cannot be made.
6225 At this point, TO_NODE_I's partition has a fixed choice of layout. */
6227 slpg_layout_cost
6228 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
6229 unsigned int from_layout_i)
6231 auto &to_vertex = m_vertices[to_node_i];
6232 unsigned int to_partition_i = to_vertex.partition;
6233 slpg_partition_info &to_partition = m_partitions[to_partition_i];
6234 gcc_assert (to_partition.layout >= 0);
6236 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
6237 adjusted for this input having layout FROM_LAYOUT_I. Assume that
6238 any other inputs keep their current choice of layout. */
6239 auto &to_costs = partition_layout_costs (to_partition_i,
6240 to_partition.layout);
6241 if (ud->src == int (to_node_i)
6242 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
6244 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
6245 auto old_layout = from_partition.layout;
6246 from_partition.layout = from_layout_i;
6247 int factor = internal_node_cost (to_vertex.node, -1,
6248 to_partition.layout);
6249 from_partition.layout = old_layout;
6250 if (factor >= 0)
6252 slpg_layout_cost cost = to_costs.out_cost;
6253 cost.add_serial_cost ({ to_vertex.weight * factor,
6254 m_optimize_size });
6255 cost.split (to_partition.in_degree);
6256 return cost;
6260 /* Compute the cost if we insert any necessary layout change on edge UD. */
6261 auto edge_cost = edge_layout_cost (ud, to_node_i,
6262 to_partition.layout, from_layout_i);
6263 if (edge_cost.is_possible ())
6265 slpg_layout_cost cost = to_costs.out_cost;
6266 cost.add_serial_cost (to_costs.internal_cost);
6267 cost.split (to_partition.in_degree);
6268 cost.add_serial_cost (edge_cost);
6269 return cost;
6272 return slpg_layout_cost::impossible ();
6275 /* Make a forward pass through the partitions, accumulating input costs.
6276 Make a tentative (provisional) choice of layout for each partition,
6277 ensuring that this choice still allows later partitions to keep
6278 their original layout. */
6280 void
6281 vect_optimize_slp_pass::forward_pass ()
6283 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
6284 ++partition_i)
6286 auto &partition = m_partitions[partition_i];
6288 /* If the partition consists of a single VEC_PERM_EXPR, precompute
6289 the incoming cost that would apply if every predecessor partition
6290 keeps its current layout. This is used within the loop below. */
6291 slpg_layout_cost in_cost;
6292 slp_tree single_node = nullptr;
6293 if (partition.node_end == partition.node_begin + 1)
6295 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
6296 single_node = m_vertices[node_i].node;
6297 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
6298 in_cost = total_in_cost (node_i);
6301 /* Go through the possible layouts. Decide which ones are valid
6302 for this partition and record which of the valid layouts has
6303 the lowest cost. */
6304 unsigned int min_layout_i = 0;
6305 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
6306 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6308 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6309 if (!layout_costs.is_possible ())
6310 continue;
6312 /* If the recorded layout is already 0 then the layout cannot
6313 change. */
6314 if (partition.layout == 0 && layout_i != 0)
6316 layout_costs.mark_impossible ();
6317 continue;
6320 bool is_possible = true;
6321 for (unsigned int order_i = partition.node_begin;
6322 order_i < partition.node_end; ++order_i)
6324 unsigned int node_i = m_partitioned_nodes[order_i];
6325 auto &vertex = m_vertices[node_i];
6327 /* Reject the layout if it is individually incompatible
6328 with any node in the partition. */
6329 if (!is_compatible_layout (vertex.node, layout_i))
6331 is_possible = false;
6332 break;
6335 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
6337 auto &other_vertex = m_vertices[other_node_i];
6338 if (other_vertex.partition < vertex.partition)
6340 /* Accumulate the incoming costs from earlier
6341 partitions, plus the cost of any layout changes
6342 on UD itself. */
6343 auto cost = forward_cost (ud, other_node_i, layout_i);
6344 if (!cost.is_possible ())
6345 is_possible = false;
6346 else
6347 layout_costs.in_cost.add_parallel_cost (cost);
6349 else
6350 /* Reject the layout if it would make layout 0 impossible
6351 for later partitions. This amounts to testing that the
6352 target supports reversing the layout change on edges
6353 to later partitions.
6355 In principle, it might be possible to push a layout
6356 change all the way down a graph, so that it never
6357 needs to be reversed and so that the target doesn't
6358 need to support the reverse operation. But it would
6359 be awkward to bail out if we hit a partition that
6360 does not support the new layout, especially since
6361 we are not dealing with a lattice. */
6362 is_possible &= edge_layout_cost (ud, other_node_i, 0,
6363 layout_i).is_possible ();
6365 for_each_partition_edge (node_i, add_cost);
6367 /* Accumulate the cost of using LAYOUT_I within NODE,
6368 both for the inputs and the outputs. */
6369 int factor = internal_node_cost (vertex.node, layout_i,
6370 layout_i);
6371 if (factor < 0)
6373 is_possible = false;
6374 break;
6376 else if (factor)
6377 layout_costs.internal_cost.add_serial_cost
6378 ({ vertex.weight * factor, m_optimize_size });
6380 if (!is_possible)
6382 layout_costs.mark_impossible ();
6383 continue;
6386 /* Combine the incoming and partition-internal costs. */
6387 slpg_layout_cost combined_cost = layout_costs.in_cost;
6388 combined_cost.add_serial_cost (layout_costs.internal_cost);
6390 /* If this partition consists of a single VEC_PERM_EXPR, see
6391 if the VEC_PERM_EXPR can be changed to support output layout
6392 LAYOUT_I while keeping all the provisional choices of input
6393 layout. */
6394 if (single_node
6395 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
6397 int factor = internal_node_cost (single_node, -1, layout_i);
6398 if (factor >= 0)
6400 auto weight = m_vertices[single_node->vertex].weight;
6401 slpg_layout_cost internal_cost
6402 = { weight * factor, m_optimize_size };
6404 slpg_layout_cost alt_cost = in_cost;
6405 alt_cost.add_serial_cost (internal_cost);
6406 if (alt_cost.is_better_than (combined_cost, m_optimize_size))
6408 combined_cost = alt_cost;
6409 layout_costs.in_cost = in_cost;
6410 layout_costs.internal_cost = internal_cost;
6415 /* Record the layout with the lowest cost. Prefer layout 0 in
6416 the event of a tie between it and another layout. */
6417 if (!min_layout_cost.is_possible ()
6418 || combined_cost.is_better_than (min_layout_cost,
6419 m_optimize_size))
6421 min_layout_i = layout_i;
6422 min_layout_cost = combined_cost;
6426 /* This loop's handling of earlier partitions should ensure that
6427 choosing the original layout for the current partition is no
6428 less valid than it was in the original graph, even with the
6429 provisional layout choices for those earlier partitions. */
6430 gcc_assert (min_layout_cost.is_possible ());
6431 partition.layout = min_layout_i;
6435 /* Make a backward pass through the partitions, accumulating output costs.
6436 Make a final choice of layout for each partition. */
6438 void
6439 vect_optimize_slp_pass::backward_pass ()
6441 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
6443 auto &partition = m_partitions[partition_i];
6445 unsigned int min_layout_i = 0;
6446 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
6447 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6449 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6450 if (!layout_costs.is_possible ())
6451 continue;
6453 /* Accumulate the costs from successor partitions. */
6454 bool is_possible = true;
6455 for (unsigned int order_i = partition.node_begin;
6456 order_i < partition.node_end; ++order_i)
6458 unsigned int node_i = m_partitioned_nodes[order_i];
6459 auto &vertex = m_vertices[node_i];
6460 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
6462 auto &other_vertex = m_vertices[other_node_i];
6463 auto &other_partition = m_partitions[other_vertex.partition];
6464 if (other_vertex.partition > vertex.partition)
6466 /* Accumulate the incoming costs from later
6467 partitions, plus the cost of any layout changes
6468 on UD itself. */
6469 auto cost = backward_cost (ud, other_node_i, layout_i);
6470 if (!cost.is_possible ())
6471 is_possible = false;
6472 else
6473 layout_costs.out_cost.add_parallel_cost (cost);
6475 else
6476 /* Make sure that earlier partitions can (if necessary
6477 or beneficial) keep the layout that they chose in
6478 the forward pass. This ensures that there is at
6479 least one valid choice of layout. */
6480 is_possible &= edge_layout_cost (ud, other_node_i,
6481 other_partition.layout,
6482 layout_i).is_possible ();
6484 for_each_partition_edge (node_i, add_cost);
6486 if (!is_possible)
6488 layout_costs.mark_impossible ();
6489 continue;
6492 /* Locally combine the costs from the forward and backward passes.
6493 (This combined cost is not passed on, since that would lead
6494 to double counting.) */
6495 slpg_layout_cost combined_cost = layout_costs.in_cost;
6496 combined_cost.add_serial_cost (layout_costs.internal_cost);
6497 combined_cost.add_serial_cost (layout_costs.out_cost);
6499 /* Record the layout with the lowest cost. Prefer layout 0 in
6500 the event of a tie between it and another layout. */
6501 if (!min_layout_cost.is_possible ()
6502 || combined_cost.is_better_than (min_layout_cost,
6503 m_optimize_size))
6505 min_layout_i = layout_i;
6506 min_layout_cost = combined_cost;
6510 gcc_assert (min_layout_cost.is_possible ());
6511 partition.layout = min_layout_i;
6515 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
6516 NODE already has the layout that was selected for its partition. */
6518 slp_tree
6519 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
6520 unsigned int to_layout_i)
6522 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
6523 slp_tree result = m_node_layouts[result_i];
6524 if (result)
6525 return result;
6527 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
6528 || (SLP_TREE_DEF_TYPE (node) == vect_external_def
6529 /* We can't permute vector defs in place. */
6530 && SLP_TREE_VEC_DEFS (node).is_empty ()))
6532 /* If the vector is uniform or unchanged, there's nothing to do. */
6533 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
6534 result = node;
6535 else
6537 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
6538 result = vect_create_new_slp_node (scalar_ops);
6539 vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
6542 else
6544 unsigned int partition_i = m_vertices[node->vertex].partition;
6545 unsigned int from_layout_i = m_partitions[partition_i].layout;
6546 if (from_layout_i == to_layout_i)
6547 return node;
6549 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
6550 permutation instead of a serial one. Leave the new permutation
6551 in TMP_PERM on success. */
6552 auto_lane_permutation_t tmp_perm;
6553 unsigned int num_inputs = 1;
6554 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6556 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
6557 if (from_layout_i != 0)
6558 vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
6559 if (to_layout_i != 0)
6560 vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
6561 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
6562 tmp_perm,
6563 SLP_TREE_CHILDREN (node),
6564 false) >= 0)
6565 num_inputs = SLP_TREE_CHILDREN (node).length ();
6566 else
6567 tmp_perm.truncate (0);
6570 if (dump_enabled_p ())
6572 if (tmp_perm.length () > 0)
6573 dump_printf_loc (MSG_NOTE, vect_location,
6574 "duplicating permutation node %p with"
6575 " layout %d\n",
6576 (void *) node, to_layout_i);
6577 else
6578 dump_printf_loc (MSG_NOTE, vect_location,
6579 "inserting permutation node in place of %p\n",
6580 (void *) node);
6583 unsigned int num_lanes = SLP_TREE_LANES (node);
6584 result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
6585 if (SLP_TREE_SCALAR_STMTS (node).length ())
6587 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
6588 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
6589 if (from_layout_i != 0)
6590 vect_slp_permute (m_perms[from_layout_i], stmts, false);
6591 if (to_layout_i != 0)
6592 vect_slp_permute (m_perms[to_layout_i], stmts, true);
6594 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
6595 SLP_TREE_LANES (result) = num_lanes;
6596 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
6597 result->vertex = -1;
6599 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
6600 if (tmp_perm.length ())
6602 lane_perm.safe_splice (tmp_perm);
6603 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
6605 else
6607 lane_perm.create (num_lanes);
6608 for (unsigned j = 0; j < num_lanes; ++j)
6609 lane_perm.quick_push ({ 0, j });
6610 if (from_layout_i != 0)
6611 vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
6612 if (to_layout_i != 0)
6613 vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
6614 SLP_TREE_CHILDREN (result).safe_push (node);
6616 for (slp_tree child : SLP_TREE_CHILDREN (result))
6617 child->refcnt++;
6619 m_node_layouts[result_i] = result;
6620 return result;
6623 /* Apply the chosen vector layouts to the SLP graph. */
6625 void
6626 vect_optimize_slp_pass::materialize ()
6628 /* We no longer need the costs, so avoid having two O(N * P) arrays
6629 live at the same time. */
6630 m_partition_layout_costs.release ();
6631 m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
6633 auto_sbitmap fully_folded (m_vertices.length ());
6634 bitmap_clear (fully_folded);
6635 for (unsigned int node_i : m_partitioned_nodes)
6637 auto &vertex = m_vertices[node_i];
6638 slp_tree node = vertex.node;
6639 int layout_i = m_partitions[vertex.partition].layout;
6640 gcc_assert (layout_i >= 0);
6642 /* Rearrange the scalar statements to match the chosen layout. */
6643 if (layout_i > 0)
6644 vect_slp_permute (m_perms[layout_i],
6645 SLP_TREE_SCALAR_STMTS (node), true);
6647 /* Update load and lane permutations. */
6648 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6650 /* First try to absorb the input vector layouts. If that fails,
6651 force the inputs to have layout LAYOUT_I too. We checked that
6652 that was possible before deciding to use nonzero output layouts.
6653 (Note that at this stage we don't really have any guarantee that
6654 the target supports the original VEC_PERM_EXPR.) */
6655 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
6656 auto_lane_permutation_t tmp_perm;
6657 tmp_perm.safe_splice (perm);
6658 change_vec_perm_layout (node, tmp_perm, -1, layout_i);
6659 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
6660 tmp_perm,
6661 SLP_TREE_CHILDREN (node),
6662 false) >= 0)
6664 if (dump_enabled_p ()
6665 && !std::equal (tmp_perm.begin (), tmp_perm.end (),
6666 perm.begin ()))
6667 dump_printf_loc (MSG_NOTE, vect_location,
6668 "absorbing input layouts into %p\n",
6669 (void *) node);
6670 std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
6671 bitmap_set_bit (fully_folded, node_i);
6673 else
6675 /* Not MSG_MISSED because it would make no sense to users. */
6676 if (dump_enabled_p ())
6677 dump_printf_loc (MSG_NOTE, vect_location,
6678 "failed to absorb input layouts into %p\n",
6679 (void *) node);
6680 change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
6683 else
6685 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
6686 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
6687 if (layout_i > 0)
6688 /* ??? When we handle non-bijective permutes the idea
6689 is that we can force the load-permutation to be
6690 { min, min + 1, min + 2, ... max }. But then the
6691 scalar defs might no longer match the lane content
6692 which means wrong-code with live lane vectorization.
6693 So we possibly have to have NULL entries for those. */
6694 vect_slp_permute (m_perms[layout_i], load_perm, true);
6698 /* Do this before any nodes disappear, since it involves a walk
6699 over the leaves. */
6700 remove_redundant_permutations ();
6702 /* Replace each child with a correctly laid-out version. */
6703 for (unsigned int node_i : m_partitioned_nodes)
6705 /* Skip nodes that have already been handled above. */
6706 if (bitmap_bit_p (fully_folded, node_i))
6707 continue;
6709 auto &vertex = m_vertices[node_i];
6710 int in_layout_i = m_partitions[vertex.partition].layout;
6711 gcc_assert (in_layout_i >= 0);
6713 unsigned j;
6714 slp_tree child;
6715 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
6717 if (!child)
6718 continue;
6720 slp_tree new_child = get_result_with_layout (child, in_layout_i);
6721 if (new_child != child)
6723 vect_free_slp_tree (child);
6724 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
6725 new_child->refcnt += 1;
6731 /* Elide load permutations that are not necessary. Such permutations might
6732 be pre-existing, rather than created by the layout optimizations. */
6734 void
6735 vect_optimize_slp_pass::remove_redundant_permutations ()
6737 for (unsigned int node_i : m_leafs)
6739 slp_tree node = m_vertices[node_i].node;
6740 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
6741 continue;
6743 /* In basic block vectorization we allow any subchain of an interleaving
6744 chain.
6745 FORNOW: not in loop SLP because of realignment complications. */
6746 if (is_a <bb_vec_info> (m_vinfo))
6748 bool subchain_p = true;
6749 stmt_vec_info next_load_info = NULL;
6750 stmt_vec_info load_info;
6751 unsigned j;
6752 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
6754 if (j != 0
6755 && (next_load_info != load_info
6756 || ! load_info
6757 || DR_GROUP_GAP (load_info) != 1))
6759 subchain_p = false;
6760 break;
6762 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
6764 if (subchain_p)
6766 SLP_TREE_LOAD_PERMUTATION (node).release ();
6767 continue;
6770 else
6772 loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
6773 stmt_vec_info load_info;
6774 bool this_load_permuted = false;
6775 unsigned j;
6776 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
6777 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
6779 this_load_permuted = true;
6780 break;
6782 /* When this isn't a grouped access we know it's single element
6783 and contiguous. */
6784 if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
6786 if (!this_load_permuted
6787 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
6788 || SLP_TREE_LANES (node) == 1))
6789 SLP_TREE_LOAD_PERMUTATION (node).release ();
6790 continue;
6792 stmt_vec_info first_stmt_info
6793 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
6794 if (!this_load_permuted
6795 /* The load requires permutation when unrolling exposes
6796 a gap either because the group is larger than the SLP
6797 group-size or because there is a gap between the groups. */
6798 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
6799 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
6800 && DR_GROUP_GAP (first_stmt_info) == 0)))
6802 SLP_TREE_LOAD_PERMUTATION (node).release ();
6803 continue;
6809 /* Print the partition graph and layout information to the dump file. */
6811 void
6812 vect_optimize_slp_pass::dump ()
6814 dump_printf_loc (MSG_NOTE, vect_location,
6815 "SLP optimize permutations:\n");
6816 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
6818 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
6819 const char *sep = "";
6820 for (unsigned int idx : m_perms[layout_i])
6822 dump_printf (MSG_NOTE, "%s%d", sep, idx);
6823 sep = ", ";
6825 dump_printf (MSG_NOTE, " }\n");
6827 dump_printf_loc (MSG_NOTE, vect_location,
6828 "SLP optimize partitions:\n");
6829 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
6830 ++partition_i)
6832 auto &partition = m_partitions[partition_i];
6833 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
6834 dump_printf_loc (MSG_NOTE, vect_location,
6835 " partition %d (layout %d):\n",
6836 partition_i, partition.layout);
6837 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
6838 for (unsigned int order_i = partition.node_begin;
6839 order_i < partition.node_end; ++order_i)
6841 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
6842 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
6843 (void *) vertex.node);
6844 dump_printf_loc (MSG_NOTE, vect_location,
6845 " weight: %f\n",
6846 vertex.weight.to_double ());
6847 if (vertex.out_degree)
6848 dump_printf_loc (MSG_NOTE, vect_location,
6849 " out weight: %f (degree %d)\n",
6850 vertex.out_weight.to_double (),
6851 vertex.out_degree);
6852 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
6853 dump_printf_loc (MSG_NOTE, vect_location,
6854 " op: VEC_PERM_EXPR\n");
6855 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
6856 dump_printf_loc (MSG_NOTE, vect_location,
6857 " op template: %G", rep->stmt);
6859 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
6860 for (unsigned int order_i = partition.node_begin;
6861 order_i < partition.node_end; ++order_i)
6863 unsigned int node_i = m_partitioned_nodes[order_i];
6864 auto &vertex = m_vertices[node_i];
6865 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
6867 auto &other_vertex = m_vertices[other_node_i];
6868 if (other_vertex.partition < vertex.partition)
6869 dump_printf_loc (MSG_NOTE, vect_location,
6870 " - %p [%d] --> %p\n",
6871 (void *) other_vertex.node,
6872 other_vertex.partition,
6873 (void *) vertex.node);
6874 else
6875 dump_printf_loc (MSG_NOTE, vect_location,
6876 " - %p --> [%d] %p\n",
6877 (void *) vertex.node,
6878 other_vertex.partition,
6879 (void *) other_vertex.node);
6881 for_each_partition_edge (node_i, print_edge);
6884 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6886 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6887 if (layout_costs.is_possible ())
6889 dump_printf_loc (MSG_NOTE, vect_location,
6890 " layout %d:%s\n", layout_i,
6891 partition.layout == int (layout_i)
6892 ? " (*)" : "");
6893 slpg_layout_cost combined_cost = layout_costs.in_cost;
6894 combined_cost.add_serial_cost (layout_costs.internal_cost);
6895 combined_cost.add_serial_cost (layout_costs.out_cost);
6896 #define TEMPLATE "{depth: %f, total: %f}"
6897 dump_printf_loc (MSG_NOTE, vect_location,
6898 " " TEMPLATE "\n",
6899 layout_costs.in_cost.depth.to_double (),
6900 layout_costs.in_cost.total.to_double ());
6901 dump_printf_loc (MSG_NOTE, vect_location,
6902 " + " TEMPLATE "\n",
6903 layout_costs.internal_cost.depth.to_double (),
6904 layout_costs.internal_cost.total.to_double ());
6905 dump_printf_loc (MSG_NOTE, vect_location,
6906 " + " TEMPLATE "\n",
6907 layout_costs.out_cost.depth.to_double (),
6908 layout_costs.out_cost.total.to_double ());
6909 dump_printf_loc (MSG_NOTE, vect_location,
6910 " = " TEMPLATE "\n",
6911 combined_cost.depth.to_double (),
6912 combined_cost.total.to_double ());
6913 #undef TEMPLATE
6915 else
6916 dump_printf_loc (MSG_NOTE, vect_location,
6917 " layout %d: rejected\n", layout_i);
6922 /* Main entry point for the SLP graph optimization pass. */
6924 void
6925 vect_optimize_slp_pass::run ()
6927 build_graph ();
6928 create_partitions ();
6929 start_choosing_layouts ();
6930 if (m_perms.length () > 1)
6932 forward_pass ();
6933 backward_pass ();
6934 if (dump_enabled_p ())
6935 dump ();
6936 materialize ();
6937 while (!m_perms.is_empty ())
6938 m_perms.pop ().release ();
6940 else
6941 remove_redundant_permutations ();
6942 free_graph (m_slpg);
6945 /* Apply CSE to NODE and its children using BST_MAP. */
6947 static void
6948 vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
6950 bool put_p = false;
6951 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
6952 /* Besides some VEC_PERM_EXPR, two-operator nodes also
6953 lack scalar stmts and thus CSE doesn't work via bst_map. Ideally
6954 we'd have sth that works for all internal and external nodes. */
6955 && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
6957 slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
6958 if (leader)
6960 /* We've visited this node already. */
6961 if (!*leader || *leader == node)
6962 return;
6964 if (dump_enabled_p ())
6965 dump_printf_loc (MSG_NOTE, vect_location,
6966 "re-using SLP tree %p for %p\n",
6967 (void *)*leader, (void *)node);
6968 vect_free_slp_tree (node);
6969 (*leader)->refcnt += 1;
6970 node = *leader;
6971 return;
6974 /* Avoid creating a cycle by populating the map only after recursion. */
6975 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
6976 node->refcnt += 1;
6977 put_p = true;
6978 /* And recurse. */
6981 for (slp_tree &child : SLP_TREE_CHILDREN (node))
6982 if (child)
6983 vect_cse_slp_nodes (bst_map, child);
6985 /* Now record the node for CSE in other siblings. */
6986 if (put_p)
6987 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
6990 /* Optimize the SLP graph of VINFO. */
6992 void
6993 vect_optimize_slp (vec_info *vinfo)
6995 if (vinfo->slp_instances.is_empty ())
6996 return;
6997 vect_optimize_slp_pass (vinfo).run ();
6999 /* Apply CSE again to nodes after permute optimization. */
7000 scalar_stmts_to_slp_tree_map_t *bst_map
7001 = new scalar_stmts_to_slp_tree_map_t ();
7003 for (auto inst : vinfo->slp_instances)
7004 vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
7006 release_scalar_stmts_to_slp_tree_map (bst_map);
7009 /* Gather loads reachable from the individual SLP graph entries. */
7011 void
7012 vect_gather_slp_loads (vec_info *vinfo)
7014 unsigned i;
7015 slp_instance instance;
7016 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
7018 hash_set<slp_tree> visited;
7019 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
7020 SLP_INSTANCE_TREE (instance), visited);
7025 /* For each possible SLP instance decide whether to SLP it and calculate overall
7026 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
7027 least one instance. */
7029 bool
7030 vect_make_slp_decision (loop_vec_info loop_vinfo)
7032 unsigned int i;
7033 poly_uint64 unrolling_factor = 1;
7034 const vec<slp_instance> &slp_instances
7035 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7036 slp_instance instance;
7037 int decided_to_slp = 0;
7039 DUMP_VECT_SCOPE ("vect_make_slp_decision");
7041 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7043 /* FORNOW: SLP if you can. */
7044 /* All unroll factors have the form:
7046 GET_MODE_SIZE (vinfo->vector_mode) * X
7048 for some rational X, so they must have a common multiple. */
7049 unrolling_factor
7050 = force_common_multiple (unrolling_factor,
7051 SLP_INSTANCE_UNROLLING_FACTOR (instance));
7053 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
7054 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
7055 loop-based vectorization. Such stmts will be marked as HYBRID. */
7056 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
7057 decided_to_slp++;
7060 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
7062 if (decided_to_slp && dump_enabled_p ())
7064 dump_printf_loc (MSG_NOTE, vect_location,
7065 "Decided to SLP %d instances. Unrolling factor ",
7066 decided_to_slp);
7067 dump_dec (MSG_NOTE, unrolling_factor);
7068 dump_printf (MSG_NOTE, "\n");
7071 return (decided_to_slp > 0);
7074 /* Private data for vect_detect_hybrid_slp. */
7075 struct vdhs_data
7077 loop_vec_info loop_vinfo;
7078 vec<stmt_vec_info> *worklist;
7081 /* Walker for walk_gimple_op. */
7083 static tree
7084 vect_detect_hybrid_slp (tree *tp, int *, void *data)
7086 walk_stmt_info *wi = (walk_stmt_info *)data;
7087 vdhs_data *dat = (vdhs_data *)wi->info;
7089 if (wi->is_lhs)
7090 return NULL_TREE;
7092 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
7093 if (!def_stmt_info)
7094 return NULL_TREE;
7095 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
7096 if (PURE_SLP_STMT (def_stmt_info))
7098 if (dump_enabled_p ())
7099 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
7100 def_stmt_info->stmt);
7101 STMT_SLP_TYPE (def_stmt_info) = hybrid;
7102 dat->worklist->safe_push (def_stmt_info);
7105 return NULL_TREE;
7108 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
7109 if so, otherwise pushing it to WORKLIST. */
7111 static void
7112 maybe_push_to_hybrid_worklist (vec_info *vinfo,
7113 vec<stmt_vec_info> &worklist,
7114 stmt_vec_info stmt_info)
7116 if (dump_enabled_p ())
7117 dump_printf_loc (MSG_NOTE, vect_location,
7118 "Processing hybrid candidate : %G", stmt_info->stmt);
7119 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
7120 imm_use_iterator iter2;
7121 ssa_op_iter iter1;
7122 use_operand_p use_p;
7123 def_operand_p def_p;
7124 bool any_def = false;
7125 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
7127 any_def = true;
7128 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
7130 if (is_gimple_debug (USE_STMT (use_p)))
7131 continue;
7132 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
7133 /* An out-of loop use means this is a loop_vect sink. */
7134 if (!use_info)
7136 if (dump_enabled_p ())
7137 dump_printf_loc (MSG_NOTE, vect_location,
7138 "Found loop_vect sink: %G", stmt_info->stmt);
7139 worklist.safe_push (stmt_info);
7140 return;
7142 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
7144 if (dump_enabled_p ())
7145 dump_printf_loc (MSG_NOTE, vect_location,
7146 "Found loop_vect use: %G", use_info->stmt);
7147 worklist.safe_push (stmt_info);
7148 return;
7152 /* No def means this is a loo_vect sink. */
7153 if (!any_def)
7155 if (dump_enabled_p ())
7156 dump_printf_loc (MSG_NOTE, vect_location,
7157 "Found loop_vect sink: %G", stmt_info->stmt);
7158 worklist.safe_push (stmt_info);
7159 return;
7161 if (dump_enabled_p ())
7162 dump_printf_loc (MSG_NOTE, vect_location,
7163 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
7164 STMT_SLP_TYPE (stmt_info) = pure_slp;
7167 /* Find stmts that must be both vectorized and SLPed. */
7169 void
7170 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
7172 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
7174 /* All stmts participating in SLP are marked pure_slp, all other
7175 stmts are loop_vect.
7176 First collect all loop_vect stmts into a worklist.
7177 SLP patterns cause not all original scalar stmts to appear in
7178 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
7179 Rectify this here and do a backward walk over the IL only considering
7180 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
7181 mark them as pure_slp. */
7182 auto_vec<stmt_vec_info> worklist;
7183 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
7185 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
7186 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
7187 gsi_next (&gsi))
7189 gphi *phi = gsi.phi ();
7190 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
7191 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
7192 maybe_push_to_hybrid_worklist (loop_vinfo,
7193 worklist, stmt_info);
7195 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
7196 gsi_prev (&gsi))
7198 gimple *stmt = gsi_stmt (gsi);
7199 if (is_gimple_debug (stmt))
7200 continue;
7201 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
7202 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
7204 for (gimple_stmt_iterator gsi2
7205 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
7206 !gsi_end_p (gsi2); gsi_next (&gsi2))
7208 stmt_vec_info patt_info
7209 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
7210 if (!STMT_SLP_TYPE (patt_info)
7211 && STMT_VINFO_RELEVANT (patt_info))
7212 maybe_push_to_hybrid_worklist (loop_vinfo,
7213 worklist, patt_info);
7215 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7217 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
7218 maybe_push_to_hybrid_worklist (loop_vinfo,
7219 worklist, stmt_info);
7223 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
7224 mark any SLP vectorized stmt as hybrid.
7225 ??? We're visiting def stmts N times (once for each non-SLP and
7226 once for each hybrid-SLP use). */
7227 walk_stmt_info wi;
7228 vdhs_data dat;
7229 dat.worklist = &worklist;
7230 dat.loop_vinfo = loop_vinfo;
7231 memset (&wi, 0, sizeof (wi));
7232 wi.info = (void *)&dat;
7233 while (!worklist.is_empty ())
7235 stmt_vec_info stmt_info = worklist.pop ();
7236 /* Since SSA operands are not set up for pattern stmts we need
7237 to use walk_gimple_op. */
7238 wi.is_lhs = 0;
7239 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
7240 /* For gather/scatter make sure to walk the offset operand, that
7241 can be a scaling and conversion away. */
7242 gather_scatter_info gs_info;
7243 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
7244 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
7246 int dummy;
7247 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
7253 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
7255 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
7256 : vec_info (vec_info::bb, shared),
7257 roots (vNULL)
7259 /* The region we are operating on. bbs[0] is the entry, excluding
7260 its PHI nodes. In the future we might want to track an explicit
7261 entry edge to cover bbs[0] PHI nodes and have a region entry
7262 insert location. */
7263 bbs = _bbs.address ();
7264 nbbs = _bbs.length ();
7266 for (unsigned i = 0; i < nbbs; ++i)
7268 if (i != 0)
7269 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
7270 gsi_next (&si))
7272 gphi *phi = si.phi ();
7273 gimple_set_uid (phi, 0);
7274 add_stmt (phi);
7276 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
7277 !gsi_end_p (gsi); gsi_next (&gsi))
7279 gimple *stmt = gsi_stmt (gsi);
7280 gimple_set_uid (stmt, 0);
7281 if (is_gimple_debug (stmt))
7282 continue;
7283 add_stmt (stmt);
7289 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
7290 stmts in the basic block. */
7292 _bb_vec_info::~_bb_vec_info ()
7294 /* Reset region marker. */
7295 for (unsigned i = 0; i < nbbs; ++i)
7297 if (i != 0)
7298 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
7299 gsi_next (&si))
7301 gphi *phi = si.phi ();
7302 gimple_set_uid (phi, -1);
7304 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
7305 !gsi_end_p (gsi); gsi_next (&gsi))
7307 gimple *stmt = gsi_stmt (gsi);
7308 gimple_set_uid (stmt, -1);
7312 for (unsigned i = 0; i < roots.length (); ++i)
7314 roots[i].stmts.release ();
7315 roots[i].roots.release ();
7316 roots[i].remain.release ();
7318 roots.release ();
7321 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
7322 given then that child nodes have already been processed, and that
7323 their def types currently match their SLP node's def type. */
7325 static bool
7326 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
7327 slp_instance node_instance,
7328 stmt_vector_for_cost *cost_vec)
7330 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7332 /* Calculate the number of vector statements to be created for the scalar
7333 stmts in this node. It is the number of scalar elements in one scalar
7334 iteration (DR_GROUP_SIZE) multiplied by VF divided by the number of
7335 elements in a vector. For single-defuse-cycle, lane-reducing op, and
7336 PHI statement that starts reduction comprised of only lane-reducing ops,
7337 the number is more than effective vector statements actually required. */
7338 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vect_get_num_copies (vinfo, node);
7340 /* Handle purely internal nodes. */
7341 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7343 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
7344 return false;
7346 stmt_vec_info slp_stmt_info;
7347 unsigned int i;
7348 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
7350 if (slp_stmt_info
7351 && STMT_VINFO_LIVE_P (slp_stmt_info)
7352 && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
7353 node_instance, i,
7354 false, cost_vec))
7355 return false;
7357 return true;
7360 bool dummy;
7361 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
7362 node, node_instance, cost_vec);
7365 /* Try to build NODE from scalars, returning true on success.
7366 NODE_INSTANCE is the SLP instance that contains NODE. */
7368 static bool
7369 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
7370 slp_instance node_instance)
7372 stmt_vec_info stmt_info;
7373 unsigned int i;
7375 if (!is_a <bb_vec_info> (vinfo)
7376 || node == SLP_INSTANCE_TREE (node_instance)
7377 || !SLP_TREE_SCALAR_STMTS (node).exists ()
7378 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
7379 /* Force the mask use to be built from scalars instead. */
7380 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
7381 return false;
7383 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7384 if (!stmt_info)
7385 return false;
7387 if (dump_enabled_p ())
7388 dump_printf_loc (MSG_NOTE, vect_location,
7389 "Building vector operands of %p from scalars instead\n",
7390 (void *) node);
7392 /* Don't remove and free the child nodes here, since they could be
7393 referenced by other structures. The analysis and scheduling phases
7394 (need to) ignore child nodes of anything that isn't vect_internal_def. */
7395 unsigned int group_size = SLP_TREE_LANES (node);
7396 SLP_TREE_DEF_TYPE (node) = vect_external_def;
7397 /* Invariants get their vector type from the uses. */
7398 SLP_TREE_VECTYPE (node) = NULL_TREE;
7399 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
7400 SLP_TREE_LOAD_PERMUTATION (node).release ();
7401 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7403 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
7404 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
7406 return true;
7409 /* Return true if all elements of the slice are the same. */
7410 bool
7411 vect_scalar_ops_slice::all_same_p () const
7413 for (unsigned int i = 1; i < length; ++i)
7414 if (!operand_equal_p (op (0), op (i)))
7415 return false;
7416 return true;
7419 hashval_t
7420 vect_scalar_ops_slice_hash::hash (const value_type &s)
7422 hashval_t hash = 0;
7423 for (unsigned i = 0; i < s.length; ++i)
7424 hash = iterative_hash_expr (s.op (i), hash);
7425 return hash;
7428 bool
7429 vect_scalar_ops_slice_hash::equal (const value_type &s1,
7430 const compare_type &s2)
7432 if (s1.length != s2.length)
7433 return false;
7434 for (unsigned i = 0; i < s1.length; ++i)
7435 if (!operand_equal_p (s1.op (i), s2.op (i)))
7436 return false;
7437 return true;
7440 /* Compute the prologue cost for invariant or constant operands represented
7441 by NODE. */
7443 static void
7444 vect_prologue_cost_for_slp (slp_tree node,
7445 stmt_vector_for_cost *cost_vec)
7447 /* There's a special case of an existing vector, that costs nothing. */
7448 if (SLP_TREE_SCALAR_OPS (node).length () == 0
7449 && !SLP_TREE_VEC_DEFS (node).is_empty ())
7450 return;
7451 /* Without looking at the actual initializer a vector of
7452 constants can be implemented as load from the constant pool.
7453 When all elements are the same we can use a splat. */
7454 tree vectype = SLP_TREE_VECTYPE (node);
7455 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
7456 unsigned HOST_WIDE_INT const_nunits;
7457 unsigned nelt_limit;
7458 auto ops = &SLP_TREE_SCALAR_OPS (node);
7459 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7460 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
7461 && ! multiple_p (const_nunits, group_size))
7463 nelt_limit = const_nunits;
7464 hash_set<vect_scalar_ops_slice_hash> vector_ops;
7465 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
7466 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
7467 starts.quick_push (i * const_nunits);
7469 else
7471 /* If either the vector has variable length or the vectors
7472 are composed of repeated whole groups we only need to
7473 cost construction once. All vectors will be the same. */
7474 nelt_limit = group_size;
7475 starts.quick_push (0);
7477 /* ??? We're just tracking whether vectors in a single node are the same.
7478 Ideally we'd do something more global. */
7479 bool passed = false;
7480 for (unsigned int start : starts)
7482 vect_cost_for_stmt kind;
7483 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
7484 kind = vector_load;
7485 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
7486 kind = scalar_to_vec;
7487 else
7488 kind = vec_construct;
7489 /* The target cost hook has no idea which part of the SLP node
7490 we are costing so avoid passing it down more than once. Pass
7491 it to the first vec_construct or scalar_to_vec part since for those
7492 the x86 backend tries to account for GPR to XMM register moves. */
7493 record_stmt_cost (cost_vec, 1, kind,
7494 (kind != vector_load && !passed) ? node : nullptr,
7495 vectype, 0, vect_prologue);
7496 if (kind != vector_load)
7497 passed = true;
7501 /* Analyze statements contained in SLP tree NODE after recursively analyzing
7502 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
7504 Return true if the operations are supported. */
7506 static bool
7507 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
7508 slp_instance node_instance,
7509 hash_set<slp_tree> &visited_set,
7510 vec<slp_tree> &visited_vec,
7511 stmt_vector_for_cost *cost_vec)
7513 int i, j;
7514 slp_tree child;
7516 /* Assume we can code-generate all invariants. */
7517 if (!node
7518 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
7519 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7520 return true;
7522 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
7524 if (dump_enabled_p ())
7525 dump_printf_loc (MSG_NOTE, vect_location,
7526 "Failed cyclic SLP reference in %p\n", (void *) node);
7527 return false;
7529 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
7531 /* If we already analyzed the exact same set of scalar stmts we're done.
7532 We share the generated vector stmts for those. */
7533 if (visited_set.add (node))
7534 return true;
7535 visited_vec.safe_push (node);
7537 bool res = true;
7538 unsigned visited_rec_start = visited_vec.length ();
7539 unsigned cost_vec_rec_start = cost_vec->length ();
7540 bool seen_non_constant_child = false;
7541 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7543 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
7544 visited_set, visited_vec,
7545 cost_vec);
7546 if (!res)
7547 break;
7548 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
7549 seen_non_constant_child = true;
7551 /* We're having difficulties scheduling nodes with just constant
7552 operands and no scalar stmts since we then cannot compute a stmt
7553 insertion place. */
7554 if (res
7555 && !seen_non_constant_child
7556 && SLP_TREE_SCALAR_STMTS (node).is_empty ())
7558 if (dump_enabled_p ())
7559 dump_printf_loc (MSG_NOTE, vect_location,
7560 "Cannot vectorize all-constant op node %p\n",
7561 (void *) node);
7562 res = false;
7565 if (res)
7566 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
7567 cost_vec);
7568 /* If analysis failed we have to pop all recursive visited nodes
7569 plus ourselves. */
7570 if (!res)
7572 while (visited_vec.length () >= visited_rec_start)
7573 visited_set.remove (visited_vec.pop ());
7574 cost_vec->truncate (cost_vec_rec_start);
7577 /* When the node can be vectorized cost invariant nodes it references.
7578 This is not done in DFS order to allow the refering node
7579 vectorizable_* calls to nail down the invariant nodes vector type
7580 and possibly unshare it if it needs a different vector type than
7581 other referrers. */
7582 if (res)
7583 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
7584 if (child
7585 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
7586 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
7587 /* Perform usual caching, note code-generation still
7588 code-gens these nodes multiple times but we expect
7589 to CSE them later. */
7590 && !visited_set.add (child))
7592 visited_vec.safe_push (child);
7593 /* ??? After auditing more code paths make a "default"
7594 and push the vector type from NODE to all children
7595 if it is not already set. */
7596 /* Compute the number of vectors to be generated. */
7597 tree vector_type = SLP_TREE_VECTYPE (child);
7598 if (!vector_type)
7600 /* For shifts with a scalar argument we don't need
7601 to cost or code-generate anything.
7602 ??? Represent this more explicitely. */
7603 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
7604 == shift_vec_info_type)
7605 && j == 1);
7606 continue;
7609 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
7610 = vect_get_num_copies (vinfo, child);
7611 /* And cost them. */
7612 vect_prologue_cost_for_slp (child, cost_vec);
7615 /* If this node or any of its children can't be vectorized, try pruning
7616 the tree here rather than felling the whole thing. */
7617 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
7619 /* We'll need to revisit this for invariant costing and number
7620 of vectorized stmt setting. */
7621 res = true;
7624 return res;
7627 /* Given a definition DEF, analyze if it will have any live scalar use after
7628 performing SLP vectorization whose information is represented by BB_VINFO,
7629 and record result into hash map SCALAR_USE_MAP as cache for later fast
7630 check. If recursion DEPTH exceeds a limit, stop analysis and make a
7631 conservative assumption. Return 0 if no scalar use, 1 if there is, -1
7632 means recursion is limited. */
7634 static int
7635 vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
7636 hash_map<tree, int> &scalar_use_map,
7637 int depth = 0)
7639 const int depth_limit = 2;
7640 imm_use_iterator use_iter;
7641 gimple *use_stmt;
7643 if (int *res = scalar_use_map.get (def))
7644 return *res;
7646 int scalar_use = 1;
7648 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
7650 if (is_gimple_debug (use_stmt))
7651 continue;
7653 stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
7655 if (!use_stmt_info)
7656 break;
7658 if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
7659 continue;
7661 /* Do not step forward when encounter PHI statement, since it may
7662 involve cyclic reference and cause infinite recursive invocation. */
7663 if (gimple_code (use_stmt) == GIMPLE_PHI)
7664 break;
7666 /* When pattern recognition is involved, a statement whose definition is
7667 consumed in some pattern, may not be included in the final replacement
7668 pattern statements, so would be skipped when building SLP graph.
7670 * Original
7671 char a_c = *(char *) a;
7672 char b_c = *(char *) b;
7673 unsigned short a_s = (unsigned short) a_c;
7674 int a_i = (int) a_s;
7675 int b_i = (int) b_c;
7676 int r_i = a_i - b_i;
7678 * After pattern replacement
7679 a_s = (unsigned short) a_c;
7680 a_i = (int) a_s;
7682 patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
7683 patt_b_i = (int) patt_b_s; // b_i = (int) b_c
7685 patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
7686 patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
7688 The definitions of a_i(original statement) and b_i(pattern statement)
7689 are related to, but actually not part of widen_minus pattern.
7690 Vectorizing the pattern does not cause these definition statements to
7691 be marked as PURE_SLP. For this case, we need to recursively check
7692 whether their uses are all absorbed into vectorized code. But there
7693 is an exception that some use may participate in an vectorized
7694 operation via an external SLP node containing that use as an element.
7695 The parameter "scalar_use_map" tags such kind of SSA as having scalar
7696 use in advance. */
7697 tree lhs = gimple_get_lhs (use_stmt);
7699 if (!lhs || TREE_CODE (lhs) != SSA_NAME)
7700 break;
7702 if (depth_limit && depth >= depth_limit)
7703 return -1;
7705 if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
7706 depth + 1)))
7707 break;
7710 if (end_imm_use_stmt_p (&use_iter))
7711 scalar_use = 0;
7713 /* If recursion is limited, do not cache result for non-root defs. */
7714 if (!depth || scalar_use >= 0)
7716 bool added = scalar_use_map.put (def, scalar_use);
7717 gcc_assert (!added);
7720 return scalar_use;
7723 /* Mark lanes of NODE that are live outside of the basic-block vectorized
7724 region and that can be vectorized using vectorizable_live_operation
7725 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
7726 scalar code computing it to be retained. */
7728 static void
7729 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
7730 slp_instance instance,
7731 stmt_vector_for_cost *cost_vec,
7732 hash_map<tree, int> &scalar_use_map,
7733 hash_set<stmt_vec_info> &svisited,
7734 hash_set<slp_tree> &visited)
7736 if (visited.add (node))
7737 return;
7739 unsigned i;
7740 stmt_vec_info stmt_info;
7741 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
7742 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7744 if (!stmt_info || svisited.contains (stmt_info))
7745 continue;
7746 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
7747 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
7748 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
7749 /* Only the pattern root stmt computes the original scalar value. */
7750 continue;
7751 bool mark_visited = true;
7752 gimple *orig_stmt = orig_stmt_info->stmt;
7753 ssa_op_iter op_iter;
7754 def_operand_p def_p;
7755 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
7757 if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
7758 scalar_use_map))
7760 STMT_VINFO_LIVE_P (stmt_info) = true;
7761 if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
7762 instance, i, false, cost_vec))
7763 /* ??? So we know we can vectorize the live stmt from one SLP
7764 node. If we cannot do so from all or none consistently
7765 we'd have to record which SLP node (and lane) we want to
7766 use for the live operation. So make sure we can
7767 code-generate from all nodes. */
7768 mark_visited = false;
7769 else
7770 STMT_VINFO_LIVE_P (stmt_info) = false;
7773 /* We have to verify whether we can insert the lane extract
7774 before all uses. The following is a conservative approximation.
7775 We cannot put this into vectorizable_live_operation because
7776 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
7777 doesn't work.
7778 Note that while the fact that we emit code for loads at the
7779 first load should make this a non-problem leafs we construct
7780 from scalars are vectorized after the last scalar def.
7781 ??? If we'd actually compute the insert location during
7782 analysis we could use sth less conservative than the last
7783 scalar stmt in the node for the dominance check. */
7784 /* ??? What remains is "live" uses in vector CTORs in the same
7785 SLP graph which is where those uses can end up code-generated
7786 right after their definition instead of close to their original
7787 use. But that would restrict us to code-generate lane-extracts
7788 from the latest stmt in a node. So we compensate for this
7789 during code-generation, simply not replacing uses for those
7790 hopefully rare cases. */
7791 imm_use_iterator use_iter;
7792 gimple *use_stmt;
7793 stmt_vec_info use_stmt_info;
7795 if (STMT_VINFO_LIVE_P (stmt_info))
7796 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
7797 if (!is_gimple_debug (use_stmt)
7798 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
7799 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
7800 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
7802 if (dump_enabled_p ())
7803 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7804 "Cannot determine insertion place for "
7805 "lane extract\n");
7806 STMT_VINFO_LIVE_P (stmt_info) = false;
7807 mark_visited = true;
7810 if (mark_visited)
7811 svisited.add (stmt_info);
7814 slp_tree child;
7815 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7816 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7817 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
7818 scalar_use_map, svisited, visited);
7821 /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
7822 are live outside of the basic-block vectorized region and that can be
7823 vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
7825 static void
7826 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
7828 if (bb_vinfo->slp_instances.is_empty ())
7829 return;
7831 hash_set<stmt_vec_info> svisited;
7832 hash_set<slp_tree> visited;
7833 hash_map<tree, int> scalar_use_map;
7834 auto_vec<slp_tree> worklist;
7836 for (slp_instance instance : bb_vinfo->slp_instances)
7838 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
7839 for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
7840 if (TREE_CODE (op) == SSA_NAME)
7841 scalar_use_map.put (op, 1);
7842 if (!visited.add (SLP_INSTANCE_TREE (instance)))
7843 worklist.safe_push (SLP_INSTANCE_TREE (instance));
7848 slp_tree node = worklist.pop ();
7850 if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
7852 for (tree op : SLP_TREE_SCALAR_OPS (node))
7853 if (TREE_CODE (op) == SSA_NAME)
7854 scalar_use_map.put (op, 1);
7856 else
7858 for (slp_tree child : SLP_TREE_CHILDREN (node))
7859 if (child && !visited.add (child))
7860 worklist.safe_push (child);
7863 while (!worklist.is_empty ());
7865 visited.empty ();
7867 for (slp_instance instance : bb_vinfo->slp_instances)
7869 vect_location = instance->location ();
7870 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
7871 instance, &instance->cost_vec,
7872 scalar_use_map, svisited, visited);
7876 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
7878 static bool
7879 vectorizable_bb_reduc_epilogue (slp_instance instance,
7880 stmt_vector_for_cost *cost_vec)
7882 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
7883 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
7884 if (reduc_code == MINUS_EXPR)
7885 reduc_code = PLUS_EXPR;
7886 internal_fn reduc_fn;
7887 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
7888 if (!vectype
7889 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7890 || reduc_fn == IFN_LAST
7891 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
7892 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
7893 TREE_TYPE (vectype)))
7895 if (dump_enabled_p ())
7896 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7897 "not vectorized: basic block reduction epilogue "
7898 "operation unsupported.\n");
7899 return false;
7902 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
7903 cost log2 vector operations plus shuffles and one extraction. */
7904 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
7905 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
7906 vectype, 0, vect_body);
7907 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
7908 vectype, 0, vect_body);
7909 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
7910 vectype, 0, vect_body);
7912 /* Since we replace all stmts of a possibly longer scalar reduction
7913 chain account for the extra scalar stmts for that. */
7914 record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
7915 instance->root_stmts[0], 0, vect_body);
7916 return true;
7919 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
7920 and recurse to children. */
7922 static void
7923 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
7924 hash_set<slp_tree> &visited)
7926 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
7927 || visited.add (node))
7928 return;
7930 stmt_vec_info stmt;
7931 unsigned i;
7932 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
7933 if (stmt)
7934 roots.remove (vect_orig_stmt (stmt));
7936 slp_tree child;
7937 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7938 if (child)
7939 vect_slp_prune_covered_roots (child, roots, visited);
7942 /* Analyze statements in SLP instances of VINFO. Return true if the
7943 operations are supported. */
7945 bool
7946 vect_slp_analyze_operations (vec_info *vinfo)
7948 slp_instance instance;
7949 int i;
7951 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
7953 hash_set<slp_tree> visited;
7954 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
7956 auto_vec<slp_tree> visited_vec;
7957 stmt_vector_for_cost cost_vec;
7958 cost_vec.create (2);
7959 if (is_a <bb_vec_info> (vinfo))
7960 vect_location = instance->location ();
7961 if (!vect_slp_analyze_node_operations (vinfo,
7962 SLP_INSTANCE_TREE (instance),
7963 instance, visited, visited_vec,
7964 &cost_vec)
7965 /* CTOR instances require vectorized defs for the SLP tree root. */
7966 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
7967 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
7968 != vect_internal_def
7969 /* Make sure we vectorized with the expected type. */
7970 || !useless_type_conversion_p
7971 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
7972 (instance->root_stmts[0]->stmt))),
7973 TREE_TYPE (SLP_TREE_VECTYPE
7974 (SLP_INSTANCE_TREE (instance))))))
7975 /* Check we can vectorize the reduction. */
7976 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
7977 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
7979 cost_vec.release ();
7980 slp_tree node = SLP_INSTANCE_TREE (instance);
7981 stmt_vec_info stmt_info;
7982 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7983 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
7984 else
7985 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7986 if (is_a <loop_vec_info> (vinfo))
7988 if (dump_enabled_p ())
7989 dump_printf_loc (MSG_NOTE, vect_location,
7990 "unsupported SLP instance starting from: %G",
7991 stmt_info->stmt);
7992 return false;
7994 if (dump_enabled_p ())
7995 dump_printf_loc (MSG_NOTE, vect_location,
7996 "removing SLP instance operations starting from: %G",
7997 stmt_info->stmt);
7998 vect_free_slp_instance (instance);
7999 vinfo->slp_instances.ordered_remove (i);
8000 while (!visited_vec.is_empty ())
8001 visited.remove (visited_vec.pop ());
8003 else
8005 i++;
8006 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
8008 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
8009 cost_vec.release ();
8011 else
8012 /* For BB vectorization remember the SLP graph entry
8013 cost for later. */
8014 instance->cost_vec = cost_vec;
8018 /* Now look for SLP instances with a root that are covered by other
8019 instances and remove them. */
8020 hash_set<stmt_vec_info> roots;
8021 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
8022 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8023 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
8024 if (!roots.is_empty ())
8026 visited.empty ();
8027 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
8028 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
8029 visited);
8030 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
8031 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
8032 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
8034 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
8035 if (dump_enabled_p ())
8036 dump_printf_loc (MSG_NOTE, vect_location,
8037 "removing SLP instance operations starting "
8038 "from: %G", root->stmt);
8039 vect_free_slp_instance (instance);
8040 vinfo->slp_instances.ordered_remove (i);
8042 else
8043 ++i;
8046 /* Compute vectorizable live stmts. */
8047 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
8048 vect_bb_slp_mark_live_stmts (bb_vinfo);
8050 return !vinfo->slp_instances.is_empty ();
8053 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
8054 closing the eventual chain. */
8056 static slp_instance
8057 get_ultimate_leader (slp_instance instance,
8058 hash_map<slp_instance, slp_instance> &instance_leader)
8060 auto_vec<slp_instance *, 8> chain;
8061 slp_instance *tem;
8062 while (*(tem = instance_leader.get (instance)) != instance)
8064 chain.safe_push (tem);
8065 instance = *tem;
8067 while (!chain.is_empty ())
8068 *chain.pop () = instance;
8069 return instance;
8072 namespace {
8073 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
8074 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
8075 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
8077 INSTANCE_LEADER is as for get_ultimate_leader. */
8079 template<typename T>
8080 bool
8081 vect_map_to_instance (slp_instance instance, T key,
8082 hash_map<T, slp_instance> &key_to_instance,
8083 hash_map<slp_instance, slp_instance> &instance_leader)
8085 bool existed_p;
8086 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
8087 if (!existed_p)
8089 else if (key_instance != instance)
8091 /* If we're running into a previously marked key make us the
8092 leader of the current ultimate leader. This keeps the
8093 leader chain acyclic and works even when the current instance
8094 connects two previously independent graph parts. */
8095 slp_instance key_leader
8096 = get_ultimate_leader (key_instance, instance_leader);
8097 if (key_leader != instance)
8098 instance_leader.put (key_leader, instance);
8100 key_instance = instance;
8101 return existed_p;
8105 /* Worker of vect_bb_partition_graph, recurse on NODE. */
8107 static void
8108 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
8109 slp_instance instance, slp_tree node,
8110 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
8111 hash_map<slp_tree, slp_instance> &node_to_instance,
8112 hash_map<slp_instance, slp_instance> &instance_leader)
8114 stmt_vec_info stmt_info;
8115 unsigned i;
8117 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8118 if (stmt_info)
8119 vect_map_to_instance (instance, stmt_info, stmt_to_instance,
8120 instance_leader);
8122 if (vect_map_to_instance (instance, node, node_to_instance,
8123 instance_leader))
8124 return;
8126 slp_tree child;
8127 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8128 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8129 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
8130 node_to_instance, instance_leader);
8133 /* Partition the SLP graph into pieces that can be costed independently. */
8135 static void
8136 vect_bb_partition_graph (bb_vec_info bb_vinfo)
8138 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
8140 /* First walk the SLP graph assigning each involved scalar stmt a
8141 corresponding SLP graph entry and upon visiting a previously
8142 marked stmt, make the stmts leader the current SLP graph entry. */
8143 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
8144 hash_map<slp_tree, slp_instance> node_to_instance;
8145 hash_map<slp_instance, slp_instance> instance_leader;
8146 slp_instance instance;
8147 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
8149 instance_leader.put (instance, instance);
8150 vect_bb_partition_graph_r (bb_vinfo,
8151 instance, SLP_INSTANCE_TREE (instance),
8152 stmt_to_instance, node_to_instance,
8153 instance_leader);
8156 /* Then collect entries to each independent subgraph. */
8157 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
8159 slp_instance leader = get_ultimate_leader (instance, instance_leader);
8160 leader->subgraph_entries.safe_push (instance);
8161 if (dump_enabled_p ()
8162 && leader != instance)
8163 dump_printf_loc (MSG_NOTE, vect_location,
8164 "instance %p is leader of %p\n",
8165 (void *) leader, (void *) instance);
8169 /* Compute the set of scalar stmts participating in internal and external
8170 nodes. */
8172 static void
8173 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
8174 hash_set<slp_tree> &visited,
8175 hash_set<stmt_vec_info> &vstmts,
8176 hash_set<stmt_vec_info> &estmts)
8178 int i;
8179 stmt_vec_info stmt_info;
8180 slp_tree child;
8182 if (visited.add (node))
8183 return;
8185 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
8187 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8188 if (stmt_info)
8189 vstmts.add (stmt_info);
8191 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8192 if (child)
8193 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
8194 vstmts, estmts);
8196 else
8197 for (tree def : SLP_TREE_SCALAR_OPS (node))
8199 stmt_vec_info def_stmt = vinfo->lookup_def (def);
8200 if (def_stmt)
8201 estmts.add (def_stmt);
8206 /* Compute the scalar cost of the SLP node NODE and its children
8207 and return it. Do not account defs that are marked in LIFE and
8208 update LIFE according to uses of NODE. */
8210 static void
8211 vect_bb_slp_scalar_cost (vec_info *vinfo,
8212 slp_tree node, vec<bool, va_heap> *life,
8213 stmt_vector_for_cost *cost_vec,
8214 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
8215 hash_set<slp_tree> &visited)
8217 unsigned i;
8218 stmt_vec_info stmt_info;
8219 slp_tree child;
8221 if (visited.add (node))
8222 return;
8224 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8226 ssa_op_iter op_iter;
8227 def_operand_p def_p;
8229 if (!stmt_info || (*life)[i])
8230 continue;
8232 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
8233 gimple *orig_stmt = orig_stmt_info->stmt;
8235 /* If there is a non-vectorized use of the defs then the scalar
8236 stmt is kept live in which case we do not account it or any
8237 required defs in the SLP children in the scalar cost. This
8238 way we make the vectorization more costly when compared to
8239 the scalar cost. */
8240 if (!STMT_VINFO_LIVE_P (stmt_info))
8242 auto_vec<gimple *, 8> worklist;
8243 hash_set<gimple *> *worklist_visited = NULL;
8244 worklist.quick_push (orig_stmt);
8247 gimple *work_stmt = worklist.pop ();
8248 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
8250 imm_use_iterator use_iter;
8251 gimple *use_stmt;
8252 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
8253 DEF_FROM_PTR (def_p))
8254 if (!is_gimple_debug (use_stmt))
8256 stmt_vec_info use_stmt_info
8257 = vinfo->lookup_stmt (use_stmt);
8258 if (!use_stmt_info
8259 || !vectorized_scalar_stmts.contains (use_stmt_info))
8261 if (use_stmt_info
8262 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
8264 /* For stmts participating in patterns we have
8265 to check its uses recursively. */
8266 if (!worklist_visited)
8267 worklist_visited = new hash_set<gimple *> ();
8268 if (!worklist_visited->add (use_stmt))
8269 worklist.safe_push (use_stmt);
8270 continue;
8272 (*life)[i] = true;
8273 goto next_lane;
8278 while (!worklist.is_empty ());
8279 next_lane:
8280 if (worklist_visited)
8281 delete worklist_visited;
8282 if ((*life)[i])
8283 continue;
8286 /* Count scalar stmts only once. */
8287 if (gimple_visited_p (orig_stmt))
8288 continue;
8289 gimple_set_visited (orig_stmt, true);
8291 vect_cost_for_stmt kind;
8292 if (STMT_VINFO_DATA_REF (orig_stmt_info))
8294 data_reference_p dr = STMT_VINFO_DATA_REF (orig_stmt_info);
8295 tree base = get_base_address (DR_REF (dr));
8296 /* When the scalar access is to a non-global not address-taken
8297 decl that is not BLKmode assume we can access it with a single
8298 non-load/store instruction. */
8299 if (DECL_P (base)
8300 && !is_global_var (base)
8301 && !TREE_ADDRESSABLE (base)
8302 && DECL_MODE (base) != BLKmode)
8303 kind = scalar_stmt;
8304 else if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
8305 kind = scalar_load;
8306 else
8307 kind = scalar_store;
8309 else if (vect_nop_conversion_p (orig_stmt_info))
8310 continue;
8311 /* For single-argument PHIs assume coalescing which means zero cost
8312 for the scalar and the vector PHIs. This avoids artificially
8313 favoring the vector path (but may pessimize it in some cases). */
8314 else if (is_a <gphi *> (orig_stmt_info->stmt)
8315 && gimple_phi_num_args
8316 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
8317 continue;
8318 else
8319 kind = scalar_stmt;
8320 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
8321 SLP_TREE_VECTYPE (node), 0, vect_body);
8324 auto_vec<bool, 20> subtree_life;
8325 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8327 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8329 /* Do not directly pass LIFE to the recursive call, copy it to
8330 confine changes in the callee to the current child/subtree. */
8331 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
8333 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
8334 for (unsigned j = 0;
8335 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
8337 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
8338 if (perm.first == i)
8339 subtree_life[perm.second] = (*life)[j];
8342 else
8344 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
8345 subtree_life.safe_splice (*life);
8347 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
8348 vectorized_scalar_stmts, visited);
8349 subtree_life.truncate (0);
8354 /* Comparator for the loop-index sorted cost vectors. */
8356 static int
8357 li_cost_vec_cmp (const void *a_, const void *b_)
8359 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
8360 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
8361 if (a->first < b->first)
8362 return -1;
8363 else if (a->first == b->first)
8364 return 0;
8365 return 1;
8368 /* Check if vectorization of the basic block is profitable for the
8369 subgraph denoted by SLP_INSTANCES. */
8371 static bool
8372 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
8373 vec<slp_instance> slp_instances,
8374 loop_p orig_loop)
8376 slp_instance instance;
8377 int i;
8378 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
8379 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
8381 if (dump_enabled_p ())
8383 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
8384 hash_set<slp_tree> visited;
8385 FOR_EACH_VEC_ELT (slp_instances, i, instance)
8386 vect_print_slp_graph (MSG_NOTE, vect_location,
8387 SLP_INSTANCE_TREE (instance), visited);
8390 /* Compute the set of scalar stmts we know will go away 'locally' when
8391 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
8392 not accurate for nodes promoted extern late or for scalar stmts that
8393 are used both in extern defs and in vectorized defs. */
8394 hash_set<stmt_vec_info> vectorized_scalar_stmts;
8395 hash_set<stmt_vec_info> scalar_stmts_in_externs;
8396 hash_set<slp_tree> visited;
8397 FOR_EACH_VEC_ELT (slp_instances, i, instance)
8399 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
8400 SLP_INSTANCE_TREE (instance),
8401 visited,
8402 vectorized_scalar_stmts,
8403 scalar_stmts_in_externs);
8404 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
8405 vectorized_scalar_stmts.add (rstmt);
8407 /* Scalar stmts used as defs in external nodes need to be preseved, so
8408 remove them from vectorized_scalar_stmts. */
8409 for (stmt_vec_info stmt : scalar_stmts_in_externs)
8410 vectorized_scalar_stmts.remove (stmt);
8412 /* Calculate scalar cost and sum the cost for the vector stmts
8413 previously collected. */
8414 stmt_vector_for_cost scalar_costs = vNULL;
8415 stmt_vector_for_cost vector_costs = vNULL;
8416 visited.empty ();
8417 FOR_EACH_VEC_ELT (slp_instances, i, instance)
8419 auto_vec<bool, 20> life;
8420 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
8421 true);
8422 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8423 record_stmt_cost (&scalar_costs,
8424 SLP_INSTANCE_ROOT_STMTS (instance).length (),
8425 scalar_stmt,
8426 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
8427 vect_bb_slp_scalar_cost (bb_vinfo,
8428 SLP_INSTANCE_TREE (instance),
8429 &life, &scalar_costs, vectorized_scalar_stmts,
8430 visited);
8431 vector_costs.safe_splice (instance->cost_vec);
8432 instance->cost_vec.release ();
8435 if (dump_enabled_p ())
8436 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
8438 /* When costing non-loop vectorization we need to consider each covered
8439 loop independently and make sure vectorization is profitable. For
8440 now we assume a loop may be not entered or executed an arbitrary
8441 number of iterations (??? static information can provide more
8442 precise info here) which means we can simply cost each containing
8443 loops stmts separately. */
8445 /* First produce cost vectors sorted by loop index. */
8446 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
8447 li_scalar_costs (scalar_costs.length ());
8448 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
8449 li_vector_costs (vector_costs.length ());
8450 stmt_info_for_cost *cost;
8451 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
8453 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
8454 li_scalar_costs.quick_push (std::make_pair (l, cost));
8456 /* Use a random used loop as fallback in case the first vector_costs
8457 entry does not have a stmt_info associated with it. */
8458 unsigned l = li_scalar_costs[0].first;
8459 FOR_EACH_VEC_ELT (vector_costs, i, cost)
8461 /* We inherit from the previous COST, invariants, externals and
8462 extracts immediately follow the cost for the related stmt. */
8463 if (cost->stmt_info)
8464 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
8465 li_vector_costs.quick_push (std::make_pair (l, cost));
8467 li_scalar_costs.qsort (li_cost_vec_cmp);
8468 li_vector_costs.qsort (li_cost_vec_cmp);
8470 /* Now cost the portions individually. */
8471 unsigned vi = 0;
8472 unsigned si = 0;
8473 bool profitable = true;
8474 while (si < li_scalar_costs.length ()
8475 && vi < li_vector_costs.length ())
8477 unsigned sl = li_scalar_costs[si].first;
8478 unsigned vl = li_vector_costs[vi].first;
8479 if (sl != vl)
8481 if (dump_enabled_p ())
8482 dump_printf_loc (MSG_NOTE, vect_location,
8483 "Scalar %d and vector %d loop part do not "
8484 "match up, skipping scalar part\n", sl, vl);
8485 /* Skip the scalar part, assuming zero cost on the vector side. */
8488 si++;
8490 while (si < li_scalar_costs.length ()
8491 && li_scalar_costs[si].first == sl);
8492 continue;
8495 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
8498 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
8499 si++;
8501 while (si < li_scalar_costs.length ()
8502 && li_scalar_costs[si].first == sl);
8503 unsigned dummy;
8504 finish_cost (scalar_target_cost_data, nullptr,
8505 &dummy, &scalar_cost, &dummy);
8507 /* Complete the target-specific vector cost calculation. */
8508 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
8511 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
8512 vi++;
8514 while (vi < li_vector_costs.length ()
8515 && li_vector_costs[vi].first == vl);
8516 finish_cost (vect_target_cost_data, scalar_target_cost_data,
8517 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
8518 delete scalar_target_cost_data;
8519 delete vect_target_cost_data;
8521 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
8523 if (dump_enabled_p ())
8525 dump_printf_loc (MSG_NOTE, vect_location,
8526 "Cost model analysis for part in loop %d:\n", sl);
8527 dump_printf (MSG_NOTE, " Vector cost: %d\n",
8528 vec_inside_cost + vec_outside_cost);
8529 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
8532 /* Vectorization is profitable if its cost is more than the cost of scalar
8533 version. Note that we err on the vector side for equal cost because
8534 the cost estimate is otherwise quite pessimistic (constant uses are
8535 free on the scalar side but cost a load on the vector side for
8536 example). */
8537 if (vec_outside_cost + vec_inside_cost > scalar_cost)
8539 profitable = false;
8540 break;
8543 if (profitable && vi < li_vector_costs.length ())
8545 if (dump_enabled_p ())
8546 dump_printf_loc (MSG_NOTE, vect_location,
8547 "Excess vector cost for part in loop %d:\n",
8548 li_vector_costs[vi].first);
8549 profitable = false;
8552 /* Unset visited flag. This is delayed when the subgraph is profitable
8553 and we process the loop for remaining unvectorized if-converted code. */
8554 if (!orig_loop || !profitable)
8555 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
8556 gimple_set_visited (cost->stmt_info->stmt, false);
8558 scalar_costs.release ();
8559 vector_costs.release ();
8561 return profitable;
8564 /* qsort comparator for lane defs. */
8566 static int
8567 vld_cmp (const void *a_, const void *b_)
8569 auto *a = (const std::pair<unsigned, tree> *)a_;
8570 auto *b = (const std::pair<unsigned, tree> *)b_;
8571 return a->first - b->first;
8574 /* Return true if USE_STMT is a vector lane insert into VEC and set
8575 *THIS_LANE to the lane number that is set. */
8577 static bool
8578 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
8580 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
8581 if (!use_ass
8582 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
8583 || (vec
8584 ? gimple_assign_rhs1 (use_ass) != vec
8585 : ((vec = gimple_assign_rhs1 (use_ass)), false))
8586 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
8587 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
8588 || !constant_multiple_p
8589 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
8590 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
8591 this_lane))
8592 return false;
8593 return true;
8596 /* Find any vectorizable constructors and add them to the grouped_store
8597 array. */
8599 static void
8600 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
8602 for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
8603 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
8604 !gsi_end_p (gsi); gsi_next (&gsi))
8606 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
8607 if (!assign)
8608 continue;
8610 tree rhs = gimple_assign_rhs1 (assign);
8611 enum tree_code code = gimple_assign_rhs_code (assign);
8612 use_operand_p use_p;
8613 gimple *use_stmt;
8614 if (code == CONSTRUCTOR)
8616 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
8617 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
8618 CONSTRUCTOR_NELTS (rhs))
8619 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
8620 || uniform_vector_p (rhs))
8621 continue;
8623 unsigned j;
8624 tree val;
8625 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
8626 if (TREE_CODE (val) != SSA_NAME
8627 || !bb_vinfo->lookup_def (val))
8628 break;
8629 if (j != CONSTRUCTOR_NELTS (rhs))
8630 continue;
8632 vec<stmt_vec_info> roots = vNULL;
8633 roots.safe_push (bb_vinfo->lookup_stmt (assign));
8634 vec<stmt_vec_info> stmts;
8635 stmts.create (CONSTRUCTOR_NELTS (rhs));
8636 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
8637 stmts.quick_push
8638 (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
8639 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
8640 stmts, roots));
8642 else if (code == BIT_INSERT_EXPR
8643 && VECTOR_TYPE_P (TREE_TYPE (rhs))
8644 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
8645 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
8646 && integer_zerop (gimple_assign_rhs3 (assign))
8647 && useless_type_conversion_p
8648 (TREE_TYPE (TREE_TYPE (rhs)),
8649 TREE_TYPE (gimple_assign_rhs2 (assign)))
8650 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
8652 /* We start to match on insert to lane zero but since the
8653 inserts need not be ordered we'd have to search both
8654 the def and the use chains. */
8655 tree vectype = TREE_TYPE (rhs);
8656 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
8657 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
8658 auto_sbitmap lanes (nlanes);
8659 bitmap_clear (lanes);
8660 bitmap_set_bit (lanes, 0);
8661 tree def = gimple_assign_lhs (assign);
8662 lane_defs.quick_push
8663 (std::make_pair (0, gimple_assign_rhs2 (assign)));
8664 unsigned lanes_found = 1;
8665 /* Start with the use chains, the last stmt will be the root. */
8666 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
8667 vec<stmt_vec_info> roots = vNULL;
8668 roots.safe_push (last);
8671 use_operand_p use_p;
8672 gimple *use_stmt;
8673 if (!single_imm_use (def, &use_p, &use_stmt))
8674 break;
8675 unsigned this_lane;
8676 if (!bb_vinfo->lookup_stmt (use_stmt)
8677 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
8678 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
8679 break;
8680 if (bitmap_bit_p (lanes, this_lane))
8681 break;
8682 lanes_found++;
8683 bitmap_set_bit (lanes, this_lane);
8684 gassign *use_ass = as_a <gassign *> (use_stmt);
8685 lane_defs.quick_push (std::make_pair
8686 (this_lane, gimple_assign_rhs2 (use_ass)));
8687 last = bb_vinfo->lookup_stmt (use_ass);
8688 roots.safe_push (last);
8689 def = gimple_assign_lhs (use_ass);
8691 while (lanes_found < nlanes);
8692 if (roots.length () > 1)
8693 std::swap(roots[0], roots[roots.length () - 1]);
8694 if (lanes_found < nlanes)
8696 /* Now search the def chain. */
8697 def = gimple_assign_rhs1 (assign);
8700 if (TREE_CODE (def) != SSA_NAME
8701 || !has_single_use (def))
8702 break;
8703 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
8704 unsigned this_lane;
8705 if (!bb_vinfo->lookup_stmt (def_stmt)
8706 || !vect_slp_is_lane_insert (def_stmt,
8707 NULL_TREE, &this_lane)
8708 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
8709 break;
8710 if (bitmap_bit_p (lanes, this_lane))
8711 break;
8712 lanes_found++;
8713 bitmap_set_bit (lanes, this_lane);
8714 lane_defs.quick_push (std::make_pair
8715 (this_lane,
8716 gimple_assign_rhs2 (def_stmt)));
8717 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
8718 def = gimple_assign_rhs1 (def_stmt);
8720 while (lanes_found < nlanes);
8722 if (lanes_found == nlanes)
8724 /* Sort lane_defs after the lane index and register the root. */
8725 lane_defs.qsort (vld_cmp);
8726 vec<stmt_vec_info> stmts;
8727 stmts.create (nlanes);
8728 for (unsigned i = 0; i < nlanes; ++i)
8729 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
8730 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
8731 stmts, roots));
8733 else
8734 roots.release ();
8736 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
8737 && (associative_tree_code (code) || code == MINUS_EXPR)
8738 /* ??? This pessimizes a two-element reduction. PR54400.
8739 ??? In-order reduction could be handled if we only
8740 traverse one operand chain in vect_slp_linearize_chain. */
8741 && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
8742 /* Ops with constants at the tail can be stripped here. */
8743 && TREE_CODE (rhs) == SSA_NAME
8744 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
8745 /* Should be the chain end. */
8746 && (!single_imm_use (gimple_assign_lhs (assign),
8747 &use_p, &use_stmt)
8748 || !is_gimple_assign (use_stmt)
8749 || (gimple_assign_rhs_code (use_stmt) != code
8750 && ((code != PLUS_EXPR && code != MINUS_EXPR)
8751 || (gimple_assign_rhs_code (use_stmt)
8752 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
8754 /* We start the match at the end of a possible association
8755 chain. */
8756 auto_vec<chain_op_t> chain;
8757 auto_vec<std::pair<tree_code, gimple *> > worklist;
8758 auto_vec<gimple *> chain_stmts;
8759 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
8760 if (code == MINUS_EXPR)
8761 code = PLUS_EXPR;
8762 internal_fn reduc_fn;
8763 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
8764 || reduc_fn == IFN_LAST)
8765 continue;
8766 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
8767 /* ??? */
8768 code_stmt, alt_code_stmt, &chain_stmts);
8769 if (chain.length () > 1)
8771 /* Sort the chain according to def_type and operation. */
8772 chain.sort (dt_sort_cmp, bb_vinfo);
8773 /* ??? Now we'd want to strip externals and constants
8774 but record those to be handled in the epilogue. */
8775 /* ??? For now do not allow mixing ops or externs/constants. */
8776 bool invalid = false;
8777 unsigned remain_cnt = 0;
8778 unsigned last_idx = 0;
8779 for (unsigned i = 0; i < chain.length (); ++i)
8781 if (chain[i].code != code)
8783 invalid = true;
8784 break;
8786 if (chain[i].dt != vect_internal_def
8787 /* Avoid stmts where the def is not the LHS, like
8788 ASMs. */
8789 || (gimple_get_lhs (bb_vinfo->lookup_def
8790 (chain[i].op)->stmt)
8791 != chain[i].op))
8792 remain_cnt++;
8793 else
8794 last_idx = i;
8796 /* Make sure to have an even number of lanes as we later do
8797 all-or-nothing discovery, not trying to split further. */
8798 if ((chain.length () - remain_cnt) & 1)
8799 remain_cnt++;
8800 if (!invalid && chain.length () - remain_cnt > 1)
8802 vec<stmt_vec_info> stmts;
8803 vec<tree> remain = vNULL;
8804 stmts.create (chain.length ());
8805 if (remain_cnt > 0)
8806 remain.create (remain_cnt);
8807 for (unsigned i = 0; i < chain.length (); ++i)
8809 stmt_vec_info stmt_info;
8810 if (chain[i].dt == vect_internal_def
8811 && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
8812 gimple_get_lhs (stmt_info->stmt) == chain[i].op)
8813 && (i != last_idx
8814 || (stmts.length () & 1)))
8815 stmts.quick_push (stmt_info);
8816 else
8817 remain.quick_push (chain[i].op);
8819 vec<stmt_vec_info> roots;
8820 roots.create (chain_stmts.length ());
8821 for (unsigned i = 0; i < chain_stmts.length (); ++i)
8822 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
8823 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
8824 stmts, roots, remain));
8831 /* Walk the grouped store chains and replace entries with their
8832 pattern variant if any. */
8834 static void
8835 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
8837 stmt_vec_info first_element;
8838 unsigned i;
8840 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
8842 /* We also have CTORs in this array. */
8843 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
8844 continue;
8845 if (STMT_VINFO_IN_PATTERN_P (first_element))
8847 stmt_vec_info orig = first_element;
8848 first_element = STMT_VINFO_RELATED_STMT (first_element);
8849 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
8850 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
8851 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
8852 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
8853 vinfo->grouped_stores[i] = first_element;
8855 stmt_vec_info prev = first_element;
8856 while (DR_GROUP_NEXT_ELEMENT (prev))
8858 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
8859 if (STMT_VINFO_IN_PATTERN_P (elt))
8861 stmt_vec_info orig = elt;
8862 elt = STMT_VINFO_RELATED_STMT (elt);
8863 DR_GROUP_NEXT_ELEMENT (prev) = elt;
8864 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
8865 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
8867 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
8868 prev = elt;
8873 /* Check if the region described by BB_VINFO can be vectorized, returning
8874 true if so. When returning false, set FATAL to true if the same failure
8875 would prevent vectorization at other vector sizes, false if it is still
8876 worth trying other sizes. N_STMTS is the number of statements in the
8877 region. */
8879 static bool
8880 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
8881 vec<int> *dataref_groups)
8883 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
8885 slp_instance instance;
8886 int i;
8887 poly_uint64 min_vf = 2;
8889 /* The first group of checks is independent of the vector size. */
8890 fatal = true;
8892 /* Analyze the data references. */
8894 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
8896 if (dump_enabled_p ())
8897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8898 "not vectorized: unhandled data-ref in basic "
8899 "block.\n");
8900 return false;
8903 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
8905 if (dump_enabled_p ())
8906 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8907 "not vectorized: unhandled data access in "
8908 "basic block.\n");
8909 return false;
8912 vect_slp_check_for_roots (bb_vinfo);
8914 /* If there are no grouped stores and no constructors in the region
8915 there is no need to continue with pattern recog as vect_analyze_slp
8916 will fail anyway. */
8917 if (bb_vinfo->grouped_stores.is_empty ()
8918 && bb_vinfo->roots.is_empty ())
8920 if (dump_enabled_p ())
8921 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8922 "not vectorized: no grouped stores in "
8923 "basic block.\n");
8924 return false;
8927 /* While the rest of the analysis below depends on it in some way. */
8928 fatal = false;
8930 vect_pattern_recog (bb_vinfo);
8932 /* Update store groups from pattern processing. */
8933 vect_fixup_store_groups_with_patterns (bb_vinfo);
8935 /* Check the SLP opportunities in the basic block, analyze and build SLP
8936 trees. */
8937 if (!vect_analyze_slp (bb_vinfo, n_stmts))
8939 if (dump_enabled_p ())
8941 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8942 "Failed to SLP the basic block.\n");
8943 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8944 "not vectorized: failed to find SLP opportunities "
8945 "in basic block.\n");
8947 return false;
8950 /* Optimize permutations. */
8951 vect_optimize_slp (bb_vinfo);
8953 /* Gather the loads reachable from the SLP graph entries. */
8954 vect_gather_slp_loads (bb_vinfo);
8956 vect_record_base_alignments (bb_vinfo);
8958 /* Analyze and verify the alignment of data references and the
8959 dependence in the SLP instances. */
8960 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
8962 vect_location = instance->location ();
8963 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
8964 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
8966 slp_tree node = SLP_INSTANCE_TREE (instance);
8967 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8968 if (dump_enabled_p ())
8969 dump_printf_loc (MSG_NOTE, vect_location,
8970 "removing SLP instance operations starting from: %G",
8971 stmt_info->stmt);
8972 vect_free_slp_instance (instance);
8973 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
8974 continue;
8977 /* Mark all the statements that we want to vectorize as pure SLP and
8978 relevant. */
8979 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
8980 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
8981 unsigned j;
8982 stmt_vec_info root;
8983 /* Likewise consider instance root stmts as vectorized. */
8984 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
8985 STMT_SLP_TYPE (root) = pure_slp;
8987 i++;
8989 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
8990 return false;
8992 if (!vect_slp_analyze_operations (bb_vinfo))
8994 if (dump_enabled_p ())
8995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8996 "not vectorized: bad operation in basic block.\n");
8997 return false;
9000 vect_bb_partition_graph (bb_vinfo);
9002 return true;
9005 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
9006 basic blocks in BBS, returning true on success.
9007 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
9009 static bool
9010 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
9011 vec<int> *dataref_groups, unsigned int n_stmts,
9012 loop_p orig_loop)
9014 bb_vec_info bb_vinfo;
9015 auto_vector_modes vector_modes;
9017 /* Autodetect first vector size we try. */
9018 machine_mode next_vector_mode = VOIDmode;
9019 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
9020 unsigned int mode_i = 0;
9022 vec_info_shared shared;
9024 machine_mode autodetected_vector_mode = VOIDmode;
9025 while (1)
9027 bool vectorized = false;
9028 bool fatal = false;
9029 bb_vinfo = new _bb_vec_info (bbs, &shared);
9031 bool first_time_p = shared.datarefs.is_empty ();
9032 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
9033 if (first_time_p)
9034 bb_vinfo->shared->save_datarefs ();
9035 else
9036 bb_vinfo->shared->check_datarefs ();
9037 bb_vinfo->vector_mode = next_vector_mode;
9039 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
9041 if (dump_enabled_p ())
9043 dump_printf_loc (MSG_NOTE, vect_location,
9044 "***** Analysis succeeded with vector mode"
9045 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
9046 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
9049 bb_vinfo->shared->check_datarefs ();
9051 bool force_clear = false;
9052 auto_vec<slp_instance> profitable_subgraphs;
9053 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
9055 if (instance->subgraph_entries.is_empty ())
9056 continue;
9058 dump_user_location_t saved_vect_location = vect_location;
9059 vect_location = instance->location ();
9060 if (!unlimited_cost_model (NULL)
9061 && !vect_bb_vectorization_profitable_p
9062 (bb_vinfo, instance->subgraph_entries, orig_loop))
9064 if (dump_enabled_p ())
9065 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9066 "not vectorized: vectorization is not "
9067 "profitable.\n");
9068 vect_location = saved_vect_location;
9069 continue;
9072 vect_location = saved_vect_location;
9073 if (!dbg_cnt (vect_slp))
9075 force_clear = true;
9076 continue;
9079 profitable_subgraphs.safe_push (instance);
9082 /* When we're vectorizing an if-converted loop body make sure
9083 we vectorized all if-converted code. */
9084 if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
9086 gcc_assert (bb_vinfo->nbbs == 1);
9087 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
9088 !gsi_end_p (gsi); gsi_next (&gsi))
9090 /* The costing above left us with DCEable vectorized scalar
9091 stmts having the visited flag set on profitable
9092 subgraphs. Do the delayed clearing of the flag here. */
9093 if (gimple_visited_p (gsi_stmt (gsi)))
9095 gimple_set_visited (gsi_stmt (gsi), false);
9096 continue;
9098 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
9099 continue;
9101 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
9102 if (gimple_assign_rhs_code (ass) == COND_EXPR)
9104 if (!profitable_subgraphs.is_empty ()
9105 && dump_enabled_p ())
9106 dump_printf_loc (MSG_NOTE, vect_location,
9107 "not profitable because of "
9108 "unprofitable if-converted scalar "
9109 "code\n");
9110 profitable_subgraphs.truncate (0);
9115 /* Finally schedule the profitable subgraphs. */
9116 for (slp_instance instance : profitable_subgraphs)
9118 if (!vectorized && dump_enabled_p ())
9119 dump_printf_loc (MSG_NOTE, vect_location,
9120 "Basic block will be vectorized "
9121 "using SLP\n");
9122 vectorized = true;
9124 /* Dump before scheduling as store vectorization will remove
9125 the original stores and mess with the instance tree
9126 so querying its location will eventually ICE. */
9127 if (flag_checking)
9128 for (slp_instance sub : instance->subgraph_entries)
9129 gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
9130 unsigned HOST_WIDE_INT bytes;
9131 if (dump_enabled_p ())
9132 for (slp_instance sub : instance->subgraph_entries)
9134 tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
9135 if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
9136 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
9137 sub->location (),
9138 "basic block part vectorized using %wu "
9139 "byte vectors\n", bytes);
9140 else
9141 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
9142 sub->location (),
9143 "basic block part vectorized using "
9144 "variable length vectors\n");
9147 dump_user_location_t saved_vect_location = vect_location;
9148 vect_location = instance->location ();
9150 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
9152 vect_location = saved_vect_location;
9155 else
9157 if (dump_enabled_p ())
9158 dump_printf_loc (MSG_NOTE, vect_location,
9159 "***** Analysis failed with vector mode %s\n",
9160 GET_MODE_NAME (bb_vinfo->vector_mode));
9163 if (mode_i == 0)
9164 autodetected_vector_mode = bb_vinfo->vector_mode;
9166 if (!fatal)
9167 while (mode_i < vector_modes.length ()
9168 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
9170 if (dump_enabled_p ())
9171 dump_printf_loc (MSG_NOTE, vect_location,
9172 "***** The result for vector mode %s would"
9173 " be the same\n",
9174 GET_MODE_NAME (vector_modes[mode_i]));
9175 mode_i += 1;
9178 delete bb_vinfo;
9180 if (mode_i < vector_modes.length ()
9181 && VECTOR_MODE_P (autodetected_vector_mode)
9182 && (related_vector_mode (vector_modes[mode_i],
9183 GET_MODE_INNER (autodetected_vector_mode))
9184 == autodetected_vector_mode)
9185 && (related_vector_mode (autodetected_vector_mode,
9186 GET_MODE_INNER (vector_modes[mode_i]))
9187 == vector_modes[mode_i]))
9189 if (dump_enabled_p ())
9190 dump_printf_loc (MSG_NOTE, vect_location,
9191 "***** Skipping vector mode %s, which would"
9192 " repeat the analysis for %s\n",
9193 GET_MODE_NAME (vector_modes[mode_i]),
9194 GET_MODE_NAME (autodetected_vector_mode));
9195 mode_i += 1;
9198 if (vectorized
9199 || mode_i == vector_modes.length ()
9200 || autodetected_vector_mode == VOIDmode
9201 /* If vect_slp_analyze_bb_1 signaled that analysis for all
9202 vector sizes will fail do not bother iterating. */
9203 || fatal)
9204 return vectorized;
9206 /* Try the next biggest vector size. */
9207 next_vector_mode = vector_modes[mode_i++];
9208 if (dump_enabled_p ())
9209 dump_printf_loc (MSG_NOTE, vect_location,
9210 "***** Re-trying analysis with vector mode %s\n",
9211 GET_MODE_NAME (next_vector_mode));
9216 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
9217 true if anything in the basic-block was vectorized. */
9219 static bool
9220 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
9222 vec<data_reference_p> datarefs = vNULL;
9223 auto_vec<int> dataref_groups;
9224 int insns = 0;
9225 int current_group = 0;
9227 for (unsigned i = 0; i < bbs.length (); i++)
9229 basic_block bb = bbs[i];
9230 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
9231 gsi_next (&gsi))
9233 gimple *stmt = gsi_stmt (gsi);
9234 if (is_gimple_debug (stmt))
9235 continue;
9237 insns++;
9239 if (gimple_location (stmt) != UNKNOWN_LOCATION)
9240 vect_location = stmt;
9242 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
9243 &dataref_groups, current_group))
9244 ++current_group;
9246 /* New BBs always start a new DR group. */
9247 ++current_group;
9250 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
9253 /* Special entry for the BB vectorizer. Analyze and transform a single
9254 if-converted BB with ORIG_LOOPs body being the not if-converted
9255 representation. Returns true if anything in the basic-block was
9256 vectorized. */
9258 bool
9259 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
9261 auto_vec<basic_block> bbs;
9262 bbs.safe_push (bb);
9263 return vect_slp_bbs (bbs, orig_loop);
9266 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
9267 true if anything in the basic-block was vectorized. */
9269 bool
9270 vect_slp_function (function *fun)
9272 bool r = false;
9273 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
9274 auto_bitmap exit_bbs;
9275 bitmap_set_bit (exit_bbs, EXIT_BLOCK);
9276 edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
9277 unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
9278 true, rpo, NULL);
9280 /* For the moment split the function into pieces to avoid making
9281 the iteration on the vector mode moot. Split at points we know
9282 to not handle well which is CFG merges (SLP discovery doesn't
9283 handle non-loop-header PHIs) and loop exits. Since pattern
9284 recog requires reverse iteration to visit uses before defs
9285 simply chop RPO into pieces. */
9286 auto_vec<basic_block> bbs;
9287 for (unsigned i = 0; i < n; i++)
9289 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
9290 bool split = false;
9292 /* Split when a BB is not dominated by the first block. */
9293 if (!bbs.is_empty ()
9294 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
9296 if (dump_enabled_p ())
9297 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9298 "splitting region at dominance boundary bb%d\n",
9299 bb->index);
9300 split = true;
9302 /* Split when the loop determined by the first block
9303 is exited. This is because we eventually insert
9304 invariants at region begin. */
9305 else if (!bbs.is_empty ()
9306 && bbs[0]->loop_father != bb->loop_father
9307 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
9309 if (dump_enabled_p ())
9310 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9311 "splitting region at loop %d exit at bb%d\n",
9312 bbs[0]->loop_father->num, bb->index);
9313 split = true;
9315 else if (!bbs.is_empty ()
9316 && bb->loop_father->header == bb
9317 && bb->loop_father->dont_vectorize)
9319 if (dump_enabled_p ())
9320 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9321 "splitting region at dont-vectorize loop %d "
9322 "entry at bb%d\n",
9323 bb->loop_father->num, bb->index);
9324 split = true;
9327 if (split && !bbs.is_empty ())
9329 r |= vect_slp_bbs (bbs, NULL);
9330 bbs.truncate (0);
9333 if (bbs.is_empty ())
9335 /* We need to be able to insert at the head of the region which
9336 we cannot for region starting with a returns-twice call. */
9337 if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
9338 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
9340 if (dump_enabled_p ())
9341 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9342 "skipping bb%d as start of region as it "
9343 "starts with returns-twice call\n",
9344 bb->index);
9345 continue;
9347 /* If the loop this BB belongs to is marked as not to be vectorized
9348 honor that also for BB vectorization. */
9349 if (bb->loop_father->dont_vectorize)
9350 continue;
9353 bbs.safe_push (bb);
9355 /* When we have a stmt ending this block and defining a
9356 value we have to insert on edges when inserting after it for
9357 a vector containing its definition. Avoid this for now. */
9358 if (gimple *last = *gsi_last_bb (bb))
9359 if (gimple_get_lhs (last)
9360 && is_ctrl_altering_stmt (last))
9362 if (dump_enabled_p ())
9363 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9364 "splitting region at control altering "
9365 "definition %G", last);
9366 r |= vect_slp_bbs (bbs, NULL);
9367 bbs.truncate (0);
9371 if (!bbs.is_empty ())
9372 r |= vect_slp_bbs (bbs, NULL);
9374 free (rpo);
9376 return r;
9379 /* Build a variable-length vector in which the elements in ELTS are repeated
9380 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
9381 RESULTS and add any new instructions to SEQ.
9383 The approach we use is:
9385 (1) Find a vector mode VM with integer elements of mode IM.
9387 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
9388 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
9389 from small vectors to IM.
9391 (3) Duplicate each ELTS'[I] into a vector of mode VM.
9393 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
9394 correct byte contents.
9396 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
9398 We try to find the largest IM for which this sequence works, in order
9399 to cut down on the number of interleaves. */
9401 void
9402 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
9403 const vec<tree> &elts, unsigned int nresults,
9404 vec<tree> &results)
9406 unsigned int nelts = elts.length ();
9407 tree element_type = TREE_TYPE (vector_type);
9409 /* (1) Find a vector mode VM with integer elements of mode IM. */
9410 unsigned int nvectors = 1;
9411 tree new_vector_type;
9412 tree permutes[2];
9413 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
9414 &nvectors, &new_vector_type,
9415 permutes))
9416 gcc_unreachable ();
9418 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
9419 unsigned int partial_nelts = nelts / nvectors;
9420 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
9422 tree_vector_builder partial_elts;
9423 auto_vec<tree, 32> pieces (nvectors * 2);
9424 pieces.quick_grow_cleared (nvectors * 2);
9425 for (unsigned int i = 0; i < nvectors; ++i)
9427 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
9428 ELTS' has mode IM. */
9429 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
9430 for (unsigned int j = 0; j < partial_nelts; ++j)
9431 partial_elts.quick_push (elts[i * partial_nelts + j]);
9432 tree t = gimple_build_vector (seq, &partial_elts);
9433 t = gimple_build (seq, VIEW_CONVERT_EXPR,
9434 TREE_TYPE (new_vector_type), t);
9436 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
9437 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
9440 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
9441 correct byte contents.
9443 Conceptually, we need to repeat the following operation log2(nvectors)
9444 times, where hi_start = nvectors / 2:
9446 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
9447 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
9449 However, if each input repeats every N elements and the VF is
9450 a multiple of N * 2, the HI result is the same as the LO result.
9451 This will be true for the first N1 iterations of the outer loop,
9452 followed by N2 iterations for which both the LO and HI results
9453 are needed. I.e.:
9455 N1 + N2 = log2(nvectors)
9457 Each "N1 iteration" doubles the number of redundant vectors and the
9458 effect of the process as a whole is to have a sequence of nvectors/2**N1
9459 vectors that repeats 2**N1 times. Rather than generate these redundant
9460 vectors, we halve the number of vectors for each N1 iteration. */
9461 unsigned int in_start = 0;
9462 unsigned int out_start = nvectors;
9463 unsigned int new_nvectors = nvectors;
9464 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
9466 unsigned int hi_start = new_nvectors / 2;
9467 unsigned int out_i = 0;
9468 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
9470 if ((in_i & 1) != 0
9471 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
9472 2 * in_repeat))
9473 continue;
9475 tree output = make_ssa_name (new_vector_type);
9476 tree input1 = pieces[in_start + (in_i / 2)];
9477 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
9478 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
9479 input1, input2,
9480 permutes[in_i & 1]);
9481 gimple_seq_add_stmt (seq, stmt);
9482 pieces[out_start + out_i] = output;
9483 out_i += 1;
9485 std::swap (in_start, out_start);
9486 new_nvectors = out_i;
9489 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
9490 results.reserve (nresults);
9491 for (unsigned int i = 0; i < nresults; ++i)
9492 if (i < new_nvectors)
9493 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
9494 pieces[in_start + i]));
9495 else
9496 results.quick_push (results[i - new_nvectors]);
9500 /* For constant and loop invariant defs in OP_NODE this function creates
9501 vector defs that will be used in the vectorized stmts and stores them
9502 to SLP_TREE_VEC_DEFS of OP_NODE. */
9504 static void
9505 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
9507 unsigned HOST_WIDE_INT nunits;
9508 tree vec_cst;
9509 unsigned j, number_of_places_left_in_vector;
9510 tree vector_type;
9511 tree vop;
9512 int group_size = op_node->ops.length ();
9513 unsigned int vec_num, i;
9514 unsigned number_of_copies = 1;
9515 bool constant_p;
9516 gimple_seq ctor_seq = NULL;
9517 auto_vec<tree, 16> permute_results;
9519 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
9520 vector_type = SLP_TREE_VECTYPE (op_node);
9522 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
9523 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
9524 auto_vec<tree> voprnds (number_of_vectors);
9526 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
9527 created vectors. It is greater than 1 if unrolling is performed.
9529 For example, we have two scalar operands, s1 and s2 (e.g., group of
9530 strided accesses of size two), while NUNITS is four (i.e., four scalars
9531 of this type can be packed in a vector). The output vector will contain
9532 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
9533 will be 2).
9535 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
9536 containing the operands.
9538 For example, NUNITS is four as before, and the group size is 8
9539 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
9540 {s5, s6, s7, s8}. */
9542 /* When using duplicate_and_interleave, we just need one element for
9543 each scalar statement. */
9544 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
9545 nunits = group_size;
9547 number_of_copies = nunits * number_of_vectors / group_size;
9549 number_of_places_left_in_vector = nunits;
9550 constant_p = true;
9551 tree uniform_elt = NULL_TREE;
9552 tree_vector_builder elts (vector_type, nunits, 1);
9553 elts.quick_grow (nunits);
9554 stmt_vec_info insert_after = NULL;
9555 for (j = 0; j < number_of_copies; j++)
9557 tree op;
9558 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
9560 /* Create 'vect_ = {op0,op1,...,opn}'. */
9561 tree orig_op = op;
9562 if (number_of_places_left_in_vector == nunits)
9563 uniform_elt = op;
9564 else if (uniform_elt && operand_equal_p (uniform_elt, op))
9565 op = elts[number_of_places_left_in_vector];
9566 else
9567 uniform_elt = NULL_TREE;
9568 number_of_places_left_in_vector--;
9569 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
9571 if (CONSTANT_CLASS_P (op))
9573 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
9575 /* Can't use VIEW_CONVERT_EXPR for booleans because
9576 of possibly different sizes of scalar value and
9577 vector element. */
9578 if (integer_zerop (op))
9579 op = build_int_cst (TREE_TYPE (vector_type), 0);
9580 else if (integer_onep (op))
9581 op = build_all_ones_cst (TREE_TYPE (vector_type));
9582 else
9583 gcc_unreachable ();
9585 else
9586 op = fold_unary (VIEW_CONVERT_EXPR,
9587 TREE_TYPE (vector_type), op);
9588 gcc_assert (op && CONSTANT_CLASS_P (op));
9590 else
9592 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
9593 gimple *init_stmt;
9594 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
9596 tree true_val
9597 = build_all_ones_cst (TREE_TYPE (vector_type));
9598 tree false_val
9599 = build_zero_cst (TREE_TYPE (vector_type));
9600 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
9601 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
9602 op, true_val,
9603 false_val);
9605 else
9607 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
9608 op);
9609 init_stmt
9610 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
9611 op);
9613 gimple_seq_add_stmt (&ctor_seq, init_stmt);
9614 op = new_temp;
9617 elts[number_of_places_left_in_vector] = op;
9618 if (!CONSTANT_CLASS_P (op))
9619 constant_p = false;
9620 /* For BB vectorization we have to compute an insert location
9621 when a def is inside the analyzed region since we cannot
9622 simply insert at the BB start in this case. */
9623 stmt_vec_info opdef;
9624 if (TREE_CODE (orig_op) == SSA_NAME
9625 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
9626 && is_a <bb_vec_info> (vinfo)
9627 && (opdef = vinfo->lookup_def (orig_op)))
9629 if (!insert_after)
9630 insert_after = opdef;
9631 else
9632 insert_after = get_later_stmt (insert_after, opdef);
9635 if (number_of_places_left_in_vector == 0)
9637 auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
9638 if (uniform_elt)
9639 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
9640 elts[0]);
9641 else if (constant_p
9642 ? multiple_p (type_nunits, nunits)
9643 : known_eq (type_nunits, nunits))
9644 vec_cst = gimple_build_vector (&ctor_seq, &elts);
9645 else
9647 if (permute_results.is_empty ())
9648 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
9649 elts, number_of_vectors,
9650 permute_results);
9651 vec_cst = permute_results[number_of_vectors - j - 1];
9653 if (!gimple_seq_empty_p (ctor_seq))
9655 if (insert_after)
9657 gimple_stmt_iterator gsi;
9658 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
9660 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
9661 gsi_insert_seq_before (&gsi, ctor_seq,
9662 GSI_CONTINUE_LINKING);
9664 else if (!stmt_ends_bb_p (insert_after->stmt))
9666 gsi = gsi_for_stmt (insert_after->stmt);
9667 gsi_insert_seq_after (&gsi, ctor_seq,
9668 GSI_CONTINUE_LINKING);
9670 else
9672 /* When we want to insert after a def where the
9673 defining stmt throws then insert on the fallthru
9674 edge. */
9675 edge e = find_fallthru_edge
9676 (gimple_bb (insert_after->stmt)->succs);
9677 basic_block new_bb
9678 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
9679 gcc_assert (!new_bb);
9682 else
9683 vinfo->insert_seq_on_entry (NULL, ctor_seq);
9684 ctor_seq = NULL;
9686 voprnds.quick_push (vec_cst);
9687 insert_after = NULL;
9688 number_of_places_left_in_vector = nunits;
9689 constant_p = true;
9690 elts.new_vector (vector_type, nunits, 1);
9691 elts.quick_grow (nunits);
9696 /* Since the vectors are created in the reverse order, we should invert
9697 them. */
9698 vec_num = voprnds.length ();
9699 for (j = vec_num; j != 0; j--)
9701 vop = voprnds[j - 1];
9702 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
9705 /* In case that VF is greater than the unrolling factor needed for the SLP
9706 group of stmts, NUMBER_OF_VECTORS to be created is greater than
9707 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
9708 to replicate the vectors. */
9709 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
9710 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
9711 i++)
9712 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
9715 /* Get the Ith vectorized definition from SLP_NODE. */
9717 tree
9718 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
9720 return SLP_TREE_VEC_DEFS (slp_node)[i];
9723 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
9725 void
9726 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
9728 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
9729 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
9732 /* Get N vectorized definitions for SLP_NODE. */
9734 void
9735 vect_get_slp_defs (vec_info *,
9736 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
9738 if (n == -1U)
9739 n = SLP_TREE_CHILDREN (slp_node).length ();
9741 for (unsigned i = 0; i < n; ++i)
9743 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9744 vec<tree> vec_defs = vNULL;
9745 vect_get_slp_defs (child, &vec_defs);
9746 vec_oprnds->quick_push (vec_defs);
9750 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
9751 - PERM gives the permutation that the caller wants to use for NODE,
9752 which might be different from SLP_LOAD_PERMUTATION.
9753 - DUMP_P controls whether the function dumps information. */
9755 static bool
9756 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
9757 load_permutation_t &perm,
9758 const vec<tree> &dr_chain,
9759 gimple_stmt_iterator *gsi, poly_uint64 vf,
9760 bool analyze_only, bool dump_p,
9761 unsigned *n_perms, unsigned int *n_loads,
9762 bool dce_chain)
9764 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
9765 int vec_index = 0;
9766 tree vectype = SLP_TREE_VECTYPE (node);
9767 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
9768 unsigned int mask_element;
9769 unsigned dr_group_size;
9770 machine_mode mode;
9772 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
9773 dr_group_size = 1;
9774 else
9776 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
9777 dr_group_size = DR_GROUP_SIZE (stmt_info);
9780 mode = TYPE_MODE (vectype);
9781 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9782 unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9784 /* Initialize the vect stmts of NODE to properly insert the generated
9785 stmts later. */
9786 if (! analyze_only)
9787 for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
9788 SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
9790 /* Generate permutation masks for every NODE. Number of masks for each NODE
9791 is equal to GROUP_SIZE.
9792 E.g., we have a group of three nodes with three loads from the same
9793 location in each node, and the vector size is 4. I.e., we have a
9794 a0b0c0a1b1c1... sequence and we need to create the following vectors:
9795 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
9796 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
9799 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
9800 The last mask is illegal since we assume two operands for permute
9801 operation, and the mask element values can't be outside that range.
9802 Hence, the last mask must be converted into {2,5,5,5}.
9803 For the first two permutations we need the first and the second input
9804 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
9805 we need the second and the third vectors: {b1,c1,a2,b2} and
9806 {c2,a3,b3,c3}. */
9808 int vect_stmts_counter = 0;
9809 unsigned int index = 0;
9810 int first_vec_index = -1;
9811 int second_vec_index = -1;
9812 bool noop_p = true;
9813 *n_perms = 0;
9815 vec_perm_builder mask;
9816 unsigned int nelts_to_build;
9817 unsigned int nvectors_per_build;
9818 unsigned int in_nlanes;
9819 bool repeating_p = (group_size == dr_group_size
9820 && multiple_p (nunits, group_size));
9821 if (repeating_p)
9823 /* A single vector contains a whole number of copies of the node, so:
9824 (a) all permutes can use the same mask; and
9825 (b) the permutes only need a single vector input. */
9826 mask.new_vector (nunits, group_size, 3);
9827 nelts_to_build = mask.encoded_nelts ();
9828 /* It's possible to obtain zero nstmts during analyze_only, so make
9829 it at least one to ensure the later computation for n_perms
9830 proceed. */
9831 nvectors_per_build = nstmts > 0 ? nstmts : 1;
9832 in_nlanes = dr_group_size * 3;
9834 else
9836 /* We need to construct a separate mask for each vector statement. */
9837 unsigned HOST_WIDE_INT const_nunits, const_vf;
9838 if (!nunits.is_constant (&const_nunits)
9839 || !vf.is_constant (&const_vf))
9840 return false;
9841 mask.new_vector (const_nunits, const_nunits, 1);
9842 nelts_to_build = const_vf * group_size;
9843 nvectors_per_build = 1;
9844 in_nlanes = const_vf * dr_group_size;
9846 auto_sbitmap used_in_lanes (in_nlanes);
9847 bitmap_clear (used_in_lanes);
9848 auto_bitmap used_defs;
9850 unsigned int count = mask.encoded_nelts ();
9851 mask.quick_grow (count);
9852 vec_perm_indices indices;
9854 for (unsigned int j = 0; j < nelts_to_build; j++)
9856 unsigned int iter_num = j / group_size;
9857 unsigned int stmt_num = j % group_size;
9858 unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
9859 bitmap_set_bit (used_in_lanes, i);
9860 if (repeating_p)
9862 first_vec_index = 0;
9863 mask_element = i;
9865 else
9867 /* Enforced before the loop when !repeating_p. */
9868 unsigned int const_nunits = nunits.to_constant ();
9869 vec_index = i / const_nunits;
9870 mask_element = i % const_nunits;
9871 if (vec_index == first_vec_index
9872 || first_vec_index == -1)
9874 first_vec_index = vec_index;
9876 else if (vec_index == second_vec_index
9877 || second_vec_index == -1)
9879 second_vec_index = vec_index;
9880 mask_element += const_nunits;
9882 else
9884 if (dump_p)
9885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9886 "permutation requires at "
9887 "least three vectors %G",
9888 stmt_info->stmt);
9889 gcc_assert (analyze_only);
9890 return false;
9893 gcc_assert (mask_element < 2 * const_nunits);
9896 if (mask_element != index)
9897 noop_p = false;
9898 mask[index++] = mask_element;
9900 if (index == count)
9902 if (!noop_p)
9904 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
9905 if (!can_vec_perm_const_p (mode, mode, indices))
9907 if (dump_p)
9909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9910 "unsupported vect permute { ");
9911 for (i = 0; i < count; ++i)
9913 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
9914 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
9916 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
9918 gcc_assert (analyze_only);
9919 return false;
9922 tree mask_vec = NULL_TREE;
9923 if (!analyze_only)
9924 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
9926 if (second_vec_index == -1)
9927 second_vec_index = first_vec_index;
9929 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
9931 ++*n_perms;
9932 if (analyze_only)
9933 continue;
9934 /* Generate the permute statement if necessary. */
9935 tree first_vec = dr_chain[first_vec_index + ri];
9936 tree second_vec = dr_chain[second_vec_index + ri];
9937 gassign *stmt = as_a<gassign *> (stmt_info->stmt);
9938 tree perm_dest
9939 = vect_create_destination_var (gimple_assign_lhs (stmt),
9940 vectype);
9941 perm_dest = make_ssa_name (perm_dest);
9942 gimple *perm_stmt
9943 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
9944 second_vec, mask_vec);
9945 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
9946 gsi);
9947 if (dce_chain)
9949 bitmap_set_bit (used_defs, first_vec_index + ri);
9950 bitmap_set_bit (used_defs, second_vec_index + ri);
9953 /* Store the vector statement in NODE. */
9954 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
9957 else if (!analyze_only)
9959 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
9961 tree first_vec = dr_chain[first_vec_index + ri];
9962 /* If mask was NULL_TREE generate the requested
9963 identity transform. */
9964 if (dce_chain)
9965 bitmap_set_bit (used_defs, first_vec_index + ri);
9967 /* Store the vector statement in NODE. */
9968 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
9972 index = 0;
9973 first_vec_index = -1;
9974 second_vec_index = -1;
9975 noop_p = true;
9979 if (n_loads)
9981 if (repeating_p)
9982 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9983 else
9985 /* Enforced above when !repeating_p. */
9986 unsigned int const_nunits = nunits.to_constant ();
9987 *n_loads = 0;
9988 bool load_seen = false;
9989 for (unsigned i = 0; i < in_nlanes; ++i)
9991 if (i % const_nunits == 0)
9993 if (load_seen)
9994 *n_loads += 1;
9995 load_seen = false;
9997 if (bitmap_bit_p (used_in_lanes, i))
9998 load_seen = true;
10000 if (load_seen)
10001 *n_loads += 1;
10005 if (dce_chain)
10006 for (unsigned i = 0; i < dr_chain.length (); ++i)
10007 if (!bitmap_bit_p (used_defs, i))
10009 tree def = dr_chain[i];
10012 gimple *stmt = SSA_NAME_DEF_STMT (def);
10013 if (is_gimple_assign (stmt)
10014 && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
10015 || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
10016 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
10017 else
10018 def = NULL;
10019 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
10020 gsi_remove (&rgsi, true);
10021 release_defs (stmt);
10023 while (def);
10026 return true;
10029 /* Generate vector permute statements from a list of loads in DR_CHAIN.
10030 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
10031 permute statements for the SLP node NODE. Store the number of vector
10032 permute instructions in *N_PERMS and the number of vector load
10033 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
10034 that were not needed. */
10036 bool
10037 vect_transform_slp_perm_load (vec_info *vinfo,
10038 slp_tree node, const vec<tree> &dr_chain,
10039 gimple_stmt_iterator *gsi, poly_uint64 vf,
10040 bool analyze_only, unsigned *n_perms,
10041 unsigned int *n_loads, bool dce_chain)
10043 return vect_transform_slp_perm_load_1 (vinfo, node,
10044 SLP_TREE_LOAD_PERMUTATION (node),
10045 dr_chain, gsi, vf, analyze_only,
10046 dump_enabled_p (), n_perms, n_loads,
10047 dce_chain);
10050 /* Produce the next vector result for SLP permutation NODE by adding a vector
10051 statement at GSI. If MASK_VEC is nonnull, add:
10053 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
10055 otherwise add:
10057 <new SSA name> = FIRST_DEF. */
10059 static void
10060 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
10061 slp_tree node, tree first_def, tree second_def,
10062 tree mask_vec, poly_uint64 identity_offset)
10064 tree vectype = SLP_TREE_VECTYPE (node);
10066 /* ??? We SLP match existing vector element extracts but
10067 allow punning which we need to re-instantiate at uses
10068 but have no good way of explicitly representing. */
10069 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
10070 && !types_compatible_p (TREE_TYPE (first_def), vectype))
10072 gassign *conv_stmt
10073 = gimple_build_assign (make_ssa_name (vectype),
10074 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
10075 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
10076 first_def = gimple_assign_lhs (conv_stmt);
10078 gassign *perm_stmt;
10079 tree perm_dest = make_ssa_name (vectype);
10080 if (mask_vec)
10082 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
10083 TYPE_SIZE (vectype))
10084 && !types_compatible_p (TREE_TYPE (second_def), vectype))
10086 gassign *conv_stmt
10087 = gimple_build_assign (make_ssa_name (vectype),
10088 build1 (VIEW_CONVERT_EXPR,
10089 vectype, second_def));
10090 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
10091 second_def = gimple_assign_lhs (conv_stmt);
10093 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
10094 first_def, second_def,
10095 mask_vec);
10097 else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
10099 /* For identity permutes we still need to handle the case
10100 of offsetted extracts or concats. */
10101 unsigned HOST_WIDE_INT c;
10102 auto first_def_nunits
10103 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
10104 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
10106 unsigned HOST_WIDE_INT elsz
10107 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
10108 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
10109 TYPE_SIZE (vectype),
10110 bitsize_int (identity_offset * elsz));
10111 perm_stmt = gimple_build_assign (perm_dest, lowpart);
10113 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
10114 first_def_nunits, &c) && c == 2)
10116 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
10117 NULL_TREE, second_def);
10118 perm_stmt = gimple_build_assign (perm_dest, ctor);
10120 else
10121 gcc_unreachable ();
10123 else
10125 /* We need a copy here in case the def was external. */
10126 perm_stmt = gimple_build_assign (perm_dest, first_def);
10128 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
10129 /* Store the vector statement in NODE. */
10130 node->push_vec_def (perm_stmt);
10133 /* Subroutine of vectorizable_slp_permutation. Check whether the target
10134 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
10135 If GSI is nonnull, emit the permutation there.
10137 When GSI is null, the only purpose of NODE is to give properties
10138 of the result, such as the vector type and number of SLP lanes.
10139 The node does not need to be a VEC_PERM_EXPR.
10141 If the target supports the operation, return the number of individual
10142 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
10143 dump file if DUMP_P is true. */
10145 static int
10146 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
10147 slp_tree node, lane_permutation_t &perm,
10148 vec<slp_tree> &children, bool dump_p)
10150 tree vectype = SLP_TREE_VECTYPE (node);
10152 /* ??? We currently only support all same vector input types
10153 while the SLP IL should really do a concat + select and thus accept
10154 arbitrary mismatches. */
10155 slp_tree child;
10156 unsigned i;
10157 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10158 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
10159 tree op_vectype = NULL_TREE;
10160 FOR_EACH_VEC_ELT (children, i, child)
10161 if (SLP_TREE_VECTYPE (child))
10163 op_vectype = SLP_TREE_VECTYPE (child);
10164 break;
10166 if (!op_vectype)
10167 op_vectype = vectype;
10168 FOR_EACH_VEC_ELT (children, i, child)
10170 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
10171 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
10172 || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
10173 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
10175 if (dump_p)
10176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10177 "Unsupported vector types in lane permutation\n");
10178 return -1;
10180 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
10181 repeating_p = false;
10184 gcc_assert (perm.length () == SLP_TREE_LANES (node));
10186 /* Load-lanes permute. This permute only acts as a forwarder to
10187 select the correct vector def of the load-lanes load which
10188 has the permuted vectors in its vector defs like
10189 { v0, w0, r0, v1, w1, r1 ... } for a ld3. */
10190 if (node->ldst_lanes)
10192 gcc_assert (children.length () == 1);
10193 if (!gsi)
10194 /* This is a trivial op always supported. */
10195 return 1;
10196 slp_tree child = children[0];
10197 unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
10198 / SLP_TREE_LANES (node));
10199 unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
10200 for (unsigned i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
10202 tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num + vec_idx];
10203 node->push_vec_def (def);
10205 return 1;
10208 /* REPEATING_P is true if every output vector is guaranteed to use the
10209 same permute vector. We can handle that case for both variable-length
10210 and constant-length vectors, but we only handle other cases for
10211 constant-length vectors.
10213 Set:
10215 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
10216 mask vector that we want to build.
10218 - NCOPIES to the number of copies of PERM that we need in order
10219 to build the necessary permute mask vectors.
10221 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
10222 for each permute mask vector. This is only relevant when GSI is
10223 nonnull. */
10224 uint64_t npatterns;
10225 unsigned nelts_per_pattern;
10226 uint64_t ncopies;
10227 unsigned noutputs_per_mask;
10228 if (repeating_p)
10230 /* We need a single permute mask vector that has the form:
10232 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
10234 In other words, the original n-element permute in PERM is
10235 "unrolled" to fill a full vector. The stepped vector encoding
10236 that we use for permutes requires 3n elements. */
10237 npatterns = SLP_TREE_LANES (node);
10238 nelts_per_pattern = ncopies = 3;
10239 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
10241 else
10243 /* Calculate every element of every permute mask vector explicitly,
10244 instead of relying on the pattern described above. */
10245 if (!nunits.is_constant (&npatterns)
10246 || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
10247 return -1;
10248 nelts_per_pattern = ncopies = 1;
10249 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
10250 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
10251 return -1;
10252 noutputs_per_mask = 1;
10254 unsigned olanes = ncopies * SLP_TREE_LANES (node);
10255 gcc_assert (repeating_p || multiple_p (olanes, nunits));
10257 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
10258 from the { SLP operand, scalar lane } permutation as recorded in the
10259 SLP node as intermediate step. This part should already work
10260 with SLP children with arbitrary number of lanes. */
10261 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
10262 auto_vec<unsigned> active_lane;
10263 vperm.create (olanes);
10264 active_lane.safe_grow_cleared (children.length (), true);
10265 for (unsigned i = 0; i < ncopies; ++i)
10267 for (unsigned pi = 0; pi < perm.length (); ++pi)
10269 std::pair<unsigned, unsigned> p = perm[pi];
10270 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
10271 if (repeating_p)
10272 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
10273 else
10275 /* We checked above that the vectors are constant-length. */
10276 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
10277 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
10278 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
10279 vperm.quick_push ({{p.first, vi}, vl});
10282 /* Advance to the next group. */
10283 for (unsigned j = 0; j < children.length (); ++j)
10284 active_lane[j] += SLP_TREE_LANES (children[j]);
10287 if (dump_p)
10289 dump_printf_loc (MSG_NOTE, vect_location,
10290 "vectorizing permutation %p", (void *)node);
10291 for (unsigned i = 0; i < perm.length (); ++i)
10292 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
10293 if (repeating_p)
10294 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
10295 dump_printf (MSG_NOTE, "\n");
10296 dump_printf_loc (MSG_NOTE, vect_location, "as");
10297 for (unsigned i = 0; i < vperm.length (); ++i)
10299 if (i != 0
10300 && (repeating_p
10301 ? multiple_p (i, npatterns)
10302 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
10303 dump_printf (MSG_NOTE, ",");
10304 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
10305 vperm[i].first.first, vperm[i].first.second,
10306 vperm[i].second);
10308 dump_printf (MSG_NOTE, "\n");
10311 /* We can only handle two-vector permutes, everything else should
10312 be lowered on the SLP level. The following is closely inspired
10313 by vect_transform_slp_perm_load and is supposed to eventually
10314 replace it.
10315 ??? As intermediate step do code-gen in the SLP tree representation
10316 somehow? */
10317 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
10318 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
10319 unsigned int index = 0;
10320 poly_uint64 mask_element;
10321 vec_perm_builder mask;
10322 mask.new_vector (nunits, npatterns, nelts_per_pattern);
10323 unsigned int count = mask.encoded_nelts ();
10324 mask.quick_grow (count);
10325 vec_perm_indices indices;
10326 unsigned nperms = 0;
10327 for (unsigned i = 0; i < vperm.length (); ++i)
10329 mask_element = vperm[i].second;
10330 if (first_vec.first == -1U
10331 || first_vec == vperm[i].first)
10332 first_vec = vperm[i].first;
10333 else if (second_vec.first == -1U
10334 || second_vec == vperm[i].first)
10336 second_vec = vperm[i].first;
10337 mask_element += nunits;
10339 else
10341 if (dump_p)
10342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10343 "permutation requires at "
10344 "least three vectors\n");
10345 gcc_assert (!gsi);
10346 return -1;
10349 mask[index++] = mask_element;
10351 if (index == count)
10353 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
10354 TYPE_VECTOR_SUBPARTS (op_vectype));
10355 bool identity_p = (indices.series_p (0, 1, mask[0], 1)
10356 && constant_multiple_p (mask[0], nunits));
10357 machine_mode vmode = TYPE_MODE (vectype);
10358 machine_mode op_vmode = TYPE_MODE (op_vectype);
10359 unsigned HOST_WIDE_INT c;
10360 if ((!identity_p
10361 && !can_vec_perm_const_p (vmode, op_vmode, indices))
10362 || (identity_p
10363 && !known_le (nunits,
10364 TYPE_VECTOR_SUBPARTS (op_vectype))
10365 && (!constant_multiple_p (nunits,
10366 TYPE_VECTOR_SUBPARTS (op_vectype),
10367 &c) || c != 2)))
10369 if (dump_p)
10371 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
10372 vect_location,
10373 "unsupported vect permute { ");
10374 for (i = 0; i < count; ++i)
10376 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
10377 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
10379 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
10381 gcc_assert (!gsi);
10382 return -1;
10385 if (!identity_p)
10386 nperms++;
10387 if (gsi)
10389 if (second_vec.first == -1U)
10390 second_vec = first_vec;
10392 slp_tree
10393 first_node = children[first_vec.first],
10394 second_node = children[second_vec.first];
10396 tree mask_vec = NULL_TREE;
10397 if (!identity_p)
10398 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
10400 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
10402 tree first_def
10403 = vect_get_slp_vect_def (first_node,
10404 first_vec.second + vi);
10405 tree second_def
10406 = vect_get_slp_vect_def (second_node,
10407 second_vec.second + vi);
10408 vect_add_slp_permutation (vinfo, gsi, node, first_def,
10409 second_def, mask_vec, mask[0]);
10413 index = 0;
10414 first_vec = std::make_pair (-1U, -1U);
10415 second_vec = std::make_pair (-1U, -1U);
10419 return nperms;
10422 /* Vectorize the SLP permutations in NODE as specified
10423 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
10424 child number and lane number.
10425 Interleaving of two two-lane two-child SLP subtrees (not supported):
10426 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
10427 A blend of two four-lane two-child SLP subtrees:
10428 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
10429 Highpart of a four-lane one-child SLP subtree (not supported):
10430 [ { 0, 2 }, { 0, 3 } ]
10431 Where currently only a subset is supported by code generating below. */
10433 static bool
10434 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
10435 slp_tree node, stmt_vector_for_cost *cost_vec)
10437 tree vectype = SLP_TREE_VECTYPE (node);
10438 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
10439 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
10440 SLP_TREE_CHILDREN (node),
10441 dump_enabled_p ());
10442 if (nperms < 0)
10443 return false;
10445 if (!gsi)
10446 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
10448 return true;
10451 /* Vectorize SLP NODE. */
10453 static void
10454 vect_schedule_slp_node (vec_info *vinfo,
10455 slp_tree node, slp_instance instance)
10457 gimple_stmt_iterator si;
10458 int i;
10459 slp_tree child;
10461 /* Vectorize externals and constants. */
10462 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
10463 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
10465 /* ??? vectorizable_shift can end up using a scalar operand which is
10466 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
10467 node in this case. */
10468 if (!SLP_TREE_VECTYPE (node))
10469 return;
10471 /* There are two reasons vector defs might already exist. The first
10472 is that we are vectorizing an existing vector def. The second is
10473 when performing BB vectorization shared constant/external nodes
10474 are not split apart during partitioning so during the code-gen
10475 DFS walk we can end up visiting them twice. */
10476 if (! SLP_TREE_VEC_DEFS (node).exists ())
10477 vect_create_constant_vectors (vinfo, node);
10478 return;
10481 gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
10483 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
10485 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
10486 SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
10488 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
10489 && STMT_VINFO_DATA_REF (stmt_info))
10491 /* Vectorized loads go before the first scalar load to make it
10492 ready early, vectorized stores go before the last scalar
10493 stmt which is where all uses are ready. */
10494 stmt_vec_info last_stmt_info = NULL;
10495 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
10496 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
10497 else /* DR_IS_WRITE */
10498 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
10499 si = gsi_for_stmt (last_stmt_info->stmt);
10501 else if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
10502 && (STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
10503 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
10504 || STMT_VINFO_TYPE (stmt_info) == phi_info_type))
10506 /* For PHI node vectorization we do not use the insertion iterator. */
10507 si = gsi_none ();
10509 else
10511 /* Emit other stmts after the children vectorized defs which is
10512 earliest possible. */
10513 gimple *last_stmt = NULL;
10514 bool seen_vector_def = false;
10515 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
10516 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
10518 /* For fold-left reductions we are retaining the scalar
10519 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
10520 set so the representation isn't perfect. Resort to the
10521 last scalar def here. */
10522 if (SLP_TREE_VEC_DEFS (child).is_empty ())
10524 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
10525 == cycle_phi_info_type);
10526 gphi *phi = as_a <gphi *>
10527 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
10528 if (!last_stmt
10529 || vect_stmt_dominates_stmt_p (last_stmt, phi))
10530 last_stmt = phi;
10532 /* We are emitting all vectorized stmts in the same place and
10533 the last one is the last.
10534 ??? Unless we have a load permutation applied and that
10535 figures to re-use an earlier generated load. */
10536 unsigned j;
10537 tree vdef;
10538 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
10540 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
10541 if (!last_stmt
10542 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
10543 last_stmt = vstmt;
10546 else if (!SLP_TREE_VECTYPE (child))
10548 /* For externals we use unvectorized at all scalar defs. */
10549 unsigned j;
10550 tree def;
10551 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
10552 if (TREE_CODE (def) == SSA_NAME
10553 && !SSA_NAME_IS_DEFAULT_DEF (def))
10555 gimple *stmt = SSA_NAME_DEF_STMT (def);
10556 if (!last_stmt
10557 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
10558 last_stmt = stmt;
10561 else
10563 /* For externals we have to look at all defs since their
10564 insertion place is decided per vector. But beware
10565 of pre-existing vectors where we need to make sure
10566 we do not insert before the region boundary. */
10567 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
10568 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
10569 seen_vector_def = true;
10570 else
10572 unsigned j;
10573 tree vdef;
10574 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
10575 if (TREE_CODE (vdef) == SSA_NAME
10576 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
10578 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
10579 if (!last_stmt
10580 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
10581 last_stmt = vstmt;
10585 /* This can happen when all children are pre-existing vectors or
10586 constants. */
10587 if (!last_stmt)
10588 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
10589 if (!last_stmt)
10591 gcc_assert (seen_vector_def);
10592 si = gsi_after_labels (vinfo->bbs[0]);
10594 else if (is_ctrl_altering_stmt (last_stmt))
10596 /* We split regions to vectorize at control altering stmts
10597 with a definition so this must be an external which
10598 we can insert at the start of the region. */
10599 si = gsi_after_labels (vinfo->bbs[0]);
10601 else if (is_a <bb_vec_info> (vinfo)
10602 && SLP_TREE_CODE (node) != VEC_PERM_EXPR
10603 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
10604 && gimple_could_trap_p (stmt_info->stmt))
10606 /* We've constrained possibly trapping operations to all come
10607 from the same basic-block, if vectorized defs would allow earlier
10608 scheduling still force vectorized stmts to the original block.
10609 This is only necessary for BB vectorization since for loop vect
10610 all operations are in a single BB and scalar stmt based
10611 placement doesn't play well with epilogue vectorization. */
10612 gcc_assert (dominated_by_p (CDI_DOMINATORS,
10613 gimple_bb (stmt_info->stmt),
10614 gimple_bb (last_stmt)));
10615 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
10617 else if (is_a <gphi *> (last_stmt))
10618 si = gsi_after_labels (gimple_bb (last_stmt));
10619 else
10621 si = gsi_for_stmt (last_stmt);
10622 gsi_next (&si);
10624 /* Avoid scheduling internal defs outside of the loop when
10625 we might have only implicitly tracked loop mask/len defs. */
10626 if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
10627 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10628 || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10630 gimple_stmt_iterator si2
10631 = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
10632 if ((gsi_end_p (si2)
10633 && (LOOP_VINFO_LOOP (loop_vinfo)->header
10634 != gimple_bb (last_stmt))
10635 && dominated_by_p (CDI_DOMINATORS,
10636 LOOP_VINFO_LOOP (loop_vinfo)->header,
10637 gimple_bb (last_stmt)))
10638 || (!gsi_end_p (si2)
10639 && last_stmt != *si2
10640 && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
10641 si = si2;
10646 /* Handle purely internal nodes. */
10647 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
10649 if (dump_enabled_p ())
10650 dump_printf_loc (MSG_NOTE, vect_location,
10651 "------>vectorizing SLP permutation node\n");
10652 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
10653 be shared with different SLP nodes (but usually it's the same
10654 operation apart from the case the stmt is only there for denoting
10655 the actual scalar lane defs ...). So do not call vect_transform_stmt
10656 but open-code it here (partly). */
10657 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
10658 gcc_assert (done);
10659 stmt_vec_info slp_stmt_info;
10660 unsigned int i;
10661 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
10662 if (slp_stmt_info && STMT_VINFO_LIVE_P (slp_stmt_info))
10664 done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
10665 instance, i, true, NULL);
10666 gcc_assert (done);
10669 else
10671 if (dump_enabled_p ())
10672 dump_printf_loc (MSG_NOTE, vect_location,
10673 "------>vectorizing SLP node starting from: %G",
10674 stmt_info->stmt);
10675 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
10679 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
10680 For loop vectorization this is done in vectorizable_call, but for SLP
10681 it needs to be deferred until end of vect_schedule_slp, because multiple
10682 SLP instances may refer to the same scalar stmt. */
10684 static void
10685 vect_remove_slp_scalar_calls (vec_info *vinfo,
10686 slp_tree node, hash_set<slp_tree> &visited)
10688 gimple *new_stmt;
10689 gimple_stmt_iterator gsi;
10690 int i;
10691 slp_tree child;
10692 tree lhs;
10693 stmt_vec_info stmt_info;
10695 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
10696 return;
10698 if (visited.add (node))
10699 return;
10701 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
10702 vect_remove_slp_scalar_calls (vinfo, child, visited);
10704 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
10706 if (!stmt_info)
10707 continue;
10708 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
10709 if (!stmt || gimple_bb (stmt) == NULL)
10710 continue;
10711 if (is_pattern_stmt_p (stmt_info)
10712 || !PURE_SLP_STMT (stmt_info))
10713 continue;
10714 lhs = gimple_call_lhs (stmt);
10715 if (lhs)
10716 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
10717 else
10719 new_stmt = gimple_build_nop ();
10720 unlink_stmt_vdef (stmt_info->stmt);
10722 gsi = gsi_for_stmt (stmt);
10723 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
10724 if (lhs)
10725 SSA_NAME_DEF_STMT (lhs) = new_stmt;
10729 static void
10730 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
10732 hash_set<slp_tree> visited;
10733 vect_remove_slp_scalar_calls (vinfo, node, visited);
10736 /* Vectorize the instance root. */
10738 void
10739 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
10741 gassign *rstmt = NULL;
10743 if (instance->kind == slp_inst_kind_ctor)
10745 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
10747 tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
10748 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
10749 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
10750 TREE_TYPE (vect_lhs)))
10751 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
10752 vect_lhs);
10753 rstmt = gimple_build_assign (root_lhs, vect_lhs);
10755 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
10757 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
10758 tree child_def;
10759 int j;
10760 vec<constructor_elt, va_gc> *v;
10761 vec_alloc (v, nelts);
10763 /* A CTOR can handle V16HI composition from VNx8HI so we
10764 do not need to convert vector elements if the types
10765 do not match. */
10766 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
10767 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
10768 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
10769 tree rtype
10770 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
10771 tree r_constructor = build_constructor (rtype, v);
10772 rstmt = gimple_build_assign (lhs, r_constructor);
10775 else if (instance->kind == slp_inst_kind_bb_reduc)
10777 /* Largely inspired by reduction chain epilogue handling in
10778 vect_create_epilog_for_reduction. */
10779 vec<tree> vec_defs = vNULL;
10780 vect_get_slp_defs (node, &vec_defs);
10781 enum tree_code reduc_code
10782 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
10783 /* ??? We actually have to reflect signs somewhere. */
10784 if (reduc_code == MINUS_EXPR)
10785 reduc_code = PLUS_EXPR;
10786 gimple_seq epilogue = NULL;
10787 /* We may end up with more than one vector result, reduce them
10788 to one vector. */
10789 tree vec_def = vec_defs[0];
10790 tree vectype = TREE_TYPE (vec_def);
10791 tree compute_vectype = vectype;
10792 bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
10793 && TYPE_OVERFLOW_UNDEFINED (vectype)
10794 && operation_can_overflow (reduc_code));
10795 if (pun_for_overflow_p)
10797 compute_vectype = unsigned_type_for (vectype);
10798 vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
10799 compute_vectype, vec_def);
10801 for (unsigned i = 1; i < vec_defs.length (); ++i)
10803 tree def = vec_defs[i];
10804 if (pun_for_overflow_p)
10805 def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
10806 compute_vectype, def);
10807 vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
10808 vec_def, def);
10810 vec_defs.release ();
10811 /* ??? Support other schemes than direct internal fn. */
10812 internal_fn reduc_fn;
10813 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
10814 || reduc_fn == IFN_LAST)
10815 gcc_unreachable ();
10816 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
10817 TREE_TYPE (compute_vectype), vec_def);
10818 if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
10820 tree rem_def = NULL_TREE;
10821 for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
10823 def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
10824 if (!rem_def)
10825 rem_def = def;
10826 else
10827 rem_def = gimple_build (&epilogue, reduc_code,
10828 TREE_TYPE (scalar_def),
10829 rem_def, def);
10831 scalar_def = gimple_build (&epilogue, reduc_code,
10832 TREE_TYPE (scalar_def),
10833 scalar_def, rem_def);
10835 scalar_def = gimple_convert (&epilogue,
10836 TREE_TYPE (vectype), scalar_def);
10837 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
10838 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
10839 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
10840 update_stmt (gsi_stmt (rgsi));
10841 return;
10843 else
10844 gcc_unreachable ();
10846 gcc_assert (rstmt);
10848 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
10849 gsi_replace (&rgsi, rstmt, true);
10852 struct slp_scc_info
10854 bool on_stack;
10855 int dfs;
10856 int lowlink;
10859 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
10861 static void
10862 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
10863 hash_map<slp_tree, slp_scc_info> &scc_info,
10864 int &maxdfs, vec<slp_tree> &stack)
10866 bool existed_p;
10867 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
10868 gcc_assert (!existed_p);
10869 info->dfs = maxdfs;
10870 info->lowlink = maxdfs;
10871 maxdfs++;
10873 /* Leaf. */
10874 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
10876 info->on_stack = false;
10877 vect_schedule_slp_node (vinfo, node, instance);
10878 return;
10881 info->on_stack = true;
10882 stack.safe_push (node);
10884 unsigned i;
10885 slp_tree child;
10886 /* DFS recurse. */
10887 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
10889 if (!child)
10890 continue;
10891 slp_scc_info *child_info = scc_info.get (child);
10892 if (!child_info)
10894 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
10895 /* Recursion might have re-allocated the node. */
10896 info = scc_info.get (node);
10897 child_info = scc_info.get (child);
10898 info->lowlink = MIN (info->lowlink, child_info->lowlink);
10900 else if (child_info->on_stack)
10901 info->lowlink = MIN (info->lowlink, child_info->dfs);
10903 if (info->lowlink != info->dfs)
10904 return;
10906 auto_vec<slp_tree, 4> phis_to_fixup;
10908 /* Singleton. */
10909 if (stack.last () == node)
10911 stack.pop ();
10912 info->on_stack = false;
10913 vect_schedule_slp_node (vinfo, node, instance);
10914 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
10915 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
10916 phis_to_fixup.quick_push (node);
10918 else
10920 /* SCC. */
10921 int last_idx = stack.length () - 1;
10922 while (stack[last_idx] != node)
10923 last_idx--;
10924 /* We can break the cycle at PHIs who have at least one child
10925 code generated. Then we could re-start the DFS walk until
10926 all nodes in the SCC are covered (we might have new entries
10927 for only back-reachable nodes). But it's simpler to just
10928 iterate and schedule those that are ready. */
10929 unsigned todo = stack.length () - last_idx;
10932 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
10934 slp_tree entry = stack[idx];
10935 if (!entry)
10936 continue;
10937 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
10938 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
10939 bool ready = !phi;
10940 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
10941 if (!child)
10943 gcc_assert (phi);
10944 ready = true;
10945 break;
10947 else if (scc_info.get (child)->on_stack)
10949 if (!phi)
10951 ready = false;
10952 break;
10955 else
10957 if (phi)
10959 ready = true;
10960 break;
10963 if (ready)
10965 vect_schedule_slp_node (vinfo, entry, instance);
10966 scc_info.get (entry)->on_stack = false;
10967 stack[idx] = NULL;
10968 todo--;
10969 if (phi)
10970 phis_to_fixup.safe_push (entry);
10974 while (todo != 0);
10976 /* Pop the SCC. */
10977 stack.truncate (last_idx);
10980 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
10981 slp_tree phi_node;
10982 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
10984 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
10985 edge_iterator ei;
10986 edge e;
10987 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
10989 unsigned dest_idx = e->dest_idx;
10990 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
10991 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
10992 continue;
10993 unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
10994 /* Simply fill all args. */
10995 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
10996 != vect_first_order_recurrence)
10997 for (unsigned i = 0; i < n; ++i)
10999 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
11000 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11001 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
11002 e, gimple_phi_arg_location (phi, dest_idx));
11004 else
11006 /* Unless it is a first order recurrence which needs
11007 args filled in for both the PHI node and the permutes. */
11008 gimple *perm
11009 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
11010 gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
11011 add_phi_arg (as_a <gphi *> (rphi),
11012 vect_get_slp_vect_def (child, n - 1),
11013 e, gimple_phi_arg_location (phi, dest_idx));
11014 for (unsigned i = 0; i < n; ++i)
11016 gimple *perm
11017 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
11018 if (i > 0)
11019 gimple_assign_set_rhs1 (perm,
11020 vect_get_slp_vect_def (child, i - 1));
11021 gimple_assign_set_rhs2 (perm,
11022 vect_get_slp_vect_def (child, i));
11023 update_stmt (perm);
11030 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
11032 void
11033 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
11035 slp_instance instance;
11036 unsigned int i;
11038 hash_map<slp_tree, slp_scc_info> scc_info;
11039 int maxdfs = 0;
11040 FOR_EACH_VEC_ELT (slp_instances, i, instance)
11042 slp_tree node = SLP_INSTANCE_TREE (instance);
11043 if (dump_enabled_p ())
11045 dump_printf_loc (MSG_NOTE, vect_location,
11046 "Vectorizing SLP tree:\n");
11047 /* ??? Dump all? */
11048 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
11049 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
11050 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
11051 vect_print_slp_graph (MSG_NOTE, vect_location,
11052 SLP_INSTANCE_TREE (instance));
11054 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
11055 have a PHI be the node breaking the cycle. */
11056 auto_vec<slp_tree> stack;
11057 if (!scc_info.get (node))
11058 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
11060 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
11061 vectorize_slp_instance_root_stmt (node, instance);
11063 if (dump_enabled_p ())
11064 dump_printf_loc (MSG_NOTE, vect_location,
11065 "vectorizing stmts using SLP.\n");
11068 FOR_EACH_VEC_ELT (slp_instances, i, instance)
11070 slp_tree root = SLP_INSTANCE_TREE (instance);
11071 stmt_vec_info store_info;
11072 unsigned int j;
11074 /* Remove scalar call stmts. Do not do this for basic-block
11075 vectorization as not all uses may be vectorized.
11076 ??? Why should this be necessary? DCE should be able to
11077 remove the stmts itself.
11078 ??? For BB vectorization we can as well remove scalar
11079 stmts starting from the SLP tree root if they have no
11080 uses. */
11081 if (is_a <loop_vec_info> (vinfo))
11082 vect_remove_slp_scalar_calls (vinfo, root);
11084 /* Remove vectorized stores original scalar stmts. */
11085 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
11087 if (!STMT_VINFO_DATA_REF (store_info)
11088 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
11089 break;
11091 store_info = vect_orig_stmt (store_info);
11092 /* Free the attached stmt_vec_info and remove the stmt. */
11093 vinfo->remove_stmt (store_info);
11095 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
11096 to not crash in vect_free_slp_tree later. */
11097 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
11098 SLP_TREE_REPRESENTATIVE (root) = NULL;