libstdc++: Simplify std::any to fix -Wdeprecated-declarations warning
[official-gcc.git] / gcc / tree-vect-slp.cc
blob1342913affa1e65fb902b9577f556a3ea17dbd02
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #define INCLUDE_ALGORITHM
24 #define INCLUDE_MEMORY
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "gimple.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "insn-config.h"
36 #include "recog.h" /* FIXME: for insn_data */
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "gimple-iterator.h"
40 #include "cfgloop.h"
41 #include "tree-vectorizer.h"
42 #include "langhooks.h"
43 #include "gimple-walk.h"
44 #include "dbgcnt.h"
45 #include "tree-vector-builder.h"
46 #include "vec-perm-indices.h"
47 #include "gimple-fold.h"
48 #include "internal-fn.h"
49 #include "dump-context.h"
50 #include "cfganal.h"
51 #include "tree-eh.h"
52 #include "tree-cfg.h"
53 #include "alloc-pool.h"
54 #include "sreal.h"
55 #include "predict.h"
57 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
58 load_permutation_t &,
59 const vec<tree> &,
60 gimple_stmt_iterator *,
61 poly_uint64, bool, bool,
62 unsigned *,
63 unsigned * = nullptr,
64 bool = false);
65 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
66 slp_tree, lane_permutation_t &,
67 vec<slp_tree> &, bool);
68 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
69 slp_tree, stmt_vector_for_cost *);
70 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
72 static object_allocator<_slp_tree> *slp_tree_pool;
73 static slp_tree slp_first_node;
75 void
76 vect_slp_init (void)
78 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
81 void
82 vect_slp_fini (void)
84 while (slp_first_node)
85 delete slp_first_node;
86 delete slp_tree_pool;
87 slp_tree_pool = NULL;
90 void *
91 _slp_tree::operator new (size_t n)
93 gcc_assert (n == sizeof (_slp_tree));
94 return slp_tree_pool->allocate_raw ();
97 void
98 _slp_tree::operator delete (void *node, size_t n)
100 gcc_assert (n == sizeof (_slp_tree));
101 slp_tree_pool->remove_raw (node);
105 /* Initialize a SLP node. */
107 _slp_tree::_slp_tree ()
109 this->prev_node = NULL;
110 if (slp_first_node)
111 slp_first_node->prev_node = this;
112 this->next_node = slp_first_node;
113 slp_first_node = this;
114 SLP_TREE_SCALAR_STMTS (this) = vNULL;
115 SLP_TREE_SCALAR_OPS (this) = vNULL;
116 SLP_TREE_VEC_DEFS (this) = vNULL;
117 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
118 SLP_TREE_CHILDREN (this) = vNULL;
119 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
120 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
121 SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
122 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
123 SLP_TREE_CODE (this) = ERROR_MARK;
124 this->ldst_lanes = false;
125 SLP_TREE_VECTYPE (this) = NULL_TREE;
126 SLP_TREE_REPRESENTATIVE (this) = NULL;
127 SLP_TREE_REF_COUNT (this) = 1;
128 this->failed = NULL;
129 this->max_nunits = 1;
130 this->lanes = 0;
133 /* Tear down a SLP node. */
135 _slp_tree::~_slp_tree ()
137 if (this->prev_node)
138 this->prev_node->next_node = this->next_node;
139 else
140 slp_first_node = this->next_node;
141 if (this->next_node)
142 this->next_node->prev_node = this->prev_node;
143 SLP_TREE_CHILDREN (this).release ();
144 SLP_TREE_SCALAR_STMTS (this).release ();
145 SLP_TREE_SCALAR_OPS (this).release ();
146 SLP_TREE_VEC_DEFS (this).release ();
147 SLP_TREE_LOAD_PERMUTATION (this).release ();
148 SLP_TREE_LANE_PERMUTATION (this).release ();
149 SLP_TREE_SIMD_CLONE_INFO (this).release ();
150 if (this->failed)
151 free (failed);
154 /* Push the single SSA definition in DEF to the vector of vector defs. */
156 void
157 _slp_tree::push_vec_def (gimple *def)
159 if (gphi *phi = dyn_cast <gphi *> (def))
160 vec_defs.quick_push (gimple_phi_result (phi));
161 else
163 def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
164 vec_defs.quick_push (get_def_from_ptr (defop));
168 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
170 void
171 vect_free_slp_tree (slp_tree node)
173 int i;
174 slp_tree child;
176 if (--SLP_TREE_REF_COUNT (node) != 0)
177 return;
179 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
180 if (child)
181 vect_free_slp_tree (child);
183 /* If the node defines any SLP only patterns then those patterns are no
184 longer valid and should be removed. */
185 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
186 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
188 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
189 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
190 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
193 delete node;
196 /* Return a location suitable for dumpings related to the SLP instance. */
198 dump_user_location_t
199 _slp_instance::location () const
201 if (!root_stmts.is_empty ())
202 return root_stmts[0]->stmt;
203 else
204 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
208 /* Free the memory allocated for the SLP instance. */
210 void
211 vect_free_slp_instance (slp_instance instance)
213 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
214 SLP_INSTANCE_LOADS (instance).release ();
215 SLP_INSTANCE_ROOT_STMTS (instance).release ();
216 SLP_INSTANCE_REMAIN_DEFS (instance).release ();
217 instance->subgraph_entries.release ();
218 instance->cost_vec.release ();
219 free (instance);
223 /* Create an SLP node for SCALAR_STMTS. */
225 slp_tree
226 vect_create_new_slp_node (unsigned nops, tree_code code)
228 slp_tree node = new _slp_tree;
229 SLP_TREE_SCALAR_STMTS (node) = vNULL;
230 SLP_TREE_CHILDREN (node).create (nops);
231 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
232 SLP_TREE_CODE (node) = code;
233 return node;
235 /* Create an SLP node for SCALAR_STMTS. */
237 static slp_tree
238 vect_create_new_slp_node (slp_tree node,
239 vec<stmt_vec_info> scalar_stmts, unsigned nops)
241 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
242 SLP_TREE_CHILDREN (node).create (nops);
243 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
244 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
245 SLP_TREE_LANES (node) = scalar_stmts.length ();
246 return node;
249 /* Create an SLP node for SCALAR_STMTS. */
251 static slp_tree
252 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
254 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
257 /* Create an SLP node for OPS. */
259 static slp_tree
260 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
262 SLP_TREE_SCALAR_OPS (node) = ops;
263 SLP_TREE_DEF_TYPE (node) = vect_external_def;
264 SLP_TREE_LANES (node) = ops.length ();
265 return node;
268 /* Create an SLP node for OPS. */
270 static slp_tree
271 vect_create_new_slp_node (vec<tree> ops)
273 return vect_create_new_slp_node (new _slp_tree, ops);
277 /* This structure is used in creation of an SLP tree. Each instance
278 corresponds to the same operand in a group of scalar stmts in an SLP
279 node. */
280 typedef struct _slp_oprnd_info
282 /* Def-stmts for the operands. */
283 vec<stmt_vec_info> def_stmts;
284 /* Operands. */
285 vec<tree> ops;
286 /* Information about the first statement, its vector def-type, type, the
287 operand itself in case it's constant, and an indication if it's a pattern
288 stmt and gather/scatter info. */
289 tree first_op_type;
290 enum vect_def_type first_dt;
291 bool any_pattern;
292 bool first_gs_p;
293 gather_scatter_info first_gs_info;
294 } *slp_oprnd_info;
297 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
298 operand. */
299 static vec<slp_oprnd_info>
300 vect_create_oprnd_info (int nops, int group_size)
302 int i;
303 slp_oprnd_info oprnd_info;
304 vec<slp_oprnd_info> oprnds_info;
306 oprnds_info.create (nops);
307 for (i = 0; i < nops; i++)
309 oprnd_info = XNEW (struct _slp_oprnd_info);
310 oprnd_info->def_stmts.create (group_size);
311 oprnd_info->ops.create (group_size);
312 oprnd_info->first_dt = vect_uninitialized_def;
313 oprnd_info->first_op_type = NULL_TREE;
314 oprnd_info->any_pattern = false;
315 oprnd_info->first_gs_p = false;
316 oprnds_info.quick_push (oprnd_info);
319 return oprnds_info;
323 /* Free operands info. */
325 static void
326 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
328 int i;
329 slp_oprnd_info oprnd_info;
331 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
333 oprnd_info->def_stmts.release ();
334 oprnd_info->ops.release ();
335 XDELETE (oprnd_info);
338 oprnds_info.release ();
341 /* Return the execution frequency of NODE (so that a higher value indicates
342 a "more important" node when optimizing for speed). */
344 static sreal
345 vect_slp_node_weight (slp_tree node)
347 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
348 basic_block bb = gimple_bb (stmt_info->stmt);
349 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
352 /* Return true if STMTS contains a pattern statement. */
354 static bool
355 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
357 stmt_vec_info stmt_info;
358 unsigned int i;
359 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
360 if (stmt_info && is_pattern_stmt_p (stmt_info))
361 return true;
362 return false;
365 /* Return true when all lanes in the external or constant NODE have
366 the same value. */
368 static bool
369 vect_slp_tree_uniform_p (slp_tree node)
371 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
372 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
374 /* Pre-exsting vectors. */
375 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
376 return false;
378 unsigned i;
379 tree op, first = NULL_TREE;
380 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
381 if (!first)
382 first = op;
383 else if (!operand_equal_p (first, op, 0))
384 return false;
386 return true;
389 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
390 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
391 of the chain. */
394 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
395 stmt_vec_info first_stmt_info)
397 stmt_vec_info next_stmt_info = first_stmt_info;
398 int result = 0;
400 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
401 return -1;
405 if (next_stmt_info == stmt_info)
406 return result;
407 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
408 if (next_stmt_info)
409 result += DR_GROUP_GAP (next_stmt_info);
411 while (next_stmt_info);
413 return -1;
416 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
417 using the method implemented by duplicate_and_interleave. Return true
418 if so, returning the number of intermediate vectors in *NVECTORS_OUT
419 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
420 (if nonnull). */
422 bool
423 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
424 tree elt_type, unsigned int *nvectors_out,
425 tree *vector_type_out,
426 tree *permutes)
428 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
429 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
430 return false;
432 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
433 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
434 unsigned int nvectors = 1;
435 for (;;)
437 scalar_int_mode int_mode;
438 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
439 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
441 /* Get the natural vector type for this SLP group size. */
442 tree int_type = build_nonstandard_integer_type
443 (GET_MODE_BITSIZE (int_mode), 1);
444 tree vector_type
445 = get_vectype_for_scalar_type (vinfo, int_type, count);
446 poly_int64 half_nelts;
447 if (vector_type
448 && VECTOR_MODE_P (TYPE_MODE (vector_type))
449 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
450 GET_MODE_SIZE (base_vector_mode))
451 && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
452 2, &half_nelts))
454 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
455 together into elements of type INT_TYPE and using the result
456 to build NVECTORS vectors. */
457 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
458 vec_perm_builder sel1 (nelts, 2, 3);
459 vec_perm_builder sel2 (nelts, 2, 3);
461 for (unsigned int i = 0; i < 3; ++i)
463 sel1.quick_push (i);
464 sel1.quick_push (i + nelts);
465 sel2.quick_push (half_nelts + i);
466 sel2.quick_push (half_nelts + i + nelts);
468 vec_perm_indices indices1 (sel1, 2, nelts);
469 vec_perm_indices indices2 (sel2, 2, nelts);
470 machine_mode vmode = TYPE_MODE (vector_type);
471 if (can_vec_perm_const_p (vmode, vmode, indices1)
472 && can_vec_perm_const_p (vmode, vmode, indices2))
474 if (nvectors_out)
475 *nvectors_out = nvectors;
476 if (vector_type_out)
477 *vector_type_out = vector_type;
478 if (permutes)
480 permutes[0] = vect_gen_perm_mask_checked (vector_type,
481 indices1);
482 permutes[1] = vect_gen_perm_mask_checked (vector_type,
483 indices2);
485 return true;
489 if (!multiple_p (elt_bytes, 2, &elt_bytes))
490 return false;
491 nvectors *= 2;
495 /* Return true if DTA and DTB match. */
497 static bool
498 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
500 return (dta == dtb
501 || ((dta == vect_external_def || dta == vect_constant_def)
502 && (dtb == vect_external_def || dtb == vect_constant_def)));
505 static const int cond_expr_maps[3][5] = {
506 { 4, -1, -2, 1, 2 },
507 { 4, -2, -1, 1, 2 },
508 { 4, -1, -2, 2, 1 }
510 static const int arg0_map[] = { 1, 0 };
511 static const int arg1_map[] = { 1, 1 };
512 static const int arg2_map[] = { 1, 2 };
513 static const int arg1_arg4_map[] = { 2, 1, 4 };
514 static const int arg3_arg2_map[] = { 2, 3, 2 };
515 static const int op1_op0_map[] = { 2, 1, 0 };
516 static const int off_map[] = { 1, -3 };
517 static const int off_op0_map[] = { 2, -3, 0 };
518 static const int off_arg2_map[] = { 2, -3, 2 };
519 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
520 static const int mask_call_maps[6][7] = {
521 { 1, 1, },
522 { 2, 1, 2, },
523 { 3, 1, 2, 3, },
524 { 4, 1, 2, 3, 4, },
525 { 5, 1, 2, 3, 4, 5, },
526 { 6, 1, 2, 3, 4, 5, 6 },
529 /* For most SLP statements, there is a one-to-one mapping between
530 gimple arguments and child nodes. If that is not true for STMT,
531 return an array that contains:
533 - the number of child nodes, followed by
534 - for each child node, the index of the argument associated with that node.
535 The special index -1 is the first operand of an embedded comparison and
536 the special index -2 is the second operand of an embedded comparison.
537 The special indes -3 is the offset of a gather as analyzed by
538 vect_check_gather_scatter.
540 SWAP is as for vect_get_and_check_slp_defs. */
542 static const int *
543 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
544 unsigned char swap = 0)
546 if (auto assign = dyn_cast<const gassign *> (stmt))
548 if (gimple_assign_rhs_code (assign) == COND_EXPR
549 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
550 return cond_expr_maps[swap];
551 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
552 && swap)
553 return op1_op0_map;
554 if (gather_scatter_p)
555 return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
556 ? off_op0_map : off_map);
558 gcc_assert (!swap);
559 if (auto call = dyn_cast<const gcall *> (stmt))
561 if (gimple_call_internal_p (call))
562 switch (gimple_call_internal_fn (call))
564 case IFN_MASK_LOAD:
565 return gather_scatter_p ? off_arg2_map : arg2_map;
567 case IFN_GATHER_LOAD:
568 return arg1_map;
570 case IFN_MASK_GATHER_LOAD:
571 case IFN_MASK_LEN_GATHER_LOAD:
572 return arg1_arg4_map;
574 case IFN_MASK_STORE:
575 return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
577 case IFN_MASK_CALL:
579 unsigned nargs = gimple_call_num_args (call);
580 if (nargs >= 2 && nargs <= 7)
581 return mask_call_maps[nargs-2];
582 else
583 return nullptr;
586 case IFN_CLZ:
587 case IFN_CTZ:
588 return arg0_map;
590 default:
591 break;
594 return nullptr;
597 /* Return the SLP node child index for operand OP of STMT. */
600 vect_slp_child_index_for_operand (const gimple *stmt, int op,
601 bool gather_scatter_p)
603 const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
604 if (!opmap)
605 return op;
606 for (int i = 1; i < 1 + opmap[0]; ++i)
607 if (opmap[i] == op)
608 return i - 1;
609 gcc_unreachable ();
612 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
613 they are of a valid type and that they match the defs of the first stmt of
614 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
615 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
616 indicates swap is required for cond_expr stmts. Specifically, SWAP
617 is 1 if STMT is cond and operands of comparison need to be swapped;
618 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
620 If there was a fatal error return -1; if the error could be corrected by
621 swapping operands of father node of this one, return 1; if everything is
622 ok return 0. */
623 static int
624 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
625 bool *skip_args,
626 vec<stmt_vec_info> stmts, unsigned stmt_num,
627 vec<slp_oprnd_info> *oprnds_info)
629 stmt_vec_info stmt_info = stmts[stmt_num];
630 tree oprnd;
631 unsigned int i, number_of_oprnds;
632 enum vect_def_type dt = vect_uninitialized_def;
633 slp_oprnd_info oprnd_info;
634 gather_scatter_info gs_info;
635 unsigned int gs_op = -1u;
636 unsigned int commutative_op = -1U;
637 bool first = stmt_num == 0;
639 if (!is_a<gcall *> (stmt_info->stmt)
640 && !is_a<gassign *> (stmt_info->stmt)
641 && !is_a<gphi *> (stmt_info->stmt))
642 return -1;
644 number_of_oprnds = gimple_num_args (stmt_info->stmt);
645 const int *map
646 = vect_get_operand_map (stmt_info->stmt,
647 STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
648 if (map)
649 number_of_oprnds = *map++;
650 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
652 if (gimple_call_internal_p (stmt))
654 internal_fn ifn = gimple_call_internal_fn (stmt);
655 commutative_op = first_commutative_argument (ifn);
658 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
660 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
661 commutative_op = 0;
664 bool swapped = (swap != 0);
665 bool backedge = false;
666 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
667 for (i = 0; i < number_of_oprnds; i++)
669 oprnd_info = (*oprnds_info)[i];
670 int opno = map ? map[i] : int (i);
671 if (opno == -3)
673 gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
674 if (!is_a <loop_vec_info> (vinfo)
675 || !vect_check_gather_scatter (stmt_info,
676 as_a <loop_vec_info> (vinfo),
677 first ? &oprnd_info->first_gs_info
678 : &gs_info))
679 return -1;
681 if (first)
683 oprnd_info->first_gs_p = true;
684 oprnd = oprnd_info->first_gs_info.offset;
686 else
688 gs_op = i;
689 oprnd = gs_info.offset;
692 else if (opno < 0)
693 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
694 else
696 oprnd = gimple_arg (stmt_info->stmt, opno);
697 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
699 edge e = gimple_phi_arg_edge (stmt, opno);
700 backedge = (is_a <bb_vec_info> (vinfo)
701 ? e->flags & EDGE_DFS_BACK
702 : dominated_by_p (CDI_DOMINATORS, e->src,
703 gimple_bb (stmt_info->stmt)));
706 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
707 oprnd = TREE_OPERAND (oprnd, 0);
709 stmt_vec_info def_stmt_info;
710 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
712 if (dump_enabled_p ())
713 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
714 "Build SLP failed: can't analyze def for %T\n",
715 oprnd);
717 return -1;
720 if (skip_args[i])
722 oprnd_info->def_stmts.quick_push (NULL);
723 oprnd_info->ops.quick_push (NULL_TREE);
724 oprnd_info->first_dt = vect_uninitialized_def;
725 continue;
728 oprnd_info->def_stmts.quick_push (def_stmt_info);
729 oprnd_info->ops.quick_push (oprnd);
731 if (def_stmt_info
732 && is_pattern_stmt_p (def_stmt_info))
734 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
735 != def_stmt_info)
736 oprnd_info->any_pattern = true;
737 else
738 /* If we promote this to external use the original stmt def. */
739 oprnd_info->ops.last ()
740 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
743 /* If there's a extern def on a backedge make sure we can
744 code-generate at the region start.
745 ??? This is another case that could be fixed by adjusting
746 how we split the function but at the moment we'd have conflicting
747 goals there. */
748 if (backedge
749 && dts[i] == vect_external_def
750 && is_a <bb_vec_info> (vinfo)
751 && TREE_CODE (oprnd) == SSA_NAME
752 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
753 && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
754 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
756 if (dump_enabled_p ())
757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
758 "Build SLP failed: extern def %T only defined "
759 "on backedge\n", oprnd);
760 return -1;
763 if (first)
765 tree type = TREE_TYPE (oprnd);
766 dt = dts[i];
768 /* For the swapping logic below force vect_reduction_def
769 for the reduction op in a SLP reduction group. */
770 if (!STMT_VINFO_DATA_REF (stmt_info)
771 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
772 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
773 && def_stmt_info)
774 dts[i] = dt = vect_reduction_def;
776 /* Check the types of the definition. */
777 switch (dt)
779 case vect_external_def:
780 case vect_constant_def:
781 case vect_internal_def:
782 case vect_reduction_def:
783 case vect_double_reduction_def:
784 case vect_induction_def:
785 case vect_nested_cycle:
786 case vect_first_order_recurrence:
787 break;
789 default:
790 /* FORNOW: Not supported. */
791 if (dump_enabled_p ())
792 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
793 "Build SLP failed: illegal type of def %T\n",
794 oprnd);
795 return -1;
798 oprnd_info->first_dt = dt;
799 oprnd_info->first_op_type = type;
802 if (first)
803 return 0;
805 /* Now match the operand definition types to that of the first stmt. */
806 for (i = 0; i < number_of_oprnds;)
808 if (skip_args[i])
810 ++i;
811 continue;
814 oprnd_info = (*oprnds_info)[i];
815 dt = dts[i];
816 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
817 oprnd = oprnd_info->ops[stmt_num];
818 tree type = TREE_TYPE (oprnd);
820 if (!types_compatible_p (oprnd_info->first_op_type, type))
822 if (dump_enabled_p ())
823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
824 "Build SLP failed: different operand types\n");
825 return 1;
828 if ((gs_op == i) != oprnd_info->first_gs_p)
830 if (dump_enabled_p ())
831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
832 "Build SLP failed: mixed gather and non-gather\n");
833 return 1;
835 else if (gs_op == i)
837 if (!operand_equal_p (oprnd_info->first_gs_info.base,
838 gs_info.base))
840 if (dump_enabled_p ())
841 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
842 "Build SLP failed: different gather base\n");
843 return 1;
845 if (oprnd_info->first_gs_info.scale != gs_info.scale)
847 if (dump_enabled_p ())
848 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
849 "Build SLP failed: different gather scale\n");
850 return 1;
854 /* Not first stmt of the group, check that the def-stmt/s match
855 the def-stmt/s of the first stmt. Allow different definition
856 types for reduction chains: the first stmt must be a
857 vect_reduction_def (a phi node), and the rest
858 end in the reduction chain. */
859 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
860 && !(oprnd_info->first_dt == vect_reduction_def
861 && !STMT_VINFO_DATA_REF (stmt_info)
862 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
863 && def_stmt_info
864 && !STMT_VINFO_DATA_REF (def_stmt_info)
865 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
866 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
867 || (!STMT_VINFO_DATA_REF (stmt_info)
868 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
869 && ((!def_stmt_info
870 || STMT_VINFO_DATA_REF (def_stmt_info)
871 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
872 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
873 != (oprnd_info->first_dt != vect_reduction_def))))
875 /* Try swapping operands if we got a mismatch. For BB
876 vectorization only in case it will clearly improve things. */
877 if (i == commutative_op && !swapped
878 && (!is_a <bb_vec_info> (vinfo)
879 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
880 dts[i+1])
881 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
882 || vect_def_types_match
883 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
885 if (dump_enabled_p ())
886 dump_printf_loc (MSG_NOTE, vect_location,
887 "trying swapped operands\n");
888 std::swap (dts[i], dts[i+1]);
889 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
890 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
891 std::swap ((*oprnds_info)[i]->ops[stmt_num],
892 (*oprnds_info)[i+1]->ops[stmt_num]);
893 /* After swapping some operands we lost track whether an
894 operand has any pattern defs so be conservative here. */
895 if ((*oprnds_info)[i]->any_pattern
896 || (*oprnds_info)[i+1]->any_pattern)
897 (*oprnds_info)[i]->any_pattern
898 = (*oprnds_info)[i+1]->any_pattern = true;
899 swapped = true;
900 continue;
903 if (is_a <bb_vec_info> (vinfo)
904 && !oprnd_info->any_pattern)
906 /* Now for commutative ops we should see whether we can
907 make the other operand matching. */
908 if (dump_enabled_p ())
909 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
910 "treating operand as external\n");
911 oprnd_info->first_dt = dt = vect_external_def;
913 else
915 if (dump_enabled_p ())
916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
917 "Build SLP failed: different types\n");
918 return 1;
922 /* Make sure to demote the overall operand to external. */
923 if (dt == vect_external_def)
924 oprnd_info->first_dt = vect_external_def;
925 /* For a SLP reduction chain we want to duplicate the reduction to
926 each of the chain members. That gets us a sane SLP graph (still
927 the stmts are not 100% correct wrt the initial values). */
928 else if ((dt == vect_internal_def
929 || dt == vect_reduction_def)
930 && oprnd_info->first_dt == vect_reduction_def
931 && !STMT_VINFO_DATA_REF (stmt_info)
932 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
933 && !STMT_VINFO_DATA_REF (def_stmt_info)
934 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
935 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
937 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
938 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
941 ++i;
944 /* Swap operands. */
945 if (swapped)
947 if (dump_enabled_p ())
948 dump_printf_loc (MSG_NOTE, vect_location,
949 "swapped operands to match def types in %G",
950 stmt_info->stmt);
953 return 0;
956 /* Return true if call statements CALL1 and CALL2 are similar enough
957 to be combined into the same SLP group. */
959 bool
960 compatible_calls_p (gcall *call1, gcall *call2)
962 unsigned int nargs = gimple_call_num_args (call1);
963 if (nargs != gimple_call_num_args (call2))
964 return false;
966 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
967 return false;
969 if (gimple_call_internal_p (call1))
971 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
972 TREE_TYPE (gimple_call_lhs (call2))))
973 return false;
974 for (unsigned int i = 0; i < nargs; ++i)
975 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
976 TREE_TYPE (gimple_call_arg (call2, i))))
977 return false;
979 else
981 if (!operand_equal_p (gimple_call_fn (call1),
982 gimple_call_fn (call2), 0))
983 return false;
985 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
986 return false;
989 /* Check that any unvectorized arguments are equal. */
990 if (const int *map = vect_get_operand_map (call1))
992 unsigned int nkept = *map++;
993 unsigned int mapi = 0;
994 for (unsigned int i = 0; i < nargs; ++i)
995 if (mapi < nkept && map[mapi] == int (i))
996 mapi += 1;
997 else if (!operand_equal_p (gimple_call_arg (call1, i),
998 gimple_call_arg (call2, i)))
999 return false;
1002 return true;
1005 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1006 caller's attempt to find the vector type in STMT_INFO with the narrowest
1007 element type. Return true if VECTYPE is nonnull and if it is valid
1008 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1009 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1010 vect_build_slp_tree. */
1012 static bool
1013 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1014 unsigned int group_size,
1015 tree vectype, poly_uint64 *max_nunits)
1017 if (!vectype)
1019 if (dump_enabled_p ())
1020 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1021 "Build SLP failed: unsupported data-type in %G\n",
1022 stmt_info->stmt);
1023 /* Fatal mismatch. */
1024 return false;
1027 /* If populating the vector type requires unrolling then fail
1028 before adjusting *max_nunits for basic-block vectorization. */
1029 if (is_a <bb_vec_info> (vinfo)
1030 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1032 if (dump_enabled_p ())
1033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1034 "Build SLP failed: unrolling required "
1035 "in basic block SLP\n");
1036 /* Fatal mismatch. */
1037 return false;
1040 /* In case of multiple types we need to detect the smallest type. */
1041 vect_update_max_nunits (max_nunits, vectype);
1042 return true;
1045 /* Verify if the scalar stmts STMTS are isomorphic, require data
1046 permutation or are of unsupported types of operation. Return
1047 true if they are, otherwise return false and indicate in *MATCHES
1048 which stmts are not isomorphic to the first one. If MATCHES[0]
1049 is false then this indicates the comparison could not be
1050 carried out or the stmts will never be vectorized by SLP.
1052 Note COND_EXPR is possibly isomorphic to another one after swapping its
1053 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1054 the first stmt by swapping the two operands of comparison; set SWAP[i]
1055 to 2 if stmt I is isormorphic to the first stmt by inverting the code
1056 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1057 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1059 static bool
1060 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1061 vec<stmt_vec_info> stmts, unsigned int group_size,
1062 poly_uint64 *max_nunits, bool *matches,
1063 bool *two_operators, tree *node_vectype)
1065 unsigned int i;
1066 stmt_vec_info first_stmt_info = stmts[0];
1067 code_helper first_stmt_code = ERROR_MARK;
1068 code_helper alt_stmt_code = ERROR_MARK;
1069 code_helper rhs_code = ERROR_MARK;
1070 code_helper first_cond_code = ERROR_MARK;
1071 tree lhs;
1072 bool need_same_oprnds = false;
1073 tree vectype = NULL_TREE, first_op1 = NULL_TREE;
1074 stmt_vec_info first_load = NULL, prev_first_load = NULL;
1075 bool first_stmt_ldst_p = false, ldst_p = false;
1076 bool first_stmt_phi_p = false, phi_p = false;
1077 int first_reduc_idx = -1;
1078 bool maybe_soft_fail = false;
1079 tree soft_fail_nunits_vectype = NULL_TREE;
1081 /* For every stmt in NODE find its def stmt/s. */
1082 stmt_vec_info stmt_info;
1083 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1085 swap[i] = 0;
1086 matches[i] = false;
1087 if (!stmt_info)
1089 matches[i] = true;
1090 continue;
1093 gimple *stmt = stmt_info->stmt;
1094 if (dump_enabled_p ())
1095 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1097 /* Fail to vectorize statements marked as unvectorizable, throw
1098 or are volatile. */
1099 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1100 || stmt_can_throw_internal (cfun, stmt)
1101 || gimple_has_volatile_ops (stmt))
1103 if (dump_enabled_p ())
1104 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1105 "Build SLP failed: unvectorizable statement %G",
1106 stmt);
1107 /* ??? For BB vectorization we want to commutate operands in a way
1108 to shuffle all unvectorizable defs into one operand and have
1109 the other still vectorized. The following doesn't reliably
1110 work for this though but it's the easiest we can do here. */
1111 if (is_a <bb_vec_info> (vinfo) && i != 0)
1112 continue;
1113 /* Fatal mismatch. */
1114 matches[0] = false;
1115 return false;
1118 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1119 lhs = gimple_get_lhs (stmt);
1120 if (lhs == NULL_TREE
1121 && (!call_stmt
1122 || !gimple_call_internal_p (stmt)
1123 || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1125 if (dump_enabled_p ())
1126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1127 "Build SLP failed: not GIMPLE_ASSIGN nor "
1128 "GIMPLE_CALL %G", stmt);
1129 if (is_a <bb_vec_info> (vinfo) && i != 0)
1130 continue;
1131 /* Fatal mismatch. */
1132 matches[0] = false;
1133 return false;
1136 tree nunits_vectype;
1137 if (!vect_get_vector_types_for_stmt (vinfo, stmt_info, &vectype,
1138 &nunits_vectype, group_size))
1140 if (is_a <bb_vec_info> (vinfo) && i != 0)
1141 continue;
1142 /* Fatal mismatch. */
1143 matches[0] = false;
1144 return false;
1146 /* Record nunits required but continue analysis, producing matches[]
1147 as if nunits was not an issue. This allows splitting of groups
1148 to happen. */
1149 if (nunits_vectype
1150 && !vect_record_max_nunits (vinfo, stmt_info, group_size,
1151 nunits_vectype, max_nunits))
1153 gcc_assert (is_a <bb_vec_info> (vinfo));
1154 maybe_soft_fail = true;
1155 soft_fail_nunits_vectype = nunits_vectype;
1158 gcc_assert (vectype);
1160 if (call_stmt)
1162 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1163 if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1164 rhs_code = cfn;
1165 else
1166 rhs_code = CALL_EXPR;
1168 if (cfn == CFN_MASK_LOAD
1169 || cfn == CFN_GATHER_LOAD
1170 || cfn == CFN_MASK_GATHER_LOAD
1171 || cfn == CFN_MASK_LEN_GATHER_LOAD)
1172 ldst_p = true;
1173 else if (cfn == CFN_MASK_STORE)
1175 ldst_p = true;
1176 rhs_code = CFN_MASK_STORE;
1178 else if ((cfn != CFN_LAST
1179 && cfn != CFN_MASK_CALL
1180 && internal_fn_p (cfn)
1181 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1182 || gimple_call_tail_p (call_stmt)
1183 || gimple_call_noreturn_p (call_stmt)
1184 || gimple_call_chain (call_stmt))
1186 if (dump_enabled_p ())
1187 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1188 "Build SLP failed: unsupported call type %G",
1189 (gimple *) call_stmt);
1190 if (is_a <bb_vec_info> (vinfo) && i != 0)
1191 continue;
1192 /* Fatal mismatch. */
1193 matches[0] = false;
1194 return false;
1197 else if (gimple_code (stmt) == GIMPLE_PHI)
1199 rhs_code = ERROR_MARK;
1200 phi_p = true;
1202 else
1204 rhs_code = gimple_assign_rhs_code (stmt);
1205 ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1208 /* Check the operation. */
1209 if (i == 0)
1211 *node_vectype = vectype;
1212 first_stmt_code = rhs_code;
1213 first_stmt_ldst_p = ldst_p;
1214 first_stmt_phi_p = phi_p;
1215 first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
1217 /* Shift arguments should be equal in all the packed stmts for a
1218 vector shift with scalar shift operand. */
1219 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1220 || rhs_code == LROTATE_EXPR
1221 || rhs_code == RROTATE_EXPR)
1223 /* First see if we have a vector/vector shift. */
1224 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1226 /* No vector/vector shift, try for a vector/scalar shift. */
1227 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1229 if (dump_enabled_p ())
1230 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1231 "Build SLP failed: "
1232 "op not supported by target.\n");
1233 if (is_a <bb_vec_info> (vinfo) && i != 0)
1234 continue;
1235 /* Fatal mismatch. */
1236 matches[0] = false;
1237 return false;
1239 need_same_oprnds = true;
1240 first_op1 = gimple_assign_rhs2 (stmt);
1243 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1245 need_same_oprnds = true;
1246 first_op1 = gimple_assign_rhs2 (stmt);
1248 else if (!ldst_p
1249 && rhs_code == BIT_FIELD_REF)
1251 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1252 if (!is_a <bb_vec_info> (vinfo)
1253 || TREE_CODE (vec) != SSA_NAME
1254 /* When the element types are not compatible we pun the
1255 source to the target vectype which requires equal size. */
1256 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1257 || !types_compatible_p (TREE_TYPE (vectype),
1258 TREE_TYPE (TREE_TYPE (vec))))
1259 && !operand_equal_p (TYPE_SIZE (vectype),
1260 TYPE_SIZE (TREE_TYPE (vec)))))
1262 if (dump_enabled_p ())
1263 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1264 "Build SLP failed: "
1265 "BIT_FIELD_REF not supported\n");
1266 /* Fatal mismatch. */
1267 matches[0] = false;
1268 return false;
1271 else if (rhs_code == CFN_DIV_POW2)
1273 need_same_oprnds = true;
1274 first_op1 = gimple_call_arg (call_stmt, 1);
1277 else
1279 if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1280 /* For SLP reduction groups the index isn't necessarily
1281 uniform but only that of the first stmt matters. */
1282 && !(first_reduc_idx != -1
1283 && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1284 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
1286 if (dump_enabled_p ())
1288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1289 "Build SLP failed: different reduc_idx "
1290 "%d instead of %d in %G",
1291 STMT_VINFO_REDUC_IDX (stmt_info),
1292 first_reduc_idx, stmt);
1294 /* Mismatch. */
1295 continue;
1297 if (first_stmt_code != rhs_code
1298 && alt_stmt_code == ERROR_MARK)
1299 alt_stmt_code = rhs_code;
1300 if ((first_stmt_code != rhs_code
1301 && (first_stmt_code != IMAGPART_EXPR
1302 || rhs_code != REALPART_EXPR)
1303 && (first_stmt_code != REALPART_EXPR
1304 || rhs_code != IMAGPART_EXPR)
1305 /* Handle mismatches in plus/minus by computing both
1306 and merging the results. */
1307 && !((first_stmt_code == PLUS_EXPR
1308 || first_stmt_code == MINUS_EXPR)
1309 && (alt_stmt_code == PLUS_EXPR
1310 || alt_stmt_code == MINUS_EXPR)
1311 && rhs_code == alt_stmt_code)
1312 && !(first_stmt_code.is_tree_code ()
1313 && rhs_code.is_tree_code ()
1314 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1315 == tcc_comparison)
1316 && (swap_tree_comparison (tree_code (first_stmt_code))
1317 == tree_code (rhs_code)))
1318 && !(STMT_VINFO_GROUPED_ACCESS (stmt_info)
1319 && (first_stmt_code == ARRAY_REF
1320 || first_stmt_code == BIT_FIELD_REF
1321 || first_stmt_code == COMPONENT_REF
1322 || first_stmt_code == REALPART_EXPR
1323 || first_stmt_code == IMAGPART_EXPR
1324 || first_stmt_code == MEM_REF)
1325 && (rhs_code == ARRAY_REF
1326 || rhs_code == BIT_FIELD_REF
1327 || rhs_code == COMPONENT_REF
1328 || rhs_code == REALPART_EXPR
1329 || rhs_code == IMAGPART_EXPR
1330 || rhs_code == MEM_REF)))
1331 || (ldst_p
1332 && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1333 != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1334 || (ldst_p
1335 && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1336 != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1337 || first_stmt_ldst_p != ldst_p
1338 || first_stmt_phi_p != phi_p)
1340 if (dump_enabled_p ())
1342 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1343 "Build SLP failed: different operation "
1344 "in stmt %G", stmt);
1345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1346 "original stmt %G", first_stmt_info->stmt);
1348 /* Mismatch. */
1349 continue;
1352 if (!ldst_p
1353 && first_stmt_code == BIT_FIELD_REF
1354 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1355 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1357 if (dump_enabled_p ())
1358 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1359 "Build SLP failed: different BIT_FIELD_REF "
1360 "arguments in %G", stmt);
1361 /* Mismatch. */
1362 continue;
1365 if (call_stmt
1366 && first_stmt_code != CFN_MASK_LOAD
1367 && first_stmt_code != CFN_MASK_STORE)
1369 if (!compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1370 call_stmt))
1372 if (dump_enabled_p ())
1373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1374 "Build SLP failed: different calls in %G",
1375 stmt);
1376 /* Mismatch. */
1377 continue;
1381 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1382 && (gimple_bb (first_stmt_info->stmt)
1383 != gimple_bb (stmt_info->stmt)))
1385 if (dump_enabled_p ())
1386 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1387 "Build SLP failed: different BB for PHI "
1388 "or possibly trapping operation in %G", stmt);
1389 /* Mismatch. */
1390 continue;
1393 if (need_same_oprnds)
1395 tree other_op1 = gimple_arg (stmt, 1);
1396 if (!operand_equal_p (first_op1, other_op1, 0))
1398 if (dump_enabled_p ())
1399 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1400 "Build SLP failed: different shift "
1401 "arguments in %G", stmt);
1402 /* Mismatch. */
1403 continue;
1407 if (!types_compatible_p (vectype, *node_vectype))
1409 if (dump_enabled_p ())
1410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1411 "Build SLP failed: different vector type "
1412 "in %G", stmt);
1413 /* Mismatch. */
1414 continue;
1418 /* Grouped store or load. */
1419 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1421 gcc_assert (ldst_p);
1422 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1424 /* Store. */
1425 gcc_assert (rhs_code == CFN_MASK_STORE
1426 || REFERENCE_CLASS_P (lhs)
1427 || DECL_P (lhs));
1429 else
1431 /* Load. */
1432 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1433 if (prev_first_load)
1435 /* Check that there are no loads from different interleaving
1436 chains in the same node. */
1437 if (prev_first_load != first_load)
1439 if (dump_enabled_p ())
1440 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1441 vect_location,
1442 "Build SLP failed: different "
1443 "interleaving chains in one node %G",
1444 stmt);
1445 /* Mismatch. */
1446 continue;
1449 else
1450 prev_first_load = first_load;
1453 /* Non-grouped store or load. */
1454 else if (ldst_p)
1456 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1457 && rhs_code != CFN_GATHER_LOAD
1458 && rhs_code != CFN_MASK_GATHER_LOAD
1459 && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1460 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1461 /* Not grouped loads are handled as externals for BB
1462 vectorization. For loop vectorization we can handle
1463 splats the same we handle single element interleaving. */
1464 && (is_a <bb_vec_info> (vinfo)
1465 || stmt_info != first_stmt_info))
1467 /* Not grouped load. */
1468 if (dump_enabled_p ())
1469 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1470 "Build SLP failed: not grouped load %G", stmt);
1472 if (i != 0)
1473 continue;
1474 /* Fatal mismatch. */
1475 matches[0] = false;
1476 return false;
1479 /* Not memory operation. */
1480 else
1482 if (!phi_p
1483 && rhs_code.is_tree_code ()
1484 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1485 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1486 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1487 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1488 && rhs_code != VIEW_CONVERT_EXPR
1489 && rhs_code != CALL_EXPR
1490 && rhs_code != BIT_FIELD_REF)
1492 if (dump_enabled_p ())
1493 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1494 "Build SLP failed: operation unsupported %G",
1495 stmt);
1496 if (is_a <bb_vec_info> (vinfo) && i != 0)
1497 continue;
1498 /* Fatal mismatch. */
1499 matches[0] = false;
1500 return false;
1503 if (rhs_code == COND_EXPR)
1505 tree cond_expr = gimple_assign_rhs1 (stmt);
1506 enum tree_code cond_code = TREE_CODE (cond_expr);
1507 enum tree_code swap_code = ERROR_MARK;
1508 enum tree_code invert_code = ERROR_MARK;
1510 if (i == 0)
1511 first_cond_code = TREE_CODE (cond_expr);
1512 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1514 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1515 swap_code = swap_tree_comparison (cond_code);
1516 invert_code = invert_tree_comparison (cond_code, honor_nans);
1519 if (first_cond_code == cond_code)
1521 /* Isomorphic can be achieved by swapping. */
1522 else if (first_cond_code == swap_code)
1523 swap[i] = 1;
1524 /* Isomorphic can be achieved by inverting. */
1525 else if (first_cond_code == invert_code)
1526 swap[i] = 2;
1527 else
1529 if (dump_enabled_p ())
1530 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1531 "Build SLP failed: different"
1532 " operation %G", stmt);
1533 /* Mismatch. */
1534 continue;
1538 if (rhs_code.is_tree_code ()
1539 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1540 && (swap_tree_comparison ((tree_code)first_stmt_code)
1541 == (tree_code)rhs_code))
1542 swap[i] = 1;
1545 matches[i] = true;
1548 for (i = 0; i < group_size; ++i)
1549 if (!matches[i])
1550 return false;
1552 /* If we allowed a two-operation SLP node verify the target can cope
1553 with the permute we are going to use. */
1554 if (alt_stmt_code != ERROR_MARK
1555 && (!alt_stmt_code.is_tree_code ()
1556 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1557 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1559 *two_operators = true;
1562 if (maybe_soft_fail)
1564 unsigned HOST_WIDE_INT const_nunits;
1565 if (!TYPE_VECTOR_SUBPARTS
1566 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1567 || const_nunits > group_size)
1568 matches[0] = false;
1569 else
1571 /* With constant vector elements simulate a mismatch at the
1572 point we need to split. */
1573 unsigned tail = group_size & (const_nunits - 1);
1574 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1576 return false;
1579 return true;
1582 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1583 Note we never remove apart from at destruction time so we do not
1584 need a special value for deleted that differs from empty. */
1585 struct bst_traits
1587 typedef vec <stmt_vec_info> value_type;
1588 typedef vec <stmt_vec_info> compare_type;
1589 static inline hashval_t hash (value_type);
1590 static inline bool equal (value_type existing, value_type candidate);
1591 static inline bool is_empty (value_type x) { return !x.exists (); }
1592 static inline bool is_deleted (value_type x) { return !x.exists (); }
1593 static const bool empty_zero_p = true;
1594 static inline void mark_empty (value_type &x) { x.release (); }
1595 static inline void mark_deleted (value_type &x) { x.release (); }
1596 static inline void remove (value_type &x) { x.release (); }
1598 inline hashval_t
1599 bst_traits::hash (value_type x)
1601 inchash::hash h;
1602 for (unsigned i = 0; i < x.length (); ++i)
1603 h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
1604 return h.end ();
1606 inline bool
1607 bst_traits::equal (value_type existing, value_type candidate)
1609 if (existing.length () != candidate.length ())
1610 return false;
1611 for (unsigned i = 0; i < existing.length (); ++i)
1612 if (existing[i] != candidate[i])
1613 return false;
1614 return true;
1617 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1618 simple_hashmap_traits <bst_traits, slp_tree> >
1619 scalar_stmts_to_slp_tree_map_t;
1621 /* Release BST_MAP. */
1623 static void
1624 release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
1626 /* The map keeps a reference on SLP nodes built, release that. */
1627 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1628 it != bst_map->end (); ++it)
1629 if ((*it).second)
1630 vect_free_slp_tree ((*it).second);
1631 delete bst_map;
1634 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1635 but then vec::insert does memmove and that's not compatible with
1636 std::pair. */
1637 struct chain_op_t
1639 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1640 : code (code_), dt (dt_), op (op_) {}
1641 tree_code code;
1642 vect_def_type dt;
1643 tree op;
1646 /* Comparator for sorting associatable chains. */
1648 static int
1649 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1651 auto *op1 = (const chain_op_t *) op1_;
1652 auto *op2 = (const chain_op_t *) op2_;
1653 if (op1->dt != op2->dt)
1654 return (int)op1->dt - (int)op2->dt;
1655 return (int)op1->code - (int)op2->code;
1658 /* Linearize the associatable expression chain at START with the
1659 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1660 filling CHAIN with the result and using WORKLIST as intermediate storage.
1661 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1662 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1663 stmts, starting with START. */
1665 static void
1666 vect_slp_linearize_chain (vec_info *vinfo,
1667 vec<std::pair<tree_code, gimple *> > &worklist,
1668 vec<chain_op_t> &chain,
1669 enum tree_code code, gimple *start,
1670 gimple *&code_stmt, gimple *&alt_code_stmt,
1671 vec<gimple *> *chain_stmts)
1673 /* For each lane linearize the addition/subtraction (or other
1674 uniform associatable operation) expression tree. */
1675 worklist.safe_push (std::make_pair (code, start));
1676 while (!worklist.is_empty ())
1678 auto entry = worklist.pop ();
1679 gassign *stmt = as_a <gassign *> (entry.second);
1680 enum tree_code in_code = entry.first;
1681 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1682 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1683 if (!code_stmt
1684 && gimple_assign_rhs_code (stmt) == code)
1685 code_stmt = stmt;
1686 else if (!alt_code_stmt
1687 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1688 alt_code_stmt = stmt;
1689 if (chain_stmts)
1690 chain_stmts->safe_push (stmt);
1691 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1693 tree op = gimple_op (stmt, opnum);
1694 vect_def_type dt;
1695 stmt_vec_info def_stmt_info;
1696 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1697 gcc_assert (res);
1698 if (dt == vect_internal_def
1699 && is_pattern_stmt_p (def_stmt_info))
1700 op = gimple_get_lhs (def_stmt_info->stmt);
1701 gimple *use_stmt;
1702 use_operand_p use_p;
1703 if (dt == vect_internal_def
1704 && single_imm_use (op, &use_p, &use_stmt)
1705 && is_gimple_assign (def_stmt_info->stmt)
1706 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1707 || (code == PLUS_EXPR
1708 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1709 == MINUS_EXPR))))
1711 tree_code op_def_code = this_code;
1712 if (op_def_code == MINUS_EXPR && opnum == 1)
1713 op_def_code = PLUS_EXPR;
1714 if (in_code == MINUS_EXPR)
1715 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1716 worklist.safe_push (std::make_pair (op_def_code,
1717 def_stmt_info->stmt));
1719 else
1721 tree_code op_def_code = this_code;
1722 if (op_def_code == MINUS_EXPR && opnum == 1)
1723 op_def_code = PLUS_EXPR;
1724 if (in_code == MINUS_EXPR)
1725 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1726 chain.safe_push (chain_op_t (op_def_code, dt, op));
1732 static slp_tree
1733 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1734 vec<stmt_vec_info> stmts, unsigned int group_size,
1735 poly_uint64 *max_nunits,
1736 bool *matches, unsigned *limit, unsigned *tree_size,
1737 scalar_stmts_to_slp_tree_map_t *bst_map);
1739 static slp_tree
1740 vect_build_slp_tree (vec_info *vinfo,
1741 vec<stmt_vec_info> stmts, unsigned int group_size,
1742 poly_uint64 *max_nunits,
1743 bool *matches, unsigned *limit, unsigned *tree_size,
1744 scalar_stmts_to_slp_tree_map_t *bst_map)
1746 if (slp_tree *leader = bst_map->get (stmts))
1748 if (dump_enabled_p ())
1749 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1750 !(*leader)->failed ? "" : "failed ",
1751 (void *) *leader);
1752 if (!(*leader)->failed)
1754 SLP_TREE_REF_COUNT (*leader)++;
1755 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1756 stmts.release ();
1757 return *leader;
1759 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1760 return NULL;
1763 /* Single-lane SLP doesn't have the chance of run-away, do not account
1764 it to the limit. */
1765 if (stmts.length () > 1)
1767 if (*limit == 0)
1769 if (dump_enabled_p ())
1770 dump_printf_loc (MSG_NOTE, vect_location,
1771 "SLP discovery limit exceeded\n");
1772 memset (matches, 0, sizeof (bool) * group_size);
1773 return NULL;
1775 --*limit;
1778 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1779 so we can pick up backedge destinations during discovery. */
1780 slp_tree res = new _slp_tree;
1781 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1782 SLP_TREE_SCALAR_STMTS (res) = stmts;
1783 bst_map->put (stmts.copy (), res);
1785 if (dump_enabled_p ())
1786 dump_printf_loc (MSG_NOTE, vect_location,
1787 "starting SLP discovery for node %p\n", (void *) res);
1789 poly_uint64 this_max_nunits = 1;
1790 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1791 &this_max_nunits,
1792 matches, limit, tree_size, bst_map);
1793 if (!res_)
1795 if (dump_enabled_p ())
1796 dump_printf_loc (MSG_NOTE, vect_location,
1797 "SLP discovery for node %p failed\n", (void *) res);
1798 /* Mark the node invalid so we can detect those when still in use
1799 as backedge destinations. */
1800 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1801 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1802 res->failed = XNEWVEC (bool, group_size);
1803 if (flag_checking)
1805 unsigned i;
1806 for (i = 0; i < group_size; ++i)
1807 if (!matches[i])
1808 break;
1809 gcc_assert (i < group_size);
1811 memcpy (res->failed, matches, sizeof (bool) * group_size);
1813 else
1815 if (dump_enabled_p ())
1816 dump_printf_loc (MSG_NOTE, vect_location,
1817 "SLP discovery for node %p succeeded\n",
1818 (void *) res);
1819 gcc_assert (res_ == res);
1820 res->max_nunits = this_max_nunits;
1821 vect_update_max_nunits (max_nunits, this_max_nunits);
1822 /* Keep a reference for the bst_map use. */
1823 SLP_TREE_REF_COUNT (res)++;
1825 return res_;
1828 /* Helper for building an associated SLP node chain. */
1830 static void
1831 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1832 slp_tree op0, slp_tree op1,
1833 stmt_vec_info oper1, stmt_vec_info oper2,
1834 vec<std::pair<unsigned, unsigned> > lperm)
1836 unsigned group_size = SLP_TREE_LANES (op1);
1838 slp_tree child1 = new _slp_tree;
1839 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1840 SLP_TREE_VECTYPE (child1) = vectype;
1841 SLP_TREE_LANES (child1) = group_size;
1842 SLP_TREE_CHILDREN (child1).create (2);
1843 SLP_TREE_CHILDREN (child1).quick_push (op0);
1844 SLP_TREE_CHILDREN (child1).quick_push (op1);
1845 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1847 slp_tree child2 = new _slp_tree;
1848 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1849 SLP_TREE_VECTYPE (child2) = vectype;
1850 SLP_TREE_LANES (child2) = group_size;
1851 SLP_TREE_CHILDREN (child2).create (2);
1852 SLP_TREE_CHILDREN (child2).quick_push (op0);
1853 SLP_TREE_REF_COUNT (op0)++;
1854 SLP_TREE_CHILDREN (child2).quick_push (op1);
1855 SLP_TREE_REF_COUNT (op1)++;
1856 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1858 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1859 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1860 SLP_TREE_VECTYPE (perm) = vectype;
1861 SLP_TREE_LANES (perm) = group_size;
1862 /* ??? We should set this NULL but that's not expected. */
1863 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1864 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1865 SLP_TREE_CHILDREN (perm).quick_push (child1);
1866 SLP_TREE_CHILDREN (perm).quick_push (child2);
1869 /* Recursively build an SLP tree starting from NODE.
1870 Fail (and return a value not equal to zero) if def-stmts are not
1871 isomorphic, require data permutation or are of unsupported types of
1872 operation. Otherwise, return 0.
1873 The value returned is the depth in the SLP tree where a mismatch
1874 was found. */
1876 static slp_tree
1877 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1878 vec<stmt_vec_info> stmts, unsigned int group_size,
1879 poly_uint64 *max_nunits,
1880 bool *matches, unsigned *limit, unsigned *tree_size,
1881 scalar_stmts_to_slp_tree_map_t *bst_map)
1883 unsigned nops, i, this_tree_size = 0;
1884 poly_uint64 this_max_nunits = *max_nunits;
1886 matches[0] = false;
1888 stmt_vec_info stmt_info = stmts[0];
1889 if (!is_a<gcall *> (stmt_info->stmt)
1890 && !is_a<gassign *> (stmt_info->stmt)
1891 && !is_a<gphi *> (stmt_info->stmt))
1892 return NULL;
1894 nops = gimple_num_args (stmt_info->stmt);
1895 if (const int *map = vect_get_operand_map (stmt_info->stmt,
1896 STMT_VINFO_GATHER_SCATTER_P
1897 (stmt_info)))
1898 nops = map[0];
1900 /* If the SLP node is a PHI (induction or reduction), terminate
1901 the recursion. */
1902 bool *skip_args = XALLOCAVEC (bool, nops);
1903 memset (skip_args, 0, sizeof (bool) * nops);
1904 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1905 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1907 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1908 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1909 group_size);
1910 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1911 max_nunits))
1912 return NULL;
1914 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1915 if (def_type == vect_induction_def)
1917 /* Induction PHIs are not cycles but walk the initial
1918 value. Only for inner loops through, for outer loops
1919 we need to pick up the value from the actual PHIs
1920 to more easily support peeling and epilogue vectorization. */
1921 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1922 if (!nested_in_vect_loop_p (loop, stmt_info))
1923 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1924 else
1925 loop = loop->inner;
1926 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1928 else if (def_type == vect_reduction_def
1929 || def_type == vect_double_reduction_def
1930 || def_type == vect_nested_cycle
1931 || def_type == vect_first_order_recurrence)
1933 /* Else def types have to match. */
1934 stmt_vec_info other_info;
1935 bool all_same = true;
1936 FOR_EACH_VEC_ELT (stmts, i, other_info)
1938 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1939 return NULL;
1940 if (other_info != stmt_info)
1941 all_same = false;
1943 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1944 /* Reduction initial values are not explicitely represented. */
1945 if (def_type != vect_first_order_recurrence
1946 && gimple_bb (stmt_info->stmt) == loop->header)
1947 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1948 /* Reduction chain backedge defs are filled manually.
1949 ??? Need a better way to identify a SLP reduction chain PHI.
1950 Or a better overall way to SLP match those. */
1951 if (stmts.length () > 1
1952 && all_same && def_type == vect_reduction_def)
1953 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1955 else if (def_type != vect_internal_def)
1956 return NULL;
1960 bool two_operators = false;
1961 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1962 tree vectype = NULL_TREE;
1963 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
1964 &this_max_nunits, matches, &two_operators,
1965 &vectype))
1966 return NULL;
1968 /* If the SLP node is a load, terminate the recursion unless masked. */
1969 if (STMT_VINFO_DATA_REF (stmt_info)
1970 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1972 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
1973 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
1974 else
1976 *max_nunits = this_max_nunits;
1977 (*tree_size)++;
1978 node = vect_create_new_slp_node (node, stmts, 0);
1979 SLP_TREE_VECTYPE (node) = vectype;
1980 /* And compute the load permutation. Whether it is actually
1981 a permutation depends on the unrolling factor which is
1982 decided later. */
1983 vec<unsigned> load_permutation;
1984 int j;
1985 stmt_vec_info load_info;
1986 load_permutation.create (group_size);
1987 stmt_vec_info first_stmt_info
1988 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
1989 bool any_permute = false;
1990 bool any_null = false;
1991 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
1993 int load_place;
1994 if (! load_info)
1996 load_place = j;
1997 any_null = true;
1999 else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2000 load_place = vect_get_place_in_interleaving_chain
2001 (load_info, first_stmt_info);
2002 else
2003 load_place = 0;
2004 gcc_assert (load_place != -1);
2005 any_permute |= load_place != j;
2006 load_permutation.quick_push (load_place);
2008 if (any_null)
2010 gcc_assert (!any_permute);
2011 load_permutation.release ();
2014 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
2016 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD)
2017 || gimple_call_internal_p (stmt, IFN_GATHER_LOAD)
2018 || gimple_call_internal_p (stmt, IFN_MASK_GATHER_LOAD)
2019 || gimple_call_internal_p (stmt,
2020 IFN_MASK_LEN_GATHER_LOAD));
2021 load_permutation.release ();
2022 /* We cannot handle permuted masked loads, see PR114375. */
2023 if (any_permute
2024 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2025 && DR_GROUP_SIZE (first_stmt_info) != group_size)
2026 || STMT_VINFO_STRIDED_P (stmt_info))
2028 matches[0] = false;
2029 return NULL;
2032 else
2034 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
2035 return node;
2039 else if (gimple_assign_single_p (stmt_info->stmt)
2040 && !gimple_vuse (stmt_info->stmt)
2041 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
2043 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
2044 the same SSA name vector of a compatible type to vectype. */
2045 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
2046 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
2047 stmt_vec_info estmt_info;
2048 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
2050 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
2051 tree bfref = gimple_assign_rhs1 (estmt);
2052 HOST_WIDE_INT lane;
2053 if (!known_eq (bit_field_size (bfref),
2054 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2055 || !constant_multiple_p (bit_field_offset (bfref),
2056 bit_field_size (bfref), &lane))
2058 lperm.release ();
2059 matches[0] = false;
2060 return NULL;
2062 lperm.safe_push (std::make_pair (0, (unsigned)lane));
2064 slp_tree vnode = vect_create_new_slp_node (vNULL);
2065 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2066 /* ??? We record vectype here but we hide eventually necessary
2067 punning and instead rely on code generation to materialize
2068 VIEW_CONVERT_EXPRs as necessary. We instead should make
2069 this explicit somehow. */
2070 SLP_TREE_VECTYPE (vnode) = vectype;
2071 else
2073 /* For different size but compatible elements we can still
2074 use VEC_PERM_EXPR without punning. */
2075 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2076 && types_compatible_p (TREE_TYPE (vectype),
2077 TREE_TYPE (TREE_TYPE (vec))));
2078 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2080 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2081 unsigned HOST_WIDE_INT const_nunits;
2082 if (nunits.is_constant (&const_nunits))
2083 SLP_TREE_LANES (vnode) = const_nunits;
2084 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2085 /* We are always building a permutation node even if it is an identity
2086 permute to shield the rest of the vectorizer from the odd node
2087 representing an actual vector without any scalar ops.
2088 ??? We could hide it completely with making the permute node
2089 external? */
2090 node = vect_create_new_slp_node (node, stmts, 1);
2091 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2092 SLP_TREE_LANE_PERMUTATION (node) = lperm;
2093 SLP_TREE_VECTYPE (node) = vectype;
2094 SLP_TREE_CHILDREN (node).quick_push (vnode);
2095 return node;
2097 /* When discovery reaches an associatable operation see whether we can
2098 improve that to match up lanes in a way superior to the operand
2099 swapping code which at most looks at two defs.
2100 ??? For BB vectorization we cannot do the brute-force search
2101 for matching as we can succeed by means of builds from scalars
2102 and have no good way to "cost" one build against another. */
2103 else if (is_a <loop_vec_info> (vinfo)
2104 /* Do not bother for single-lane SLP. */
2105 && group_size > 1
2106 /* ??? We don't handle !vect_internal_def defs below. */
2107 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2108 /* ??? Do not associate a reduction, this will wreck REDUC_IDX
2109 mapping as long as that exists on the stmt_info level. */
2110 && STMT_VINFO_REDUC_IDX (stmt_info) == -1
2111 && is_gimple_assign (stmt_info->stmt)
2112 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2113 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2114 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2115 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2116 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2118 /* See if we have a chain of (mixed) adds or subtracts or other
2119 associatable ops. */
2120 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2121 if (code == MINUS_EXPR)
2122 code = PLUS_EXPR;
2123 stmt_vec_info other_op_stmt_info = NULL;
2124 stmt_vec_info op_stmt_info = NULL;
2125 unsigned chain_len = 0;
2126 auto_vec<chain_op_t> chain;
2127 auto_vec<std::pair<tree_code, gimple *> > worklist;
2128 auto_vec<vec<chain_op_t> > chains (group_size);
2129 auto_vec<slp_tree, 4> children;
2130 bool hard_fail = true;
2131 for (unsigned lane = 0; lane < group_size; ++lane)
2133 /* For each lane linearize the addition/subtraction (or other
2134 uniform associatable operation) expression tree. */
2135 gimple *op_stmt = NULL, *other_op_stmt = NULL;
2136 vect_slp_linearize_chain (vinfo, worklist, chain, code,
2137 stmts[lane]->stmt, op_stmt, other_op_stmt,
2138 NULL);
2139 if (!op_stmt_info && op_stmt)
2140 op_stmt_info = vinfo->lookup_stmt (op_stmt);
2141 if (!other_op_stmt_info && other_op_stmt)
2142 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2143 if (chain.length () == 2)
2145 /* In a chain of just two elements resort to the regular
2146 operand swapping scheme. If we run into a length
2147 mismatch still hard-FAIL. */
2148 if (chain_len == 0)
2149 hard_fail = false;
2150 else
2152 matches[lane] = false;
2153 /* ??? We might want to process the other lanes, but
2154 make sure to not give false matching hints to the
2155 caller for lanes we did not process. */
2156 if (lane != group_size - 1)
2157 matches[0] = false;
2159 break;
2161 else if (chain_len == 0)
2162 chain_len = chain.length ();
2163 else if (chain.length () != chain_len)
2165 /* ??? Here we could slip in magic to compensate with
2166 neutral operands. */
2167 matches[lane] = false;
2168 if (lane != group_size - 1)
2169 matches[0] = false;
2170 break;
2172 chains.quick_push (chain.copy ());
2173 chain.truncate (0);
2175 if (chains.length () == group_size)
2177 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2178 if (!op_stmt_info)
2180 hard_fail = false;
2181 goto out;
2183 /* Now we have a set of chains with the same length. */
2184 /* 1. pre-sort according to def_type and operation. */
2185 for (unsigned lane = 0; lane < group_size; ++lane)
2186 chains[lane].stablesort (dt_sort_cmp, vinfo);
2187 if (dump_enabled_p ())
2189 dump_printf_loc (MSG_NOTE, vect_location,
2190 "pre-sorted chains of %s\n",
2191 get_tree_code_name (code));
2192 for (unsigned lane = 0; lane < group_size; ++lane)
2194 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2195 dump_printf (MSG_NOTE, "%s %T ",
2196 get_tree_code_name (chains[lane][opnum].code),
2197 chains[lane][opnum].op);
2198 dump_printf (MSG_NOTE, "\n");
2201 /* 2. try to build children nodes, associating as necessary. */
2202 for (unsigned n = 0; n < chain_len; ++n)
2204 vect_def_type dt = chains[0][n].dt;
2205 unsigned lane;
2206 for (lane = 0; lane < group_size; ++lane)
2207 if (chains[lane][n].dt != dt)
2209 if (dt == vect_constant_def
2210 && chains[lane][n].dt == vect_external_def)
2211 dt = vect_external_def;
2212 else if (dt == vect_external_def
2213 && chains[lane][n].dt == vect_constant_def)
2215 else
2216 break;
2218 if (lane != group_size)
2220 if (dump_enabled_p ())
2221 dump_printf_loc (MSG_NOTE, vect_location,
2222 "giving up on chain due to mismatched "
2223 "def types\n");
2224 matches[lane] = false;
2225 if (lane != group_size - 1)
2226 matches[0] = false;
2227 goto out;
2229 if (dt == vect_constant_def
2230 || dt == vect_external_def)
2232 /* Check whether we can build the invariant. If we can't
2233 we never will be able to. */
2234 tree type = TREE_TYPE (chains[0][n].op);
2235 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2236 && (TREE_CODE (type) == BOOLEAN_TYPE
2237 || !can_duplicate_and_interleave_p (vinfo, group_size,
2238 type)))
2240 matches[0] = false;
2241 goto out;
2243 vec<tree> ops;
2244 ops.create (group_size);
2245 for (lane = 0; lane < group_size; ++lane)
2246 ops.quick_push (chains[lane][n].op);
2247 slp_tree child = vect_create_new_slp_node (ops);
2248 SLP_TREE_DEF_TYPE (child) = dt;
2249 children.safe_push (child);
2251 else if (dt != vect_internal_def)
2253 /* Not sure, we might need sth special.
2254 gcc.dg/vect/pr96854.c,
2255 gfortran.dg/vect/fast-math-pr37021.f90
2256 and gfortran.dg/vect/pr61171.f trigger. */
2257 /* Soft-fail for now. */
2258 hard_fail = false;
2259 goto out;
2261 else
2263 vec<stmt_vec_info> op_stmts;
2264 op_stmts.create (group_size);
2265 slp_tree child = NULL;
2266 /* Brute-force our way. We have to consider a lane
2267 failing after fixing an earlier fail up in the
2268 SLP discovery recursion. So track the current
2269 permute per lane. */
2270 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2271 memset (perms, 0, sizeof (unsigned) * group_size);
2274 op_stmts.truncate (0);
2275 for (lane = 0; lane < group_size; ++lane)
2276 op_stmts.quick_push
2277 (vinfo->lookup_def (chains[lane][n].op));
2278 child = vect_build_slp_tree (vinfo, op_stmts,
2279 group_size, &this_max_nunits,
2280 matches, limit,
2281 &this_tree_size, bst_map);
2282 /* ??? We're likely getting too many fatal mismatches
2283 here so maybe we want to ignore them (but then we
2284 have no idea which lanes fatally mismatched). */
2285 if (child || !matches[0])
2286 break;
2287 /* Swap another lane we have not yet matched up into
2288 lanes that did not match. If we run out of
2289 permute possibilities for a lane terminate the
2290 search. */
2291 bool term = false;
2292 for (lane = 1; lane < group_size; ++lane)
2293 if (!matches[lane])
2295 if (n + perms[lane] + 1 == chain_len)
2297 term = true;
2298 break;
2300 std::swap (chains[lane][n],
2301 chains[lane][n + perms[lane] + 1]);
2302 perms[lane]++;
2304 if (term)
2305 break;
2307 while (1);
2308 if (!child)
2310 if (dump_enabled_p ())
2311 dump_printf_loc (MSG_NOTE, vect_location,
2312 "failed to match up op %d\n", n);
2313 op_stmts.release ();
2314 if (lane != group_size - 1)
2315 matches[0] = false;
2316 else
2317 matches[lane] = false;
2318 goto out;
2320 if (dump_enabled_p ())
2322 dump_printf_loc (MSG_NOTE, vect_location,
2323 "matched up op %d to\n", n);
2324 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2326 children.safe_push (child);
2329 /* 3. build SLP nodes to combine the chain. */
2330 for (unsigned lane = 0; lane < group_size; ++lane)
2331 if (chains[lane][0].code != code)
2333 /* See if there's any alternate all-PLUS entry. */
2334 unsigned n;
2335 for (n = 1; n < chain_len; ++n)
2337 for (lane = 0; lane < group_size; ++lane)
2338 if (chains[lane][n].code != code)
2339 break;
2340 if (lane == group_size)
2341 break;
2343 if (n != chain_len)
2345 /* Swap that in at first position. */
2346 std::swap (children[0], children[n]);
2347 for (lane = 0; lane < group_size; ++lane)
2348 std::swap (chains[lane][0], chains[lane][n]);
2350 else
2352 /* ??? When this triggers and we end up with two
2353 vect_constant/external_def up-front things break (ICE)
2354 spectacularly finding an insertion place for the
2355 all-constant op. We should have a fully
2356 vect_internal_def operand though(?) so we can swap
2357 that into first place and then prepend the all-zero
2358 constant. */
2359 if (dump_enabled_p ())
2360 dump_printf_loc (MSG_NOTE, vect_location,
2361 "inserting constant zero to compensate "
2362 "for (partially) negated first "
2363 "operand\n");
2364 chain_len++;
2365 for (lane = 0; lane < group_size; ++lane)
2366 chains[lane].safe_insert
2367 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2368 vec<tree> zero_ops;
2369 zero_ops.create (group_size);
2370 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2371 for (lane = 1; lane < group_size; ++lane)
2372 zero_ops.quick_push (zero_ops[0]);
2373 slp_tree zero = vect_create_new_slp_node (zero_ops);
2374 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2375 children.safe_insert (0, zero);
2377 break;
2379 for (unsigned i = 1; i < children.length (); ++i)
2381 slp_tree op0 = children[i - 1];
2382 slp_tree op1 = children[i];
2383 bool this_two_op = false;
2384 for (unsigned lane = 0; lane < group_size; ++lane)
2385 if (chains[lane][i].code != chains[0][i].code)
2387 this_two_op = true;
2388 break;
2390 slp_tree child;
2391 if (i == children.length () - 1)
2392 child = vect_create_new_slp_node (node, stmts, 2);
2393 else
2394 child = vect_create_new_slp_node (2, ERROR_MARK);
2395 if (this_two_op)
2397 vec<std::pair<unsigned, unsigned> > lperm;
2398 lperm.create (group_size);
2399 for (unsigned lane = 0; lane < group_size; ++lane)
2400 lperm.quick_push (std::make_pair
2401 (chains[lane][i].code != chains[0][i].code, lane));
2402 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2403 (chains[0][i].code == code
2404 ? op_stmt_info
2405 : other_op_stmt_info),
2406 (chains[0][i].code == code
2407 ? other_op_stmt_info
2408 : op_stmt_info),
2409 lperm);
2411 else
2413 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2414 SLP_TREE_VECTYPE (child) = vectype;
2415 SLP_TREE_LANES (child) = group_size;
2416 SLP_TREE_CHILDREN (child).quick_push (op0);
2417 SLP_TREE_CHILDREN (child).quick_push (op1);
2418 SLP_TREE_REPRESENTATIVE (child)
2419 = (chains[0][i].code == code
2420 ? op_stmt_info : other_op_stmt_info);
2422 children[i] = child;
2424 *tree_size += this_tree_size + 1;
2425 *max_nunits = this_max_nunits;
2426 while (!chains.is_empty ())
2427 chains.pop ().release ();
2428 return node;
2430 out:
2431 while (!children.is_empty ())
2432 vect_free_slp_tree (children.pop ());
2433 while (!chains.is_empty ())
2434 chains.pop ().release ();
2435 /* Hard-fail, otherwise we might run into quadratic processing of the
2436 chains starting one stmt into the chain again. */
2437 if (hard_fail)
2438 return NULL;
2439 /* Fall thru to normal processing. */
2442 /* Get at the operands, verifying they are compatible. */
2443 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2444 slp_oprnd_info oprnd_info;
2445 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2447 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2448 stmts, i, &oprnds_info);
2449 if (res != 0)
2450 matches[(res == -1) ? 0 : i] = false;
2451 if (!matches[0])
2452 break;
2454 for (i = 0; i < group_size; ++i)
2455 if (!matches[i])
2457 vect_free_oprnd_info (oprnds_info);
2458 return NULL;
2460 swap = NULL;
2462 bool has_two_operators_perm = false;
2463 auto_vec<unsigned> two_op_perm_indices[2];
2464 vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
2466 if (two_operators && oprnds_info.length () == 2 && group_size > 2)
2468 unsigned idx = 0;
2469 hash_map<gimple *, unsigned> seen;
2470 vec<slp_oprnd_info> new_oprnds_info
2471 = vect_create_oprnd_info (1, group_size);
2472 bool success = true;
2474 enum tree_code code = ERROR_MARK;
2475 if (oprnds_info[0]->def_stmts[0]
2476 && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
2477 code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
2479 for (unsigned j = 0; j < group_size; ++j)
2481 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2483 stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
2484 if (!stmt_info || !stmt_info->stmt
2485 || !is_a<gassign *> (stmt_info->stmt)
2486 || gimple_assign_rhs_code (stmt_info->stmt) != code
2487 || skip_args[i])
2489 success = false;
2490 break;
2493 bool exists;
2494 unsigned &stmt_idx
2495 = seen.get_or_insert (stmt_info->stmt, &exists);
2497 if (!exists)
2499 new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
2500 new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
2501 stmt_idx = idx;
2502 idx++;
2505 two_op_perm_indices[i].safe_push (stmt_idx);
2508 if (!success)
2509 break;
2512 if (success && idx == group_size)
2514 if (dump_enabled_p ())
2516 dump_printf_loc (MSG_NOTE, vect_location,
2517 "Replace two_operators operands:\n");
2519 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2521 dump_printf_loc (MSG_NOTE, vect_location,
2522 "Operand %u:\n", i);
2523 for (unsigned j = 0; j < group_size; j++)
2524 dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2525 j, oprnd_info->def_stmts[j]->stmt);
2528 dump_printf_loc (MSG_NOTE, vect_location,
2529 "With a single operand:\n");
2530 for (unsigned j = 0; j < group_size; j++)
2531 dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2532 j, new_oprnds_info[0]->def_stmts[j]->stmt);
2535 two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
2536 two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
2538 new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
2539 new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
2540 new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
2541 new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
2542 new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
2544 vect_free_oprnd_info (oprnds_info);
2545 oprnds_info = new_oprnds_info;
2546 nops = 1;
2547 has_two_operators_perm = true;
2551 auto_vec<slp_tree, 4> children;
2553 stmt_info = stmts[0];
2555 /* Create SLP_TREE nodes for the definition node/s. */
2556 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2558 slp_tree child = nullptr;
2559 unsigned int j;
2561 /* We're skipping certain operands from processing, for example
2562 outer loop reduction initial defs. */
2563 if (skip_args[i])
2565 children.safe_push (NULL);
2566 continue;
2569 if (oprnd_info->first_dt == vect_uninitialized_def)
2571 /* COND_EXPR have one too many eventually if the condition
2572 is a SSA name. */
2573 gcc_assert (i == 3 && nops == 4);
2574 continue;
2577 if (is_a <bb_vec_info> (vinfo)
2578 && oprnd_info->first_dt == vect_internal_def
2579 && !oprnd_info->any_pattern)
2581 /* For BB vectorization, if all defs are the same do not
2582 bother to continue the build along the single-lane
2583 graph but use a splat of the scalar value. */
2584 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2585 for (j = 1; j < group_size; ++j)
2586 if (oprnd_info->def_stmts[j] != first_def)
2587 break;
2588 if (j == group_size
2589 /* But avoid doing this for loads where we may be
2590 able to CSE things, unless the stmt is not
2591 vectorizable. */
2592 && (!STMT_VINFO_VECTORIZABLE (first_def)
2593 || !gimple_vuse (first_def->stmt)))
2595 if (dump_enabled_p ())
2596 dump_printf_loc (MSG_NOTE, vect_location,
2597 "Using a splat of the uniform operand %G",
2598 first_def->stmt);
2599 oprnd_info->first_dt = vect_external_def;
2603 if (oprnd_info->first_dt == vect_external_def
2604 || oprnd_info->first_dt == vect_constant_def)
2606 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2608 tree op0;
2609 tree uniform_val = op0 = oprnd_info->ops[0];
2610 for (j = 1; j < oprnd_info->ops.length (); ++j)
2611 if (!operand_equal_p (uniform_val, oprnd_info->ops[j]))
2613 uniform_val = NULL_TREE;
2614 break;
2616 if (!uniform_val
2617 && !can_duplicate_and_interleave_p (vinfo,
2618 oprnd_info->ops.length (),
2619 TREE_TYPE (op0)))
2621 matches[j] = false;
2622 if (dump_enabled_p ())
2623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2624 "Build SLP failed: invalid type of def "
2625 "for variable-length SLP %T\n", op0);
2626 goto fail;
2629 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2630 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2631 oprnd_info->ops = vNULL;
2632 children.safe_push (invnode);
2633 continue;
2636 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2637 group_size, &this_max_nunits,
2638 matches, limit,
2639 &this_tree_size, bst_map)) != NULL)
2641 oprnd_info->def_stmts = vNULL;
2642 children.safe_push (child);
2643 continue;
2646 /* If the SLP build for operand zero failed and operand zero
2647 and one can be commutated try that for the scalar stmts
2648 that failed the match. */
2649 if (i == 0
2650 /* A first scalar stmt mismatch signals a fatal mismatch. */
2651 && matches[0]
2652 /* ??? For COND_EXPRs we can swap the comparison operands
2653 as well as the arms under some constraints. */
2654 && nops == 2
2655 && oprnds_info[1]->first_dt == vect_internal_def
2656 && is_gimple_assign (stmt_info->stmt)
2657 /* Swapping operands for reductions breaks assumptions later on. */
2658 && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
2660 /* See whether we can swap the matching or the non-matching
2661 stmt operands. */
2662 bool swap_not_matching = true;
2665 for (j = 0; j < group_size; ++j)
2667 if (matches[j] != !swap_not_matching)
2668 continue;
2669 stmt_vec_info stmt_info = stmts[j];
2670 /* Verify if we can swap operands of this stmt. */
2671 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2672 if (!stmt
2673 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2675 if (!swap_not_matching)
2676 goto fail;
2677 swap_not_matching = false;
2678 break;
2682 while (j != group_size);
2684 /* Swap mismatched definition stmts. */
2685 if (dump_enabled_p ())
2686 dump_printf_loc (MSG_NOTE, vect_location,
2687 "Re-trying with swapped operands of stmts ");
2688 for (j = 0; j < group_size; ++j)
2689 if (matches[j] == !swap_not_matching)
2691 std::swap (oprnds_info[0]->def_stmts[j],
2692 oprnds_info[1]->def_stmts[j]);
2693 std::swap (oprnds_info[0]->ops[j],
2694 oprnds_info[1]->ops[j]);
2695 if (dump_enabled_p ())
2696 dump_printf (MSG_NOTE, "%d ", j);
2698 if (dump_enabled_p ())
2699 dump_printf (MSG_NOTE, "\n");
2700 /* After swapping some operands we lost track whether an
2701 operand has any pattern defs so be conservative here. */
2702 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2703 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2704 /* And try again with scratch 'matches' ... */
2705 bool *tem = XALLOCAVEC (bool, group_size);
2706 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2707 group_size, &this_max_nunits,
2708 tem, limit,
2709 &this_tree_size, bst_map)) != NULL)
2711 oprnd_info->def_stmts = vNULL;
2712 children.safe_push (child);
2713 continue;
2716 fail:
2718 /* If the SLP build failed and we analyze a basic-block
2719 simply treat nodes we fail to build as externally defined
2720 (and thus build vectors from the scalar defs).
2721 The cost model will reject outright expensive cases.
2722 ??? This doesn't treat cases where permutation ultimatively
2723 fails (or we don't try permutation below). Ideally we'd
2724 even compute a permutation that will end up with the maximum
2725 SLP tree size... */
2726 if (is_a <bb_vec_info> (vinfo)
2727 /* ??? Rejecting patterns this way doesn't work. We'd have to
2728 do extra work to cancel the pattern so the uses see the
2729 scalar version. */
2730 && !is_pattern_stmt_p (stmt_info)
2731 && !oprnd_info->any_pattern)
2733 /* But if there's a leading vector sized set of matching stmts
2734 fail here so we can split the group. This matches the condition
2735 vect_analyze_slp_instance uses. */
2736 /* ??? We might want to split here and combine the results to support
2737 multiple vector sizes better. */
2738 for (j = 0; j < group_size; ++j)
2739 if (!matches[j])
2740 break;
2741 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype)))
2743 if (dump_enabled_p ())
2744 dump_printf_loc (MSG_NOTE, vect_location,
2745 "Building vector operands from scalars\n");
2746 this_tree_size++;
2747 child = vect_create_new_slp_node (oprnd_info->ops);
2748 children.safe_push (child);
2749 oprnd_info->ops = vNULL;
2750 continue;
2754 gcc_assert (child == NULL);
2755 FOR_EACH_VEC_ELT (children, j, child)
2756 if (child)
2757 vect_free_slp_tree (child);
2758 vect_free_oprnd_info (oprnds_info);
2759 return NULL;
2762 vect_free_oprnd_info (oprnds_info);
2764 /* If we have all children of a child built up from uniform scalars
2765 or does more than one possibly expensive vector construction then
2766 just throw that away, causing it built up from scalars.
2767 The exception is the SLP node for the vector store. */
2768 if (is_a <bb_vec_info> (vinfo)
2769 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2770 /* ??? Rejecting patterns this way doesn't work. We'd have to
2771 do extra work to cancel the pattern so the uses see the
2772 scalar version. */
2773 && !is_pattern_stmt_p (stmt_info))
2775 slp_tree child;
2776 unsigned j;
2777 bool all_uniform_p = true;
2778 unsigned n_vector_builds = 0;
2779 FOR_EACH_VEC_ELT (children, j, child)
2781 if (!child)
2783 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2784 all_uniform_p = false;
2785 else if (!vect_slp_tree_uniform_p (child))
2787 all_uniform_p = false;
2788 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2789 n_vector_builds++;
2792 if (all_uniform_p
2793 || n_vector_builds > 1
2794 || (n_vector_builds == children.length ()
2795 && is_a <gphi *> (stmt_info->stmt)))
2797 /* Roll back. */
2798 matches[0] = false;
2799 FOR_EACH_VEC_ELT (children, j, child)
2800 if (child)
2801 vect_free_slp_tree (child);
2803 if (dump_enabled_p ())
2804 dump_printf_loc (MSG_NOTE, vect_location,
2805 "Building parent vector operands from "
2806 "scalars instead\n");
2807 return NULL;
2811 *tree_size += this_tree_size + 1;
2812 *max_nunits = this_max_nunits;
2814 if (two_operators)
2816 /* ??? We'd likely want to either cache in bst_map sth like
2817 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2818 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2819 explicit stmts to put in so the keying on 'stmts' doesn't
2820 work (but we have the same issue with nodes that use 'ops'). */
2822 if (has_two_operators_perm)
2824 slp_tree child = children[0];
2825 children.truncate (0);
2826 for (i = 0; i < 2; i++)
2828 slp_tree pnode
2829 = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
2830 SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
2831 SLP_TREE_VECTYPE (pnode) = vectype;
2832 SLP_TREE_CHILDREN (pnode).quick_push (child);
2833 SLP_TREE_CHILDREN (pnode).quick_push (child);
2834 lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
2835 children.safe_push (pnode);
2837 for (unsigned j = 0; j < stmts.length (); j++)
2838 perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
2841 SLP_TREE_REF_COUNT (child) += 4;
2844 slp_tree one = new _slp_tree;
2845 slp_tree two = new _slp_tree;
2846 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
2847 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
2848 SLP_TREE_VECTYPE (one) = vectype;
2849 SLP_TREE_VECTYPE (two) = vectype;
2850 SLP_TREE_CHILDREN (one).safe_splice (children);
2851 SLP_TREE_CHILDREN (two).safe_splice (children);
2852 slp_tree child;
2853 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
2854 SLP_TREE_REF_COUNT (child)++;
2856 /* Here we record the original defs since this
2857 node represents the final lane configuration. */
2858 node = vect_create_new_slp_node (node, stmts, 2);
2859 SLP_TREE_VECTYPE (node) = vectype;
2860 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2861 SLP_TREE_CHILDREN (node).quick_push (one);
2862 SLP_TREE_CHILDREN (node).quick_push (two);
2863 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
2864 enum tree_code code0 = gimple_assign_rhs_code (stmt);
2865 enum tree_code ocode = ERROR_MARK;
2866 stmt_vec_info ostmt_info;
2867 unsigned j = 0;
2868 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
2870 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
2871 if (gimple_assign_rhs_code (ostmt) != code0)
2873 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
2874 ocode = gimple_assign_rhs_code (ostmt);
2875 j = i;
2877 else
2878 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
2881 SLP_TREE_CODE (one) = code0;
2882 SLP_TREE_CODE (two) = ocode;
2883 SLP_TREE_LANES (one) = stmts.length ();
2884 SLP_TREE_LANES (two) = stmts.length ();
2885 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
2886 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
2888 return node;
2891 node = vect_create_new_slp_node (node, stmts, nops);
2892 SLP_TREE_VECTYPE (node) = vectype;
2893 SLP_TREE_CHILDREN (node).splice (children);
2894 return node;
2897 /* Dump a single SLP tree NODE. */
2899 static void
2900 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
2901 slp_tree node)
2903 unsigned i, j;
2904 slp_tree child;
2905 stmt_vec_info stmt_info;
2906 tree op;
2908 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
2909 dump_user_location_t user_loc = loc.get_user_location ();
2910 dump_printf_loc (metadata, user_loc,
2911 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
2912 ", refcnt=%u)",
2913 SLP_TREE_DEF_TYPE (node) == vect_external_def
2914 ? " (external)"
2915 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
2916 ? " (constant)"
2917 : ""), (void *) node,
2918 estimated_poly_value (node->max_nunits),
2919 SLP_TREE_REF_COUNT (node));
2920 if (SLP_TREE_VECTYPE (node))
2921 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
2922 dump_printf (metadata, "\n");
2923 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
2925 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
2926 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
2927 else
2928 dump_printf_loc (metadata, user_loc, "op template: %G",
2929 SLP_TREE_REPRESENTATIVE (node)->stmt);
2931 if (SLP_TREE_SCALAR_STMTS (node).exists ())
2932 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
2933 if (stmt_info)
2934 dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
2935 STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
2936 i, stmt_info->stmt);
2937 else
2938 dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
2939 else
2941 dump_printf_loc (metadata, user_loc, "\t{ ");
2942 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
2943 dump_printf (metadata, "%T%s ", op,
2944 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
2945 dump_printf (metadata, "}\n");
2947 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
2949 dump_printf_loc (metadata, user_loc, "\tload permutation {");
2950 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
2951 dump_printf (dump_kind, " %u", j);
2952 dump_printf (dump_kind, " }\n");
2954 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
2956 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
2957 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
2958 dump_printf (dump_kind, " %u[%u]",
2959 SLP_TREE_LANE_PERMUTATION (node)[i].first,
2960 SLP_TREE_LANE_PERMUTATION (node)[i].second);
2961 dump_printf (dump_kind, " }\n");
2963 if (SLP_TREE_CHILDREN (node).is_empty ())
2964 return;
2965 dump_printf_loc (metadata, user_loc, "\tchildren");
2966 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
2967 dump_printf (dump_kind, " %p", (void *)child);
2968 dump_printf (dump_kind, "\n");
2971 DEBUG_FUNCTION void
2972 debug (slp_tree node)
2974 debug_dump_context ctx;
2975 vect_print_slp_tree (MSG_NOTE,
2976 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2977 node);
2980 /* Recursive helper for the dot producer below. */
2982 static void
2983 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
2985 if (visited.add (node))
2986 return;
2988 fprintf (f, "\"%p\" [label=\"", (void *)node);
2989 vect_print_slp_tree (MSG_NOTE,
2990 dump_location_t::from_location_t (UNKNOWN_LOCATION),
2991 node);
2992 fprintf (f, "\"];\n");
2995 for (slp_tree child : SLP_TREE_CHILDREN (node))
2996 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
2998 for (slp_tree child : SLP_TREE_CHILDREN (node))
2999 if (child)
3000 dot_slp_tree (f, child, visited);
3003 DEBUG_FUNCTION void
3004 dot_slp_tree (const char *fname, slp_tree node)
3006 FILE *f = fopen (fname, "w");
3007 fprintf (f, "digraph {\n");
3008 fflush (f);
3010 debug_dump_context ctx (f);
3011 hash_set<slp_tree> visited;
3012 dot_slp_tree (f, node, visited);
3014 fflush (f);
3015 fprintf (f, "}\n");
3016 fclose (f);
3019 DEBUG_FUNCTION void
3020 dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
3022 FILE *f = fopen (fname, "w");
3023 fprintf (f, "digraph {\n");
3024 fflush (f);
3026 debug_dump_context ctx (f);
3027 hash_set<slp_tree> visited;
3028 for (auto inst : slp_instances)
3029 dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
3031 fflush (f);
3032 fprintf (f, "}\n");
3033 fclose (f);
3036 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
3038 static void
3039 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3040 slp_tree node, hash_set<slp_tree> &visited)
3042 unsigned i;
3043 slp_tree child;
3045 if (visited.add (node))
3046 return;
3048 vect_print_slp_tree (dump_kind, loc, node);
3050 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3051 if (child)
3052 vect_print_slp_graph (dump_kind, loc, child, visited);
3055 static void
3056 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3057 slp_tree entry)
3059 hash_set<slp_tree> visited;
3060 vect_print_slp_graph (dump_kind, loc, entry, visited);
3063 DEBUG_FUNCTION void
3064 debug (slp_instance instance)
3066 debug_dump_context ctx;
3067 vect_print_slp_graph (MSG_NOTE,
3068 dump_location_t::from_location_t (UNKNOWN_LOCATION),
3069 SLP_INSTANCE_TREE (instance));
3072 /* Mark the tree rooted at NODE with PURE_SLP. */
3074 static void
3075 vect_mark_slp_stmts (slp_tree node, hash_set<slp_tree> &visited)
3077 int i;
3078 stmt_vec_info stmt_info;
3079 slp_tree child;
3081 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3082 return;
3084 if (visited.add (node))
3085 return;
3087 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3088 if (stmt_info)
3089 STMT_SLP_TYPE (stmt_info) = pure_slp;
3091 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3092 if (child)
3093 vect_mark_slp_stmts (child, visited);
3096 static void
3097 vect_mark_slp_stmts (slp_tree node)
3099 hash_set<slp_tree> visited;
3100 vect_mark_slp_stmts (node, visited);
3103 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
3105 static void
3106 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
3108 int i;
3109 stmt_vec_info stmt_info;
3110 slp_tree child;
3112 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3113 return;
3115 if (visited.add (node))
3116 return;
3118 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3119 if (stmt_info)
3121 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
3122 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
3123 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
3126 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3127 if (child)
3128 vect_mark_slp_stmts_relevant (child, visited);
3131 static void
3132 vect_mark_slp_stmts_relevant (slp_tree node)
3134 hash_set<slp_tree> visited;
3135 vect_mark_slp_stmts_relevant (node, visited);
3139 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
3141 static void
3142 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
3143 hash_set<slp_tree> &visited)
3145 if (!node || visited.add (node))
3146 return;
3148 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3149 return;
3151 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3153 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3154 if (STMT_VINFO_DATA_REF (stmt_info)
3155 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3156 loads.safe_push (node);
3159 unsigned i;
3160 slp_tree child;
3161 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3162 vect_gather_slp_loads (loads, child, visited);
3166 /* Find the last store in SLP INSTANCE. */
3168 stmt_vec_info
3169 vect_find_last_scalar_stmt_in_slp (slp_tree node)
3171 stmt_vec_info last = NULL;
3172 stmt_vec_info stmt_vinfo;
3174 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3175 if (stmt_vinfo)
3177 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3178 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
3181 return last;
3184 /* Find the first stmt in NODE. */
3186 stmt_vec_info
3187 vect_find_first_scalar_stmt_in_slp (slp_tree node)
3189 stmt_vec_info first = NULL;
3190 stmt_vec_info stmt_vinfo;
3192 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3193 if (stmt_vinfo)
3195 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3196 if (!first
3197 || get_later_stmt (stmt_vinfo, first) == first)
3198 first = stmt_vinfo;
3201 return first;
3204 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3205 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3206 (also containing the first GROUP1_SIZE stmts, since stores are
3207 consecutive), the second containing the remainder.
3208 Return the first stmt in the second group. */
3210 static stmt_vec_info
3211 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3213 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3214 gcc_assert (group1_size > 0);
3215 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3216 gcc_assert (group2_size > 0);
3217 DR_GROUP_SIZE (first_vinfo) = group1_size;
3219 stmt_vec_info stmt_info = first_vinfo;
3220 for (unsigned i = group1_size; i > 1; i--)
3222 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3223 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3225 /* STMT is now the last element of the first group. */
3226 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3227 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3229 DR_GROUP_SIZE (group2) = group2_size;
3230 for (stmt_info = group2; stmt_info;
3231 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3233 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3234 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3237 /* For the second group, the DR_GROUP_GAP is that before the original group,
3238 plus skipping over the first vector. */
3239 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3241 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3242 DR_GROUP_GAP (first_vinfo) += group2_size;
3244 if (dump_enabled_p ())
3245 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3246 group1_size, group2_size);
3248 return group2;
3251 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3252 statements and a vector of NUNITS elements. */
3254 static poly_uint64
3255 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3257 return exact_div (common_multiple (nunits, group_size), group_size);
3260 /* Helper that checks to see if a node is a load node. */
3262 static inline bool
3263 vect_is_slp_load_node (slp_tree root)
3265 return SLP_TREE_DEF_TYPE (root) == vect_internal_def
3266 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3267 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root)));
3271 /* Helper function of optimize_load_redistribution that performs the operation
3272 recursively. */
3274 static slp_tree
3275 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3276 vec_info *vinfo, unsigned int group_size,
3277 hash_map<slp_tree, slp_tree> *load_map,
3278 slp_tree root)
3280 if (slp_tree *leader = load_map->get (root))
3281 return *leader;
3283 slp_tree node;
3284 unsigned i;
3286 /* For now, we don't know anything about externals so do not do anything. */
3287 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3288 return NULL;
3289 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3291 /* First convert this node into a load node and add it to the leaves
3292 list and flatten the permute from a lane to a load one. If it's
3293 unneeded it will be elided later. */
3294 vec<stmt_vec_info> stmts;
3295 stmts.create (SLP_TREE_LANES (root));
3296 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3297 for (unsigned j = 0; j < lane_perm.length (); j++)
3299 std::pair<unsigned, unsigned> perm = lane_perm[j];
3300 node = SLP_TREE_CHILDREN (root)[perm.first];
3302 if (!vect_is_slp_load_node (node)
3303 || SLP_TREE_CHILDREN (node).exists ())
3305 stmts.release ();
3306 goto next;
3309 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3312 if (dump_enabled_p ())
3313 dump_printf_loc (MSG_NOTE, vect_location,
3314 "converting stmts on permute node %p\n",
3315 (void *) root);
3317 bool *matches = XALLOCAVEC (bool, group_size);
3318 poly_uint64 max_nunits = 1;
3319 unsigned tree_size = 0, limit = 1;
3320 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3321 matches, &limit, &tree_size, bst_map);
3322 if (!node)
3323 stmts.release ();
3325 load_map->put (root, node);
3326 return node;
3329 next:
3330 load_map->put (root, NULL);
3332 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3334 slp_tree value
3335 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3336 node);
3337 if (value)
3339 SLP_TREE_REF_COUNT (value)++;
3340 SLP_TREE_CHILDREN (root)[i] = value;
3341 /* ??? We know the original leafs of the replaced nodes will
3342 be referenced by bst_map, only the permutes created by
3343 pattern matching are not. */
3344 if (SLP_TREE_REF_COUNT (node) == 1)
3345 load_map->remove (node);
3346 vect_free_slp_tree (node);
3350 return NULL;
3353 /* Temporary workaround for loads not being CSEd during SLP build. This
3354 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3355 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3356 same DR such that the final operation is equal to a permuted load. Such
3357 NODES are then directly converted into LOADS themselves. The nodes are
3358 CSEd using BST_MAP. */
3360 static void
3361 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3362 vec_info *vinfo, unsigned int group_size,
3363 hash_map<slp_tree, slp_tree> *load_map,
3364 slp_tree root)
3366 slp_tree node;
3367 unsigned i;
3369 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3371 slp_tree value
3372 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3373 node);
3374 if (value)
3376 SLP_TREE_REF_COUNT (value)++;
3377 SLP_TREE_CHILDREN (root)[i] = value;
3378 /* ??? We know the original leafs of the replaced nodes will
3379 be referenced by bst_map, only the permutes created by
3380 pattern matching are not. */
3381 if (SLP_TREE_REF_COUNT (node) == 1)
3382 load_map->remove (node);
3383 vect_free_slp_tree (node);
3388 /* Helper function of vect_match_slp_patterns.
3390 Attempts to match patterns against the slp tree rooted in REF_NODE using
3391 VINFO. Patterns are matched in post-order traversal.
3393 If matching is successful the value in REF_NODE is updated and returned, if
3394 not then it is returned unchanged. */
3396 static bool
3397 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3398 slp_tree_to_load_perm_map_t *perm_cache,
3399 slp_compat_nodes_map_t *compat_cache,
3400 hash_set<slp_tree> *visited)
3402 unsigned i;
3403 slp_tree node = *ref_node;
3404 bool found_p = false;
3405 if (!node || visited->add (node))
3406 return false;
3408 slp_tree child;
3409 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3410 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3411 vinfo, perm_cache, compat_cache,
3412 visited);
3414 for (unsigned x = 0; x < num__slp_patterns; x++)
3416 vect_pattern *pattern
3417 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3418 if (pattern)
3420 pattern->build (vinfo);
3421 delete pattern;
3422 found_p = true;
3426 return found_p;
3429 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3430 vec_info VINFO.
3432 The modified tree is returned. Patterns are tried in order and multiple
3433 patterns may match. */
3435 static bool
3436 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3437 hash_set<slp_tree> *visited,
3438 slp_tree_to_load_perm_map_t *perm_cache,
3439 slp_compat_nodes_map_t *compat_cache)
3441 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3442 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3444 if (dump_enabled_p ())
3445 dump_printf_loc (MSG_NOTE, vect_location,
3446 "Analyzing SLP tree %p for patterns\n",
3447 (void *) SLP_INSTANCE_TREE (instance));
3449 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3450 visited);
3453 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3454 splitting into two, with the first split group having size NEW_GROUP_SIZE.
3455 Return true if we could use IFN_STORE_LANES instead and if that appears
3456 to be the better approach. */
3458 static bool
3459 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3460 unsigned int group_size,
3461 unsigned int new_group_size)
3463 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3464 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3465 if (!vectype)
3466 return false;
3467 /* Allow the split if one of the two new groups would operate on full
3468 vectors *within* rather than across one scalar loop iteration.
3469 This is purely a heuristic, but it should work well for group
3470 sizes of 3 and 4, where the possible splits are:
3472 3->2+1: OK if the vector has exactly two elements
3473 4->2+2: Likewise
3474 4->3+1: Less clear-cut. */
3475 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3476 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3477 return false;
3478 return vect_store_lanes_supported (vectype, group_size, false) != IFN_LAST;
3481 /* Analyze an SLP instance starting from a group of grouped stores. Call
3482 vect_build_slp_tree to build a tree of packed stmts if possible.
3483 Return FALSE if it's impossible to SLP any stmt in the loop. */
3485 static bool
3486 vect_analyze_slp_instance (vec_info *vinfo,
3487 scalar_stmts_to_slp_tree_map_t *bst_map,
3488 stmt_vec_info stmt_info, slp_instance_kind kind,
3489 unsigned max_tree_size, unsigned *limit,
3490 bool force_single_lane = false);
3492 /* Build an interleaving scheme for the store sources RHS_NODES from
3493 SCALAR_STMTS. */
3495 static slp_tree
3496 vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
3497 vec<stmt_vec_info> &scalar_stmts)
3499 unsigned int group_size = scalar_stmts.length ();
3500 slp_tree node = vect_create_new_slp_node (scalar_stmts,
3501 SLP_TREE_CHILDREN
3502 (rhs_nodes[0]).length ());
3503 SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
3504 for (unsigned l = 0;
3505 l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
3507 /* And a permute merging all RHS SLP trees. */
3508 slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
3509 VEC_PERM_EXPR);
3510 SLP_TREE_CHILDREN (node).quick_push (perm);
3511 SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
3512 SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
3513 SLP_TREE_LANES (perm) = group_size;
3514 /* ??? We should set this NULL but that's not expected. */
3515 SLP_TREE_REPRESENTATIVE (perm)
3516 = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
3517 for (unsigned j = 0; j < rhs_nodes.length (); ++j)
3519 SLP_TREE_CHILDREN (perm)
3520 .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
3521 SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
3522 for (unsigned k = 0;
3523 k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
3525 /* ??? We should populate SLP_TREE_SCALAR_STMTS
3526 or SLP_TREE_SCALAR_OPS but then we might have
3527 a mix of both in our children. */
3528 SLP_TREE_LANE_PERMUTATION (perm)
3529 .quick_push (std::make_pair (j, k));
3533 /* Now we have a single permute node but we cannot code-generate
3534 the case with more than two inputs.
3535 Perform pairwise reduction, reducing the two inputs
3536 with the least number of lanes to one and then repeat until
3537 we end up with two inputs. That scheme makes sure we end
3538 up with permutes satisfying the restriction of requiring at
3539 most two vector inputs to produce a single vector output
3540 when the number of lanes is even. */
3541 while (SLP_TREE_CHILDREN (perm).length () > 2)
3543 /* When we have three equal sized groups left the pairwise
3544 reduction does not result in a scheme that avoids using
3545 three vectors. Instead merge the first two groups
3546 to the final size with do-not-care elements (chosen
3547 from the first group) and then merge with the third.
3548 { A0, B0, x, A1, B1, x, ... }
3549 -> { A0, B0, C0, A1, B1, C1, ... }
3550 This handles group size of three (and at least
3551 power-of-two multiples of that). */
3552 if (SLP_TREE_CHILDREN (perm).length () == 3
3553 && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3554 == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
3555 && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3556 == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
3558 int ai = 0;
3559 int bi = 1;
3560 slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3561 slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3562 unsigned n = SLP_TREE_LANES (perm);
3564 slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3565 SLP_TREE_LANES (permab) = n;
3566 SLP_TREE_LANE_PERMUTATION (permab).create (n);
3567 SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3568 /* ??? Should be NULL but that's not expected. */
3569 SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3570 SLP_TREE_CHILDREN (permab).quick_push (a);
3571 for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3572 SLP_TREE_LANE_PERMUTATION (permab)
3573 .quick_push (std::make_pair (0, k));
3574 SLP_TREE_CHILDREN (permab).quick_push (b);
3575 for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3576 SLP_TREE_LANE_PERMUTATION (permab)
3577 .quick_push (std::make_pair (1, k));
3578 /* Push the do-not-care lanes. */
3579 for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3580 SLP_TREE_LANE_PERMUTATION (permab)
3581 .quick_push (std::make_pair (0, k));
3583 /* Put the merged node into 'perm', in place of a. */
3584 SLP_TREE_CHILDREN (perm)[ai] = permab;
3585 /* Adjust the references to b in the permutation
3586 of perm and to the later children which we'll
3587 remove. */
3588 for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3590 std::pair<unsigned, unsigned> &p
3591 = SLP_TREE_LANE_PERMUTATION (perm)[k];
3592 if (p.first == (unsigned) bi)
3594 p.first = ai;
3595 p.second += SLP_TREE_LANES (a);
3597 else if (p.first > (unsigned) bi)
3598 p.first--;
3600 SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3601 break;
3604 /* Pick the two nodes with the least number of lanes,
3605 prefer the earliest candidate and maintain ai < bi. */
3606 int ai = -1;
3607 int bi = -1;
3608 for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
3610 if (ai == -1)
3611 ai = ci;
3612 else if (bi == -1)
3613 bi = ci;
3614 else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
3615 < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
3616 || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
3617 < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
3619 if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
3620 <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
3621 bi = ci;
3622 else
3624 ai = bi;
3625 bi = ci;
3630 /* Produce a merge of nodes ai and bi. */
3631 slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3632 slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3633 unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
3634 slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3635 SLP_TREE_LANES (permab) = n;
3636 SLP_TREE_LANE_PERMUTATION (permab).create (n);
3637 SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3638 /* ??? Should be NULL but that's not expected. */
3639 SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3640 SLP_TREE_CHILDREN (permab).quick_push (a);
3641 for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3642 SLP_TREE_LANE_PERMUTATION (permab)
3643 .quick_push (std::make_pair (0, k));
3644 SLP_TREE_CHILDREN (permab).quick_push (b);
3645 for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3646 SLP_TREE_LANE_PERMUTATION (permab)
3647 .quick_push (std::make_pair (1, k));
3649 /* Put the merged node into 'perm', in place of a. */
3650 SLP_TREE_CHILDREN (perm)[ai] = permab;
3651 /* Adjust the references to b in the permutation
3652 of perm and to the later children which we'll
3653 remove. */
3654 for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3656 std::pair<unsigned, unsigned> &p
3657 = SLP_TREE_LANE_PERMUTATION (perm)[k];
3658 if (p.first == (unsigned) bi)
3660 p.first = ai;
3661 p.second += SLP_TREE_LANES (a);
3663 else if (p.first > (unsigned) bi)
3664 p.first--;
3666 SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3670 return node;
3673 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3674 of KIND. Return true if successful. */
3676 static bool
3677 vect_build_slp_instance (vec_info *vinfo,
3678 slp_instance_kind kind,
3679 vec<stmt_vec_info> &scalar_stmts,
3680 vec<stmt_vec_info> &root_stmt_infos,
3681 vec<tree> &remain,
3682 unsigned max_tree_size, unsigned *limit,
3683 scalar_stmts_to_slp_tree_map_t *bst_map,
3684 /* ??? We need stmt_info for group splitting. */
3685 stmt_vec_info stmt_info_,
3686 bool force_single_lane = false)
3688 /* If there's no budget left bail out early. */
3689 if (*limit == 0)
3690 return false;
3692 if (kind == slp_inst_kind_ctor)
3694 if (dump_enabled_p ())
3695 dump_printf_loc (MSG_NOTE, vect_location,
3696 "Analyzing vectorizable constructor: %G\n",
3697 root_stmt_infos[0]->stmt);
3700 if (dump_enabled_p ())
3702 dump_printf_loc (MSG_NOTE, vect_location,
3703 "Starting SLP discovery for\n");
3704 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3705 dump_printf_loc (MSG_NOTE, vect_location,
3706 " %G", scalar_stmts[i]->stmt);
3709 /* Build the tree for the SLP instance. */
3710 unsigned int group_size = scalar_stmts.length ();
3711 bool *matches = XALLOCAVEC (bool, group_size);
3712 poly_uint64 max_nunits = 1;
3713 unsigned tree_size = 0;
3714 unsigned i;
3716 slp_tree node = NULL;
3717 if (force_single_lane)
3719 matches[0] = true;
3720 matches[1] = false;
3722 else
3723 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3724 &max_nunits, matches, limit,
3725 &tree_size, bst_map);
3726 if (node != NULL)
3728 /* Calculate the unrolling factor based on the smallest type. */
3729 poly_uint64 unrolling_factor
3730 = calculate_unrolling_factor (max_nunits, group_size);
3732 if (maybe_ne (unrolling_factor, 1U)
3733 && is_a <bb_vec_info> (vinfo))
3735 unsigned HOST_WIDE_INT const_max_nunits;
3736 if (!max_nunits.is_constant (&const_max_nunits)
3737 || const_max_nunits > group_size)
3739 if (dump_enabled_p ())
3740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3741 "Build SLP failed: store group "
3742 "size not a multiple of the vector size "
3743 "in basic block SLP\n");
3744 vect_free_slp_tree (node);
3745 return false;
3747 /* Fatal mismatch. */
3748 if (dump_enabled_p ())
3749 dump_printf_loc (MSG_NOTE, vect_location,
3750 "SLP discovery succeeded but node needs "
3751 "splitting\n");
3752 memset (matches, true, group_size);
3753 matches[group_size / const_max_nunits * const_max_nunits] = false;
3754 vect_free_slp_tree (node);
3756 else
3758 /* Create a new SLP instance. */
3759 slp_instance new_instance = XNEW (class _slp_instance);
3760 SLP_INSTANCE_TREE (new_instance) = node;
3761 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
3762 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3763 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3764 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3765 SLP_INSTANCE_KIND (new_instance) = kind;
3766 new_instance->reduc_phis = NULL;
3767 new_instance->cost_vec = vNULL;
3768 new_instance->subgraph_entries = vNULL;
3770 if (dump_enabled_p ())
3771 dump_printf_loc (MSG_NOTE, vect_location,
3772 "SLP size %u vs. limit %u.\n",
3773 tree_size, max_tree_size);
3775 /* Fixup SLP reduction chains. */
3776 if (kind == slp_inst_kind_reduc_chain)
3778 /* If this is a reduction chain with a conversion in front
3779 amend the SLP tree with a node for that. */
3780 gimple *scalar_def
3781 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
3782 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
3784 /* Get at the conversion stmt - we know it's the single use
3785 of the last stmt of the reduction chain. */
3786 use_operand_p use_p;
3787 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
3788 &use_p, &scalar_def);
3789 gcc_assert (r);
3790 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
3791 next_info = vect_stmt_to_vectorize (next_info);
3792 scalar_stmts = vNULL;
3793 scalar_stmts.create (group_size);
3794 for (unsigned i = 0; i < group_size; ++i)
3795 scalar_stmts.quick_push (next_info);
3796 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
3797 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
3798 SLP_TREE_CHILDREN (conv).quick_push (node);
3799 SLP_INSTANCE_TREE (new_instance) = conv;
3800 /* We also have to fake this conversion stmt as SLP reduction
3801 group so we don't have to mess with too much code
3802 elsewhere. */
3803 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
3804 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
3806 /* Fill the backedge child of the PHI SLP node. The
3807 general matching code cannot find it because the
3808 scalar code does not reflect how we vectorize the
3809 reduction. */
3810 use_operand_p use_p;
3811 imm_use_iterator imm_iter;
3812 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
3813 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
3814 gimple_get_lhs (scalar_def))
3815 /* There are exactly two non-debug uses, the reduction
3816 PHI and the loop-closed PHI node. */
3817 if (!is_gimple_debug (USE_STMT (use_p))
3818 && gimple_bb (USE_STMT (use_p)) == loop->header)
3820 auto_vec<stmt_vec_info, 64> phis (group_size);
3821 stmt_vec_info phi_info
3822 = vinfo->lookup_stmt (USE_STMT (use_p));
3823 for (unsigned i = 0; i < group_size; ++i)
3824 phis.quick_push (phi_info);
3825 slp_tree *phi_node = bst_map->get (phis);
3826 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
3827 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
3828 = SLP_INSTANCE_TREE (new_instance);
3829 SLP_INSTANCE_TREE (new_instance)->refcnt++;
3833 vinfo->slp_instances.safe_push (new_instance);
3835 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
3836 the number of scalar stmts in the root in a few places.
3837 Verify that assumption holds. */
3838 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
3839 .length () == group_size);
3841 if (dump_enabled_p ())
3843 dump_printf_loc (MSG_NOTE, vect_location,
3844 "Final SLP tree for instance %p:\n",
3845 (void *) new_instance);
3846 vect_print_slp_graph (MSG_NOTE, vect_location,
3847 SLP_INSTANCE_TREE (new_instance));
3850 return true;
3853 /* Failed to SLP. */
3855 stmt_vec_info stmt_info = stmt_info_;
3856 /* Try to break the group up into pieces. */
3857 if (*limit > 0 && kind == slp_inst_kind_store)
3859 /* ??? We could delay all the actual splitting of store-groups
3860 until after SLP discovery of the original group completed.
3861 Then we can recurse to vect_build_slp_instance directly. */
3862 for (i = 0; i < group_size; i++)
3863 if (!matches[i])
3864 break;
3866 /* For basic block SLP, try to break the group up into multiples of
3867 a vector size. */
3868 if (is_a <bb_vec_info> (vinfo)
3869 && (i > 1 && i < group_size))
3871 /* Free the allocated memory. */
3872 scalar_stmts.release ();
3874 tree scalar_type
3875 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3876 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
3877 1 << floor_log2 (i));
3878 unsigned HOST_WIDE_INT const_nunits;
3879 if (vectype
3880 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
3882 /* Split into two groups at the first vector boundary. */
3883 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
3884 unsigned group1_size = i & ~(const_nunits - 1);
3886 if (dump_enabled_p ())
3887 dump_printf_loc (MSG_NOTE, vect_location,
3888 "Splitting SLP group at stmt %u\n", i);
3889 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
3890 group1_size);
3891 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
3892 kind, max_tree_size,
3893 limit);
3894 /* Split the rest at the failure point and possibly
3895 re-analyze the remaining matching part if it has
3896 at least two lanes. */
3897 if (group1_size < i
3898 && (i + 1 < group_size
3899 || i - group1_size > 1))
3901 stmt_vec_info rest2 = rest;
3902 rest = vect_split_slp_store_group (rest, i - group1_size);
3903 if (i - group1_size > 1)
3904 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
3905 kind, max_tree_size,
3906 limit);
3908 /* Re-analyze the non-matching tail if it has at least
3909 two lanes. */
3910 if (i + 1 < group_size)
3911 res |= vect_analyze_slp_instance (vinfo, bst_map,
3912 rest, kind, max_tree_size,
3913 limit);
3914 return res;
3918 /* For loop vectorization split the RHS into arbitrary pieces of
3919 size >= 1. */
3920 else if (is_a <loop_vec_info> (vinfo)
3921 && (group_size != 1 && i < group_size))
3923 /* There are targets that cannot do even/odd interleaving schemes
3924 so they absolutely need to use load/store-lanes. For now
3925 force single-lane SLP for them - they would be happy with
3926 uniform power-of-two lanes (but depending on element size),
3927 but even if we can use 'i' as indicator we would need to
3928 backtrack when later lanes fail to discover with the same
3929 granularity. We cannot turn any of strided or scatter store
3930 into store-lanes. */
3931 /* ??? If this is not in sync with what get_load_store_type
3932 later decides the SLP representation is not good for other
3933 store vectorization methods. */
3934 bool want_store_lanes
3935 = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
3936 && ! STMT_VINFO_STRIDED_P (stmt_info)
3937 && compare_step_with_zero (vinfo, stmt_info) > 0
3938 && vect_slp_prefer_store_lanes_p (vinfo, stmt_info,
3939 group_size, 1));
3940 if (want_store_lanes || force_single_lane)
3941 i = 1;
3943 /* A fatal discovery fail doesn't always mean single-lane SLP
3944 isn't a possibility, so try. */
3945 if (i == 0)
3946 i = 1;
3948 if (dump_enabled_p ())
3949 dump_printf_loc (MSG_NOTE, vect_location,
3950 "Splitting SLP group at stmt %u\n", i);
3952 /* Analyze the stored values and pinch them together with
3953 a permute node so we can preserve the whole store group. */
3954 auto_vec<slp_tree> rhs_nodes;
3956 /* Calculate the unrolling factor based on the smallest type. */
3957 poly_uint64 unrolling_factor = 1;
3959 unsigned int start = 0, end = i;
3960 while (start < group_size)
3962 gcc_assert (end - start >= 1);
3963 vec<stmt_vec_info> substmts;
3964 substmts.create (end - start);
3965 for (unsigned j = start; j < end; ++j)
3966 substmts.quick_push (scalar_stmts[j]);
3967 max_nunits = 1;
3968 node = vect_build_slp_tree (vinfo, substmts, end - start,
3969 &max_nunits,
3970 matches, limit, &tree_size, bst_map);
3971 if (node)
3973 /* ??? Possibly not safe, but not sure how to check
3974 and fail SLP build? */
3975 unrolling_factor
3976 = force_common_multiple (unrolling_factor,
3977 calculate_unrolling_factor
3978 (max_nunits, end - start));
3979 rhs_nodes.safe_push (node);
3980 start = end;
3981 if (want_store_lanes || force_single_lane)
3982 end = start + 1;
3983 else
3984 end = group_size;
3986 else
3988 substmts.release ();
3989 if (end - start == 1)
3991 /* Single-lane discovery failed. Free ressources. */
3992 for (auto node : rhs_nodes)
3993 vect_free_slp_tree (node);
3994 scalar_stmts.release ();
3995 if (dump_enabled_p ())
3996 dump_printf_loc (MSG_NOTE, vect_location,
3997 "SLP discovery failed\n");
3998 return false;
4001 /* ??? It really happens that we soft-fail SLP
4002 build at a mismatch but the matching part hard-fails
4003 later. As we know we arrived here with a group
4004 larger than one try a group of size one! */
4005 if (!matches[0])
4006 end = start + 1;
4007 else
4008 for (unsigned j = start; j < end; j++)
4009 if (!matches[j - start])
4011 end = j;
4012 break;
4017 /* Now we assume we can build the root SLP node from all stores. */
4018 if (want_store_lanes)
4020 /* For store-lanes feed the store node with all RHS nodes
4021 in order. */
4022 node = vect_create_new_slp_node (scalar_stmts,
4023 SLP_TREE_CHILDREN
4024 (rhs_nodes[0]).length ());
4025 SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
4026 node->ldst_lanes = true;
4027 SLP_TREE_CHILDREN (node)
4028 .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
4029 + rhs_nodes.length () - 1);
4030 /* First store value and possibly mask. */
4031 SLP_TREE_CHILDREN (node)
4032 .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
4033 /* Rest of the store values. All mask nodes are the same,
4034 this should be guaranteed by dataref group discovery. */
4035 for (unsigned j = 1; j < rhs_nodes.length (); ++j)
4036 SLP_TREE_CHILDREN (node)
4037 .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
4038 for (slp_tree child : SLP_TREE_CHILDREN (node))
4039 child->refcnt++;
4041 else
4042 node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts);
4044 while (!rhs_nodes.is_empty ())
4045 vect_free_slp_tree (rhs_nodes.pop ());
4047 /* Create a new SLP instance. */
4048 slp_instance new_instance = XNEW (class _slp_instance);
4049 SLP_INSTANCE_TREE (new_instance) = node;
4050 SLP_INSTANCE_UNROLLING_FACTOR (new_instance) = unrolling_factor;
4051 SLP_INSTANCE_LOADS (new_instance) = vNULL;
4052 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4053 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4054 SLP_INSTANCE_KIND (new_instance) = kind;
4055 new_instance->reduc_phis = NULL;
4056 new_instance->cost_vec = vNULL;
4057 new_instance->subgraph_entries = vNULL;
4059 if (dump_enabled_p ())
4060 dump_printf_loc (MSG_NOTE, vect_location,
4061 "SLP size %u vs. limit %u.\n",
4062 tree_size, max_tree_size);
4064 vinfo->slp_instances.safe_push (new_instance);
4066 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4067 the number of scalar stmts in the root in a few places.
4068 Verify that assumption holds. */
4069 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4070 .length () == group_size);
4072 if (dump_enabled_p ())
4074 dump_printf_loc (MSG_NOTE, vect_location,
4075 "Final SLP tree for instance %p:\n",
4076 (void *) new_instance);
4077 vect_print_slp_graph (MSG_NOTE, vect_location,
4078 SLP_INSTANCE_TREE (new_instance));
4080 return true;
4082 else
4083 /* Free the allocated memory. */
4084 scalar_stmts.release ();
4086 /* Even though the first vector did not all match, we might be able to SLP
4087 (some) of the remainder. FORNOW ignore this possibility. */
4089 else
4090 /* Free the allocated memory. */
4091 scalar_stmts.release ();
4093 /* Failed to SLP. */
4094 if (dump_enabled_p ())
4095 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4096 return false;
4100 /* Analyze an SLP instance starting from a group of grouped stores. Call
4101 vect_build_slp_tree to build a tree of packed stmts if possible.
4102 Return FALSE if it's impossible to SLP any stmt in the loop. */
4104 static bool
4105 vect_analyze_slp_instance (vec_info *vinfo,
4106 scalar_stmts_to_slp_tree_map_t *bst_map,
4107 stmt_vec_info stmt_info,
4108 slp_instance_kind kind,
4109 unsigned max_tree_size, unsigned *limit,
4110 bool force_single_lane)
4112 vec<stmt_vec_info> scalar_stmts;
4114 if (is_a <bb_vec_info> (vinfo))
4115 vect_location = stmt_info->stmt;
4117 stmt_vec_info next_info = stmt_info;
4118 if (kind == slp_inst_kind_store)
4120 /* Collect the stores and store them in scalar_stmts. */
4121 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
4122 while (next_info)
4124 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4125 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
4128 else if (kind == slp_inst_kind_reduc_chain)
4130 /* Collect the reduction stmts and store them in scalar_stmts. */
4131 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
4132 while (next_info)
4134 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4135 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
4137 /* Mark the first element of the reduction chain as reduction to properly
4138 transform the node. In the reduction analysis phase only the last
4139 element of the chain is marked as reduction. */
4140 STMT_VINFO_DEF_TYPE (stmt_info)
4141 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
4142 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
4143 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
4145 else
4146 gcc_unreachable ();
4148 vec<stmt_vec_info> roots = vNULL;
4149 vec<tree> remain = vNULL;
4150 /* Build the tree for the SLP instance. */
4151 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
4152 roots, remain,
4153 max_tree_size, limit, bst_map,
4154 kind == slp_inst_kind_store
4155 ? stmt_info : NULL, force_single_lane);
4157 /* ??? If this is slp_inst_kind_store and the above succeeded here's
4158 where we should do store group splitting. */
4160 return res;
4163 /* qsort comparator ordering SLP load nodes. */
4165 static int
4166 vllp_cmp (const void *a_, const void *b_)
4168 const slp_tree a = *(const slp_tree *)a_;
4169 const slp_tree b = *(const slp_tree *)b_;
4170 stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
4171 stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
4172 if (STMT_VINFO_GROUPED_ACCESS (a0)
4173 && STMT_VINFO_GROUPED_ACCESS (b0)
4174 && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
4176 /* Same group, order after lanes used. */
4177 if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
4178 return 1;
4179 else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
4180 return -1;
4181 else
4183 /* Try to order loads using the same lanes together, breaking
4184 the tie with the lane number that first differs. */
4185 if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
4186 && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
4187 return 0;
4188 else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
4189 && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
4190 return 1;
4191 else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
4192 && SLP_TREE_LOAD_PERMUTATION (b).exists ())
4193 return -1;
4194 else
4196 for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
4197 if (SLP_TREE_LOAD_PERMUTATION (a)[i]
4198 != SLP_TREE_LOAD_PERMUTATION (b)[i])
4200 /* In-order lane first, that's what the above case for
4201 no permutation does. */
4202 if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
4203 return -1;
4204 else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
4205 return 1;
4206 else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
4207 < SLP_TREE_LOAD_PERMUTATION (b)[i])
4208 return -1;
4209 else
4210 return 1;
4212 return 0;
4216 else /* Different groups or non-groups. */
4218 /* Order groups as their first element to keep them together. */
4219 if (STMT_VINFO_GROUPED_ACCESS (a0))
4220 a0 = DR_GROUP_FIRST_ELEMENT (a0);
4221 if (STMT_VINFO_GROUPED_ACCESS (b0))
4222 b0 = DR_GROUP_FIRST_ELEMENT (b0);
4223 if (a0 == b0)
4224 return 0;
4225 /* Tie using UID. */
4226 else if (gimple_uid (STMT_VINFO_STMT (a0))
4227 < gimple_uid (STMT_VINFO_STMT (b0)))
4228 return -1;
4229 else
4231 gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
4232 != gimple_uid (STMT_VINFO_STMT (b0)));
4233 return 1;
4238 /* Process the set of LOADS that are all from the same dataref group. */
4240 static void
4241 vect_lower_load_permutations (loop_vec_info loop_vinfo,
4242 scalar_stmts_to_slp_tree_map_t *bst_map,
4243 const array_slice<slp_tree> &loads)
4245 /* We at this point want to lower without a fixed VF or vector
4246 size in mind which means we cannot actually compute whether we
4247 need three or more vectors for a load permutation yet. So always
4248 lower. */
4249 stmt_vec_info first
4250 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
4251 unsigned group_lanes = DR_GROUP_SIZE (first);
4253 /* Verify if all load permutations can be implemented with a suitably
4254 large element load-lanes operation. */
4255 unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
4256 if (STMT_VINFO_STRIDED_P (first)
4257 || compare_step_with_zero (loop_vinfo, first) <= 0
4258 || exact_log2 (ld_lanes_lanes) == -1
4259 /* ??? For now only support the single-lane case as there is
4260 missing support on the store-lane side and code generation
4261 isn't up to the task yet. */
4262 || ld_lanes_lanes != 1
4263 || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
4264 group_lanes / ld_lanes_lanes,
4265 false) == IFN_LAST)
4266 ld_lanes_lanes = 0;
4267 else
4268 /* Verify the loads access the same number of lanes aligned to
4269 ld_lanes_lanes. */
4270 for (slp_tree load : loads)
4272 if (SLP_TREE_LANES (load) != ld_lanes_lanes)
4274 ld_lanes_lanes = 0;
4275 break;
4277 unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
4278 if (first % ld_lanes_lanes != 0)
4280 ld_lanes_lanes = 0;
4281 break;
4283 for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i)
4284 if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i)
4286 ld_lanes_lanes = 0;
4287 break;
4291 /* Only a power-of-two number of lanes matches interleaving with N levels.
4292 ??? An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
4293 at each step. */
4294 if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
4295 return;
4297 for (slp_tree load : loads)
4299 /* Leave masked or gather loads alone for now. */
4300 if (!SLP_TREE_CHILDREN (load).is_empty ())
4301 continue;
4303 /* We want to pattern-match special cases here and keep those
4304 alone. Candidates are splats and load-lane. */
4306 /* We need to lower only loads of less than half of the groups
4307 lanes, including duplicate lanes. Note this leaves nodes
4308 with a non-1:1 load permutation around instead of canonicalizing
4309 those into a load and a permute node. Removing this early
4310 check would do such canonicalization. */
4311 if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
4312 && ld_lanes_lanes == 0)
4313 continue;
4315 /* First build (and possibly re-use) a load node for the
4316 unpermuted group. Gaps in the middle and on the end are
4317 represented with NULL stmts. */
4318 vec<stmt_vec_info> stmts;
4319 stmts.create (group_lanes);
4320 for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
4322 if (s != first)
4323 for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
4324 stmts.quick_push (NULL);
4325 stmts.quick_push (s);
4327 for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
4328 stmts.quick_push (NULL);
4329 poly_uint64 max_nunits = 1;
4330 bool *matches = XALLOCAVEC (bool, group_lanes);
4331 unsigned limit = 1;
4332 unsigned tree_size = 0;
4333 slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
4334 group_lanes,
4335 &max_nunits, matches, &limit,
4336 &tree_size, bst_map);
4338 /* Build the permute to get the original load permutation order. */
4339 lane_permutation_t final_perm;
4340 final_perm.create (SLP_TREE_LANES (load));
4341 for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
4342 final_perm.quick_push
4343 (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
4345 if (ld_lanes_lanes != 0)
4347 /* ??? If this is not in sync with what get_load_store_type
4348 later decides the SLP representation is not good for other
4349 store vectorization methods. */
4350 l0->ldst_lanes = true;
4351 load->ldst_lanes = true;
4354 while (1)
4356 unsigned group_lanes = SLP_TREE_LANES (l0);
4357 if (ld_lanes_lanes != 0
4358 || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
4359 break;
4361 /* Try to lower by reducing the group to half its size using an
4362 interleaving scheme. For this try to compute whether all
4363 elements needed for this load are in even or odd elements of
4364 an even/odd decomposition with N consecutive elements.
4365 Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
4366 with N == 2. */
4367 /* ??? Only an even number of lanes can be handed this way, but the
4368 fallback below could work for any number. We have to make sure
4369 to round up in that case. */
4370 gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
4371 unsigned even = 0, odd = 0;
4372 if ((group_lanes & 1) == 0)
4374 even = (1 << ceil_log2 (group_lanes)) - 1;
4375 odd = even;
4376 for (auto l : final_perm)
4378 even &= ~l.second;
4379 odd &= l.second;
4383 /* Now build an even or odd extraction from the unpermuted load. */
4384 lane_permutation_t perm;
4385 perm.create ((group_lanes + 1) / 2);
4386 unsigned level;
4387 if (even
4388 && ((level = 1 << ctz_hwi (even)), true)
4389 && group_lanes % (2 * level) == 0)
4391 /* { 0, 1, ... 4, 5 ..., } */
4392 unsigned level = 1 << ctz_hwi (even);
4393 for (unsigned i = 0; i < group_lanes / 2 / level; ++i)
4394 for (unsigned j = 0; j < level; ++j)
4395 perm.quick_push (std::make_pair (0, 2 * i * level + j));
4397 else if (odd)
4399 /* { ..., 2, 3, ... 6, 7 } */
4400 unsigned level = 1 << ctz_hwi (odd);
4401 gcc_assert (group_lanes % (2 * level) == 0);
4402 for (unsigned i = 0; i < group_lanes / 2 / level; ++i)
4403 for (unsigned j = 0; j < level; ++j)
4404 perm.quick_push (std::make_pair (0, (2 * i + 1) * level + j));
4406 else
4408 /* As fallback extract all used lanes and fill to half the
4409 group size by repeating the last element.
4410 ??? This is quite a bad strathegy for re-use - we could
4411 brute force our way to find more optimal filling lanes to
4412 maximize re-use when looking at all loads from the group. */
4413 auto_bitmap l;
4414 for (auto p : final_perm)
4415 bitmap_set_bit (l, p.second);
4416 unsigned i = 0;
4417 bitmap_iterator bi;
4418 EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
4419 perm.quick_push (std::make_pair (0, i));
4420 while (perm.length () < (group_lanes + 1) / 2)
4421 perm.quick_push (perm.last ());
4424 /* Update final_perm with the intermediate permute. */
4425 for (unsigned i = 0; i < final_perm.length (); ++i)
4427 unsigned l = final_perm[i].second;
4428 unsigned j;
4429 for (j = 0; j < perm.length (); ++j)
4430 if (perm[j].second == l)
4432 final_perm[i].second = j;
4433 break;
4435 gcc_assert (j < perm.length ());
4438 /* And create scalar stmts. */
4439 vec<stmt_vec_info> perm_stmts;
4440 perm_stmts.create (perm.length ());
4441 for (unsigned i = 0; i < perm.length (); ++i)
4442 perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
4444 slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
4445 SLP_TREE_CHILDREN (p).quick_push (l0);
4446 SLP_TREE_LANE_PERMUTATION (p) = perm;
4447 SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
4448 SLP_TREE_LANES (p) = perm.length ();
4449 SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
4450 /* ??? As we have scalar stmts for this intermediate permute we
4451 could CSE it via bst_map but we do not want to pick up
4452 another SLP node with a load permutation. We instead should
4453 have a "local" CSE map here. */
4454 SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
4456 /* We now have a node for (group_lanes + 1) / 2 lanes. */
4457 l0 = p;
4460 /* And finally from the ordered reduction node create the
4461 permute to shuffle the lanes into the original load-permutation
4462 order. We replace the original load node with this. */
4463 SLP_TREE_CODE (load) = VEC_PERM_EXPR;
4464 SLP_TREE_LOAD_PERMUTATION (load).release ();
4465 SLP_TREE_LANE_PERMUTATION (load) = final_perm;
4466 SLP_TREE_CHILDREN (load).create (1);
4467 SLP_TREE_CHILDREN (load).quick_push (l0);
4471 /* Transform SLP loads in the SLP graph created by SLP discovery to
4472 group loads from the same group and lower load permutations that
4473 are unlikely to be supported into a series of permutes.
4474 In the degenerate case of having only single-lane SLP instances
4475 this should result in a series of permute nodes emulating an
4476 interleaving scheme. */
4478 static void
4479 vect_lower_load_permutations (loop_vec_info loop_vinfo,
4480 scalar_stmts_to_slp_tree_map_t *bst_map)
4482 /* Gather and sort loads across all instances. */
4483 hash_set<slp_tree> visited;
4484 auto_vec<slp_tree> loads;
4485 for (auto inst : loop_vinfo->slp_instances)
4486 vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
4487 if (loads.is_empty ())
4488 return;
4489 loads.qsort (vllp_cmp);
4491 /* Now process each dataref group separately. */
4492 unsigned firsti = 0;
4493 for (unsigned i = 1; i < loads.length (); ++i)
4495 slp_tree first = loads[firsti];
4496 slp_tree next = loads[i];
4497 stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
4498 stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
4499 if (STMT_VINFO_GROUPED_ACCESS (a0)
4500 && STMT_VINFO_GROUPED_ACCESS (b0)
4501 && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
4502 continue;
4503 /* Just one SLP load of a possible group, leave those alone. */
4504 if (i == firsti + 1)
4506 firsti = i;
4507 continue;
4509 /* Now we have multiple SLP loads of the same group from
4510 firsti to i - 1. */
4511 vect_lower_load_permutations (loop_vinfo, bst_map,
4512 make_array_slice (&loads[firsti],
4513 i - firsti));
4514 firsti = i;
4516 if (firsti < loads.length () - 1)
4517 vect_lower_load_permutations (loop_vinfo, bst_map,
4518 make_array_slice (&loads[firsti],
4519 loads.length () - firsti));
4522 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
4523 trees of packed scalar stmts if SLP is possible. */
4525 opt_result
4526 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size)
4528 unsigned int i;
4529 stmt_vec_info first_element;
4530 slp_instance instance;
4532 DUMP_VECT_SCOPE ("vect_analyze_slp");
4534 unsigned limit = max_tree_size;
4536 scalar_stmts_to_slp_tree_map_t *bst_map
4537 = new scalar_stmts_to_slp_tree_map_t ();
4539 /* Find SLP sequences starting from groups of grouped stores. */
4540 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4541 vect_analyze_slp_instance (vinfo, bst_map, first_element,
4542 slp_inst_kind_store, max_tree_size, &limit);
4544 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4546 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
4548 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
4549 /* Apply patterns. */
4550 for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
4551 bb_vinfo->roots[i].stmts[j]
4552 = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
4553 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
4554 bb_vinfo->roots[i].stmts,
4555 bb_vinfo->roots[i].roots,
4556 bb_vinfo->roots[i].remain,
4557 max_tree_size, &limit, bst_map, NULL))
4559 bb_vinfo->roots[i].stmts = vNULL;
4560 bb_vinfo->roots[i].roots = vNULL;
4561 bb_vinfo->roots[i].remain = vNULL;
4566 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4568 /* Find SLP sequences starting from reduction chains. */
4569 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
4570 if (! STMT_VINFO_RELEVANT_P (first_element)
4571 && ! STMT_VINFO_LIVE_P (first_element))
4573 else if (! vect_analyze_slp_instance (vinfo, bst_map, first_element,
4574 slp_inst_kind_reduc_chain,
4575 max_tree_size, &limit))
4577 /* Dissolve reduction chain group. */
4578 stmt_vec_info vinfo = first_element;
4579 stmt_vec_info last = NULL;
4580 while (vinfo)
4582 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
4583 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
4584 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
4585 last = vinfo;
4586 vinfo = next;
4588 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
4589 /* It can be still vectorized as part of an SLP reduction. */
4590 loop_vinfo->reductions.safe_push (last);
4593 /* Find SLP sequences starting from groups of reductions. */
4594 if (loop_vinfo->reductions.length () > 0)
4596 /* Collect reduction statements we can combine into
4597 a SLP reduction. */
4598 vec<stmt_vec_info> scalar_stmts;
4599 scalar_stmts.create (loop_vinfo->reductions.length ());
4600 for (auto next_info : loop_vinfo->reductions)
4602 next_info = vect_stmt_to_vectorize (next_info);
4603 if ((STMT_VINFO_RELEVANT_P (next_info)
4604 || STMT_VINFO_LIVE_P (next_info))
4605 /* ??? Make sure we didn't skip a conversion around a
4606 reduction path. In that case we'd have to reverse
4607 engineer that conversion stmt following the chain using
4608 reduc_idx and from the PHI using reduc_def. */
4609 && STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def)
4611 /* Do not discover SLP reductions combining lane-reducing
4612 ops, that will fail later. */
4613 if (!lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
4614 scalar_stmts.quick_push (next_info);
4615 else
4617 /* Do SLP discovery for single-lane reductions. */
4618 vec<stmt_vec_info> stmts;
4619 vec<stmt_vec_info> roots = vNULL;
4620 vec<tree> remain = vNULL;
4621 stmts.create (1);
4622 stmts.quick_push (next_info);
4623 vect_build_slp_instance (vinfo,
4624 slp_inst_kind_reduc_group,
4625 stmts, roots, remain,
4626 max_tree_size, &limit,
4627 bst_map, NULL);
4631 /* Save for re-processing on failure. */
4632 vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
4633 vec<stmt_vec_info> roots = vNULL;
4634 vec<tree> remain = vNULL;
4635 if (scalar_stmts.length () <= 1
4636 || !vect_build_slp_instance (loop_vinfo,
4637 slp_inst_kind_reduc_group,
4638 scalar_stmts, roots, remain,
4639 max_tree_size, &limit, bst_map,
4640 NULL))
4642 if (scalar_stmts.length () <= 1)
4643 scalar_stmts.release ();
4644 /* Do SLP discovery for single-lane reductions. */
4645 for (auto stmt_info : saved_stmts)
4647 vec<stmt_vec_info> stmts;
4648 vec<stmt_vec_info> roots = vNULL;
4649 vec<tree> remain = vNULL;
4650 stmts.create (1);
4651 stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
4652 vect_build_slp_instance (vinfo,
4653 slp_inst_kind_reduc_group,
4654 stmts, roots, remain,
4655 max_tree_size, &limit,
4656 bst_map, NULL);
4658 saved_stmts.release ();
4663 hash_set<slp_tree> visited_patterns;
4664 slp_tree_to_load_perm_map_t perm_cache;
4665 slp_compat_nodes_map_t compat_cache;
4667 /* See if any patterns can be found in the SLP tree. */
4668 bool pattern_found = false;
4669 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4670 pattern_found |= vect_match_slp_patterns (instance, vinfo,
4671 &visited_patterns, &perm_cache,
4672 &compat_cache);
4674 /* If any were found optimize permutations of loads. */
4675 if (pattern_found)
4677 hash_map<slp_tree, slp_tree> load_map;
4678 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4680 slp_tree root = SLP_INSTANCE_TREE (instance);
4681 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
4682 &load_map, root);
4686 /* Check whether we should force some SLP instances to use load/store-lanes
4687 and do so by forcing SLP re-discovery with single lanes. We used
4688 to cancel SLP when this applied to all instances in a loop but now
4689 we decide this per SLP instance. It's important to do this only
4690 after SLP pattern recognition. */
4691 if (is_a <loop_vec_info> (vinfo))
4692 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4693 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
4694 && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
4696 slp_tree slp_root = SLP_INSTANCE_TREE (instance);
4697 int group_size = SLP_TREE_LANES (slp_root);
4698 tree vectype = SLP_TREE_VECTYPE (slp_root);
4700 auto_vec<slp_tree> loads;
4701 hash_set<slp_tree> visited;
4702 vect_gather_slp_loads (loads, slp_root, visited);
4704 /* Check whether any load in the SLP instance is possibly
4705 permuted. */
4706 bool loads_permuted = false;
4707 slp_tree load_node;
4708 unsigned j;
4709 FOR_EACH_VEC_ELT (loads, j, load_node)
4711 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
4712 continue;
4713 unsigned k;
4714 stmt_vec_info load_info;
4715 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
4716 if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
4718 loads_permuted = true;
4719 break;
4723 gimple *rep = STMT_VINFO_STMT (SLP_TREE_REPRESENTATIVE (slp_root));
4724 bool masked = (is_gimple_call (rep)
4725 && gimple_call_internal_p (rep)
4726 && internal_fn_mask_index
4727 (gimple_call_internal_fn (rep)) != -1);
4728 /* If the loads and stores can use load/store-lanes force re-discovery
4729 with single lanes. */
4730 if (loads_permuted
4731 && !slp_root->ldst_lanes
4732 && vect_store_lanes_supported (vectype, group_size, masked)
4733 != IFN_LAST)
4735 bool can_use_lanes = true;
4736 FOR_EACH_VEC_ELT (loads, j, load_node)
4737 if (STMT_VINFO_GROUPED_ACCESS
4738 (SLP_TREE_REPRESENTATIVE (load_node)))
4740 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
4741 (SLP_TREE_REPRESENTATIVE (load_node));
4742 rep = STMT_VINFO_STMT (stmt_vinfo);
4743 masked = (is_gimple_call (rep)
4744 && gimple_call_internal_p (rep)
4745 && internal_fn_mask_index
4746 (gimple_call_internal_fn (rep)));
4747 /* Use SLP for strided accesses (or if we can't
4748 load-lanes). */
4749 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
4750 || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
4751 || vect_load_lanes_supported
4752 (STMT_VINFO_VECTYPE (stmt_vinfo),
4753 DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
4754 /* ??? During SLP re-discovery with a single lane
4755 a masked grouped load will appear permuted and
4756 discovery will fail. We have to rework this
4757 on the discovery side - for now avoid ICEing. */
4758 || masked)
4760 can_use_lanes = false;
4761 break;
4765 if (can_use_lanes)
4767 if (dump_enabled_p ())
4768 dump_printf_loc (MSG_NOTE, vect_location,
4769 "SLP instance %p can use load/store-lanes,"
4770 " re-discovering with single-lanes\n",
4771 (void *) instance);
4773 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
4775 vect_free_slp_instance (instance);
4776 limit = max_tree_size;
4777 bool res = vect_analyze_slp_instance (vinfo, bst_map,
4778 stmt_info,
4779 slp_inst_kind_store,
4780 max_tree_size, &limit,
4781 true);
4782 gcc_assert (res);
4783 auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
4784 LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
4789 /* When we end up with load permutations that we cannot possibly handle,
4790 like those requiring three vector inputs, lower them using interleaving
4791 like schemes. */
4792 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4794 vect_lower_load_permutations (loop_vinfo, bst_map);
4795 if (dump_enabled_p ())
4797 dump_printf_loc (MSG_NOTE, vect_location,
4798 "SLP graph after lowering permutations:\n");
4799 hash_set<slp_tree> visited;
4800 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4801 vect_print_slp_graph (MSG_NOTE, vect_location,
4802 SLP_INSTANCE_TREE (instance), visited);
4806 release_scalar_stmts_to_slp_tree_map (bst_map);
4808 if (pattern_found && dump_enabled_p ())
4810 dump_printf_loc (MSG_NOTE, vect_location,
4811 "Pattern matched SLP tree\n");
4812 hash_set<slp_tree> visited;
4813 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
4814 vect_print_slp_graph (MSG_NOTE, vect_location,
4815 SLP_INSTANCE_TREE (instance), visited);
4818 return opt_result::success ();
4821 /* Estimates the cost of inserting layout changes into the SLP graph.
4822 It can also say that the insertion is impossible. */
4824 struct slpg_layout_cost
4826 slpg_layout_cost () = default;
4827 slpg_layout_cost (sreal, bool);
4829 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
4830 bool is_possible () const { return depth != sreal::max (); }
4832 bool operator== (const slpg_layout_cost &) const;
4833 bool operator!= (const slpg_layout_cost &) const;
4835 bool is_better_than (const slpg_layout_cost &, bool) const;
4837 void add_parallel_cost (const slpg_layout_cost &);
4838 void add_serial_cost (const slpg_layout_cost &);
4839 void split (unsigned int);
4841 /* The longest sequence of layout changes needed during any traversal
4842 of the partition dag, weighted by execution frequency.
4844 This is the most important metric when optimizing for speed, since
4845 it helps to ensure that we keep the number of operations on
4846 critical paths to a minimum. */
4847 sreal depth = 0;
4849 /* An estimate of the total number of operations needed. It is weighted by
4850 execution frequency when optimizing for speed but not when optimizing for
4851 size. In order to avoid double-counting, a node with a fanout of N will
4852 distribute 1/N of its total cost to each successor.
4854 This is the most important metric when optimizing for size, since
4855 it helps to keep the total number of operations to a minimum, */
4856 sreal total = 0;
4859 /* Construct costs for a node with weight WEIGHT. A higher weight
4860 indicates more frequent execution. IS_FOR_SIZE is true if we are
4861 optimizing for size rather than speed. */
4863 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
4864 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
4868 bool
4869 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
4871 return depth == other.depth && total == other.total;
4874 bool
4875 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
4877 return !operator== (other);
4880 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
4881 true if we are optimizing for size rather than speed. */
4883 bool
4884 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
4885 bool is_for_size) const
4887 if (is_for_size)
4889 if (total != other.total)
4890 return total < other.total;
4891 return depth < other.depth;
4893 else
4895 if (depth != other.depth)
4896 return depth < other.depth;
4897 return total < other.total;
4901 /* Increase the costs to account for something with cost INPUT_COST
4902 happening in parallel with the current costs. */
4904 void
4905 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
4907 depth = std::max (depth, input_cost.depth);
4908 total += input_cost.total;
4911 /* Increase the costs to account for something with cost INPUT_COST
4912 happening in series with the current costs. */
4914 void
4915 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
4917 depth += other.depth;
4918 total += other.total;
4921 /* Split the total cost among TIMES successors or predecessors. */
4923 void
4924 slpg_layout_cost::split (unsigned int times)
4926 if (times > 1)
4927 total /= times;
4930 /* Information about one node in the SLP graph, for use during
4931 vect_optimize_slp_pass. */
4933 struct slpg_vertex
4935 slpg_vertex (slp_tree node_) : node (node_) {}
4937 /* The node itself. */
4938 slp_tree node;
4940 /* Which partition the node belongs to, or -1 if none. Nodes outside of
4941 partitions are flexible; they can have whichever layout consumers
4942 want them to have. */
4943 int partition = -1;
4945 /* The number of nodes that directly use the result of this one
4946 (i.e. the number of nodes that count this one as a child). */
4947 unsigned int out_degree = 0;
4949 /* The execution frequency of the node. */
4950 sreal weight = 0;
4952 /* The total execution frequency of all nodes that directly use the
4953 result of this one. */
4954 sreal out_weight = 0;
4957 /* Information about one partition of the SLP graph, for use during
4958 vect_optimize_slp_pass. */
4960 struct slpg_partition_info
4962 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
4963 of m_partitioned_nodes. */
4964 unsigned int node_begin = 0;
4965 unsigned int node_end = 0;
4967 /* Which layout we've chosen to use for this partition, or -1 if
4968 we haven't picked one yet. */
4969 int layout = -1;
4971 /* The number of predecessors and successors in the partition dag.
4972 The predecessors always have lower partition numbers and the
4973 successors always have higher partition numbers.
4975 Note that the directions of these edges are not necessarily the
4976 same as in the data flow graph. For example, if an SCC has separate
4977 partitions for an inner loop and an outer loop, the inner loop's
4978 partition will have at least two incoming edges from the outer loop's
4979 partition: one for a live-in value and one for a live-out value.
4980 In data flow terms, one of these edges would also be from the outer loop
4981 to the inner loop, but the other would be in the opposite direction. */
4982 unsigned int in_degree = 0;
4983 unsigned int out_degree = 0;
4986 /* Information about the costs of using a particular layout for a
4987 particular partition. It can also say that the combination is
4988 impossible. */
4990 struct slpg_partition_layout_costs
4992 bool is_possible () const { return internal_cost.is_possible (); }
4993 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
4995 /* The costs inherited from predecessor partitions. */
4996 slpg_layout_cost in_cost;
4998 /* The inherent cost of the layout within the node itself. For example,
4999 this is nonzero for a load if choosing a particular layout would require
5000 the load to permute the loaded elements. It is nonzero for a
5001 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
5002 to full-vector moves. */
5003 slpg_layout_cost internal_cost;
5005 /* The costs inherited from successor partitions. */
5006 slpg_layout_cost out_cost;
5009 /* This class tries to optimize the layout of vectors in order to avoid
5010 unnecessary shuffling. At the moment, the set of possible layouts are
5011 restricted to bijective permutations.
5013 The goal of the pass depends on whether we're optimizing for size or
5014 for speed. When optimizing for size, the goal is to reduce the overall
5015 number of layout changes (including layout changes implied by things
5016 like load permutations). When optimizing for speed, the goal is to
5017 reduce the maximum latency attributable to layout changes on any
5018 non-cyclical path through the data flow graph.
5020 For example, when optimizing a loop nest for speed, we will prefer
5021 to make layout changes outside of a loop rather than inside of a loop,
5022 and will prefer to make layout changes in parallel rather than serially,
5023 even if that increases the overall number of layout changes.
5025 The high-level procedure is:
5027 (1) Build a graph in which edges go from uses (parents) to definitions
5028 (children).
5030 (2) Divide the graph into a dag of strongly-connected components (SCCs).
5032 (3) When optimizing for speed, partition the nodes in each SCC based
5033 on their containing cfg loop. When optimizing for size, treat
5034 each SCC as a single partition.
5036 This gives us a dag of partitions. The goal is now to assign a
5037 layout to each partition.
5039 (4) Construct a set of vector layouts that are worth considering.
5040 Record which nodes must keep their current layout.
5042 (5) Perform a forward walk over the partition dag (from loads to stores)
5043 accumulating the "forward" cost of using each layout. When visiting
5044 each partition, assign a tentative choice of layout to the partition
5045 and use that choice when calculating the cost of using a different
5046 layout in successor partitions.
5048 (6) Perform a backward walk over the partition dag (from stores to loads),
5049 accumulating the "backward" cost of using each layout. When visiting
5050 each partition, make a final choice of layout for that partition based
5051 on the accumulated forward costs (from (5)) and backward costs
5052 (from (6)).
5054 (7) Apply the chosen layouts to the SLP graph.
5056 For example, consider the SLP statements:
5058 S1: a_1 = load
5059 loop:
5060 S2: a_2 = PHI<a_1, a_3>
5061 S3: b_1 = load
5062 S4: a_3 = a_2 + b_1
5063 exit:
5064 S5: a_4 = PHI<a_3>
5065 S6: store a_4
5067 S2 and S4 form an SCC and are part of the same loop. Every other
5068 statement is in a singleton SCC. In this example there is a one-to-one
5069 mapping between SCCs and partitions and the partition dag looks like this;
5071 S1 S3
5073 S2+S4
5079 S2, S3 and S4 will have a higher execution frequency than the other
5080 statements, so when optimizing for speed, the goal is to avoid any
5081 layout changes:
5083 - within S3
5084 - within S2+S4
5085 - on the S3->S2+S4 edge
5087 For example, if S3 was originally a reversing load, the goal of the
5088 pass is to make it an unreversed load and change the layout on the
5089 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
5090 on S1->S2+S4 and S5->S6 would also be acceptable.)
5092 The difference between SCCs and partitions becomes important if we
5093 add an outer loop:
5095 S1: a_1 = ...
5096 loop1:
5097 S2: a_2 = PHI<a_1, a_6>
5098 S3: b_1 = load
5099 S4: a_3 = a_2 + b_1
5100 loop2:
5101 S5: a_4 = PHI<a_3, a_5>
5102 S6: c_1 = load
5103 S7: a_5 = a_4 + c_1
5104 exit2:
5105 S8: a_6 = PHI<a_5>
5106 S9: store a_6
5107 exit1:
5109 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
5110 for speed, we usually do not want restrictions in the outer loop to "infect"
5111 the decision for the inner loop. For example, if an outer-loop node
5112 in the SCC contains a statement with a fixed layout, that should not
5113 prevent the inner loop from using a different layout. Conversely,
5114 the inner loop should not dictate a layout to the outer loop: if the
5115 outer loop does a lot of computation, then it may not be efficient to
5116 do all of that computation in the inner loop's preferred layout.
5118 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
5119 and S5+S7 (inner). We also try to arrange partitions so that:
5121 - the partition for an outer loop comes before the partition for
5122 an inner loop
5124 - if a sibling loop A dominates a sibling loop B, A's partition
5125 comes before B's
5127 This gives the following partition dag for the example above:
5129 S1 S3
5131 S2+S4+S8 S6
5132 | \\ /
5133 | S5+S7
5137 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
5138 one for a reversal of the edge S7->S8.
5140 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
5141 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
5142 preferred layout against the cost of changing the layout on entry to the
5143 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
5145 Although this works well when optimizing for speed, it has the downside
5146 when optimizing for size that the choice of layout for S5+S7 is completely
5147 independent of S9, which lessens the chance of reducing the overall number
5148 of permutations. We therefore do not partition SCCs when optimizing
5149 for size.
5151 To give a concrete example of the difference between optimizing
5152 for size and speed, consider:
5154 a[0] = (b[1] << c[3]) - d[1];
5155 a[1] = (b[0] << c[2]) - d[0];
5156 a[2] = (b[3] << c[1]) - d[3];
5157 a[3] = (b[2] << c[0]) - d[2];
5159 There are three different layouts here: one for a, one for b and d,
5160 and one for c. When optimizing for speed it is better to permute each
5161 of b, c and d into the order required by a, since those permutations
5162 happen in parallel. But when optimizing for size, it is better to:
5164 - permute c into the same order as b
5165 - do the arithmetic
5166 - permute the result into the order required by a
5168 This gives 2 permutations rather than 3. */
5170 class vect_optimize_slp_pass
5172 public:
5173 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
5174 void run ();
5176 private:
5177 /* Graph building. */
5178 struct loop *containing_loop (slp_tree);
5179 bool is_cfg_latch_edge (graph_edge *);
5180 void build_vertices (hash_set<slp_tree> &, slp_tree);
5181 void build_vertices ();
5182 void build_graph ();
5184 /* Partitioning. */
5185 void create_partitions ();
5186 template<typename T> void for_each_partition_edge (unsigned int, T);
5188 /* Layout selection. */
5189 bool is_compatible_layout (slp_tree, unsigned int);
5190 int change_layout_cost (slp_tree, unsigned int, unsigned int);
5191 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
5192 unsigned int);
5193 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
5194 int, unsigned int);
5195 int internal_node_cost (slp_tree, int, unsigned int);
5196 void start_choosing_layouts ();
5198 /* Cost propagation. */
5199 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
5200 unsigned int, unsigned int);
5201 slpg_layout_cost total_in_cost (unsigned int);
5202 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
5203 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
5204 void forward_pass ();
5205 void backward_pass ();
5207 /* Rematerialization. */
5208 slp_tree get_result_with_layout (slp_tree, unsigned int);
5209 void materialize ();
5211 /* Clean-up. */
5212 void remove_redundant_permutations ();
5214 void dump ();
5216 vec_info *m_vinfo;
5218 /* True if we should optimize the graph for size, false if we should
5219 optimize it for speed. (It wouldn't be easy to make this decision
5220 more locally.) */
5221 bool m_optimize_size;
5223 /* A graph of all SLP nodes, with edges leading from uses to definitions.
5224 In other words, a node's predecessors are its slp_tree parents and
5225 a node's successors are its slp_tree children. */
5226 graph *m_slpg = nullptr;
5228 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
5229 auto_vec<slpg_vertex> m_vertices;
5231 /* The list of all leaves of M_SLPG. such as external definitions, constants,
5232 and loads. */
5233 auto_vec<int> m_leafs;
5235 /* This array has one entry for every vector layout that we're considering.
5236 Element 0 is null and indicates "no change". Other entries describe
5237 permutations that are inherent in the current graph and that we would
5238 like to reverse if possible.
5240 For example, a permutation { 1, 2, 3, 0 } means that something has
5241 effectively been permuted in that way, such as a load group
5242 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
5243 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
5244 in order to put things "back" in order. */
5245 auto_vec<vec<unsigned> > m_perms;
5247 /* A partitioning of the nodes for which a layout must be chosen.
5248 Each partition represents an <SCC, cfg loop> pair; that is,
5249 nodes in different SCCs belong to different partitions, and nodes
5250 within an SCC can be further partitioned according to a containing
5251 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
5253 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
5254 from leaves (such as loads) to roots (such as stores).
5256 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
5257 auto_vec<slpg_partition_info> m_partitions;
5259 /* The list of all nodes for which a layout must be chosen. Nodes for
5260 partition P come before the nodes for partition P+1. Nodes within a
5261 partition are in reverse postorder. */
5262 auto_vec<unsigned int> m_partitioned_nodes;
5264 /* Index P * num-layouts + L contains the cost of using layout L
5265 for partition P. */
5266 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
5268 /* Index N * num-layouts + L, if nonnull, is a node that provides the
5269 original output of node N adjusted to have layout L. */
5270 auto_vec<slp_tree> m_node_layouts;
5273 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
5274 Also record whether we should optimize anything for speed rather
5275 than size. */
5277 void
5278 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
5279 slp_tree node)
5281 unsigned i;
5282 slp_tree child;
5284 if (visited.add (node))
5285 return;
5287 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
5289 basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
5290 if (optimize_bb_for_speed_p (bb))
5291 m_optimize_size = false;
5294 node->vertex = m_vertices.length ();
5295 m_vertices.safe_push (slpg_vertex (node));
5297 bool leaf = true;
5298 bool force_leaf = false;
5299 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5300 if (child)
5302 leaf = false;
5303 build_vertices (visited, child);
5305 else
5306 force_leaf = true;
5307 /* Since SLP discovery works along use-def edges all cycles have an
5308 entry - but there's the exception of cycles where we do not handle
5309 the entry explicitely (but with a NULL SLP node), like some reductions
5310 and inductions. Force those SLP PHIs to act as leafs to make them
5311 backwards reachable. */
5312 if (leaf || force_leaf)
5313 m_leafs.safe_push (node->vertex);
5316 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
5318 void
5319 vect_optimize_slp_pass::build_vertices ()
5321 hash_set<slp_tree> visited;
5322 unsigned i;
5323 slp_instance instance;
5324 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
5325 build_vertices (visited, SLP_INSTANCE_TREE (instance));
5328 /* Apply (reverse) bijectite PERM to VEC. */
5330 template <class T>
5331 static void
5332 vect_slp_permute (vec<unsigned> perm,
5333 vec<T> &vec, bool reverse)
5335 auto_vec<T, 64> saved;
5336 saved.create (vec.length ());
5337 for (unsigned i = 0; i < vec.length (); ++i)
5338 saved.quick_push (vec[i]);
5340 if (reverse)
5342 for (unsigned i = 0; i < vec.length (); ++i)
5343 vec[perm[i]] = saved[i];
5344 for (unsigned i = 0; i < vec.length (); ++i)
5345 gcc_assert (vec[perm[i]] == saved[i]);
5347 else
5349 for (unsigned i = 0; i < vec.length (); ++i)
5350 vec[i] = saved[perm[i]];
5351 for (unsigned i = 0; i < vec.length (); ++i)
5352 gcc_assert (vec[i] == saved[perm[i]]);
5356 /* Return the cfg loop that contains NODE. */
5358 struct loop *
5359 vect_optimize_slp_pass::containing_loop (slp_tree node)
5361 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
5362 if (!rep)
5363 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
5364 return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
5367 /* Return true if UD (an edge from a use to a definition) is associated
5368 with a loop latch edge in the cfg. */
5370 bool
5371 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
5373 slp_tree use = m_vertices[ud->src].node;
5374 slp_tree def = m_vertices[ud->dest].node;
5375 if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
5376 || SLP_TREE_CODE (use) == VEC_PERM_EXPR)
5377 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
5378 return false;
5380 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
5381 return (is_a<gphi *> (use_rep->stmt)
5382 && bb_loop_header_p (gimple_bb (use_rep->stmt))
5383 && containing_loop (def) == containing_loop (use));
5386 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
5387 a nonnull data field. */
5389 void
5390 vect_optimize_slp_pass::build_graph ()
5392 m_optimize_size = true;
5393 build_vertices ();
5395 m_slpg = new_graph (m_vertices.length ());
5396 for (slpg_vertex &v : m_vertices)
5397 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
5398 if (child)
5400 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
5401 if (is_cfg_latch_edge (ud))
5402 ud->data = this;
5406 /* Return true if E corresponds to a loop latch edge in the cfg. */
5408 static bool
5409 skip_cfg_latch_edges (graph_edge *e)
5411 return e->data;
5414 /* Create the node partitions. */
5416 void
5417 vect_optimize_slp_pass::create_partitions ()
5419 /* Calculate a postorder of the graph, ignoring edges that correspond
5420 to natural latch edges in the cfg. Reading the vector from the end
5421 to the beginning gives the reverse postorder. */
5422 auto_vec<int> initial_rpo;
5423 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
5424 false, NULL, skip_cfg_latch_edges);
5425 gcc_assert (initial_rpo.length () == m_vertices.length ());
5427 /* Calculate the strongly connected components of the graph. */
5428 auto_vec<int> scc_grouping;
5429 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
5431 /* Create a new index order in which all nodes from the same SCC are
5432 consecutive. Use scc_pos to record the index of the first node in
5433 each SCC. */
5434 auto_vec<unsigned int> scc_pos (num_sccs);
5435 int last_component = -1;
5436 unsigned int node_count = 0;
5437 for (unsigned int node_i : scc_grouping)
5439 if (last_component != m_slpg->vertices[node_i].component)
5441 last_component = m_slpg->vertices[node_i].component;
5442 gcc_assert (last_component == int (scc_pos.length ()));
5443 scc_pos.quick_push (node_count);
5445 node_count += 1;
5447 gcc_assert (node_count == initial_rpo.length ()
5448 && last_component + 1 == int (num_sccs));
5450 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
5451 inside each SCC following the RPO we calculated above. The fact that
5452 we ignored natural latch edges when calculating the RPO should ensure
5453 that, for natural loop nests:
5455 - the first node that we encounter in a cfg loop is the loop header phi
5456 - the loop header phis are in dominance order
5458 Arranging for this is an optimization (see below) rather than a
5459 correctness issue. Unnatural loops with a tangled mess of backedges
5460 will still work correctly, but might give poorer results.
5462 Also update scc_pos so that it gives 1 + the index of the last node
5463 in the SCC. */
5464 m_partitioned_nodes.safe_grow (node_count);
5465 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
5467 unsigned int node_i = initial_rpo[old_i];
5468 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
5469 m_partitioned_nodes[new_i] = node_i;
5472 /* When optimizing for speed, partition each SCC based on the containing
5473 cfg loop. The order we constructed above should ensure that, for natural
5474 cfg loops, we'll create sub-SCC partitions for outer loops before
5475 the corresponding sub-SCC partitions for inner loops. Similarly,
5476 when one sibling loop A dominates another sibling loop B, we should
5477 create a sub-SCC partition for A before a sub-SCC partition for B.
5479 As above, nothing depends for correctness on whether this achieves
5480 a natural nesting, but we should get better results when it does. */
5481 m_partitions.reserve (m_vertices.length ());
5482 unsigned int next_partition_i = 0;
5483 hash_map<struct loop *, int> loop_partitions;
5484 unsigned int rpo_begin = 0;
5485 unsigned int num_partitioned_nodes = 0;
5486 for (unsigned int rpo_end : scc_pos)
5488 loop_partitions.empty ();
5489 unsigned int partition_i = next_partition_i;
5490 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
5492 /* Handle externals and constants optimistically throughout.
5493 But treat existing vectors as fixed since we do not handle
5494 permuting them. */
5495 unsigned int node_i = m_partitioned_nodes[rpo_i];
5496 auto &vertex = m_vertices[node_i];
5497 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
5498 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
5499 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
5500 vertex.partition = -1;
5501 else
5503 bool existed;
5504 if (m_optimize_size)
5505 existed = next_partition_i > partition_i;
5506 else
5508 struct loop *loop = containing_loop (vertex.node);
5509 auto &entry = loop_partitions.get_or_insert (loop, &existed);
5510 if (!existed)
5511 entry = next_partition_i;
5512 partition_i = entry;
5514 if (!existed)
5516 m_partitions.quick_push (slpg_partition_info ());
5517 next_partition_i += 1;
5519 vertex.partition = partition_i;
5520 num_partitioned_nodes += 1;
5521 m_partitions[partition_i].node_end += 1;
5524 rpo_begin = rpo_end;
5527 /* Assign ranges of consecutive node indices to each partition,
5528 in partition order. Start with node_end being the same as
5529 node_begin so that the next loop can use it as a counter. */
5530 unsigned int node_begin = 0;
5531 for (auto &partition : m_partitions)
5533 partition.node_begin = node_begin;
5534 node_begin += partition.node_end;
5535 partition.node_end = partition.node_begin;
5537 gcc_assert (node_begin == num_partitioned_nodes);
5539 /* Finally build the list of nodes in partition order. */
5540 m_partitioned_nodes.truncate (num_partitioned_nodes);
5541 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
5543 int partition_i = m_vertices[node_i].partition;
5544 if (partition_i >= 0)
5546 unsigned int order_i = m_partitions[partition_i].node_end++;
5547 m_partitioned_nodes[order_i] = node_i;
5552 /* Look for edges from earlier partitions into node NODE_I and edges from
5553 node NODE_I into later partitions. Call:
5555 FN (ud, other_node_i)
5557 for each such use-to-def edge ud, where other_node_i is the node at the
5558 other end of the edge. */
5560 template<typename T>
5561 void
5562 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
5564 int partition_i = m_vertices[node_i].partition;
5565 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
5566 pred; pred = pred->pred_next)
5568 int src_partition_i = m_vertices[pred->src].partition;
5569 if (src_partition_i >= 0 && src_partition_i != partition_i)
5570 fn (pred, pred->src);
5572 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
5573 succ; succ = succ->succ_next)
5575 int dest_partition_i = m_vertices[succ->dest].partition;
5576 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
5577 fn (succ, succ->dest);
5581 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
5582 that NODE would operate on. This test is independent of NODE's actual
5583 operation. */
5585 bool
5586 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
5587 unsigned int layout_i)
5589 if (layout_i == 0)
5590 return true;
5592 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
5593 return false;
5595 return true;
5598 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
5599 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
5600 layouts is incompatible with NODE or if the change is not possible for
5601 some other reason.
5603 The properties taken from NODE include the number of lanes and the
5604 vector type. The actual operation doesn't matter. */
5607 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
5608 unsigned int from_layout_i,
5609 unsigned int to_layout_i)
5611 if (!is_compatible_layout (node, from_layout_i)
5612 || !is_compatible_layout (node, to_layout_i))
5613 return -1;
5615 if (from_layout_i == to_layout_i)
5616 return 0;
5618 auto_vec<slp_tree, 1> children (1);
5619 children.quick_push (node);
5620 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
5621 if (from_layout_i > 0)
5622 for (unsigned int i : m_perms[from_layout_i])
5623 perm.quick_push ({ 0, i });
5624 else
5625 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
5626 perm.quick_push ({ 0, i });
5627 if (to_layout_i > 0)
5628 vect_slp_permute (m_perms[to_layout_i], perm, true);
5629 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
5630 children, false);
5631 if (count >= 0)
5632 return MAX (count, 1);
5634 /* ??? In principle we could try changing via layout 0, giving two
5635 layout changes rather than 1. Doing that would require
5636 corresponding support in get_result_with_layout. */
5637 return -1;
5640 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
5642 inline slpg_partition_layout_costs &
5643 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
5644 unsigned int layout_i)
5646 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
5649 /* Change PERM in one of two ways:
5651 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
5652 chosen for child I of NODE.
5654 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
5656 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
5658 void
5659 vect_optimize_slp_pass::
5660 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
5661 int in_layout_i, unsigned int out_layout_i)
5663 for (auto &entry : perm)
5665 int this_in_layout_i = in_layout_i;
5666 if (this_in_layout_i < 0)
5668 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
5669 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
5670 if (in_partition_i == -1u)
5671 continue;
5672 this_in_layout_i = m_partitions[in_partition_i].layout;
5674 if (this_in_layout_i > 0)
5675 entry.second = m_perms[this_in_layout_i][entry.second];
5677 if (out_layout_i > 0)
5678 vect_slp_permute (m_perms[out_layout_i], perm, true);
5681 /* Check whether the target allows NODE to be rearranged so that the node's
5682 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
5683 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
5685 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
5686 NODE can adapt to the layout changes that have (perhaps provisionally)
5687 been chosen for NODE's children, so that no extra permutations are
5688 needed on either the input or the output of NODE.
5690 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
5691 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
5693 IN_LAYOUT_I has no meaning for other types of node.
5695 Keeping the node as-is is always valid. If the target doesn't appear
5696 to support the node as-is, but might realistically support other layouts,
5697 then layout 0 instead has the cost of a worst-case permutation. On the
5698 one hand, this ensures that every node has at least one valid layout,
5699 avoiding what would otherwise be an awkward special case. On the other,
5700 it still encourages the pass to change an invalid pre-existing layout
5701 choice into a valid one. */
5704 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
5705 unsigned int out_layout_i)
5707 const int fallback_cost = 1;
5709 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
5711 auto_lane_permutation_t tmp_perm;
5712 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
5714 /* Check that the child nodes support the chosen layout. Checking
5715 the first child is enough, since any second child would have the
5716 same shape. */
5717 auto first_child = SLP_TREE_CHILDREN (node)[0];
5718 if (in_layout_i > 0
5719 && !is_compatible_layout (first_child, in_layout_i))
5720 return -1;
5722 change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
5723 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
5724 node, tmp_perm,
5725 SLP_TREE_CHILDREN (node),
5726 false);
5727 if (count < 0)
5729 if (in_layout_i == 0 && out_layout_i == 0)
5731 /* Use the fallback cost if the node could in principle support
5732 some nonzero layout for both the inputs and the outputs.
5733 Otherwise assume that the node will be rejected later
5734 and rebuilt from scalars. */
5735 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
5736 return fallback_cost;
5737 return 0;
5739 return -1;
5742 /* We currently have no way of telling whether the new layout is cheaper
5743 or more expensive than the old one. But at least in principle,
5744 it should be worth making zero permutations (whole-vector shuffles)
5745 cheaper than real permutations, in case the pass is able to remove
5746 the latter. */
5747 return count == 0 ? 0 : 1;
5750 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
5751 if (rep
5752 && STMT_VINFO_DATA_REF (rep)
5753 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
5754 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
5756 auto_load_permutation_t tmp_perm;
5757 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
5758 if (out_layout_i > 0)
5759 vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
5761 poly_uint64 vf = 1;
5762 if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
5763 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5764 unsigned int n_perms;
5765 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
5766 nullptr, vf, true, false, &n_perms))
5768 auto rep = SLP_TREE_REPRESENTATIVE (node);
5769 if (out_layout_i == 0)
5771 /* Use the fallback cost if the load is an N-to-N permutation.
5772 Otherwise assume that the node will be rejected later
5773 and rebuilt from scalars. */
5774 if (STMT_VINFO_GROUPED_ACCESS (rep)
5775 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
5776 == SLP_TREE_LANES (node)))
5777 return fallback_cost;
5778 return 0;
5780 return -1;
5783 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
5784 return n_perms == 0 ? 0 : 1;
5787 return 0;
5790 /* Decide which element layouts we should consider using. Calculate the
5791 weights associated with inserting layout changes on partition edges.
5792 Also mark partitions that cannot change layout, by setting their
5793 layout to zero. */
5795 void
5796 vect_optimize_slp_pass::start_choosing_layouts ()
5798 /* Used to assign unique permutation indices. */
5799 using perm_hash = unbounded_hashmap_traits<
5800 vec_free_hash_base<int_hash_base<unsigned>>,
5801 int_hash<int, -1, -2>
5803 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
5805 /* Layout 0 is "no change". */
5806 m_perms.safe_push (vNULL);
5808 /* Create layouts from existing permutations. */
5809 auto_load_permutation_t tmp_perm;
5810 for (unsigned int node_i : m_partitioned_nodes)
5812 /* Leafs also double as entries to the reverse graph. Allow the
5813 layout of those to be changed. */
5814 auto &vertex = m_vertices[node_i];
5815 auto &partition = m_partitions[vertex.partition];
5816 if (!m_slpg->vertices[node_i].succ)
5817 partition.layout = 0;
5819 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
5820 slp_tree node = vertex.node;
5821 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
5822 slp_tree child;
5823 unsigned HOST_WIDE_INT imin, imax = 0;
5824 bool any_permute = false;
5825 tmp_perm.truncate (0);
5826 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
5828 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
5829 unpermuted, record a layout that reverses this permutation.
5831 We would need more work to cope with loads that are internally
5832 permuted and also have inputs (such as masks for
5833 IFN_MASK_LOADs). */
5834 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
5835 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
5837 partition.layout = -1;
5838 continue;
5840 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
5841 imin = DR_GROUP_SIZE (dr_stmt) + 1;
5842 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
5844 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
5845 && SLP_TREE_CHILDREN (node).length () == 1
5846 && (child = SLP_TREE_CHILDREN (node)[0])
5847 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
5848 .is_constant (&imin)))
5850 /* If the child has the same vector size as this node,
5851 reversing the permutation can make the permutation a no-op.
5852 In other cases it can change a true permutation into a
5853 full-vector extract. */
5854 tmp_perm.reserve (SLP_TREE_LANES (node));
5855 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
5856 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
5858 else
5859 continue;
5861 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
5863 unsigned idx = tmp_perm[j];
5864 imin = MIN (imin, idx);
5865 imax = MAX (imax, idx);
5866 if (idx - tmp_perm[0] != j)
5867 any_permute = true;
5869 /* If the span doesn't match we'd disrupt VF computation, avoid
5870 that for now. */
5871 if (imax - imin + 1 != SLP_TREE_LANES (node))
5872 continue;
5873 /* If there's no permute no need to split one out. In this case
5874 we can consider turning a load into a permuted load, if that
5875 turns out to be cheaper than alternatives. */
5876 if (!any_permute)
5878 partition.layout = -1;
5879 continue;
5882 /* For now only handle true permutes, like
5883 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
5884 when permuting constants and invariants keeping the permute
5885 bijective. */
5886 auto_sbitmap load_index (SLP_TREE_LANES (node));
5887 bitmap_clear (load_index);
5888 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
5889 bitmap_set_bit (load_index, tmp_perm[j] - imin);
5890 unsigned j;
5891 for (j = 0; j < SLP_TREE_LANES (node); ++j)
5892 if (!bitmap_bit_p (load_index, j))
5893 break;
5894 if (j != SLP_TREE_LANES (node))
5895 continue;
5897 vec<unsigned> perm = vNULL;
5898 perm.safe_grow (SLP_TREE_LANES (node), true);
5899 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
5900 perm[j] = tmp_perm[j] - imin;
5902 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
5904 /* Continue to use existing layouts, but don't add any more. */
5905 int *entry = layout_ids.get (perm);
5906 partition.layout = entry ? *entry : 0;
5907 perm.release ();
5909 else
5911 bool existed;
5912 int &layout_i = layout_ids.get_or_insert (perm, &existed);
5913 if (existed)
5914 perm.release ();
5915 else
5917 layout_i = m_perms.length ();
5918 m_perms.safe_push (perm);
5920 partition.layout = layout_i;
5924 /* Initially assume that every layout is possible and has zero cost
5925 in every partition. */
5926 m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
5927 * m_perms.length ());
5929 /* We have to mark outgoing permutations facing non-associating-reduction
5930 graph entries that are not represented as to be materialized.
5931 slp_inst_kind_bb_reduc currently only covers associatable reductions. */
5932 for (slp_instance instance : m_vinfo->slp_instances)
5933 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
5935 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
5936 m_partitions[m_vertices[node_i].partition].layout = 0;
5938 else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
5940 stmt_vec_info stmt_info
5941 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
5942 stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
5943 if (needs_fold_left_reduction_p (TREE_TYPE
5944 (gimple_get_lhs (stmt_info->stmt)),
5945 STMT_VINFO_REDUC_CODE (reduc_info)))
5947 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
5948 m_partitions[m_vertices[node_i].partition].layout = 0;
5952 /* Check which layouts each node and partition can handle. Calculate the
5953 weights associated with inserting layout changes on edges. */
5954 for (unsigned int node_i : m_partitioned_nodes)
5956 auto &vertex = m_vertices[node_i];
5957 auto &partition = m_partitions[vertex.partition];
5958 slp_tree node = vertex.node;
5960 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
5962 vertex.weight = vect_slp_node_weight (node);
5964 /* We do not handle stores with a permutation, so all
5965 incoming permutations must have been materialized.
5967 We also don't handle masked grouped loads, which lack a
5968 permutation vector. In this case the memory locations
5969 form an implicit second input to the loads, on top of the
5970 explicit mask input, and the memory input's layout cannot
5971 be changed.
5973 On the other hand, we do support permuting gather loads and
5974 masked gather loads, where each scalar load is independent
5975 of the others. This can be useful if the address/index input
5976 benefits from permutation. */
5977 if (STMT_VINFO_DATA_REF (rep)
5978 && STMT_VINFO_GROUPED_ACCESS (rep)
5979 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
5980 partition.layout = 0;
5982 /* We cannot change the layout of an operation that is
5983 not independent on lanes. Note this is an explicit
5984 negative list since that's much shorter than the respective
5985 positive one but it's critical to keep maintaining it. */
5986 if (is_gimple_call (STMT_VINFO_STMT (rep)))
5987 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
5989 case CFN_COMPLEX_ADD_ROT90:
5990 case CFN_COMPLEX_ADD_ROT270:
5991 case CFN_COMPLEX_MUL:
5992 case CFN_COMPLEX_MUL_CONJ:
5993 case CFN_VEC_ADDSUB:
5994 case CFN_VEC_FMADDSUB:
5995 case CFN_VEC_FMSUBADD:
5996 partition.layout = 0;
5997 default:;
6001 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
6003 auto &other_vertex = m_vertices[other_node_i];
6005 /* Count the number of edges from earlier partitions and the number
6006 of edges to later partitions. */
6007 if (other_vertex.partition < vertex.partition)
6008 partition.in_degree += 1;
6009 else
6010 partition.out_degree += 1;
6012 /* If the current node uses the result of OTHER_NODE_I, accumulate
6013 the effects of that. */
6014 if (ud->src == int (node_i))
6016 other_vertex.out_weight += vertex.weight;
6017 other_vertex.out_degree += 1;
6020 for_each_partition_edge (node_i, process_edge);
6024 /* Return the incoming costs for node NODE_I, assuming that each input keeps
6025 its current (provisional) choice of layout. The inputs do not necessarily
6026 have the same layout as each other. */
6028 slpg_layout_cost
6029 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
6031 auto &vertex = m_vertices[node_i];
6032 slpg_layout_cost cost;
6033 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
6035 auto &other_vertex = m_vertices[other_node_i];
6036 if (other_vertex.partition < vertex.partition)
6038 auto &other_partition = m_partitions[other_vertex.partition];
6039 auto &other_costs = partition_layout_costs (other_vertex.partition,
6040 other_partition.layout);
6041 slpg_layout_cost this_cost = other_costs.in_cost;
6042 this_cost.add_serial_cost (other_costs.internal_cost);
6043 this_cost.split (other_partition.out_degree);
6044 cost.add_parallel_cost (this_cost);
6047 for_each_partition_edge (node_i, add_cost);
6048 return cost;
6051 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
6052 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
6053 slpg_layout_cost::impossible () if the change isn't possible. */
6055 slpg_layout_cost
6056 vect_optimize_slp_pass::
6057 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
6058 unsigned int layout2_i)
6060 auto &def_vertex = m_vertices[ud->dest];
6061 auto &use_vertex = m_vertices[ud->src];
6062 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
6063 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
6064 auto factor = change_layout_cost (def_vertex.node, def_layout_i,
6065 use_layout_i);
6066 if (factor < 0)
6067 return slpg_layout_cost::impossible ();
6069 /* We have a choice of putting the layout change at the site of the
6070 definition or at the site of the use. Prefer the former when
6071 optimizing for size or when the execution frequency of the
6072 definition is no greater than the combined execution frequencies of
6073 the uses. When putting the layout change at the site of the definition,
6074 divvy up the cost among all consumers. */
6075 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
6077 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
6078 cost.split (def_vertex.out_degree);
6079 return cost;
6081 return { use_vertex.weight * factor, m_optimize_size };
6084 /* UD represents a use-def link between FROM_NODE_I and a node in a later
6085 partition; FROM_NODE_I could be the definition node or the use node.
6086 The node at the other end of the link wants to use layout TO_LAYOUT_I.
6087 Return the cost of any necessary fix-ups on edge UD, or return
6088 slpg_layout_cost::impossible () if the change isn't possible.
6090 At this point, FROM_NODE_I's partition has chosen the cheapest
6091 layout based on the information available so far, but this choice
6092 is only provisional. */
6094 slpg_layout_cost
6095 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
6096 unsigned int to_layout_i)
6098 auto &from_vertex = m_vertices[from_node_i];
6099 unsigned int from_partition_i = from_vertex.partition;
6100 slpg_partition_info &from_partition = m_partitions[from_partition_i];
6101 gcc_assert (from_partition.layout >= 0);
6103 /* First calculate the cost on the assumption that FROM_PARTITION sticks
6104 with its current layout preference. */
6105 slpg_layout_cost cost = slpg_layout_cost::impossible ();
6106 auto edge_cost = edge_layout_cost (ud, from_node_i,
6107 from_partition.layout, to_layout_i);
6108 if (edge_cost.is_possible ())
6110 auto &from_costs = partition_layout_costs (from_partition_i,
6111 from_partition.layout);
6112 cost = from_costs.in_cost;
6113 cost.add_serial_cost (from_costs.internal_cost);
6114 cost.split (from_partition.out_degree);
6115 cost.add_serial_cost (edge_cost);
6117 else if (from_partition.layout == 0)
6118 /* We must allow the source partition to have layout 0 as a fallback,
6119 in case all other options turn out to be impossible. */
6120 return cost;
6122 /* Take the minimum of that cost and the cost that applies if
6123 FROM_PARTITION instead switches to TO_LAYOUT_I. */
6124 auto &direct_layout_costs = partition_layout_costs (from_partition_i,
6125 to_layout_i);
6126 if (direct_layout_costs.is_possible ())
6128 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
6129 direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
6130 direct_cost.split (from_partition.out_degree);
6131 if (!cost.is_possible ()
6132 || direct_cost.is_better_than (cost, m_optimize_size))
6133 cost = direct_cost;
6136 return cost;
6139 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
6140 partition; TO_NODE_I could be the definition node or the use node.
6141 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
6142 return the cost of any necessary fix-ups on edge UD, or
6143 slpg_layout_cost::impossible () if the choice cannot be made.
6145 At this point, TO_NODE_I's partition has a fixed choice of layout. */
6147 slpg_layout_cost
6148 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
6149 unsigned int from_layout_i)
6151 auto &to_vertex = m_vertices[to_node_i];
6152 unsigned int to_partition_i = to_vertex.partition;
6153 slpg_partition_info &to_partition = m_partitions[to_partition_i];
6154 gcc_assert (to_partition.layout >= 0);
6156 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
6157 adjusted for this input having layout FROM_LAYOUT_I. Assume that
6158 any other inputs keep their current choice of layout. */
6159 auto &to_costs = partition_layout_costs (to_partition_i,
6160 to_partition.layout);
6161 if (ud->src == int (to_node_i)
6162 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
6164 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
6165 auto old_layout = from_partition.layout;
6166 from_partition.layout = from_layout_i;
6167 int factor = internal_node_cost (to_vertex.node, -1,
6168 to_partition.layout);
6169 from_partition.layout = old_layout;
6170 if (factor >= 0)
6172 slpg_layout_cost cost = to_costs.out_cost;
6173 cost.add_serial_cost ({ to_vertex.weight * factor,
6174 m_optimize_size });
6175 cost.split (to_partition.in_degree);
6176 return cost;
6180 /* Compute the cost if we insert any necessary layout change on edge UD. */
6181 auto edge_cost = edge_layout_cost (ud, to_node_i,
6182 to_partition.layout, from_layout_i);
6183 if (edge_cost.is_possible ())
6185 slpg_layout_cost cost = to_costs.out_cost;
6186 cost.add_serial_cost (to_costs.internal_cost);
6187 cost.split (to_partition.in_degree);
6188 cost.add_serial_cost (edge_cost);
6189 return cost;
6192 return slpg_layout_cost::impossible ();
6195 /* Make a forward pass through the partitions, accumulating input costs.
6196 Make a tentative (provisional) choice of layout for each partition,
6197 ensuring that this choice still allows later partitions to keep
6198 their original layout. */
6200 void
6201 vect_optimize_slp_pass::forward_pass ()
6203 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
6204 ++partition_i)
6206 auto &partition = m_partitions[partition_i];
6208 /* If the partition consists of a single VEC_PERM_EXPR, precompute
6209 the incoming cost that would apply if every predecessor partition
6210 keeps its current layout. This is used within the loop below. */
6211 slpg_layout_cost in_cost;
6212 slp_tree single_node = nullptr;
6213 if (partition.node_end == partition.node_begin + 1)
6215 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
6216 single_node = m_vertices[node_i].node;
6217 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
6218 in_cost = total_in_cost (node_i);
6221 /* Go through the possible layouts. Decide which ones are valid
6222 for this partition and record which of the valid layouts has
6223 the lowest cost. */
6224 unsigned int min_layout_i = 0;
6225 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
6226 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6228 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6229 if (!layout_costs.is_possible ())
6230 continue;
6232 /* If the recorded layout is already 0 then the layout cannot
6233 change. */
6234 if (partition.layout == 0 && layout_i != 0)
6236 layout_costs.mark_impossible ();
6237 continue;
6240 bool is_possible = true;
6241 for (unsigned int order_i = partition.node_begin;
6242 order_i < partition.node_end; ++order_i)
6244 unsigned int node_i = m_partitioned_nodes[order_i];
6245 auto &vertex = m_vertices[node_i];
6247 /* Reject the layout if it is individually incompatible
6248 with any node in the partition. */
6249 if (!is_compatible_layout (vertex.node, layout_i))
6251 is_possible = false;
6252 break;
6255 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
6257 auto &other_vertex = m_vertices[other_node_i];
6258 if (other_vertex.partition < vertex.partition)
6260 /* Accumulate the incoming costs from earlier
6261 partitions, plus the cost of any layout changes
6262 on UD itself. */
6263 auto cost = forward_cost (ud, other_node_i, layout_i);
6264 if (!cost.is_possible ())
6265 is_possible = false;
6266 else
6267 layout_costs.in_cost.add_parallel_cost (cost);
6269 else
6270 /* Reject the layout if it would make layout 0 impossible
6271 for later partitions. This amounts to testing that the
6272 target supports reversing the layout change on edges
6273 to later partitions.
6275 In principle, it might be possible to push a layout
6276 change all the way down a graph, so that it never
6277 needs to be reversed and so that the target doesn't
6278 need to support the reverse operation. But it would
6279 be awkward to bail out if we hit a partition that
6280 does not support the new layout, especially since
6281 we are not dealing with a lattice. */
6282 is_possible &= edge_layout_cost (ud, other_node_i, 0,
6283 layout_i).is_possible ();
6285 for_each_partition_edge (node_i, add_cost);
6287 /* Accumulate the cost of using LAYOUT_I within NODE,
6288 both for the inputs and the outputs. */
6289 int factor = internal_node_cost (vertex.node, layout_i,
6290 layout_i);
6291 if (factor < 0)
6293 is_possible = false;
6294 break;
6296 else if (factor)
6297 layout_costs.internal_cost.add_serial_cost
6298 ({ vertex.weight * factor, m_optimize_size });
6300 if (!is_possible)
6302 layout_costs.mark_impossible ();
6303 continue;
6306 /* Combine the incoming and partition-internal costs. */
6307 slpg_layout_cost combined_cost = layout_costs.in_cost;
6308 combined_cost.add_serial_cost (layout_costs.internal_cost);
6310 /* If this partition consists of a single VEC_PERM_EXPR, see
6311 if the VEC_PERM_EXPR can be changed to support output layout
6312 LAYOUT_I while keeping all the provisional choices of input
6313 layout. */
6314 if (single_node
6315 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
6317 int factor = internal_node_cost (single_node, -1, layout_i);
6318 if (factor >= 0)
6320 auto weight = m_vertices[single_node->vertex].weight;
6321 slpg_layout_cost internal_cost
6322 = { weight * factor, m_optimize_size };
6324 slpg_layout_cost alt_cost = in_cost;
6325 alt_cost.add_serial_cost (internal_cost);
6326 if (alt_cost.is_better_than (combined_cost, m_optimize_size))
6328 combined_cost = alt_cost;
6329 layout_costs.in_cost = in_cost;
6330 layout_costs.internal_cost = internal_cost;
6335 /* Record the layout with the lowest cost. Prefer layout 0 in
6336 the event of a tie between it and another layout. */
6337 if (!min_layout_cost.is_possible ()
6338 || combined_cost.is_better_than (min_layout_cost,
6339 m_optimize_size))
6341 min_layout_i = layout_i;
6342 min_layout_cost = combined_cost;
6346 /* This loop's handling of earlier partitions should ensure that
6347 choosing the original layout for the current partition is no
6348 less valid than it was in the original graph, even with the
6349 provisional layout choices for those earlier partitions. */
6350 gcc_assert (min_layout_cost.is_possible ());
6351 partition.layout = min_layout_i;
6355 /* Make a backward pass through the partitions, accumulating output costs.
6356 Make a final choice of layout for each partition. */
6358 void
6359 vect_optimize_slp_pass::backward_pass ()
6361 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
6363 auto &partition = m_partitions[partition_i];
6365 unsigned int min_layout_i = 0;
6366 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
6367 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6369 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6370 if (!layout_costs.is_possible ())
6371 continue;
6373 /* Accumulate the costs from successor partitions. */
6374 bool is_possible = true;
6375 for (unsigned int order_i = partition.node_begin;
6376 order_i < partition.node_end; ++order_i)
6378 unsigned int node_i = m_partitioned_nodes[order_i];
6379 auto &vertex = m_vertices[node_i];
6380 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
6382 auto &other_vertex = m_vertices[other_node_i];
6383 auto &other_partition = m_partitions[other_vertex.partition];
6384 if (other_vertex.partition > vertex.partition)
6386 /* Accumulate the incoming costs from later
6387 partitions, plus the cost of any layout changes
6388 on UD itself. */
6389 auto cost = backward_cost (ud, other_node_i, layout_i);
6390 if (!cost.is_possible ())
6391 is_possible = false;
6392 else
6393 layout_costs.out_cost.add_parallel_cost (cost);
6395 else
6396 /* Make sure that earlier partitions can (if necessary
6397 or beneficial) keep the layout that they chose in
6398 the forward pass. This ensures that there is at
6399 least one valid choice of layout. */
6400 is_possible &= edge_layout_cost (ud, other_node_i,
6401 other_partition.layout,
6402 layout_i).is_possible ();
6404 for_each_partition_edge (node_i, add_cost);
6406 if (!is_possible)
6408 layout_costs.mark_impossible ();
6409 continue;
6412 /* Locally combine the costs from the forward and backward passes.
6413 (This combined cost is not passed on, since that would lead
6414 to double counting.) */
6415 slpg_layout_cost combined_cost = layout_costs.in_cost;
6416 combined_cost.add_serial_cost (layout_costs.internal_cost);
6417 combined_cost.add_serial_cost (layout_costs.out_cost);
6419 /* Record the layout with the lowest cost. Prefer layout 0 in
6420 the event of a tie between it and another layout. */
6421 if (!min_layout_cost.is_possible ()
6422 || combined_cost.is_better_than (min_layout_cost,
6423 m_optimize_size))
6425 min_layout_i = layout_i;
6426 min_layout_cost = combined_cost;
6430 gcc_assert (min_layout_cost.is_possible ());
6431 partition.layout = min_layout_i;
6435 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
6436 NODE already has the layout that was selected for its partition. */
6438 slp_tree
6439 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
6440 unsigned int to_layout_i)
6442 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
6443 slp_tree result = m_node_layouts[result_i];
6444 if (result)
6445 return result;
6447 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
6448 || (SLP_TREE_DEF_TYPE (node) == vect_external_def
6449 /* We can't permute vector defs in place. */
6450 && SLP_TREE_VEC_DEFS (node).is_empty ()))
6452 /* If the vector is uniform or unchanged, there's nothing to do. */
6453 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
6454 result = node;
6455 else
6457 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
6458 result = vect_create_new_slp_node (scalar_ops);
6459 vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
6462 else
6464 unsigned int partition_i = m_vertices[node->vertex].partition;
6465 unsigned int from_layout_i = m_partitions[partition_i].layout;
6466 if (from_layout_i == to_layout_i)
6467 return node;
6469 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
6470 permutation instead of a serial one. Leave the new permutation
6471 in TMP_PERM on success. */
6472 auto_lane_permutation_t tmp_perm;
6473 unsigned int num_inputs = 1;
6474 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6476 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
6477 if (from_layout_i != 0)
6478 vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
6479 if (to_layout_i != 0)
6480 vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
6481 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
6482 tmp_perm,
6483 SLP_TREE_CHILDREN (node),
6484 false) >= 0)
6485 num_inputs = SLP_TREE_CHILDREN (node).length ();
6486 else
6487 tmp_perm.truncate (0);
6490 if (dump_enabled_p ())
6492 if (tmp_perm.length () > 0)
6493 dump_printf_loc (MSG_NOTE, vect_location,
6494 "duplicating permutation node %p with"
6495 " layout %d\n",
6496 (void *) node, to_layout_i);
6497 else
6498 dump_printf_loc (MSG_NOTE, vect_location,
6499 "inserting permutation node in place of %p\n",
6500 (void *) node);
6503 unsigned int num_lanes = SLP_TREE_LANES (node);
6504 result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
6505 if (SLP_TREE_SCALAR_STMTS (node).length ())
6507 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
6508 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
6509 if (from_layout_i != 0)
6510 vect_slp_permute (m_perms[from_layout_i], stmts, false);
6511 if (to_layout_i != 0)
6512 vect_slp_permute (m_perms[to_layout_i], stmts, true);
6514 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
6515 SLP_TREE_LANES (result) = num_lanes;
6516 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
6517 result->vertex = -1;
6519 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
6520 if (tmp_perm.length ())
6522 lane_perm.safe_splice (tmp_perm);
6523 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
6525 else
6527 lane_perm.create (num_lanes);
6528 for (unsigned j = 0; j < num_lanes; ++j)
6529 lane_perm.quick_push ({ 0, j });
6530 if (from_layout_i != 0)
6531 vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
6532 if (to_layout_i != 0)
6533 vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
6534 SLP_TREE_CHILDREN (result).safe_push (node);
6536 for (slp_tree child : SLP_TREE_CHILDREN (result))
6537 child->refcnt++;
6539 m_node_layouts[result_i] = result;
6540 return result;
6543 /* Apply the chosen vector layouts to the SLP graph. */
6545 void
6546 vect_optimize_slp_pass::materialize ()
6548 /* We no longer need the costs, so avoid having two O(N * P) arrays
6549 live at the same time. */
6550 m_partition_layout_costs.release ();
6551 m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
6553 auto_sbitmap fully_folded (m_vertices.length ());
6554 bitmap_clear (fully_folded);
6555 for (unsigned int node_i : m_partitioned_nodes)
6557 auto &vertex = m_vertices[node_i];
6558 slp_tree node = vertex.node;
6559 int layout_i = m_partitions[vertex.partition].layout;
6560 gcc_assert (layout_i >= 0);
6562 /* Rearrange the scalar statements to match the chosen layout. */
6563 if (layout_i > 0)
6564 vect_slp_permute (m_perms[layout_i],
6565 SLP_TREE_SCALAR_STMTS (node), true);
6567 /* Update load and lane permutations. */
6568 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6570 /* First try to absorb the input vector layouts. If that fails,
6571 force the inputs to have layout LAYOUT_I too. We checked that
6572 that was possible before deciding to use nonzero output layouts.
6573 (Note that at this stage we don't really have any guarantee that
6574 the target supports the original VEC_PERM_EXPR.) */
6575 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
6576 auto_lane_permutation_t tmp_perm;
6577 tmp_perm.safe_splice (perm);
6578 change_vec_perm_layout (node, tmp_perm, -1, layout_i);
6579 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
6580 tmp_perm,
6581 SLP_TREE_CHILDREN (node),
6582 false) >= 0)
6584 if (dump_enabled_p ()
6585 && !std::equal (tmp_perm.begin (), tmp_perm.end (),
6586 perm.begin ()))
6587 dump_printf_loc (MSG_NOTE, vect_location,
6588 "absorbing input layouts into %p\n",
6589 (void *) node);
6590 std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
6591 bitmap_set_bit (fully_folded, node_i);
6593 else
6595 /* Not MSG_MISSED because it would make no sense to users. */
6596 if (dump_enabled_p ())
6597 dump_printf_loc (MSG_NOTE, vect_location,
6598 "failed to absorb input layouts into %p\n",
6599 (void *) node);
6600 change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
6603 else
6605 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
6606 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
6607 if (layout_i > 0)
6608 /* ??? When we handle non-bijective permutes the idea
6609 is that we can force the load-permutation to be
6610 { min, min + 1, min + 2, ... max }. But then the
6611 scalar defs might no longer match the lane content
6612 which means wrong-code with live lane vectorization.
6613 So we possibly have to have NULL entries for those. */
6614 vect_slp_permute (m_perms[layout_i], load_perm, true);
6618 /* Do this before any nodes disappear, since it involves a walk
6619 over the leaves. */
6620 remove_redundant_permutations ();
6622 /* Replace each child with a correctly laid-out version. */
6623 for (unsigned int node_i : m_partitioned_nodes)
6625 /* Skip nodes that have already been handled above. */
6626 if (bitmap_bit_p (fully_folded, node_i))
6627 continue;
6629 auto &vertex = m_vertices[node_i];
6630 int in_layout_i = m_partitions[vertex.partition].layout;
6631 gcc_assert (in_layout_i >= 0);
6633 unsigned j;
6634 slp_tree child;
6635 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
6637 if (!child)
6638 continue;
6640 slp_tree new_child = get_result_with_layout (child, in_layout_i);
6641 if (new_child != child)
6643 vect_free_slp_tree (child);
6644 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
6645 new_child->refcnt += 1;
6651 /* Elide load permutations that are not necessary. Such permutations might
6652 be pre-existing, rather than created by the layout optimizations. */
6654 void
6655 vect_optimize_slp_pass::remove_redundant_permutations ()
6657 for (unsigned int node_i : m_leafs)
6659 slp_tree node = m_vertices[node_i].node;
6660 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
6661 continue;
6663 /* In basic block vectorization we allow any subchain of an interleaving
6664 chain.
6665 FORNOW: not in loop SLP because of realignment complications. */
6666 if (is_a <bb_vec_info> (m_vinfo))
6668 bool subchain_p = true;
6669 stmt_vec_info next_load_info = NULL;
6670 stmt_vec_info load_info;
6671 unsigned j;
6672 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
6674 if (j != 0
6675 && (next_load_info != load_info
6676 || ! load_info
6677 || DR_GROUP_GAP (load_info) != 1))
6679 subchain_p = false;
6680 break;
6682 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
6684 if (subchain_p)
6686 SLP_TREE_LOAD_PERMUTATION (node).release ();
6687 continue;
6690 else
6692 loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
6693 stmt_vec_info load_info;
6694 bool this_load_permuted = false;
6695 unsigned j;
6696 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
6697 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
6699 this_load_permuted = true;
6700 break;
6702 /* When this isn't a grouped access we know it's single element
6703 and contiguous. */
6704 if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
6706 if (!this_load_permuted
6707 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
6708 || SLP_TREE_LANES (node) == 1))
6709 SLP_TREE_LOAD_PERMUTATION (node).release ();
6710 continue;
6712 stmt_vec_info first_stmt_info
6713 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
6714 if (!this_load_permuted
6715 /* The load requires permutation when unrolling exposes
6716 a gap either because the group is larger than the SLP
6717 group-size or because there is a gap between the groups. */
6718 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
6719 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
6720 && DR_GROUP_GAP (first_stmt_info) == 0)))
6722 SLP_TREE_LOAD_PERMUTATION (node).release ();
6723 continue;
6729 /* Print the partition graph and layout information to the dump file. */
6731 void
6732 vect_optimize_slp_pass::dump ()
6734 dump_printf_loc (MSG_NOTE, vect_location,
6735 "SLP optimize permutations:\n");
6736 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
6738 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
6739 const char *sep = "";
6740 for (unsigned int idx : m_perms[layout_i])
6742 dump_printf (MSG_NOTE, "%s%d", sep, idx);
6743 sep = ", ";
6745 dump_printf (MSG_NOTE, " }\n");
6747 dump_printf_loc (MSG_NOTE, vect_location,
6748 "SLP optimize partitions:\n");
6749 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
6750 ++partition_i)
6752 auto &partition = m_partitions[partition_i];
6753 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
6754 dump_printf_loc (MSG_NOTE, vect_location,
6755 " partition %d (layout %d):\n",
6756 partition_i, partition.layout);
6757 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
6758 for (unsigned int order_i = partition.node_begin;
6759 order_i < partition.node_end; ++order_i)
6761 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
6762 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
6763 (void *) vertex.node);
6764 dump_printf_loc (MSG_NOTE, vect_location,
6765 " weight: %f\n",
6766 vertex.weight.to_double ());
6767 if (vertex.out_degree)
6768 dump_printf_loc (MSG_NOTE, vect_location,
6769 " out weight: %f (degree %d)\n",
6770 vertex.out_weight.to_double (),
6771 vertex.out_degree);
6772 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
6773 dump_printf_loc (MSG_NOTE, vect_location,
6774 " op: VEC_PERM_EXPR\n");
6775 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
6776 dump_printf_loc (MSG_NOTE, vect_location,
6777 " op template: %G", rep->stmt);
6779 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
6780 for (unsigned int order_i = partition.node_begin;
6781 order_i < partition.node_end; ++order_i)
6783 unsigned int node_i = m_partitioned_nodes[order_i];
6784 auto &vertex = m_vertices[node_i];
6785 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
6787 auto &other_vertex = m_vertices[other_node_i];
6788 if (other_vertex.partition < vertex.partition)
6789 dump_printf_loc (MSG_NOTE, vect_location,
6790 " - %p [%d] --> %p\n",
6791 (void *) other_vertex.node,
6792 other_vertex.partition,
6793 (void *) vertex.node);
6794 else
6795 dump_printf_loc (MSG_NOTE, vect_location,
6796 " - %p --> [%d] %p\n",
6797 (void *) vertex.node,
6798 other_vertex.partition,
6799 (void *) other_vertex.node);
6801 for_each_partition_edge (node_i, print_edge);
6804 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6806 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6807 if (layout_costs.is_possible ())
6809 dump_printf_loc (MSG_NOTE, vect_location,
6810 " layout %d:%s\n", layout_i,
6811 partition.layout == int (layout_i)
6812 ? " (*)" : "");
6813 slpg_layout_cost combined_cost = layout_costs.in_cost;
6814 combined_cost.add_serial_cost (layout_costs.internal_cost);
6815 combined_cost.add_serial_cost (layout_costs.out_cost);
6816 #define TEMPLATE "{depth: %f, total: %f}"
6817 dump_printf_loc (MSG_NOTE, vect_location,
6818 " " TEMPLATE "\n",
6819 layout_costs.in_cost.depth.to_double (),
6820 layout_costs.in_cost.total.to_double ());
6821 dump_printf_loc (MSG_NOTE, vect_location,
6822 " + " TEMPLATE "\n",
6823 layout_costs.internal_cost.depth.to_double (),
6824 layout_costs.internal_cost.total.to_double ());
6825 dump_printf_loc (MSG_NOTE, vect_location,
6826 " + " TEMPLATE "\n",
6827 layout_costs.out_cost.depth.to_double (),
6828 layout_costs.out_cost.total.to_double ());
6829 dump_printf_loc (MSG_NOTE, vect_location,
6830 " = " TEMPLATE "\n",
6831 combined_cost.depth.to_double (),
6832 combined_cost.total.to_double ());
6833 #undef TEMPLATE
6835 else
6836 dump_printf_loc (MSG_NOTE, vect_location,
6837 " layout %d: rejected\n", layout_i);
6842 /* Main entry point for the SLP graph optimization pass. */
6844 void
6845 vect_optimize_slp_pass::run ()
6847 build_graph ();
6848 create_partitions ();
6849 start_choosing_layouts ();
6850 if (m_perms.length () > 1)
6852 forward_pass ();
6853 backward_pass ();
6854 if (dump_enabled_p ())
6855 dump ();
6856 materialize ();
6857 while (!m_perms.is_empty ())
6858 m_perms.pop ().release ();
6860 else
6861 remove_redundant_permutations ();
6862 free_graph (m_slpg);
6865 /* Apply CSE to NODE and its children using BST_MAP. */
6867 static void
6868 vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
6870 bool put_p = false;
6871 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
6872 /* Besides some VEC_PERM_EXPR, two-operator nodes also
6873 lack scalar stmts and thus CSE doesn't work via bst_map. Ideally
6874 we'd have sth that works for all internal and external nodes. */
6875 && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
6877 slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
6878 if (leader)
6880 /* We've visited this node already. */
6881 if (!*leader || *leader == node)
6882 return;
6884 if (dump_enabled_p ())
6885 dump_printf_loc (MSG_NOTE, vect_location,
6886 "re-using SLP tree %p for %p\n",
6887 (void *)*leader, (void *)node);
6888 vect_free_slp_tree (node);
6889 (*leader)->refcnt += 1;
6890 node = *leader;
6891 return;
6894 /* Avoid creating a cycle by populating the map only after recursion. */
6895 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
6896 node->refcnt += 1;
6897 put_p = true;
6898 /* And recurse. */
6901 for (slp_tree &child : SLP_TREE_CHILDREN (node))
6902 if (child)
6903 vect_cse_slp_nodes (bst_map, child);
6905 /* Now record the node for CSE in other siblings. */
6906 if (put_p)
6907 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), node);
6910 /* Optimize the SLP graph of VINFO. */
6912 void
6913 vect_optimize_slp (vec_info *vinfo)
6915 if (vinfo->slp_instances.is_empty ())
6916 return;
6917 vect_optimize_slp_pass (vinfo).run ();
6919 /* Apply CSE again to nodes after permute optimization. */
6920 scalar_stmts_to_slp_tree_map_t *bst_map
6921 = new scalar_stmts_to_slp_tree_map_t ();
6923 for (auto inst : vinfo->slp_instances)
6924 vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
6926 release_scalar_stmts_to_slp_tree_map (bst_map);
6929 /* Gather loads reachable from the individual SLP graph entries. */
6931 void
6932 vect_gather_slp_loads (vec_info *vinfo)
6934 unsigned i;
6935 slp_instance instance;
6936 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
6938 hash_set<slp_tree> visited;
6939 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
6940 SLP_INSTANCE_TREE (instance), visited);
6945 /* For each possible SLP instance decide whether to SLP it and calculate overall
6946 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
6947 least one instance. */
6949 bool
6950 vect_make_slp_decision (loop_vec_info loop_vinfo)
6952 unsigned int i;
6953 poly_uint64 unrolling_factor = 1;
6954 const vec<slp_instance> &slp_instances
6955 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
6956 slp_instance instance;
6957 int decided_to_slp = 0;
6959 DUMP_VECT_SCOPE ("vect_make_slp_decision");
6961 FOR_EACH_VEC_ELT (slp_instances, i, instance)
6963 /* FORNOW: SLP if you can. */
6964 /* All unroll factors have the form:
6966 GET_MODE_SIZE (vinfo->vector_mode) * X
6968 for some rational X, so they must have a common multiple. */
6969 unrolling_factor
6970 = force_common_multiple (unrolling_factor,
6971 SLP_INSTANCE_UNROLLING_FACTOR (instance));
6973 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
6974 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
6975 loop-based vectorization. Such stmts will be marked as HYBRID. */
6976 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
6977 decided_to_slp++;
6980 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
6982 if (decided_to_slp && dump_enabled_p ())
6984 dump_printf_loc (MSG_NOTE, vect_location,
6985 "Decided to SLP %d instances. Unrolling factor ",
6986 decided_to_slp);
6987 dump_dec (MSG_NOTE, unrolling_factor);
6988 dump_printf (MSG_NOTE, "\n");
6991 return (decided_to_slp > 0);
6994 /* Private data for vect_detect_hybrid_slp. */
6995 struct vdhs_data
6997 loop_vec_info loop_vinfo;
6998 vec<stmt_vec_info> *worklist;
7001 /* Walker for walk_gimple_op. */
7003 static tree
7004 vect_detect_hybrid_slp (tree *tp, int *, void *data)
7006 walk_stmt_info *wi = (walk_stmt_info *)data;
7007 vdhs_data *dat = (vdhs_data *)wi->info;
7009 if (wi->is_lhs)
7010 return NULL_TREE;
7012 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
7013 if (!def_stmt_info)
7014 return NULL_TREE;
7015 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
7016 if (PURE_SLP_STMT (def_stmt_info))
7018 if (dump_enabled_p ())
7019 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
7020 def_stmt_info->stmt);
7021 STMT_SLP_TYPE (def_stmt_info) = hybrid;
7022 dat->worklist->safe_push (def_stmt_info);
7025 return NULL_TREE;
7028 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
7029 if so, otherwise pushing it to WORKLIST. */
7031 static void
7032 maybe_push_to_hybrid_worklist (vec_info *vinfo,
7033 vec<stmt_vec_info> &worklist,
7034 stmt_vec_info stmt_info)
7036 if (dump_enabled_p ())
7037 dump_printf_loc (MSG_NOTE, vect_location,
7038 "Processing hybrid candidate : %G", stmt_info->stmt);
7039 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
7040 imm_use_iterator iter2;
7041 ssa_op_iter iter1;
7042 use_operand_p use_p;
7043 def_operand_p def_p;
7044 bool any_def = false;
7045 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
7047 any_def = true;
7048 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
7050 if (is_gimple_debug (USE_STMT (use_p)))
7051 continue;
7052 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
7053 /* An out-of loop use means this is a loop_vect sink. */
7054 if (!use_info)
7056 if (dump_enabled_p ())
7057 dump_printf_loc (MSG_NOTE, vect_location,
7058 "Found loop_vect sink: %G", stmt_info->stmt);
7059 worklist.safe_push (stmt_info);
7060 return;
7062 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
7064 if (dump_enabled_p ())
7065 dump_printf_loc (MSG_NOTE, vect_location,
7066 "Found loop_vect use: %G", use_info->stmt);
7067 worklist.safe_push (stmt_info);
7068 return;
7072 /* No def means this is a loo_vect sink. */
7073 if (!any_def)
7075 if (dump_enabled_p ())
7076 dump_printf_loc (MSG_NOTE, vect_location,
7077 "Found loop_vect sink: %G", stmt_info->stmt);
7078 worklist.safe_push (stmt_info);
7079 return;
7081 if (dump_enabled_p ())
7082 dump_printf_loc (MSG_NOTE, vect_location,
7083 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
7084 STMT_SLP_TYPE (stmt_info) = pure_slp;
7087 /* Find stmts that must be both vectorized and SLPed. */
7089 void
7090 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
7092 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
7094 /* All stmts participating in SLP are marked pure_slp, all other
7095 stmts are loop_vect.
7096 First collect all loop_vect stmts into a worklist.
7097 SLP patterns cause not all original scalar stmts to appear in
7098 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
7099 Rectify this here and do a backward walk over the IL only considering
7100 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
7101 mark them as pure_slp. */
7102 auto_vec<stmt_vec_info> worklist;
7103 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
7105 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
7106 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
7107 gsi_next (&gsi))
7109 gphi *phi = gsi.phi ();
7110 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
7111 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
7112 maybe_push_to_hybrid_worklist (loop_vinfo,
7113 worklist, stmt_info);
7115 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
7116 gsi_prev (&gsi))
7118 gimple *stmt = gsi_stmt (gsi);
7119 if (is_gimple_debug (stmt))
7120 continue;
7121 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
7122 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
7124 for (gimple_stmt_iterator gsi2
7125 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
7126 !gsi_end_p (gsi2); gsi_next (&gsi2))
7128 stmt_vec_info patt_info
7129 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
7130 if (!STMT_SLP_TYPE (patt_info)
7131 && STMT_VINFO_RELEVANT (patt_info))
7132 maybe_push_to_hybrid_worklist (loop_vinfo,
7133 worklist, patt_info);
7135 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7137 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
7138 maybe_push_to_hybrid_worklist (loop_vinfo,
7139 worklist, stmt_info);
7143 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
7144 mark any SLP vectorized stmt as hybrid.
7145 ??? We're visiting def stmts N times (once for each non-SLP and
7146 once for each hybrid-SLP use). */
7147 walk_stmt_info wi;
7148 vdhs_data dat;
7149 dat.worklist = &worklist;
7150 dat.loop_vinfo = loop_vinfo;
7151 memset (&wi, 0, sizeof (wi));
7152 wi.info = (void *)&dat;
7153 while (!worklist.is_empty ())
7155 stmt_vec_info stmt_info = worklist.pop ();
7156 /* Since SSA operands are not set up for pattern stmts we need
7157 to use walk_gimple_op. */
7158 wi.is_lhs = 0;
7159 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
7160 /* For gather/scatter make sure to walk the offset operand, that
7161 can be a scaling and conversion away. */
7162 gather_scatter_info gs_info;
7163 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
7164 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
7166 int dummy;
7167 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
7173 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
7175 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
7176 : vec_info (vec_info::bb, shared),
7177 roots (vNULL)
7179 /* The region we are operating on. bbs[0] is the entry, excluding
7180 its PHI nodes. In the future we might want to track an explicit
7181 entry edge to cover bbs[0] PHI nodes and have a region entry
7182 insert location. */
7183 bbs = _bbs.address ();
7184 nbbs = _bbs.length ();
7186 for (unsigned i = 0; i < nbbs; ++i)
7188 if (i != 0)
7189 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
7190 gsi_next (&si))
7192 gphi *phi = si.phi ();
7193 gimple_set_uid (phi, 0);
7194 add_stmt (phi);
7196 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
7197 !gsi_end_p (gsi); gsi_next (&gsi))
7199 gimple *stmt = gsi_stmt (gsi);
7200 gimple_set_uid (stmt, 0);
7201 if (is_gimple_debug (stmt))
7202 continue;
7203 add_stmt (stmt);
7209 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
7210 stmts in the basic block. */
7212 _bb_vec_info::~_bb_vec_info ()
7214 /* Reset region marker. */
7215 for (unsigned i = 0; i < nbbs; ++i)
7217 if (i != 0)
7218 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
7219 gsi_next (&si))
7221 gphi *phi = si.phi ();
7222 gimple_set_uid (phi, -1);
7224 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
7225 !gsi_end_p (gsi); gsi_next (&gsi))
7227 gimple *stmt = gsi_stmt (gsi);
7228 gimple_set_uid (stmt, -1);
7232 for (unsigned i = 0; i < roots.length (); ++i)
7234 roots[i].stmts.release ();
7235 roots[i].roots.release ();
7236 roots[i].remain.release ();
7238 roots.release ();
7241 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
7242 given then that child nodes have already been processed, and that
7243 their def types currently match their SLP node's def type. */
7245 static bool
7246 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
7247 slp_instance node_instance,
7248 stmt_vector_for_cost *cost_vec)
7250 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7252 /* Calculate the number of vector statements to be created for the scalar
7253 stmts in this node. It is the number of scalar elements in one scalar
7254 iteration (DR_GROUP_SIZE) multiplied by VF divided by the number of
7255 elements in a vector. For single-defuse-cycle, lane-reducing op, and
7256 PHI statement that starts reduction comprised of only lane-reducing ops,
7257 the number is more than effective vector statements actually required. */
7258 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vect_get_num_copies (vinfo, node);
7260 /* Handle purely internal nodes. */
7261 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7263 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
7264 return false;
7266 stmt_vec_info slp_stmt_info;
7267 unsigned int i;
7268 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
7270 if (slp_stmt_info
7271 && STMT_VINFO_LIVE_P (slp_stmt_info)
7272 && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
7273 node_instance, i,
7274 false, cost_vec))
7275 return false;
7277 return true;
7280 bool dummy;
7281 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
7282 node, node_instance, cost_vec);
7285 /* Try to build NODE from scalars, returning true on success.
7286 NODE_INSTANCE is the SLP instance that contains NODE. */
7288 static bool
7289 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
7290 slp_instance node_instance)
7292 stmt_vec_info stmt_info;
7293 unsigned int i;
7295 if (!is_a <bb_vec_info> (vinfo)
7296 || node == SLP_INSTANCE_TREE (node_instance)
7297 || !SLP_TREE_SCALAR_STMTS (node).exists ()
7298 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
7299 /* Force the mask use to be built from scalars instead. */
7300 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node)))
7301 return false;
7303 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7304 if (!stmt_info)
7305 return false;
7307 if (dump_enabled_p ())
7308 dump_printf_loc (MSG_NOTE, vect_location,
7309 "Building vector operands of %p from scalars instead\n",
7310 (void *) node);
7312 /* Don't remove and free the child nodes here, since they could be
7313 referenced by other structures. The analysis and scheduling phases
7314 (need to) ignore child nodes of anything that isn't vect_internal_def. */
7315 unsigned int group_size = SLP_TREE_LANES (node);
7316 SLP_TREE_DEF_TYPE (node) = vect_external_def;
7317 /* Invariants get their vector type from the uses. */
7318 SLP_TREE_VECTYPE (node) = NULL_TREE;
7319 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
7320 SLP_TREE_LOAD_PERMUTATION (node).release ();
7321 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7323 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
7324 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
7326 return true;
7329 /* Return true if all elements of the slice are the same. */
7330 bool
7331 vect_scalar_ops_slice::all_same_p () const
7333 for (unsigned int i = 1; i < length; ++i)
7334 if (!operand_equal_p (op (0), op (i)))
7335 return false;
7336 return true;
7339 hashval_t
7340 vect_scalar_ops_slice_hash::hash (const value_type &s)
7342 hashval_t hash = 0;
7343 for (unsigned i = 0; i < s.length; ++i)
7344 hash = iterative_hash_expr (s.op (i), hash);
7345 return hash;
7348 bool
7349 vect_scalar_ops_slice_hash::equal (const value_type &s1,
7350 const compare_type &s2)
7352 if (s1.length != s2.length)
7353 return false;
7354 for (unsigned i = 0; i < s1.length; ++i)
7355 if (!operand_equal_p (s1.op (i), s2.op (i)))
7356 return false;
7357 return true;
7360 /* Compute the prologue cost for invariant or constant operands represented
7361 by NODE. */
7363 static void
7364 vect_prologue_cost_for_slp (slp_tree node,
7365 stmt_vector_for_cost *cost_vec)
7367 /* There's a special case of an existing vector, that costs nothing. */
7368 if (SLP_TREE_SCALAR_OPS (node).length () == 0
7369 && !SLP_TREE_VEC_DEFS (node).is_empty ())
7370 return;
7371 /* Without looking at the actual initializer a vector of
7372 constants can be implemented as load from the constant pool.
7373 When all elements are the same we can use a splat. */
7374 tree vectype = SLP_TREE_VECTYPE (node);
7375 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
7376 unsigned HOST_WIDE_INT const_nunits;
7377 unsigned nelt_limit;
7378 auto ops = &SLP_TREE_SCALAR_OPS (node);
7379 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7380 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
7381 && ! multiple_p (const_nunits, group_size))
7383 nelt_limit = const_nunits;
7384 hash_set<vect_scalar_ops_slice_hash> vector_ops;
7385 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
7386 if (!vector_ops.add ({ ops, i * const_nunits, const_nunits }))
7387 starts.quick_push (i * const_nunits);
7389 else
7391 /* If either the vector has variable length or the vectors
7392 are composed of repeated whole groups we only need to
7393 cost construction once. All vectors will be the same. */
7394 nelt_limit = group_size;
7395 starts.quick_push (0);
7397 /* ??? We're just tracking whether vectors in a single node are the same.
7398 Ideally we'd do something more global. */
7399 bool passed = false;
7400 for (unsigned int start : starts)
7402 vect_cost_for_stmt kind;
7403 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
7404 kind = vector_load;
7405 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
7406 kind = scalar_to_vec;
7407 else
7408 kind = vec_construct;
7409 /* The target cost hook has no idea which part of the SLP node
7410 we are costing so avoid passing it down more than once. Pass
7411 it to the first vec_construct or scalar_to_vec part since for those
7412 the x86 backend tries to account for GPR to XMM register moves. */
7413 record_stmt_cost (cost_vec, 1, kind,
7414 (kind != vector_load && !passed) ? node : nullptr,
7415 vectype, 0, vect_prologue);
7416 if (kind != vector_load)
7417 passed = true;
7421 /* Analyze statements contained in SLP tree NODE after recursively analyzing
7422 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
7424 Return true if the operations are supported. */
7426 static bool
7427 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
7428 slp_instance node_instance,
7429 hash_set<slp_tree> &visited_set,
7430 vec<slp_tree> &visited_vec,
7431 stmt_vector_for_cost *cost_vec)
7433 int i, j;
7434 slp_tree child;
7436 /* Assume we can code-generate all invariants. */
7437 if (!node
7438 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
7439 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7440 return true;
7442 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
7444 if (dump_enabled_p ())
7445 dump_printf_loc (MSG_NOTE, vect_location,
7446 "Failed cyclic SLP reference in %p\n", (void *) node);
7447 return false;
7449 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
7451 /* If we already analyzed the exact same set of scalar stmts we're done.
7452 We share the generated vector stmts for those. */
7453 if (visited_set.add (node))
7454 return true;
7455 visited_vec.safe_push (node);
7457 bool res = true;
7458 unsigned visited_rec_start = visited_vec.length ();
7459 unsigned cost_vec_rec_start = cost_vec->length ();
7460 bool seen_non_constant_child = false;
7461 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7463 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
7464 visited_set, visited_vec,
7465 cost_vec);
7466 if (!res)
7467 break;
7468 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
7469 seen_non_constant_child = true;
7471 /* We're having difficulties scheduling nodes with just constant
7472 operands and no scalar stmts since we then cannot compute a stmt
7473 insertion place. */
7474 if (!seen_non_constant_child && SLP_TREE_SCALAR_STMTS (node).is_empty ())
7476 if (dump_enabled_p ())
7477 dump_printf_loc (MSG_NOTE, vect_location,
7478 "Cannot vectorize all-constant op node %p\n",
7479 (void *) node);
7480 res = false;
7483 if (res)
7484 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
7485 cost_vec);
7486 /* If analysis failed we have to pop all recursive visited nodes
7487 plus ourselves. */
7488 if (!res)
7490 while (visited_vec.length () >= visited_rec_start)
7491 visited_set.remove (visited_vec.pop ());
7492 cost_vec->truncate (cost_vec_rec_start);
7495 /* When the node can be vectorized cost invariant nodes it references.
7496 This is not done in DFS order to allow the refering node
7497 vectorizable_* calls to nail down the invariant nodes vector type
7498 and possibly unshare it if it needs a different vector type than
7499 other referrers. */
7500 if (res)
7501 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
7502 if (child
7503 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
7504 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
7505 /* Perform usual caching, note code-generation still
7506 code-gens these nodes multiple times but we expect
7507 to CSE them later. */
7508 && !visited_set.add (child))
7510 visited_vec.safe_push (child);
7511 /* ??? After auditing more code paths make a "default"
7512 and push the vector type from NODE to all children
7513 if it is not already set. */
7514 /* Compute the number of vectors to be generated. */
7515 tree vector_type = SLP_TREE_VECTYPE (child);
7516 if (!vector_type)
7518 /* For shifts with a scalar argument we don't need
7519 to cost or code-generate anything.
7520 ??? Represent this more explicitely. */
7521 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
7522 == shift_vec_info_type)
7523 && j == 1);
7524 continue;
7527 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
7528 = vect_get_num_copies (vinfo, child);
7529 /* And cost them. */
7530 vect_prologue_cost_for_slp (child, cost_vec);
7533 /* If this node or any of its children can't be vectorized, try pruning
7534 the tree here rather than felling the whole thing. */
7535 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
7537 /* We'll need to revisit this for invariant costing and number
7538 of vectorized stmt setting. */
7539 res = true;
7542 return res;
7545 /* Given a definition DEF, analyze if it will have any live scalar use after
7546 performing SLP vectorization whose information is represented by BB_VINFO,
7547 and record result into hash map SCALAR_USE_MAP as cache for later fast
7548 check. If recursion DEPTH exceeds a limit, stop analysis and make a
7549 conservative assumption. Return 0 if no scalar use, 1 if there is, -1
7550 means recursion is limited. */
7552 static int
7553 vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
7554 hash_map<tree, int> &scalar_use_map,
7555 int depth = 0)
7557 const int depth_limit = 2;
7558 imm_use_iterator use_iter;
7559 gimple *use_stmt;
7561 if (int *res = scalar_use_map.get (def))
7562 return *res;
7564 int scalar_use = 1;
7566 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
7568 if (is_gimple_debug (use_stmt))
7569 continue;
7571 stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
7573 if (!use_stmt_info)
7574 break;
7576 if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
7577 continue;
7579 /* Do not step forward when encounter PHI statement, since it may
7580 involve cyclic reference and cause infinite recursive invocation. */
7581 if (gimple_code (use_stmt) == GIMPLE_PHI)
7582 break;
7584 /* When pattern recognition is involved, a statement whose definition is
7585 consumed in some pattern, may not be included in the final replacement
7586 pattern statements, so would be skipped when building SLP graph.
7588 * Original
7589 char a_c = *(char *) a;
7590 char b_c = *(char *) b;
7591 unsigned short a_s = (unsigned short) a_c;
7592 int a_i = (int) a_s;
7593 int b_i = (int) b_c;
7594 int r_i = a_i - b_i;
7596 * After pattern replacement
7597 a_s = (unsigned short) a_c;
7598 a_i = (int) a_s;
7600 patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
7601 patt_b_i = (int) patt_b_s; // b_i = (int) b_c
7603 patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
7604 patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
7606 The definitions of a_i(original statement) and b_i(pattern statement)
7607 are related to, but actually not part of widen_minus pattern.
7608 Vectorizing the pattern does not cause these definition statements to
7609 be marked as PURE_SLP. For this case, we need to recursively check
7610 whether their uses are all absorbed into vectorized code. But there
7611 is an exception that some use may participate in an vectorized
7612 operation via an external SLP node containing that use as an element.
7613 The parameter "scalar_use_map" tags such kind of SSA as having scalar
7614 use in advance. */
7615 tree lhs = gimple_get_lhs (use_stmt);
7617 if (!lhs || TREE_CODE (lhs) != SSA_NAME)
7618 break;
7620 if (depth_limit && depth >= depth_limit)
7621 return -1;
7623 if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
7624 depth + 1)))
7625 break;
7628 if (end_imm_use_stmt_p (&use_iter))
7629 scalar_use = 0;
7631 /* If recursion is limited, do not cache result for non-root defs. */
7632 if (!depth || scalar_use >= 0)
7634 bool added = scalar_use_map.put (def, scalar_use);
7635 gcc_assert (!added);
7638 return scalar_use;
7641 /* Mark lanes of NODE that are live outside of the basic-block vectorized
7642 region and that can be vectorized using vectorizable_live_operation
7643 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
7644 scalar code computing it to be retained. */
7646 static void
7647 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
7648 slp_instance instance,
7649 stmt_vector_for_cost *cost_vec,
7650 hash_map<tree, int> &scalar_use_map,
7651 hash_set<stmt_vec_info> &svisited,
7652 hash_set<slp_tree> &visited)
7654 if (visited.add (node))
7655 return;
7657 unsigned i;
7658 stmt_vec_info stmt_info;
7659 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
7660 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7662 if (!stmt_info || svisited.contains (stmt_info))
7663 continue;
7664 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
7665 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
7666 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
7667 /* Only the pattern root stmt computes the original scalar value. */
7668 continue;
7669 bool mark_visited = true;
7670 gimple *orig_stmt = orig_stmt_info->stmt;
7671 ssa_op_iter op_iter;
7672 def_operand_p def_p;
7673 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
7675 if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
7676 scalar_use_map))
7678 STMT_VINFO_LIVE_P (stmt_info) = true;
7679 if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
7680 instance, i, false, cost_vec))
7681 /* ??? So we know we can vectorize the live stmt from one SLP
7682 node. If we cannot do so from all or none consistently
7683 we'd have to record which SLP node (and lane) we want to
7684 use for the live operation. So make sure we can
7685 code-generate from all nodes. */
7686 mark_visited = false;
7687 else
7688 STMT_VINFO_LIVE_P (stmt_info) = false;
7691 /* We have to verify whether we can insert the lane extract
7692 before all uses. The following is a conservative approximation.
7693 We cannot put this into vectorizable_live_operation because
7694 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
7695 doesn't work.
7696 Note that while the fact that we emit code for loads at the
7697 first load should make this a non-problem leafs we construct
7698 from scalars are vectorized after the last scalar def.
7699 ??? If we'd actually compute the insert location during
7700 analysis we could use sth less conservative than the last
7701 scalar stmt in the node for the dominance check. */
7702 /* ??? What remains is "live" uses in vector CTORs in the same
7703 SLP graph which is where those uses can end up code-generated
7704 right after their definition instead of close to their original
7705 use. But that would restrict us to code-generate lane-extracts
7706 from the latest stmt in a node. So we compensate for this
7707 during code-generation, simply not replacing uses for those
7708 hopefully rare cases. */
7709 imm_use_iterator use_iter;
7710 gimple *use_stmt;
7711 stmt_vec_info use_stmt_info;
7713 if (STMT_VINFO_LIVE_P (stmt_info))
7714 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
7715 if (!is_gimple_debug (use_stmt)
7716 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
7717 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
7718 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
7720 if (dump_enabled_p ())
7721 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7722 "Cannot determine insertion place for "
7723 "lane extract\n");
7724 STMT_VINFO_LIVE_P (stmt_info) = false;
7725 mark_visited = true;
7728 if (mark_visited)
7729 svisited.add (stmt_info);
7732 slp_tree child;
7733 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7734 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
7735 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
7736 scalar_use_map, svisited, visited);
7739 /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
7740 are live outside of the basic-block vectorized region and that can be
7741 vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
7743 static void
7744 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
7746 if (bb_vinfo->slp_instances.is_empty ())
7747 return;
7749 hash_set<stmt_vec_info> svisited;
7750 hash_set<slp_tree> visited;
7751 hash_map<tree, int> scalar_use_map;
7752 auto_vec<slp_tree> worklist;
7754 for (slp_instance instance : bb_vinfo->slp_instances)
7756 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
7757 for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
7758 if (TREE_CODE (op) == SSA_NAME)
7759 scalar_use_map.put (op, 1);
7760 if (!visited.add (SLP_INSTANCE_TREE (instance)))
7761 worklist.safe_push (SLP_INSTANCE_TREE (instance));
7766 slp_tree node = worklist.pop ();
7768 if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
7770 for (tree op : SLP_TREE_SCALAR_OPS (node))
7771 if (TREE_CODE (op) == SSA_NAME)
7772 scalar_use_map.put (op, 1);
7774 else
7776 for (slp_tree child : SLP_TREE_CHILDREN (node))
7777 if (child && !visited.add (child))
7778 worklist.safe_push (child);
7781 while (!worklist.is_empty ());
7783 visited.empty ();
7785 for (slp_instance instance : bb_vinfo->slp_instances)
7787 vect_location = instance->location ();
7788 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
7789 instance, &instance->cost_vec,
7790 scalar_use_map, svisited, visited);
7794 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
7796 static bool
7797 vectorizable_bb_reduc_epilogue (slp_instance instance,
7798 stmt_vector_for_cost *cost_vec)
7800 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
7801 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
7802 if (reduc_code == MINUS_EXPR)
7803 reduc_code = PLUS_EXPR;
7804 internal_fn reduc_fn;
7805 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
7806 if (!vectype
7807 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
7808 || reduc_fn == IFN_LAST
7809 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
7810 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
7811 TREE_TYPE (vectype)))
7813 if (dump_enabled_p ())
7814 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7815 "not vectorized: basic block reduction epilogue "
7816 "operation unsupported.\n");
7817 return false;
7820 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
7821 cost log2 vector operations plus shuffles and one extraction. */
7822 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
7823 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
7824 vectype, 0, vect_body);
7825 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
7826 vectype, 0, vect_body);
7827 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
7828 vectype, 0, vect_body);
7830 /* Since we replace all stmts of a possibly longer scalar reduction
7831 chain account for the extra scalar stmts for that. */
7832 record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
7833 instance->root_stmts[0], 0, vect_body);
7834 return true;
7837 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
7838 and recurse to children. */
7840 static void
7841 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
7842 hash_set<slp_tree> &visited)
7844 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
7845 || visited.add (node))
7846 return;
7848 stmt_vec_info stmt;
7849 unsigned i;
7850 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
7851 if (stmt)
7852 roots.remove (vect_orig_stmt (stmt));
7854 slp_tree child;
7855 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7856 if (child)
7857 vect_slp_prune_covered_roots (child, roots, visited);
7860 /* Analyze statements in SLP instances of VINFO. Return true if the
7861 operations are supported. */
7863 bool
7864 vect_slp_analyze_operations (vec_info *vinfo)
7866 slp_instance instance;
7867 int i;
7869 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
7871 hash_set<slp_tree> visited;
7872 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
7874 auto_vec<slp_tree> visited_vec;
7875 stmt_vector_for_cost cost_vec;
7876 cost_vec.create (2);
7877 if (is_a <bb_vec_info> (vinfo))
7878 vect_location = instance->location ();
7879 if (!vect_slp_analyze_node_operations (vinfo,
7880 SLP_INSTANCE_TREE (instance),
7881 instance, visited, visited_vec,
7882 &cost_vec)
7883 /* CTOR instances require vectorized defs for the SLP tree root. */
7884 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
7885 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
7886 != vect_internal_def
7887 /* Make sure we vectorized with the expected type. */
7888 || !useless_type_conversion_p
7889 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
7890 (instance->root_stmts[0]->stmt))),
7891 TREE_TYPE (SLP_TREE_VECTYPE
7892 (SLP_INSTANCE_TREE (instance))))))
7893 /* Check we can vectorize the reduction. */
7894 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
7895 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec)))
7897 slp_tree node = SLP_INSTANCE_TREE (instance);
7898 stmt_vec_info stmt_info;
7899 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7900 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
7901 else
7902 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
7903 if (dump_enabled_p ())
7904 dump_printf_loc (MSG_NOTE, vect_location,
7905 "removing SLP instance operations starting from: %G",
7906 stmt_info->stmt);
7907 vect_free_slp_instance (instance);
7908 vinfo->slp_instances.ordered_remove (i);
7909 cost_vec.release ();
7910 while (!visited_vec.is_empty ())
7911 visited.remove (visited_vec.pop ());
7913 else
7915 i++;
7916 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
7918 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
7919 cost_vec.release ();
7921 else
7922 /* For BB vectorization remember the SLP graph entry
7923 cost for later. */
7924 instance->cost_vec = cost_vec;
7928 /* Now look for SLP instances with a root that are covered by other
7929 instances and remove them. */
7930 hash_set<stmt_vec_info> roots;
7931 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
7932 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
7933 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
7934 if (!roots.is_empty ())
7936 visited.empty ();
7937 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
7938 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
7939 visited);
7940 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
7941 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
7942 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
7944 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
7945 if (dump_enabled_p ())
7946 dump_printf_loc (MSG_NOTE, vect_location,
7947 "removing SLP instance operations starting "
7948 "from: %G", root->stmt);
7949 vect_free_slp_instance (instance);
7950 vinfo->slp_instances.ordered_remove (i);
7952 else
7953 ++i;
7956 /* Compute vectorizable live stmts. */
7957 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
7958 vect_bb_slp_mark_live_stmts (bb_vinfo);
7960 return !vinfo->slp_instances.is_empty ();
7963 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
7964 closing the eventual chain. */
7966 static slp_instance
7967 get_ultimate_leader (slp_instance instance,
7968 hash_map<slp_instance, slp_instance> &instance_leader)
7970 auto_vec<slp_instance *, 8> chain;
7971 slp_instance *tem;
7972 while (*(tem = instance_leader.get (instance)) != instance)
7974 chain.safe_push (tem);
7975 instance = *tem;
7977 while (!chain.is_empty ())
7978 *chain.pop () = instance;
7979 return instance;
7982 namespace {
7983 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
7984 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
7985 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
7987 INSTANCE_LEADER is as for get_ultimate_leader. */
7989 template<typename T>
7990 bool
7991 vect_map_to_instance (slp_instance instance, T key,
7992 hash_map<T, slp_instance> &key_to_instance,
7993 hash_map<slp_instance, slp_instance> &instance_leader)
7995 bool existed_p;
7996 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
7997 if (!existed_p)
7999 else if (key_instance != instance)
8001 /* If we're running into a previously marked key make us the
8002 leader of the current ultimate leader. This keeps the
8003 leader chain acyclic and works even when the current instance
8004 connects two previously independent graph parts. */
8005 slp_instance key_leader
8006 = get_ultimate_leader (key_instance, instance_leader);
8007 if (key_leader != instance)
8008 instance_leader.put (key_leader, instance);
8010 key_instance = instance;
8011 return existed_p;
8015 /* Worker of vect_bb_partition_graph, recurse on NODE. */
8017 static void
8018 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
8019 slp_instance instance, slp_tree node,
8020 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
8021 hash_map<slp_tree, slp_instance> &node_to_instance,
8022 hash_map<slp_instance, slp_instance> &instance_leader)
8024 stmt_vec_info stmt_info;
8025 unsigned i;
8027 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8028 if (stmt_info)
8029 vect_map_to_instance (instance, stmt_info, stmt_to_instance,
8030 instance_leader);
8032 if (vect_map_to_instance (instance, node, node_to_instance,
8033 instance_leader))
8034 return;
8036 slp_tree child;
8037 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8038 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8039 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
8040 node_to_instance, instance_leader);
8043 /* Partition the SLP graph into pieces that can be costed independently. */
8045 static void
8046 vect_bb_partition_graph (bb_vec_info bb_vinfo)
8048 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
8050 /* First walk the SLP graph assigning each involved scalar stmt a
8051 corresponding SLP graph entry and upon visiting a previously
8052 marked stmt, make the stmts leader the current SLP graph entry. */
8053 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
8054 hash_map<slp_tree, slp_instance> node_to_instance;
8055 hash_map<slp_instance, slp_instance> instance_leader;
8056 slp_instance instance;
8057 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
8059 instance_leader.put (instance, instance);
8060 vect_bb_partition_graph_r (bb_vinfo,
8061 instance, SLP_INSTANCE_TREE (instance),
8062 stmt_to_instance, node_to_instance,
8063 instance_leader);
8066 /* Then collect entries to each independent subgraph. */
8067 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
8069 slp_instance leader = get_ultimate_leader (instance, instance_leader);
8070 leader->subgraph_entries.safe_push (instance);
8071 if (dump_enabled_p ()
8072 && leader != instance)
8073 dump_printf_loc (MSG_NOTE, vect_location,
8074 "instance %p is leader of %p\n",
8075 (void *) leader, (void *) instance);
8079 /* Compute the set of scalar stmts participating in internal and external
8080 nodes. */
8082 static void
8083 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
8084 hash_set<slp_tree> &visited,
8085 hash_set<stmt_vec_info> &vstmts,
8086 hash_set<stmt_vec_info> &estmts)
8088 int i;
8089 stmt_vec_info stmt_info;
8090 slp_tree child;
8092 if (visited.add (node))
8093 return;
8095 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
8097 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8098 if (stmt_info)
8099 vstmts.add (stmt_info);
8101 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8102 if (child)
8103 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
8104 vstmts, estmts);
8106 else
8107 for (tree def : SLP_TREE_SCALAR_OPS (node))
8109 stmt_vec_info def_stmt = vinfo->lookup_def (def);
8110 if (def_stmt)
8111 estmts.add (def_stmt);
8116 /* Compute the scalar cost of the SLP node NODE and its children
8117 and return it. Do not account defs that are marked in LIFE and
8118 update LIFE according to uses of NODE. */
8120 static void
8121 vect_bb_slp_scalar_cost (vec_info *vinfo,
8122 slp_tree node, vec<bool, va_heap> *life,
8123 stmt_vector_for_cost *cost_vec,
8124 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
8125 hash_set<slp_tree> &visited)
8127 unsigned i;
8128 stmt_vec_info stmt_info;
8129 slp_tree child;
8131 if (visited.add (node))
8132 return;
8134 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8136 ssa_op_iter op_iter;
8137 def_operand_p def_p;
8139 if (!stmt_info || (*life)[i])
8140 continue;
8142 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
8143 gimple *orig_stmt = orig_stmt_info->stmt;
8145 /* If there is a non-vectorized use of the defs then the scalar
8146 stmt is kept live in which case we do not account it or any
8147 required defs in the SLP children in the scalar cost. This
8148 way we make the vectorization more costly when compared to
8149 the scalar cost. */
8150 if (!STMT_VINFO_LIVE_P (stmt_info))
8152 auto_vec<gimple *, 8> worklist;
8153 hash_set<gimple *> *worklist_visited = NULL;
8154 worklist.quick_push (orig_stmt);
8157 gimple *work_stmt = worklist.pop ();
8158 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
8160 imm_use_iterator use_iter;
8161 gimple *use_stmt;
8162 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
8163 DEF_FROM_PTR (def_p))
8164 if (!is_gimple_debug (use_stmt))
8166 stmt_vec_info use_stmt_info
8167 = vinfo->lookup_stmt (use_stmt);
8168 if (!use_stmt_info
8169 || !vectorized_scalar_stmts.contains (use_stmt_info))
8171 if (use_stmt_info
8172 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
8174 /* For stmts participating in patterns we have
8175 to check its uses recursively. */
8176 if (!worklist_visited)
8177 worklist_visited = new hash_set<gimple *> ();
8178 if (!worklist_visited->add (use_stmt))
8179 worklist.safe_push (use_stmt);
8180 continue;
8182 (*life)[i] = true;
8183 goto next_lane;
8188 while (!worklist.is_empty ());
8189 next_lane:
8190 if (worklist_visited)
8191 delete worklist_visited;
8192 if ((*life)[i])
8193 continue;
8196 /* Count scalar stmts only once. */
8197 if (gimple_visited_p (orig_stmt))
8198 continue;
8199 gimple_set_visited (orig_stmt, true);
8201 vect_cost_for_stmt kind;
8202 if (STMT_VINFO_DATA_REF (orig_stmt_info))
8204 data_reference_p dr = STMT_VINFO_DATA_REF (orig_stmt_info);
8205 tree base = get_base_address (DR_REF (dr));
8206 /* When the scalar access is to a non-global not address-taken
8207 decl that is not BLKmode assume we can access it with a single
8208 non-load/store instruction. */
8209 if (DECL_P (base)
8210 && !is_global_var (base)
8211 && !TREE_ADDRESSABLE (base)
8212 && DECL_MODE (base) != BLKmode)
8213 kind = scalar_stmt;
8214 else if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
8215 kind = scalar_load;
8216 else
8217 kind = scalar_store;
8219 else if (vect_nop_conversion_p (orig_stmt_info))
8220 continue;
8221 /* For single-argument PHIs assume coalescing which means zero cost
8222 for the scalar and the vector PHIs. This avoids artificially
8223 favoring the vector path (but may pessimize it in some cases). */
8224 else if (is_a <gphi *> (orig_stmt_info->stmt)
8225 && gimple_phi_num_args
8226 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
8227 continue;
8228 else
8229 kind = scalar_stmt;
8230 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
8231 SLP_TREE_VECTYPE (node), 0, vect_body);
8234 auto_vec<bool, 20> subtree_life;
8235 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8237 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8239 /* Do not directly pass LIFE to the recursive call, copy it to
8240 confine changes in the callee to the current child/subtree. */
8241 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
8243 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
8244 for (unsigned j = 0;
8245 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
8247 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
8248 if (perm.first == i)
8249 subtree_life[perm.second] = (*life)[j];
8252 else
8254 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
8255 subtree_life.safe_splice (*life);
8257 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
8258 vectorized_scalar_stmts, visited);
8259 subtree_life.truncate (0);
8264 /* Comparator for the loop-index sorted cost vectors. */
8266 static int
8267 li_cost_vec_cmp (const void *a_, const void *b_)
8269 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
8270 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
8271 if (a->first < b->first)
8272 return -1;
8273 else if (a->first == b->first)
8274 return 0;
8275 return 1;
8278 /* Check if vectorization of the basic block is profitable for the
8279 subgraph denoted by SLP_INSTANCES. */
8281 static bool
8282 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
8283 vec<slp_instance> slp_instances,
8284 loop_p orig_loop)
8286 slp_instance instance;
8287 int i;
8288 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
8289 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
8291 if (dump_enabled_p ())
8293 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
8294 hash_set<slp_tree> visited;
8295 FOR_EACH_VEC_ELT (slp_instances, i, instance)
8296 vect_print_slp_graph (MSG_NOTE, vect_location,
8297 SLP_INSTANCE_TREE (instance), visited);
8300 /* Compute the set of scalar stmts we know will go away 'locally' when
8301 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
8302 not accurate for nodes promoted extern late or for scalar stmts that
8303 are used both in extern defs and in vectorized defs. */
8304 hash_set<stmt_vec_info> vectorized_scalar_stmts;
8305 hash_set<stmt_vec_info> scalar_stmts_in_externs;
8306 hash_set<slp_tree> visited;
8307 FOR_EACH_VEC_ELT (slp_instances, i, instance)
8309 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
8310 SLP_INSTANCE_TREE (instance),
8311 visited,
8312 vectorized_scalar_stmts,
8313 scalar_stmts_in_externs);
8314 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
8315 vectorized_scalar_stmts.add (rstmt);
8317 /* Scalar stmts used as defs in external nodes need to be preseved, so
8318 remove them from vectorized_scalar_stmts. */
8319 for (stmt_vec_info stmt : scalar_stmts_in_externs)
8320 vectorized_scalar_stmts.remove (stmt);
8322 /* Calculate scalar cost and sum the cost for the vector stmts
8323 previously collected. */
8324 stmt_vector_for_cost scalar_costs = vNULL;
8325 stmt_vector_for_cost vector_costs = vNULL;
8326 visited.empty ();
8327 FOR_EACH_VEC_ELT (slp_instances, i, instance)
8329 auto_vec<bool, 20> life;
8330 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
8331 true);
8332 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8333 record_stmt_cost (&scalar_costs,
8334 SLP_INSTANCE_ROOT_STMTS (instance).length (),
8335 scalar_stmt,
8336 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
8337 vect_bb_slp_scalar_cost (bb_vinfo,
8338 SLP_INSTANCE_TREE (instance),
8339 &life, &scalar_costs, vectorized_scalar_stmts,
8340 visited);
8341 vector_costs.safe_splice (instance->cost_vec);
8342 instance->cost_vec.release ();
8345 if (dump_enabled_p ())
8346 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
8348 /* When costing non-loop vectorization we need to consider each covered
8349 loop independently and make sure vectorization is profitable. For
8350 now we assume a loop may be not entered or executed an arbitrary
8351 number of iterations (??? static information can provide more
8352 precise info here) which means we can simply cost each containing
8353 loops stmts separately. */
8355 /* First produce cost vectors sorted by loop index. */
8356 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
8357 li_scalar_costs (scalar_costs.length ());
8358 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
8359 li_vector_costs (vector_costs.length ());
8360 stmt_info_for_cost *cost;
8361 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
8363 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
8364 li_scalar_costs.quick_push (std::make_pair (l, cost));
8366 /* Use a random used loop as fallback in case the first vector_costs
8367 entry does not have a stmt_info associated with it. */
8368 unsigned l = li_scalar_costs[0].first;
8369 FOR_EACH_VEC_ELT (vector_costs, i, cost)
8371 /* We inherit from the previous COST, invariants, externals and
8372 extracts immediately follow the cost for the related stmt. */
8373 if (cost->stmt_info)
8374 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
8375 li_vector_costs.quick_push (std::make_pair (l, cost));
8377 li_scalar_costs.qsort (li_cost_vec_cmp);
8378 li_vector_costs.qsort (li_cost_vec_cmp);
8380 /* Now cost the portions individually. */
8381 unsigned vi = 0;
8382 unsigned si = 0;
8383 bool profitable = true;
8384 while (si < li_scalar_costs.length ()
8385 && vi < li_vector_costs.length ())
8387 unsigned sl = li_scalar_costs[si].first;
8388 unsigned vl = li_vector_costs[vi].first;
8389 if (sl != vl)
8391 if (dump_enabled_p ())
8392 dump_printf_loc (MSG_NOTE, vect_location,
8393 "Scalar %d and vector %d loop part do not "
8394 "match up, skipping scalar part\n", sl, vl);
8395 /* Skip the scalar part, assuming zero cost on the vector side. */
8398 si++;
8400 while (si < li_scalar_costs.length ()
8401 && li_scalar_costs[si].first == sl);
8402 continue;
8405 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
8408 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
8409 si++;
8411 while (si < li_scalar_costs.length ()
8412 && li_scalar_costs[si].first == sl);
8413 unsigned dummy;
8414 finish_cost (scalar_target_cost_data, nullptr,
8415 &dummy, &scalar_cost, &dummy);
8417 /* Complete the target-specific vector cost calculation. */
8418 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
8421 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
8422 vi++;
8424 while (vi < li_vector_costs.length ()
8425 && li_vector_costs[vi].first == vl);
8426 finish_cost (vect_target_cost_data, scalar_target_cost_data,
8427 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
8428 delete scalar_target_cost_data;
8429 delete vect_target_cost_data;
8431 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
8433 if (dump_enabled_p ())
8435 dump_printf_loc (MSG_NOTE, vect_location,
8436 "Cost model analysis for part in loop %d:\n", sl);
8437 dump_printf (MSG_NOTE, " Vector cost: %d\n",
8438 vec_inside_cost + vec_outside_cost);
8439 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
8442 /* Vectorization is profitable if its cost is more than the cost of scalar
8443 version. Note that we err on the vector side for equal cost because
8444 the cost estimate is otherwise quite pessimistic (constant uses are
8445 free on the scalar side but cost a load on the vector side for
8446 example). */
8447 if (vec_outside_cost + vec_inside_cost > scalar_cost)
8449 profitable = false;
8450 break;
8453 if (profitable && vi < li_vector_costs.length ())
8455 if (dump_enabled_p ())
8456 dump_printf_loc (MSG_NOTE, vect_location,
8457 "Excess vector cost for part in loop %d:\n",
8458 li_vector_costs[vi].first);
8459 profitable = false;
8462 /* Unset visited flag. This is delayed when the subgraph is profitable
8463 and we process the loop for remaining unvectorized if-converted code. */
8464 if (!orig_loop || !profitable)
8465 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
8466 gimple_set_visited (cost->stmt_info->stmt, false);
8468 scalar_costs.release ();
8469 vector_costs.release ();
8471 return profitable;
8474 /* qsort comparator for lane defs. */
8476 static int
8477 vld_cmp (const void *a_, const void *b_)
8479 auto *a = (const std::pair<unsigned, tree> *)a_;
8480 auto *b = (const std::pair<unsigned, tree> *)b_;
8481 return a->first - b->first;
8484 /* Return true if USE_STMT is a vector lane insert into VEC and set
8485 *THIS_LANE to the lane number that is set. */
8487 static bool
8488 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
8490 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
8491 if (!use_ass
8492 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
8493 || (vec
8494 ? gimple_assign_rhs1 (use_ass) != vec
8495 : ((vec = gimple_assign_rhs1 (use_ass)), false))
8496 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
8497 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
8498 || !constant_multiple_p
8499 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
8500 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
8501 this_lane))
8502 return false;
8503 return true;
8506 /* Find any vectorizable constructors and add them to the grouped_store
8507 array. */
8509 static void
8510 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
8512 for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
8513 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
8514 !gsi_end_p (gsi); gsi_next (&gsi))
8516 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
8517 if (!assign)
8518 continue;
8520 tree rhs = gimple_assign_rhs1 (assign);
8521 enum tree_code code = gimple_assign_rhs_code (assign);
8522 use_operand_p use_p;
8523 gimple *use_stmt;
8524 if (code == CONSTRUCTOR)
8526 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
8527 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
8528 CONSTRUCTOR_NELTS (rhs))
8529 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
8530 || uniform_vector_p (rhs))
8531 continue;
8533 unsigned j;
8534 tree val;
8535 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
8536 if (TREE_CODE (val) != SSA_NAME
8537 || !bb_vinfo->lookup_def (val))
8538 break;
8539 if (j != CONSTRUCTOR_NELTS (rhs))
8540 continue;
8542 vec<stmt_vec_info> roots = vNULL;
8543 roots.safe_push (bb_vinfo->lookup_stmt (assign));
8544 vec<stmt_vec_info> stmts;
8545 stmts.create (CONSTRUCTOR_NELTS (rhs));
8546 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
8547 stmts.quick_push
8548 (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
8549 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
8550 stmts, roots));
8552 else if (code == BIT_INSERT_EXPR
8553 && VECTOR_TYPE_P (TREE_TYPE (rhs))
8554 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
8555 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
8556 && integer_zerop (gimple_assign_rhs3 (assign))
8557 && useless_type_conversion_p
8558 (TREE_TYPE (TREE_TYPE (rhs)),
8559 TREE_TYPE (gimple_assign_rhs2 (assign)))
8560 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
8562 /* We start to match on insert to lane zero but since the
8563 inserts need not be ordered we'd have to search both
8564 the def and the use chains. */
8565 tree vectype = TREE_TYPE (rhs);
8566 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
8567 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
8568 auto_sbitmap lanes (nlanes);
8569 bitmap_clear (lanes);
8570 bitmap_set_bit (lanes, 0);
8571 tree def = gimple_assign_lhs (assign);
8572 lane_defs.quick_push
8573 (std::make_pair (0, gimple_assign_rhs2 (assign)));
8574 unsigned lanes_found = 1;
8575 /* Start with the use chains, the last stmt will be the root. */
8576 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
8577 vec<stmt_vec_info> roots = vNULL;
8578 roots.safe_push (last);
8581 use_operand_p use_p;
8582 gimple *use_stmt;
8583 if (!single_imm_use (def, &use_p, &use_stmt))
8584 break;
8585 unsigned this_lane;
8586 if (!bb_vinfo->lookup_stmt (use_stmt)
8587 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
8588 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
8589 break;
8590 if (bitmap_bit_p (lanes, this_lane))
8591 break;
8592 lanes_found++;
8593 bitmap_set_bit (lanes, this_lane);
8594 gassign *use_ass = as_a <gassign *> (use_stmt);
8595 lane_defs.quick_push (std::make_pair
8596 (this_lane, gimple_assign_rhs2 (use_ass)));
8597 last = bb_vinfo->lookup_stmt (use_ass);
8598 roots.safe_push (last);
8599 def = gimple_assign_lhs (use_ass);
8601 while (lanes_found < nlanes);
8602 if (roots.length () > 1)
8603 std::swap(roots[0], roots[roots.length () - 1]);
8604 if (lanes_found < nlanes)
8606 /* Now search the def chain. */
8607 def = gimple_assign_rhs1 (assign);
8610 if (TREE_CODE (def) != SSA_NAME
8611 || !has_single_use (def))
8612 break;
8613 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
8614 unsigned this_lane;
8615 if (!bb_vinfo->lookup_stmt (def_stmt)
8616 || !vect_slp_is_lane_insert (def_stmt,
8617 NULL_TREE, &this_lane)
8618 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
8619 break;
8620 if (bitmap_bit_p (lanes, this_lane))
8621 break;
8622 lanes_found++;
8623 bitmap_set_bit (lanes, this_lane);
8624 lane_defs.quick_push (std::make_pair
8625 (this_lane,
8626 gimple_assign_rhs2 (def_stmt)));
8627 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
8628 def = gimple_assign_rhs1 (def_stmt);
8630 while (lanes_found < nlanes);
8632 if (lanes_found == nlanes)
8634 /* Sort lane_defs after the lane index and register the root. */
8635 lane_defs.qsort (vld_cmp);
8636 vec<stmt_vec_info> stmts;
8637 stmts.create (nlanes);
8638 for (unsigned i = 0; i < nlanes; ++i)
8639 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
8640 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
8641 stmts, roots));
8643 else
8644 roots.release ();
8646 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
8647 && (associative_tree_code (code) || code == MINUS_EXPR)
8648 /* ??? This pessimizes a two-element reduction. PR54400.
8649 ??? In-order reduction could be handled if we only
8650 traverse one operand chain in vect_slp_linearize_chain. */
8651 && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
8652 /* Ops with constants at the tail can be stripped here. */
8653 && TREE_CODE (rhs) == SSA_NAME
8654 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
8655 /* Should be the chain end. */
8656 && (!single_imm_use (gimple_assign_lhs (assign),
8657 &use_p, &use_stmt)
8658 || !is_gimple_assign (use_stmt)
8659 || (gimple_assign_rhs_code (use_stmt) != code
8660 && ((code != PLUS_EXPR && code != MINUS_EXPR)
8661 || (gimple_assign_rhs_code (use_stmt)
8662 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
8664 /* We start the match at the end of a possible association
8665 chain. */
8666 auto_vec<chain_op_t> chain;
8667 auto_vec<std::pair<tree_code, gimple *> > worklist;
8668 auto_vec<gimple *> chain_stmts;
8669 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
8670 if (code == MINUS_EXPR)
8671 code = PLUS_EXPR;
8672 internal_fn reduc_fn;
8673 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
8674 || reduc_fn == IFN_LAST)
8675 continue;
8676 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
8677 /* ??? */
8678 code_stmt, alt_code_stmt, &chain_stmts);
8679 if (chain.length () > 1)
8681 /* Sort the chain according to def_type and operation. */
8682 chain.sort (dt_sort_cmp, bb_vinfo);
8683 /* ??? Now we'd want to strip externals and constants
8684 but record those to be handled in the epilogue. */
8685 /* ??? For now do not allow mixing ops or externs/constants. */
8686 bool invalid = false;
8687 unsigned remain_cnt = 0;
8688 unsigned last_idx = 0;
8689 for (unsigned i = 0; i < chain.length (); ++i)
8691 if (chain[i].code != code)
8693 invalid = true;
8694 break;
8696 if (chain[i].dt != vect_internal_def
8697 /* Avoid stmts where the def is not the LHS, like
8698 ASMs. */
8699 || (gimple_get_lhs (bb_vinfo->lookup_def
8700 (chain[i].op)->stmt)
8701 != chain[i].op))
8702 remain_cnt++;
8703 else
8704 last_idx = i;
8706 /* Make sure to have an even number of lanes as we later do
8707 all-or-nothing discovery, not trying to split further. */
8708 if ((chain.length () - remain_cnt) & 1)
8709 remain_cnt++;
8710 if (!invalid && chain.length () - remain_cnt > 1)
8712 vec<stmt_vec_info> stmts;
8713 vec<tree> remain = vNULL;
8714 stmts.create (chain.length ());
8715 if (remain_cnt > 0)
8716 remain.create (remain_cnt);
8717 for (unsigned i = 0; i < chain.length (); ++i)
8719 stmt_vec_info stmt_info;
8720 if (chain[i].dt == vect_internal_def
8721 && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
8722 gimple_get_lhs (stmt_info->stmt) == chain[i].op)
8723 && (i != last_idx
8724 || (stmts.length () & 1)))
8725 stmts.quick_push (stmt_info);
8726 else
8727 remain.quick_push (chain[i].op);
8729 vec<stmt_vec_info> roots;
8730 roots.create (chain_stmts.length ());
8731 for (unsigned i = 0; i < chain_stmts.length (); ++i)
8732 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
8733 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
8734 stmts, roots, remain));
8741 /* Walk the grouped store chains and replace entries with their
8742 pattern variant if any. */
8744 static void
8745 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
8747 stmt_vec_info first_element;
8748 unsigned i;
8750 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
8752 /* We also have CTORs in this array. */
8753 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
8754 continue;
8755 if (STMT_VINFO_IN_PATTERN_P (first_element))
8757 stmt_vec_info orig = first_element;
8758 first_element = STMT_VINFO_RELATED_STMT (first_element);
8759 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
8760 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
8761 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
8762 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
8763 vinfo->grouped_stores[i] = first_element;
8765 stmt_vec_info prev = first_element;
8766 while (DR_GROUP_NEXT_ELEMENT (prev))
8768 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
8769 if (STMT_VINFO_IN_PATTERN_P (elt))
8771 stmt_vec_info orig = elt;
8772 elt = STMT_VINFO_RELATED_STMT (elt);
8773 DR_GROUP_NEXT_ELEMENT (prev) = elt;
8774 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
8775 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
8777 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
8778 prev = elt;
8783 /* Check if the region described by BB_VINFO can be vectorized, returning
8784 true if so. When returning false, set FATAL to true if the same failure
8785 would prevent vectorization at other vector sizes, false if it is still
8786 worth trying other sizes. N_STMTS is the number of statements in the
8787 region. */
8789 static bool
8790 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
8791 vec<int> *dataref_groups)
8793 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
8795 slp_instance instance;
8796 int i;
8797 poly_uint64 min_vf = 2;
8799 /* The first group of checks is independent of the vector size. */
8800 fatal = true;
8802 /* Analyze the data references. */
8804 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
8806 if (dump_enabled_p ())
8807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8808 "not vectorized: unhandled data-ref in basic "
8809 "block.\n");
8810 return false;
8813 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
8815 if (dump_enabled_p ())
8816 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8817 "not vectorized: unhandled data access in "
8818 "basic block.\n");
8819 return false;
8822 vect_slp_check_for_roots (bb_vinfo);
8824 /* If there are no grouped stores and no constructors in the region
8825 there is no need to continue with pattern recog as vect_analyze_slp
8826 will fail anyway. */
8827 if (bb_vinfo->grouped_stores.is_empty ()
8828 && bb_vinfo->roots.is_empty ())
8830 if (dump_enabled_p ())
8831 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8832 "not vectorized: no grouped stores in "
8833 "basic block.\n");
8834 return false;
8837 /* While the rest of the analysis below depends on it in some way. */
8838 fatal = false;
8840 vect_pattern_recog (bb_vinfo);
8842 /* Update store groups from pattern processing. */
8843 vect_fixup_store_groups_with_patterns (bb_vinfo);
8845 /* Check the SLP opportunities in the basic block, analyze and build SLP
8846 trees. */
8847 if (!vect_analyze_slp (bb_vinfo, n_stmts))
8849 if (dump_enabled_p ())
8851 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8852 "Failed to SLP the basic block.\n");
8853 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8854 "not vectorized: failed to find SLP opportunities "
8855 "in basic block.\n");
8857 return false;
8860 /* Optimize permutations. */
8861 vect_optimize_slp (bb_vinfo);
8863 /* Gather the loads reachable from the SLP graph entries. */
8864 vect_gather_slp_loads (bb_vinfo);
8866 vect_record_base_alignments (bb_vinfo);
8868 /* Analyze and verify the alignment of data references and the
8869 dependence in the SLP instances. */
8870 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
8872 vect_location = instance->location ();
8873 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
8874 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
8876 slp_tree node = SLP_INSTANCE_TREE (instance);
8877 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8878 if (dump_enabled_p ())
8879 dump_printf_loc (MSG_NOTE, vect_location,
8880 "removing SLP instance operations starting from: %G",
8881 stmt_info->stmt);
8882 vect_free_slp_instance (instance);
8883 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
8884 continue;
8887 /* Mark all the statements that we want to vectorize as pure SLP and
8888 relevant. */
8889 vect_mark_slp_stmts (SLP_INSTANCE_TREE (instance));
8890 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
8891 unsigned j;
8892 stmt_vec_info root;
8893 /* Likewise consider instance root stmts as vectorized. */
8894 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
8895 STMT_SLP_TYPE (root) = pure_slp;
8897 i++;
8899 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
8900 return false;
8902 if (!vect_slp_analyze_operations (bb_vinfo))
8904 if (dump_enabled_p ())
8905 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8906 "not vectorized: bad operation in basic block.\n");
8907 return false;
8910 vect_bb_partition_graph (bb_vinfo);
8912 return true;
8915 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
8916 basic blocks in BBS, returning true on success.
8917 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
8919 static bool
8920 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
8921 vec<int> *dataref_groups, unsigned int n_stmts,
8922 loop_p orig_loop)
8924 bb_vec_info bb_vinfo;
8925 auto_vector_modes vector_modes;
8927 /* Autodetect first vector size we try. */
8928 machine_mode next_vector_mode = VOIDmode;
8929 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
8930 unsigned int mode_i = 0;
8932 vec_info_shared shared;
8934 machine_mode autodetected_vector_mode = VOIDmode;
8935 while (1)
8937 bool vectorized = false;
8938 bool fatal = false;
8939 bb_vinfo = new _bb_vec_info (bbs, &shared);
8941 bool first_time_p = shared.datarefs.is_empty ();
8942 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
8943 if (first_time_p)
8944 bb_vinfo->shared->save_datarefs ();
8945 else
8946 bb_vinfo->shared->check_datarefs ();
8947 bb_vinfo->vector_mode = next_vector_mode;
8949 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
8951 if (dump_enabled_p ())
8953 dump_printf_loc (MSG_NOTE, vect_location,
8954 "***** Analysis succeeded with vector mode"
8955 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
8956 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
8959 bb_vinfo->shared->check_datarefs ();
8961 bool force_clear = false;
8962 auto_vec<slp_instance> profitable_subgraphs;
8963 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
8965 if (instance->subgraph_entries.is_empty ())
8966 continue;
8968 dump_user_location_t saved_vect_location = vect_location;
8969 vect_location = instance->location ();
8970 if (!unlimited_cost_model (NULL)
8971 && !vect_bb_vectorization_profitable_p
8972 (bb_vinfo, instance->subgraph_entries, orig_loop))
8974 if (dump_enabled_p ())
8975 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8976 "not vectorized: vectorization is not "
8977 "profitable.\n");
8978 vect_location = saved_vect_location;
8979 continue;
8982 vect_location = saved_vect_location;
8983 if (!dbg_cnt (vect_slp))
8985 force_clear = true;
8986 continue;
8989 profitable_subgraphs.safe_push (instance);
8992 /* When we're vectorizing an if-converted loop body make sure
8993 we vectorized all if-converted code. */
8994 if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
8996 gcc_assert (bb_vinfo->nbbs == 1);
8997 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
8998 !gsi_end_p (gsi); gsi_next (&gsi))
9000 /* The costing above left us with DCEable vectorized scalar
9001 stmts having the visited flag set on profitable
9002 subgraphs. Do the delayed clearing of the flag here. */
9003 if (gimple_visited_p (gsi_stmt (gsi)))
9005 gimple_set_visited (gsi_stmt (gsi), false);
9006 continue;
9008 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
9009 continue;
9011 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
9012 if (gimple_assign_rhs_code (ass) == COND_EXPR)
9014 if (!profitable_subgraphs.is_empty ()
9015 && dump_enabled_p ())
9016 dump_printf_loc (MSG_NOTE, vect_location,
9017 "not profitable because of "
9018 "unprofitable if-converted scalar "
9019 "code\n");
9020 profitable_subgraphs.truncate (0);
9025 /* Finally schedule the profitable subgraphs. */
9026 for (slp_instance instance : profitable_subgraphs)
9028 if (!vectorized && dump_enabled_p ())
9029 dump_printf_loc (MSG_NOTE, vect_location,
9030 "Basic block will be vectorized "
9031 "using SLP\n");
9032 vectorized = true;
9034 /* Dump before scheduling as store vectorization will remove
9035 the original stores and mess with the instance tree
9036 so querying its location will eventually ICE. */
9037 if (flag_checking)
9038 for (slp_instance sub : instance->subgraph_entries)
9039 gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
9040 unsigned HOST_WIDE_INT bytes;
9041 if (dump_enabled_p ())
9042 for (slp_instance sub : instance->subgraph_entries)
9044 tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
9045 if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
9046 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
9047 sub->location (),
9048 "basic block part vectorized using %wu "
9049 "byte vectors\n", bytes);
9050 else
9051 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
9052 sub->location (),
9053 "basic block part vectorized using "
9054 "variable length vectors\n");
9057 dump_user_location_t saved_vect_location = vect_location;
9058 vect_location = instance->location ();
9060 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
9062 vect_location = saved_vect_location;
9065 else
9067 if (dump_enabled_p ())
9068 dump_printf_loc (MSG_NOTE, vect_location,
9069 "***** Analysis failed with vector mode %s\n",
9070 GET_MODE_NAME (bb_vinfo->vector_mode));
9073 if (mode_i == 0)
9074 autodetected_vector_mode = bb_vinfo->vector_mode;
9076 if (!fatal)
9077 while (mode_i < vector_modes.length ()
9078 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
9080 if (dump_enabled_p ())
9081 dump_printf_loc (MSG_NOTE, vect_location,
9082 "***** The result for vector mode %s would"
9083 " be the same\n",
9084 GET_MODE_NAME (vector_modes[mode_i]));
9085 mode_i += 1;
9088 delete bb_vinfo;
9090 if (mode_i < vector_modes.length ()
9091 && VECTOR_MODE_P (autodetected_vector_mode)
9092 && (related_vector_mode (vector_modes[mode_i],
9093 GET_MODE_INNER (autodetected_vector_mode))
9094 == autodetected_vector_mode)
9095 && (related_vector_mode (autodetected_vector_mode,
9096 GET_MODE_INNER (vector_modes[mode_i]))
9097 == vector_modes[mode_i]))
9099 if (dump_enabled_p ())
9100 dump_printf_loc (MSG_NOTE, vect_location,
9101 "***** Skipping vector mode %s, which would"
9102 " repeat the analysis for %s\n",
9103 GET_MODE_NAME (vector_modes[mode_i]),
9104 GET_MODE_NAME (autodetected_vector_mode));
9105 mode_i += 1;
9108 if (vectorized
9109 || mode_i == vector_modes.length ()
9110 || autodetected_vector_mode == VOIDmode
9111 /* If vect_slp_analyze_bb_1 signaled that analysis for all
9112 vector sizes will fail do not bother iterating. */
9113 || fatal)
9114 return vectorized;
9116 /* Try the next biggest vector size. */
9117 next_vector_mode = vector_modes[mode_i++];
9118 if (dump_enabled_p ())
9119 dump_printf_loc (MSG_NOTE, vect_location,
9120 "***** Re-trying analysis with vector mode %s\n",
9121 GET_MODE_NAME (next_vector_mode));
9126 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
9127 true if anything in the basic-block was vectorized. */
9129 static bool
9130 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
9132 vec<data_reference_p> datarefs = vNULL;
9133 auto_vec<int> dataref_groups;
9134 int insns = 0;
9135 int current_group = 0;
9137 for (unsigned i = 0; i < bbs.length (); i++)
9139 basic_block bb = bbs[i];
9140 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
9141 gsi_next (&gsi))
9143 gimple *stmt = gsi_stmt (gsi);
9144 if (is_gimple_debug (stmt))
9145 continue;
9147 insns++;
9149 if (gimple_location (stmt) != UNKNOWN_LOCATION)
9150 vect_location = stmt;
9152 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
9153 &dataref_groups, current_group))
9154 ++current_group;
9156 /* New BBs always start a new DR group. */
9157 ++current_group;
9160 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
9163 /* Special entry for the BB vectorizer. Analyze and transform a single
9164 if-converted BB with ORIG_LOOPs body being the not if-converted
9165 representation. Returns true if anything in the basic-block was
9166 vectorized. */
9168 bool
9169 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
9171 auto_vec<basic_block> bbs;
9172 bbs.safe_push (bb);
9173 return vect_slp_bbs (bbs, orig_loop);
9176 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
9177 true if anything in the basic-block was vectorized. */
9179 bool
9180 vect_slp_function (function *fun)
9182 bool r = false;
9183 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
9184 auto_bitmap exit_bbs;
9185 bitmap_set_bit (exit_bbs, EXIT_BLOCK);
9186 edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
9187 unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
9188 true, rpo, NULL);
9190 /* For the moment split the function into pieces to avoid making
9191 the iteration on the vector mode moot. Split at points we know
9192 to not handle well which is CFG merges (SLP discovery doesn't
9193 handle non-loop-header PHIs) and loop exits. Since pattern
9194 recog requires reverse iteration to visit uses before defs
9195 simply chop RPO into pieces. */
9196 auto_vec<basic_block> bbs;
9197 for (unsigned i = 0; i < n; i++)
9199 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
9200 bool split = false;
9202 /* Split when a BB is not dominated by the first block. */
9203 if (!bbs.is_empty ()
9204 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
9206 if (dump_enabled_p ())
9207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9208 "splitting region at dominance boundary bb%d\n",
9209 bb->index);
9210 split = true;
9212 /* Split when the loop determined by the first block
9213 is exited. This is because we eventually insert
9214 invariants at region begin. */
9215 else if (!bbs.is_empty ()
9216 && bbs[0]->loop_father != bb->loop_father
9217 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
9219 if (dump_enabled_p ())
9220 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9221 "splitting region at loop %d exit at bb%d\n",
9222 bbs[0]->loop_father->num, bb->index);
9223 split = true;
9225 else if (!bbs.is_empty ()
9226 && bb->loop_father->header == bb
9227 && bb->loop_father->dont_vectorize)
9229 if (dump_enabled_p ())
9230 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9231 "splitting region at dont-vectorize loop %d "
9232 "entry at bb%d\n",
9233 bb->loop_father->num, bb->index);
9234 split = true;
9237 if (split && !bbs.is_empty ())
9239 r |= vect_slp_bbs (bbs, NULL);
9240 bbs.truncate (0);
9243 if (bbs.is_empty ())
9245 /* We need to be able to insert at the head of the region which
9246 we cannot for region starting with a returns-twice call. */
9247 if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
9248 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
9250 if (dump_enabled_p ())
9251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9252 "skipping bb%d as start of region as it "
9253 "starts with returns-twice call\n",
9254 bb->index);
9255 continue;
9257 /* If the loop this BB belongs to is marked as not to be vectorized
9258 honor that also for BB vectorization. */
9259 if (bb->loop_father->dont_vectorize)
9260 continue;
9263 bbs.safe_push (bb);
9265 /* When we have a stmt ending this block and defining a
9266 value we have to insert on edges when inserting after it for
9267 a vector containing its definition. Avoid this for now. */
9268 if (gimple *last = *gsi_last_bb (bb))
9269 if (gimple_get_lhs (last)
9270 && is_ctrl_altering_stmt (last))
9272 if (dump_enabled_p ())
9273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9274 "splitting region at control altering "
9275 "definition %G", last);
9276 r |= vect_slp_bbs (bbs, NULL);
9277 bbs.truncate (0);
9281 if (!bbs.is_empty ())
9282 r |= vect_slp_bbs (bbs, NULL);
9284 free (rpo);
9286 return r;
9289 /* Build a variable-length vector in which the elements in ELTS are repeated
9290 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
9291 RESULTS and add any new instructions to SEQ.
9293 The approach we use is:
9295 (1) Find a vector mode VM with integer elements of mode IM.
9297 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
9298 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
9299 from small vectors to IM.
9301 (3) Duplicate each ELTS'[I] into a vector of mode VM.
9303 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
9304 correct byte contents.
9306 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
9308 We try to find the largest IM for which this sequence works, in order
9309 to cut down on the number of interleaves. */
9311 void
9312 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
9313 const vec<tree> &elts, unsigned int nresults,
9314 vec<tree> &results)
9316 unsigned int nelts = elts.length ();
9317 tree element_type = TREE_TYPE (vector_type);
9319 /* (1) Find a vector mode VM with integer elements of mode IM. */
9320 unsigned int nvectors = 1;
9321 tree new_vector_type;
9322 tree permutes[2];
9323 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
9324 &nvectors, &new_vector_type,
9325 permutes))
9326 gcc_unreachable ();
9328 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
9329 unsigned int partial_nelts = nelts / nvectors;
9330 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
9332 tree_vector_builder partial_elts;
9333 auto_vec<tree, 32> pieces (nvectors * 2);
9334 pieces.quick_grow_cleared (nvectors * 2);
9335 for (unsigned int i = 0; i < nvectors; ++i)
9337 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
9338 ELTS' has mode IM. */
9339 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
9340 for (unsigned int j = 0; j < partial_nelts; ++j)
9341 partial_elts.quick_push (elts[i * partial_nelts + j]);
9342 tree t = gimple_build_vector (seq, &partial_elts);
9343 t = gimple_build (seq, VIEW_CONVERT_EXPR,
9344 TREE_TYPE (new_vector_type), t);
9346 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
9347 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
9350 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
9351 correct byte contents.
9353 Conceptually, we need to repeat the following operation log2(nvectors)
9354 times, where hi_start = nvectors / 2:
9356 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
9357 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
9359 However, if each input repeats every N elements and the VF is
9360 a multiple of N * 2, the HI result is the same as the LO result.
9361 This will be true for the first N1 iterations of the outer loop,
9362 followed by N2 iterations for which both the LO and HI results
9363 are needed. I.e.:
9365 N1 + N2 = log2(nvectors)
9367 Each "N1 iteration" doubles the number of redundant vectors and the
9368 effect of the process as a whole is to have a sequence of nvectors/2**N1
9369 vectors that repeats 2**N1 times. Rather than generate these redundant
9370 vectors, we halve the number of vectors for each N1 iteration. */
9371 unsigned int in_start = 0;
9372 unsigned int out_start = nvectors;
9373 unsigned int new_nvectors = nvectors;
9374 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
9376 unsigned int hi_start = new_nvectors / 2;
9377 unsigned int out_i = 0;
9378 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
9380 if ((in_i & 1) != 0
9381 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
9382 2 * in_repeat))
9383 continue;
9385 tree output = make_ssa_name (new_vector_type);
9386 tree input1 = pieces[in_start + (in_i / 2)];
9387 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
9388 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
9389 input1, input2,
9390 permutes[in_i & 1]);
9391 gimple_seq_add_stmt (seq, stmt);
9392 pieces[out_start + out_i] = output;
9393 out_i += 1;
9395 std::swap (in_start, out_start);
9396 new_nvectors = out_i;
9399 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
9400 results.reserve (nresults);
9401 for (unsigned int i = 0; i < nresults; ++i)
9402 if (i < new_nvectors)
9403 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
9404 pieces[in_start + i]));
9405 else
9406 results.quick_push (results[i - new_nvectors]);
9410 /* For constant and loop invariant defs in OP_NODE this function creates
9411 vector defs that will be used in the vectorized stmts and stores them
9412 to SLP_TREE_VEC_DEFS of OP_NODE. */
9414 static void
9415 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
9417 unsigned HOST_WIDE_INT nunits;
9418 tree vec_cst;
9419 unsigned j, number_of_places_left_in_vector;
9420 tree vector_type;
9421 tree vop;
9422 int group_size = op_node->ops.length ();
9423 unsigned int vec_num, i;
9424 unsigned number_of_copies = 1;
9425 bool constant_p;
9426 gimple_seq ctor_seq = NULL;
9427 auto_vec<tree, 16> permute_results;
9429 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
9430 vector_type = SLP_TREE_VECTYPE (op_node);
9432 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
9433 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
9434 auto_vec<tree> voprnds (number_of_vectors);
9436 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
9437 created vectors. It is greater than 1 if unrolling is performed.
9439 For example, we have two scalar operands, s1 and s2 (e.g., group of
9440 strided accesses of size two), while NUNITS is four (i.e., four scalars
9441 of this type can be packed in a vector). The output vector will contain
9442 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
9443 will be 2).
9445 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
9446 containing the operands.
9448 For example, NUNITS is four as before, and the group size is 8
9449 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
9450 {s5, s6, s7, s8}. */
9452 /* When using duplicate_and_interleave, we just need one element for
9453 each scalar statement. */
9454 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
9455 nunits = group_size;
9457 number_of_copies = nunits * number_of_vectors / group_size;
9459 number_of_places_left_in_vector = nunits;
9460 constant_p = true;
9461 tree uniform_elt = NULL_TREE;
9462 tree_vector_builder elts (vector_type, nunits, 1);
9463 elts.quick_grow (nunits);
9464 stmt_vec_info insert_after = NULL;
9465 for (j = 0; j < number_of_copies; j++)
9467 tree op;
9468 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
9470 /* Create 'vect_ = {op0,op1,...,opn}'. */
9471 tree orig_op = op;
9472 if (number_of_places_left_in_vector == nunits)
9473 uniform_elt = op;
9474 else if (uniform_elt && operand_equal_p (uniform_elt, op))
9475 op = elts[number_of_places_left_in_vector];
9476 else
9477 uniform_elt = NULL_TREE;
9478 number_of_places_left_in_vector--;
9479 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
9481 if (CONSTANT_CLASS_P (op))
9483 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
9485 /* Can't use VIEW_CONVERT_EXPR for booleans because
9486 of possibly different sizes of scalar value and
9487 vector element. */
9488 if (integer_zerop (op))
9489 op = build_int_cst (TREE_TYPE (vector_type), 0);
9490 else if (integer_onep (op))
9491 op = build_all_ones_cst (TREE_TYPE (vector_type));
9492 else
9493 gcc_unreachable ();
9495 else
9496 op = fold_unary (VIEW_CONVERT_EXPR,
9497 TREE_TYPE (vector_type), op);
9498 gcc_assert (op && CONSTANT_CLASS_P (op));
9500 else
9502 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
9503 gimple *init_stmt;
9504 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
9506 tree true_val
9507 = build_all_ones_cst (TREE_TYPE (vector_type));
9508 tree false_val
9509 = build_zero_cst (TREE_TYPE (vector_type));
9510 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
9511 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
9512 op, true_val,
9513 false_val);
9515 else
9517 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
9518 op);
9519 init_stmt
9520 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
9521 op);
9523 gimple_seq_add_stmt (&ctor_seq, init_stmt);
9524 op = new_temp;
9527 elts[number_of_places_left_in_vector] = op;
9528 if (!CONSTANT_CLASS_P (op))
9529 constant_p = false;
9530 /* For BB vectorization we have to compute an insert location
9531 when a def is inside the analyzed region since we cannot
9532 simply insert at the BB start in this case. */
9533 stmt_vec_info opdef;
9534 if (TREE_CODE (orig_op) == SSA_NAME
9535 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
9536 && is_a <bb_vec_info> (vinfo)
9537 && (opdef = vinfo->lookup_def (orig_op)))
9539 if (!insert_after)
9540 insert_after = opdef;
9541 else
9542 insert_after = get_later_stmt (insert_after, opdef);
9545 if (number_of_places_left_in_vector == 0)
9547 auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
9548 if (uniform_elt)
9549 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
9550 elts[0]);
9551 else if (constant_p
9552 ? multiple_p (type_nunits, nunits)
9553 : known_eq (type_nunits, nunits))
9554 vec_cst = gimple_build_vector (&ctor_seq, &elts);
9555 else
9557 if (permute_results.is_empty ())
9558 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
9559 elts, number_of_vectors,
9560 permute_results);
9561 vec_cst = permute_results[number_of_vectors - j - 1];
9563 if (!gimple_seq_empty_p (ctor_seq))
9565 if (insert_after)
9567 gimple_stmt_iterator gsi;
9568 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
9570 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
9571 gsi_insert_seq_before (&gsi, ctor_seq,
9572 GSI_CONTINUE_LINKING);
9574 else if (!stmt_ends_bb_p (insert_after->stmt))
9576 gsi = gsi_for_stmt (insert_after->stmt);
9577 gsi_insert_seq_after (&gsi, ctor_seq,
9578 GSI_CONTINUE_LINKING);
9580 else
9582 /* When we want to insert after a def where the
9583 defining stmt throws then insert on the fallthru
9584 edge. */
9585 edge e = find_fallthru_edge
9586 (gimple_bb (insert_after->stmt)->succs);
9587 basic_block new_bb
9588 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
9589 gcc_assert (!new_bb);
9592 else
9593 vinfo->insert_seq_on_entry (NULL, ctor_seq);
9594 ctor_seq = NULL;
9596 voprnds.quick_push (vec_cst);
9597 insert_after = NULL;
9598 number_of_places_left_in_vector = nunits;
9599 constant_p = true;
9600 elts.new_vector (vector_type, nunits, 1);
9601 elts.quick_grow (nunits);
9606 /* Since the vectors are created in the reverse order, we should invert
9607 them. */
9608 vec_num = voprnds.length ();
9609 for (j = vec_num; j != 0; j--)
9611 vop = voprnds[j - 1];
9612 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
9615 /* In case that VF is greater than the unrolling factor needed for the SLP
9616 group of stmts, NUMBER_OF_VECTORS to be created is greater than
9617 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
9618 to replicate the vectors. */
9619 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
9620 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
9621 i++)
9622 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
9625 /* Get the Ith vectorized definition from SLP_NODE. */
9627 tree
9628 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
9630 return SLP_TREE_VEC_DEFS (slp_node)[i];
9633 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
9635 void
9636 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
9638 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
9639 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
9642 /* Get N vectorized definitions for SLP_NODE. */
9644 void
9645 vect_get_slp_defs (vec_info *,
9646 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
9648 if (n == -1U)
9649 n = SLP_TREE_CHILDREN (slp_node).length ();
9651 for (unsigned i = 0; i < n; ++i)
9653 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9654 vec<tree> vec_defs = vNULL;
9655 vect_get_slp_defs (child, &vec_defs);
9656 vec_oprnds->quick_push (vec_defs);
9660 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
9661 - PERM gives the permutation that the caller wants to use for NODE,
9662 which might be different from SLP_LOAD_PERMUTATION.
9663 - DUMP_P controls whether the function dumps information. */
9665 static bool
9666 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
9667 load_permutation_t &perm,
9668 const vec<tree> &dr_chain,
9669 gimple_stmt_iterator *gsi, poly_uint64 vf,
9670 bool analyze_only, bool dump_p,
9671 unsigned *n_perms, unsigned int *n_loads,
9672 bool dce_chain)
9674 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
9675 int vec_index = 0;
9676 tree vectype = SLP_TREE_VECTYPE (node);
9677 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
9678 unsigned int mask_element;
9679 unsigned dr_group_size;
9680 machine_mode mode;
9682 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
9683 dr_group_size = 1;
9684 else
9686 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
9687 dr_group_size = DR_GROUP_SIZE (stmt_info);
9690 mode = TYPE_MODE (vectype);
9691 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9692 unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9694 /* Initialize the vect stmts of NODE to properly insert the generated
9695 stmts later. */
9696 if (! analyze_only)
9697 for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
9698 SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
9700 /* Generate permutation masks for every NODE. Number of masks for each NODE
9701 is equal to GROUP_SIZE.
9702 E.g., we have a group of three nodes with three loads from the same
9703 location in each node, and the vector size is 4. I.e., we have a
9704 a0b0c0a1b1c1... sequence and we need to create the following vectors:
9705 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
9706 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
9709 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
9710 The last mask is illegal since we assume two operands for permute
9711 operation, and the mask element values can't be outside that range.
9712 Hence, the last mask must be converted into {2,5,5,5}.
9713 For the first two permutations we need the first and the second input
9714 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
9715 we need the second and the third vectors: {b1,c1,a2,b2} and
9716 {c2,a3,b3,c3}. */
9718 int vect_stmts_counter = 0;
9719 unsigned int index = 0;
9720 int first_vec_index = -1;
9721 int second_vec_index = -1;
9722 bool noop_p = true;
9723 *n_perms = 0;
9725 vec_perm_builder mask;
9726 unsigned int nelts_to_build;
9727 unsigned int nvectors_per_build;
9728 unsigned int in_nlanes;
9729 bool repeating_p = (group_size == dr_group_size
9730 && multiple_p (nunits, group_size));
9731 if (repeating_p)
9733 /* A single vector contains a whole number of copies of the node, so:
9734 (a) all permutes can use the same mask; and
9735 (b) the permutes only need a single vector input. */
9736 mask.new_vector (nunits, group_size, 3);
9737 nelts_to_build = mask.encoded_nelts ();
9738 /* It's possible to obtain zero nstmts during analyze_only, so make
9739 it at least one to ensure the later computation for n_perms
9740 proceed. */
9741 nvectors_per_build = nstmts > 0 ? nstmts : 1;
9742 in_nlanes = dr_group_size * 3;
9744 else
9746 /* We need to construct a separate mask for each vector statement. */
9747 unsigned HOST_WIDE_INT const_nunits, const_vf;
9748 if (!nunits.is_constant (&const_nunits)
9749 || !vf.is_constant (&const_vf))
9750 return false;
9751 mask.new_vector (const_nunits, const_nunits, 1);
9752 nelts_to_build = const_vf * group_size;
9753 nvectors_per_build = 1;
9754 in_nlanes = const_vf * dr_group_size;
9756 auto_sbitmap used_in_lanes (in_nlanes);
9757 bitmap_clear (used_in_lanes);
9758 auto_bitmap used_defs;
9760 unsigned int count = mask.encoded_nelts ();
9761 mask.quick_grow (count);
9762 vec_perm_indices indices;
9764 for (unsigned int j = 0; j < nelts_to_build; j++)
9766 unsigned int iter_num = j / group_size;
9767 unsigned int stmt_num = j % group_size;
9768 unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
9769 bitmap_set_bit (used_in_lanes, i);
9770 if (repeating_p)
9772 first_vec_index = 0;
9773 mask_element = i;
9775 else
9777 /* Enforced before the loop when !repeating_p. */
9778 unsigned int const_nunits = nunits.to_constant ();
9779 vec_index = i / const_nunits;
9780 mask_element = i % const_nunits;
9781 if (vec_index == first_vec_index
9782 || first_vec_index == -1)
9784 first_vec_index = vec_index;
9786 else if (vec_index == second_vec_index
9787 || second_vec_index == -1)
9789 second_vec_index = vec_index;
9790 mask_element += const_nunits;
9792 else
9794 if (dump_p)
9795 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9796 "permutation requires at "
9797 "least three vectors %G",
9798 stmt_info->stmt);
9799 gcc_assert (analyze_only);
9800 return false;
9803 gcc_assert (mask_element < 2 * const_nunits);
9806 if (mask_element != index)
9807 noop_p = false;
9808 mask[index++] = mask_element;
9810 if (index == count)
9812 if (!noop_p)
9814 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
9815 if (!can_vec_perm_const_p (mode, mode, indices))
9817 if (dump_p)
9819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9820 "unsupported vect permute { ");
9821 for (i = 0; i < count; ++i)
9823 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
9824 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
9826 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
9828 gcc_assert (analyze_only);
9829 return false;
9832 tree mask_vec = NULL_TREE;
9833 if (!analyze_only)
9834 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
9836 if (second_vec_index == -1)
9837 second_vec_index = first_vec_index;
9839 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
9841 ++*n_perms;
9842 if (analyze_only)
9843 continue;
9844 /* Generate the permute statement if necessary. */
9845 tree first_vec = dr_chain[first_vec_index + ri];
9846 tree second_vec = dr_chain[second_vec_index + ri];
9847 gassign *stmt = as_a<gassign *> (stmt_info->stmt);
9848 tree perm_dest
9849 = vect_create_destination_var (gimple_assign_lhs (stmt),
9850 vectype);
9851 perm_dest = make_ssa_name (perm_dest);
9852 gimple *perm_stmt
9853 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
9854 second_vec, mask_vec);
9855 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
9856 gsi);
9857 if (dce_chain)
9859 bitmap_set_bit (used_defs, first_vec_index + ri);
9860 bitmap_set_bit (used_defs, second_vec_index + ri);
9863 /* Store the vector statement in NODE. */
9864 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
9867 else if (!analyze_only)
9869 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
9871 tree first_vec = dr_chain[first_vec_index + ri];
9872 /* If mask was NULL_TREE generate the requested
9873 identity transform. */
9874 if (dce_chain)
9875 bitmap_set_bit (used_defs, first_vec_index + ri);
9877 /* Store the vector statement in NODE. */
9878 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
9882 index = 0;
9883 first_vec_index = -1;
9884 second_vec_index = -1;
9885 noop_p = true;
9889 if (n_loads)
9891 if (repeating_p)
9892 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
9893 else
9895 /* Enforced above when !repeating_p. */
9896 unsigned int const_nunits = nunits.to_constant ();
9897 *n_loads = 0;
9898 bool load_seen = false;
9899 for (unsigned i = 0; i < in_nlanes; ++i)
9901 if (i % const_nunits == 0)
9903 if (load_seen)
9904 *n_loads += 1;
9905 load_seen = false;
9907 if (bitmap_bit_p (used_in_lanes, i))
9908 load_seen = true;
9910 if (load_seen)
9911 *n_loads += 1;
9915 if (dce_chain)
9916 for (unsigned i = 0; i < dr_chain.length (); ++i)
9917 if (!bitmap_bit_p (used_defs, i))
9919 tree def = dr_chain[i];
9922 gimple *stmt = SSA_NAME_DEF_STMT (def);
9923 if (is_gimple_assign (stmt)
9924 && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
9925 || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
9926 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
9927 else
9928 def = NULL;
9929 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
9930 gsi_remove (&rgsi, true);
9931 release_defs (stmt);
9933 while (def);
9936 return true;
9939 /* Generate vector permute statements from a list of loads in DR_CHAIN.
9940 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
9941 permute statements for the SLP node NODE. Store the number of vector
9942 permute instructions in *N_PERMS and the number of vector load
9943 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
9944 that were not needed. */
9946 bool
9947 vect_transform_slp_perm_load (vec_info *vinfo,
9948 slp_tree node, const vec<tree> &dr_chain,
9949 gimple_stmt_iterator *gsi, poly_uint64 vf,
9950 bool analyze_only, unsigned *n_perms,
9951 unsigned int *n_loads, bool dce_chain)
9953 return vect_transform_slp_perm_load_1 (vinfo, node,
9954 SLP_TREE_LOAD_PERMUTATION (node),
9955 dr_chain, gsi, vf, analyze_only,
9956 dump_enabled_p (), n_perms, n_loads,
9957 dce_chain);
9960 /* Produce the next vector result for SLP permutation NODE by adding a vector
9961 statement at GSI. If MASK_VEC is nonnull, add:
9963 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
9965 otherwise add:
9967 <new SSA name> = FIRST_DEF. */
9969 static void
9970 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
9971 slp_tree node, tree first_def, tree second_def,
9972 tree mask_vec, poly_uint64 identity_offset)
9974 tree vectype = SLP_TREE_VECTYPE (node);
9976 /* ??? We SLP match existing vector element extracts but
9977 allow punning which we need to re-instantiate at uses
9978 but have no good way of explicitly representing. */
9979 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
9980 && !types_compatible_p (TREE_TYPE (first_def), vectype))
9982 gassign *conv_stmt
9983 = gimple_build_assign (make_ssa_name (vectype),
9984 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
9985 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
9986 first_def = gimple_assign_lhs (conv_stmt);
9988 gassign *perm_stmt;
9989 tree perm_dest = make_ssa_name (vectype);
9990 if (mask_vec)
9992 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
9993 TYPE_SIZE (vectype))
9994 && !types_compatible_p (TREE_TYPE (second_def), vectype))
9996 gassign *conv_stmt
9997 = gimple_build_assign (make_ssa_name (vectype),
9998 build1 (VIEW_CONVERT_EXPR,
9999 vectype, second_def));
10000 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
10001 second_def = gimple_assign_lhs (conv_stmt);
10003 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
10004 first_def, second_def,
10005 mask_vec);
10007 else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
10009 /* For identity permutes we still need to handle the case
10010 of offsetted extracts or concats. */
10011 unsigned HOST_WIDE_INT c;
10012 auto first_def_nunits
10013 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
10014 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
10016 unsigned HOST_WIDE_INT elsz
10017 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
10018 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
10019 TYPE_SIZE (vectype),
10020 bitsize_int (identity_offset * elsz));
10021 perm_stmt = gimple_build_assign (perm_dest, lowpart);
10023 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
10024 first_def_nunits, &c) && c == 2)
10026 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
10027 NULL_TREE, second_def);
10028 perm_stmt = gimple_build_assign (perm_dest, ctor);
10030 else
10031 gcc_unreachable ();
10033 else
10035 /* We need a copy here in case the def was external. */
10036 perm_stmt = gimple_build_assign (perm_dest, first_def);
10038 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
10039 /* Store the vector statement in NODE. */
10040 node->push_vec_def (perm_stmt);
10043 /* Subroutine of vectorizable_slp_permutation. Check whether the target
10044 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
10045 If GSI is nonnull, emit the permutation there.
10047 When GSI is null, the only purpose of NODE is to give properties
10048 of the result, such as the vector type and number of SLP lanes.
10049 The node does not need to be a VEC_PERM_EXPR.
10051 If the target supports the operation, return the number of individual
10052 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
10053 dump file if DUMP_P is true. */
10055 static int
10056 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
10057 slp_tree node, lane_permutation_t &perm,
10058 vec<slp_tree> &children, bool dump_p)
10060 tree vectype = SLP_TREE_VECTYPE (node);
10062 /* ??? We currently only support all same vector input types
10063 while the SLP IL should really do a concat + select and thus accept
10064 arbitrary mismatches. */
10065 slp_tree child;
10066 unsigned i;
10067 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10068 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
10069 tree op_vectype = NULL_TREE;
10070 FOR_EACH_VEC_ELT (children, i, child)
10071 if (SLP_TREE_VECTYPE (child))
10073 op_vectype = SLP_TREE_VECTYPE (child);
10074 break;
10076 if (!op_vectype)
10077 op_vectype = vectype;
10078 FOR_EACH_VEC_ELT (children, i, child)
10080 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
10081 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
10082 || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
10083 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
10085 if (dump_p)
10086 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10087 "Unsupported vector types in lane permutation\n");
10088 return -1;
10090 if (SLP_TREE_LANES (child) != SLP_TREE_LANES (node))
10091 repeating_p = false;
10094 gcc_assert (perm.length () == SLP_TREE_LANES (node));
10096 /* Load-lanes permute. This permute only acts as a forwarder to
10097 select the correct vector def of the load-lanes load which
10098 has the permuted vectors in its vector defs like
10099 { v0, w0, r0, v1, w1, r1 ... } for a ld3. */
10100 if (node->ldst_lanes)
10102 gcc_assert (children.length () == 1);
10103 if (!gsi)
10104 /* This is a trivial op always supported. */
10105 return 1;
10106 slp_tree child = children[0];
10107 unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
10108 / SLP_TREE_LANES (node));
10109 unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
10110 for (unsigned i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
10112 tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num + vec_idx];
10113 node->push_vec_def (def);
10115 return 1;
10118 /* REPEATING_P is true if every output vector is guaranteed to use the
10119 same permute vector. We can handle that case for both variable-length
10120 and constant-length vectors, but we only handle other cases for
10121 constant-length vectors.
10123 Set:
10125 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
10126 mask vector that we want to build.
10128 - NCOPIES to the number of copies of PERM that we need in order
10129 to build the necessary permute mask vectors.
10131 - NOUTPUTS_PER_MASK to the number of output vectors we want to create
10132 for each permute mask vector. This is only relevant when GSI is
10133 nonnull. */
10134 uint64_t npatterns;
10135 unsigned nelts_per_pattern;
10136 uint64_t ncopies;
10137 unsigned noutputs_per_mask;
10138 if (repeating_p)
10140 /* We need a single permute mask vector that has the form:
10142 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
10144 In other words, the original n-element permute in PERM is
10145 "unrolled" to fill a full vector. The stepped vector encoding
10146 that we use for permutes requires 3n elements. */
10147 npatterns = SLP_TREE_LANES (node);
10148 nelts_per_pattern = ncopies = 3;
10149 noutputs_per_mask = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
10151 else
10153 /* Calculate every element of every permute mask vector explicitly,
10154 instead of relying on the pattern described above. */
10155 if (!nunits.is_constant (&npatterns)
10156 || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
10157 return -1;
10158 nelts_per_pattern = ncopies = 1;
10159 if (loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo))
10160 if (!LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
10161 return -1;
10162 noutputs_per_mask = 1;
10164 unsigned olanes = ncopies * SLP_TREE_LANES (node);
10165 gcc_assert (repeating_p || multiple_p (olanes, nunits));
10167 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
10168 from the { SLP operand, scalar lane } permutation as recorded in the
10169 SLP node as intermediate step. This part should already work
10170 with SLP children with arbitrary number of lanes. */
10171 auto_vec<std::pair<std::pair<unsigned, unsigned>, unsigned> > vperm;
10172 auto_vec<unsigned> active_lane;
10173 vperm.create (olanes);
10174 active_lane.safe_grow_cleared (children.length (), true);
10175 for (unsigned i = 0; i < ncopies; ++i)
10177 for (unsigned pi = 0; pi < perm.length (); ++pi)
10179 std::pair<unsigned, unsigned> p = perm[pi];
10180 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
10181 if (repeating_p)
10182 vperm.quick_push ({{p.first, 0}, p.second + active_lane[p.first]});
10183 else
10185 /* We checked above that the vectors are constant-length. */
10186 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype).to_constant ();
10187 unsigned vi = (active_lane[p.first] + p.second) / vnunits;
10188 unsigned vl = (active_lane[p.first] + p.second) % vnunits;
10189 vperm.quick_push ({{p.first, vi}, vl});
10192 /* Advance to the next group. */
10193 for (unsigned j = 0; j < children.length (); ++j)
10194 active_lane[j] += SLP_TREE_LANES (children[j]);
10197 if (dump_p)
10199 dump_printf_loc (MSG_NOTE, vect_location,
10200 "vectorizing permutation");
10201 for (unsigned i = 0; i < perm.length (); ++i)
10202 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
10203 if (repeating_p)
10204 dump_printf (MSG_NOTE, " (repeat %d)\n", SLP_TREE_LANES (node));
10205 dump_printf (MSG_NOTE, "\n");
10206 dump_printf_loc (MSG_NOTE, vect_location, "as");
10207 for (unsigned i = 0; i < vperm.length (); ++i)
10209 if (i != 0
10210 && (repeating_p
10211 ? multiple_p (i, npatterns)
10212 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
10213 dump_printf (MSG_NOTE, ",");
10214 dump_printf (MSG_NOTE, " vops%u[%u][%u]",
10215 vperm[i].first.first, vperm[i].first.second,
10216 vperm[i].second);
10218 dump_printf (MSG_NOTE, "\n");
10221 /* We can only handle two-vector permutes, everything else should
10222 be lowered on the SLP level. The following is closely inspired
10223 by vect_transform_slp_perm_load and is supposed to eventually
10224 replace it.
10225 ??? As intermediate step do code-gen in the SLP tree representation
10226 somehow? */
10227 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
10228 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
10229 unsigned int index = 0;
10230 poly_uint64 mask_element;
10231 vec_perm_builder mask;
10232 mask.new_vector (nunits, npatterns, nelts_per_pattern);
10233 unsigned int count = mask.encoded_nelts ();
10234 mask.quick_grow (count);
10235 vec_perm_indices indices;
10236 unsigned nperms = 0;
10237 for (unsigned i = 0; i < vperm.length (); ++i)
10239 mask_element = vperm[i].second;
10240 if (first_vec.first == -1U
10241 || first_vec == vperm[i].first)
10242 first_vec = vperm[i].first;
10243 else if (second_vec.first == -1U
10244 || second_vec == vperm[i].first)
10246 second_vec = vperm[i].first;
10247 mask_element += nunits;
10249 else
10251 if (dump_p)
10252 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10253 "permutation requires at "
10254 "least three vectors\n");
10255 gcc_assert (!gsi);
10256 return -1;
10259 mask[index++] = mask_element;
10261 if (index == count)
10263 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
10264 TYPE_VECTOR_SUBPARTS (op_vectype));
10265 bool identity_p = (indices.series_p (0, 1, mask[0], 1)
10266 && constant_multiple_p (mask[0], nunits));
10267 machine_mode vmode = TYPE_MODE (vectype);
10268 machine_mode op_vmode = TYPE_MODE (op_vectype);
10269 unsigned HOST_WIDE_INT c;
10270 if ((!identity_p
10271 && !can_vec_perm_const_p (vmode, op_vmode, indices))
10272 || (identity_p
10273 && !known_le (nunits,
10274 TYPE_VECTOR_SUBPARTS (op_vectype))
10275 && (!constant_multiple_p (nunits,
10276 TYPE_VECTOR_SUBPARTS (op_vectype),
10277 &c) || c != 2)))
10279 if (dump_p)
10281 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
10282 vect_location,
10283 "unsupported vect permute { ");
10284 for (i = 0; i < count; ++i)
10286 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
10287 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
10289 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
10291 gcc_assert (!gsi);
10292 return -1;
10295 if (!identity_p)
10296 nperms++;
10297 if (gsi)
10299 if (second_vec.first == -1U)
10300 second_vec = first_vec;
10302 slp_tree
10303 first_node = children[first_vec.first],
10304 second_node = children[second_vec.first];
10306 tree mask_vec = NULL_TREE;
10307 if (!identity_p)
10308 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
10310 for (unsigned int vi = 0; vi < noutputs_per_mask; ++vi)
10312 tree first_def
10313 = vect_get_slp_vect_def (first_node,
10314 first_vec.second + vi);
10315 tree second_def
10316 = vect_get_slp_vect_def (second_node,
10317 second_vec.second + vi);
10318 vect_add_slp_permutation (vinfo, gsi, node, first_def,
10319 second_def, mask_vec, mask[0]);
10323 index = 0;
10324 first_vec = std::make_pair (-1U, -1U);
10325 second_vec = std::make_pair (-1U, -1U);
10329 return nperms;
10332 /* Vectorize the SLP permutations in NODE as specified
10333 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
10334 child number and lane number.
10335 Interleaving of two two-lane two-child SLP subtrees (not supported):
10336 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
10337 A blend of two four-lane two-child SLP subtrees:
10338 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
10339 Highpart of a four-lane one-child SLP subtree (not supported):
10340 [ { 0, 2 }, { 0, 3 } ]
10341 Where currently only a subset is supported by code generating below. */
10343 static bool
10344 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
10345 slp_tree node, stmt_vector_for_cost *cost_vec)
10347 tree vectype = SLP_TREE_VECTYPE (node);
10348 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
10349 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
10350 SLP_TREE_CHILDREN (node),
10351 dump_enabled_p ());
10352 if (nperms < 0)
10353 return false;
10355 if (!gsi)
10356 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
10358 return true;
10361 /* Vectorize SLP NODE. */
10363 static void
10364 vect_schedule_slp_node (vec_info *vinfo,
10365 slp_tree node, slp_instance instance)
10367 gimple_stmt_iterator si;
10368 int i;
10369 slp_tree child;
10371 /* Vectorize externals and constants. */
10372 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
10373 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
10375 /* ??? vectorizable_shift can end up using a scalar operand which is
10376 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
10377 node in this case. */
10378 if (!SLP_TREE_VECTYPE (node))
10379 return;
10381 /* There are two reasons vector defs might already exist. The first
10382 is that we are vectorizing an existing vector def. The second is
10383 when performing BB vectorization shared constant/external nodes
10384 are not split apart during partitioning so during the code-gen
10385 DFS walk we can end up visiting them twice. */
10386 if (! SLP_TREE_VEC_DEFS (node).exists ())
10387 vect_create_constant_vectors (vinfo, node);
10388 return;
10391 gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
10393 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
10395 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
10396 SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
10398 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
10399 && STMT_VINFO_DATA_REF (stmt_info))
10401 /* Vectorized loads go before the first scalar load to make it
10402 ready early, vectorized stores go before the last scalar
10403 stmt which is where all uses are ready. */
10404 stmt_vec_info last_stmt_info = NULL;
10405 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
10406 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
10407 else /* DR_IS_WRITE */
10408 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
10409 si = gsi_for_stmt (last_stmt_info->stmt);
10411 else if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
10412 && (STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
10413 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
10414 || STMT_VINFO_TYPE (stmt_info) == phi_info_type))
10416 /* For PHI node vectorization we do not use the insertion iterator. */
10417 si = gsi_none ();
10419 else
10421 /* Emit other stmts after the children vectorized defs which is
10422 earliest possible. */
10423 gimple *last_stmt = NULL;
10424 bool seen_vector_def = false;
10425 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
10426 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
10428 /* For fold-left reductions we are retaining the scalar
10429 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
10430 set so the representation isn't perfect. Resort to the
10431 last scalar def here. */
10432 if (SLP_TREE_VEC_DEFS (child).is_empty ())
10434 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
10435 == cycle_phi_info_type);
10436 gphi *phi = as_a <gphi *>
10437 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
10438 if (!last_stmt
10439 || vect_stmt_dominates_stmt_p (last_stmt, phi))
10440 last_stmt = phi;
10442 /* We are emitting all vectorized stmts in the same place and
10443 the last one is the last.
10444 ??? Unless we have a load permutation applied and that
10445 figures to re-use an earlier generated load. */
10446 unsigned j;
10447 tree vdef;
10448 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
10450 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
10451 if (!last_stmt
10452 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
10453 last_stmt = vstmt;
10456 else if (!SLP_TREE_VECTYPE (child))
10458 /* For externals we use unvectorized at all scalar defs. */
10459 unsigned j;
10460 tree def;
10461 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
10462 if (TREE_CODE (def) == SSA_NAME
10463 && !SSA_NAME_IS_DEFAULT_DEF (def))
10465 gimple *stmt = SSA_NAME_DEF_STMT (def);
10466 if (!last_stmt
10467 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
10468 last_stmt = stmt;
10471 else
10473 /* For externals we have to look at all defs since their
10474 insertion place is decided per vector. But beware
10475 of pre-existing vectors where we need to make sure
10476 we do not insert before the region boundary. */
10477 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
10478 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
10479 seen_vector_def = true;
10480 else
10482 unsigned j;
10483 tree vdef;
10484 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
10485 if (TREE_CODE (vdef) == SSA_NAME
10486 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
10488 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
10489 if (!last_stmt
10490 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
10491 last_stmt = vstmt;
10495 /* This can happen when all children are pre-existing vectors or
10496 constants. */
10497 if (!last_stmt)
10498 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
10499 if (!last_stmt)
10501 gcc_assert (seen_vector_def);
10502 si = gsi_after_labels (vinfo->bbs[0]);
10504 else if (is_ctrl_altering_stmt (last_stmt))
10506 /* We split regions to vectorize at control altering stmts
10507 with a definition so this must be an external which
10508 we can insert at the start of the region. */
10509 si = gsi_after_labels (vinfo->bbs[0]);
10511 else if (is_a <bb_vec_info> (vinfo)
10512 && SLP_TREE_CODE (node) != VEC_PERM_EXPR
10513 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
10514 && gimple_could_trap_p (stmt_info->stmt))
10516 /* We've constrained possibly trapping operations to all come
10517 from the same basic-block, if vectorized defs would allow earlier
10518 scheduling still force vectorized stmts to the original block.
10519 This is only necessary for BB vectorization since for loop vect
10520 all operations are in a single BB and scalar stmt based
10521 placement doesn't play well with epilogue vectorization. */
10522 gcc_assert (dominated_by_p (CDI_DOMINATORS,
10523 gimple_bb (stmt_info->stmt),
10524 gimple_bb (last_stmt)));
10525 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
10527 else if (is_a <gphi *> (last_stmt))
10528 si = gsi_after_labels (gimple_bb (last_stmt));
10529 else
10531 si = gsi_for_stmt (last_stmt);
10532 gsi_next (&si);
10534 /* Avoid scheduling internal defs outside of the loop when
10535 we might have only implicitly tracked loop mask/len defs. */
10536 if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
10537 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10538 || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10540 gimple_stmt_iterator si2
10541 = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
10542 if ((gsi_end_p (si2)
10543 && (LOOP_VINFO_LOOP (loop_vinfo)->header
10544 != gimple_bb (last_stmt))
10545 && dominated_by_p (CDI_DOMINATORS,
10546 LOOP_VINFO_LOOP (loop_vinfo)->header,
10547 gimple_bb (last_stmt)))
10548 || (!gsi_end_p (si2)
10549 && last_stmt != *si2
10550 && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
10551 si = si2;
10556 /* Handle purely internal nodes. */
10557 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
10559 if (dump_enabled_p ())
10560 dump_printf_loc (MSG_NOTE, vect_location,
10561 "------>vectorizing SLP permutation node\n");
10562 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
10563 be shared with different SLP nodes (but usually it's the same
10564 operation apart from the case the stmt is only there for denoting
10565 the actual scalar lane defs ...). So do not call vect_transform_stmt
10566 but open-code it here (partly). */
10567 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
10568 gcc_assert (done);
10569 stmt_vec_info slp_stmt_info;
10570 unsigned int i;
10571 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
10572 if (slp_stmt_info && STMT_VINFO_LIVE_P (slp_stmt_info))
10574 done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
10575 instance, i, true, NULL);
10576 gcc_assert (done);
10579 else
10581 if (dump_enabled_p ())
10582 dump_printf_loc (MSG_NOTE, vect_location,
10583 "------>vectorizing SLP node starting from: %G",
10584 stmt_info->stmt);
10585 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
10589 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
10590 For loop vectorization this is done in vectorizable_call, but for SLP
10591 it needs to be deferred until end of vect_schedule_slp, because multiple
10592 SLP instances may refer to the same scalar stmt. */
10594 static void
10595 vect_remove_slp_scalar_calls (vec_info *vinfo,
10596 slp_tree node, hash_set<slp_tree> &visited)
10598 gimple *new_stmt;
10599 gimple_stmt_iterator gsi;
10600 int i;
10601 slp_tree child;
10602 tree lhs;
10603 stmt_vec_info stmt_info;
10605 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
10606 return;
10608 if (visited.add (node))
10609 return;
10611 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
10612 vect_remove_slp_scalar_calls (vinfo, child, visited);
10614 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
10616 if (!stmt_info)
10617 continue;
10618 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
10619 if (!stmt || gimple_bb (stmt) == NULL)
10620 continue;
10621 if (is_pattern_stmt_p (stmt_info)
10622 || !PURE_SLP_STMT (stmt_info))
10623 continue;
10624 lhs = gimple_call_lhs (stmt);
10625 if (lhs)
10626 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
10627 else
10629 new_stmt = gimple_build_nop ();
10630 unlink_stmt_vdef (stmt_info->stmt);
10632 gsi = gsi_for_stmt (stmt);
10633 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
10634 if (lhs)
10635 SSA_NAME_DEF_STMT (lhs) = new_stmt;
10639 static void
10640 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
10642 hash_set<slp_tree> visited;
10643 vect_remove_slp_scalar_calls (vinfo, node, visited);
10646 /* Vectorize the instance root. */
10648 void
10649 vectorize_slp_instance_root_stmt (slp_tree node, slp_instance instance)
10651 gassign *rstmt = NULL;
10653 if (instance->kind == slp_inst_kind_ctor)
10655 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
10657 tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
10658 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
10659 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
10660 TREE_TYPE (vect_lhs)))
10661 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
10662 vect_lhs);
10663 rstmt = gimple_build_assign (root_lhs, vect_lhs);
10665 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
10667 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
10668 tree child_def;
10669 int j;
10670 vec<constructor_elt, va_gc> *v;
10671 vec_alloc (v, nelts);
10673 /* A CTOR can handle V16HI composition from VNx8HI so we
10674 do not need to convert vector elements if the types
10675 do not match. */
10676 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
10677 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
10678 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
10679 tree rtype
10680 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
10681 tree r_constructor = build_constructor (rtype, v);
10682 rstmt = gimple_build_assign (lhs, r_constructor);
10685 else if (instance->kind == slp_inst_kind_bb_reduc)
10687 /* Largely inspired by reduction chain epilogue handling in
10688 vect_create_epilog_for_reduction. */
10689 vec<tree> vec_defs = vNULL;
10690 vect_get_slp_defs (node, &vec_defs);
10691 enum tree_code reduc_code
10692 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
10693 /* ??? We actually have to reflect signs somewhere. */
10694 if (reduc_code == MINUS_EXPR)
10695 reduc_code = PLUS_EXPR;
10696 gimple_seq epilogue = NULL;
10697 /* We may end up with more than one vector result, reduce them
10698 to one vector. */
10699 tree vec_def = vec_defs[0];
10700 tree vectype = TREE_TYPE (vec_def);
10701 tree compute_vectype = vectype;
10702 bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
10703 && TYPE_OVERFLOW_UNDEFINED (vectype)
10704 && operation_can_overflow (reduc_code));
10705 if (pun_for_overflow_p)
10707 compute_vectype = unsigned_type_for (vectype);
10708 vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
10709 compute_vectype, vec_def);
10711 for (unsigned i = 1; i < vec_defs.length (); ++i)
10713 tree def = vec_defs[i];
10714 if (pun_for_overflow_p)
10715 def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
10716 compute_vectype, def);
10717 vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
10718 vec_def, def);
10720 vec_defs.release ();
10721 /* ??? Support other schemes than direct internal fn. */
10722 internal_fn reduc_fn;
10723 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
10724 || reduc_fn == IFN_LAST)
10725 gcc_unreachable ();
10726 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
10727 TREE_TYPE (compute_vectype), vec_def);
10728 if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
10730 tree rem_def = NULL_TREE;
10731 for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
10733 def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
10734 if (!rem_def)
10735 rem_def = def;
10736 else
10737 rem_def = gimple_build (&epilogue, reduc_code,
10738 TREE_TYPE (scalar_def),
10739 rem_def, def);
10741 scalar_def = gimple_build (&epilogue, reduc_code,
10742 TREE_TYPE (scalar_def),
10743 scalar_def, rem_def);
10745 scalar_def = gimple_convert (&epilogue,
10746 TREE_TYPE (vectype), scalar_def);
10747 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
10748 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
10749 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
10750 update_stmt (gsi_stmt (rgsi));
10751 return;
10753 else
10754 gcc_unreachable ();
10756 gcc_assert (rstmt);
10758 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
10759 gsi_replace (&rgsi, rstmt, true);
10762 struct slp_scc_info
10764 bool on_stack;
10765 int dfs;
10766 int lowlink;
10769 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
10771 static void
10772 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
10773 hash_map<slp_tree, slp_scc_info> &scc_info,
10774 int &maxdfs, vec<slp_tree> &stack)
10776 bool existed_p;
10777 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
10778 gcc_assert (!existed_p);
10779 info->dfs = maxdfs;
10780 info->lowlink = maxdfs;
10781 maxdfs++;
10783 /* Leaf. */
10784 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
10786 info->on_stack = false;
10787 vect_schedule_slp_node (vinfo, node, instance);
10788 return;
10791 info->on_stack = true;
10792 stack.safe_push (node);
10794 unsigned i;
10795 slp_tree child;
10796 /* DFS recurse. */
10797 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
10799 if (!child)
10800 continue;
10801 slp_scc_info *child_info = scc_info.get (child);
10802 if (!child_info)
10804 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
10805 /* Recursion might have re-allocated the node. */
10806 info = scc_info.get (node);
10807 child_info = scc_info.get (child);
10808 info->lowlink = MIN (info->lowlink, child_info->lowlink);
10810 else if (child_info->on_stack)
10811 info->lowlink = MIN (info->lowlink, child_info->dfs);
10813 if (info->lowlink != info->dfs)
10814 return;
10816 auto_vec<slp_tree, 4> phis_to_fixup;
10818 /* Singleton. */
10819 if (stack.last () == node)
10821 stack.pop ();
10822 info->on_stack = false;
10823 vect_schedule_slp_node (vinfo, node, instance);
10824 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
10825 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
10826 phis_to_fixup.quick_push (node);
10828 else
10830 /* SCC. */
10831 int last_idx = stack.length () - 1;
10832 while (stack[last_idx] != node)
10833 last_idx--;
10834 /* We can break the cycle at PHIs who have at least one child
10835 code generated. Then we could re-start the DFS walk until
10836 all nodes in the SCC are covered (we might have new entries
10837 for only back-reachable nodes). But it's simpler to just
10838 iterate and schedule those that are ready. */
10839 unsigned todo = stack.length () - last_idx;
10842 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
10844 slp_tree entry = stack[idx];
10845 if (!entry)
10846 continue;
10847 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
10848 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
10849 bool ready = !phi;
10850 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
10851 if (!child)
10853 gcc_assert (phi);
10854 ready = true;
10855 break;
10857 else if (scc_info.get (child)->on_stack)
10859 if (!phi)
10861 ready = false;
10862 break;
10865 else
10867 if (phi)
10869 ready = true;
10870 break;
10873 if (ready)
10875 vect_schedule_slp_node (vinfo, entry, instance);
10876 scc_info.get (entry)->on_stack = false;
10877 stack[idx] = NULL;
10878 todo--;
10879 if (phi)
10880 phis_to_fixup.safe_push (entry);
10884 while (todo != 0);
10886 /* Pop the SCC. */
10887 stack.truncate (last_idx);
10890 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
10891 slp_tree phi_node;
10892 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
10894 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
10895 edge_iterator ei;
10896 edge e;
10897 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
10899 unsigned dest_idx = e->dest_idx;
10900 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
10901 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
10902 continue;
10903 unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
10904 /* Simply fill all args. */
10905 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
10906 != vect_first_order_recurrence)
10907 for (unsigned i = 0; i < n; ++i)
10909 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
10910 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
10911 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
10912 e, gimple_phi_arg_location (phi, dest_idx));
10914 else
10916 /* Unless it is a first order recurrence which needs
10917 args filled in for both the PHI node and the permutes. */
10918 gimple *perm
10919 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
10920 gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
10921 add_phi_arg (as_a <gphi *> (rphi),
10922 vect_get_slp_vect_def (child, n - 1),
10923 e, gimple_phi_arg_location (phi, dest_idx));
10924 for (unsigned i = 0; i < n; ++i)
10926 gimple *perm
10927 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
10928 if (i > 0)
10929 gimple_assign_set_rhs1 (perm,
10930 vect_get_slp_vect_def (child, i - 1));
10931 gimple_assign_set_rhs2 (perm,
10932 vect_get_slp_vect_def (child, i));
10933 update_stmt (perm);
10940 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
10942 void
10943 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
10945 slp_instance instance;
10946 unsigned int i;
10948 hash_map<slp_tree, slp_scc_info> scc_info;
10949 int maxdfs = 0;
10950 FOR_EACH_VEC_ELT (slp_instances, i, instance)
10952 slp_tree node = SLP_INSTANCE_TREE (instance);
10953 if (dump_enabled_p ())
10955 dump_printf_loc (MSG_NOTE, vect_location,
10956 "Vectorizing SLP tree:\n");
10957 /* ??? Dump all? */
10958 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
10959 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
10960 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
10961 vect_print_slp_graph (MSG_NOTE, vect_location,
10962 SLP_INSTANCE_TREE (instance));
10964 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
10965 have a PHI be the node breaking the cycle. */
10966 auto_vec<slp_tree> stack;
10967 if (!scc_info.get (node))
10968 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
10970 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
10971 vectorize_slp_instance_root_stmt (node, instance);
10973 if (dump_enabled_p ())
10974 dump_printf_loc (MSG_NOTE, vect_location,
10975 "vectorizing stmts using SLP.\n");
10978 FOR_EACH_VEC_ELT (slp_instances, i, instance)
10980 slp_tree root = SLP_INSTANCE_TREE (instance);
10981 stmt_vec_info store_info;
10982 unsigned int j;
10984 /* Remove scalar call stmts. Do not do this for basic-block
10985 vectorization as not all uses may be vectorized.
10986 ??? Why should this be necessary? DCE should be able to
10987 remove the stmts itself.
10988 ??? For BB vectorization we can as well remove scalar
10989 stmts starting from the SLP tree root if they have no
10990 uses. */
10991 if (is_a <loop_vec_info> (vinfo))
10992 vect_remove_slp_scalar_calls (vinfo, root);
10994 /* Remove vectorized stores original scalar stmts. */
10995 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
10997 if (!STMT_VINFO_DATA_REF (store_info)
10998 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
10999 break;
11001 store_info = vect_orig_stmt (store_info);
11002 /* Free the attached stmt_vec_info and remove the stmt. */
11003 vinfo->remove_stmt (store_info);
11005 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
11006 to not crash in vect_free_slp_tree later. */
11007 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
11008 SLP_TREE_REPRESENTATIVE (root) = NULL;