OpenMP: Update documentation of metadirective implementation status.
[gcc.git] / gcc / tree-vect-slp.cc
blobac1733004b68943d3fe0d789996c5056cd5400fb
1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2025 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
22 #include "config.h"
23 #define INCLUDE_ALGORITHM
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "insn-config.h"
35 #include "recog.h" /* FIXME: for insn_data */
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
39 #include "cfgloop.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
43 #include "dbgcnt.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 #include "dump-context.h"
49 #include "cfganal.h"
50 #include "tree-eh.h"
51 #include "tree-cfg.h"
52 #include "alloc-pool.h"
53 #include "sreal.h"
54 #include "predict.h"
56 static bool vect_transform_slp_perm_load_1 (vec_info *, slp_tree,
57 load_permutation_t &,
58 const vec<tree> &,
59 gimple_stmt_iterator *,
60 poly_uint64, bool, bool,
61 unsigned *,
62 unsigned * = nullptr,
63 bool = false);
64 static int vectorizable_slp_permutation_1 (vec_info *, gimple_stmt_iterator *,
65 slp_tree, lane_permutation_t &,
66 vec<slp_tree> &, bool);
67 static bool vectorizable_slp_permutation (vec_info *, gimple_stmt_iterator *,
68 slp_tree, stmt_vector_for_cost *);
69 static void vect_print_slp_tree (dump_flags_t, dump_location_t, slp_tree);
70 static bool vect_slp_can_convert_to_external (const vec<stmt_vec_info> &);
72 static object_allocator<_slp_tree> *slp_tree_pool;
73 static slp_tree slp_first_node;
75 void
76 vect_slp_init (void)
78 slp_tree_pool = new object_allocator<_slp_tree> ("SLP nodes");
81 void
82 vect_slp_fini (void)
84 while (slp_first_node)
85 delete slp_first_node;
86 delete slp_tree_pool;
87 slp_tree_pool = NULL;
90 void *
91 _slp_tree::operator new (size_t n)
93 gcc_assert (n == sizeof (_slp_tree));
94 return slp_tree_pool->allocate_raw ();
97 void
98 _slp_tree::operator delete (void *node, size_t n)
100 gcc_assert (n == sizeof (_slp_tree));
101 slp_tree_pool->remove_raw (node);
105 /* Initialize a SLP node. */
107 _slp_tree::_slp_tree ()
109 this->prev_node = NULL;
110 if (slp_first_node)
111 slp_first_node->prev_node = this;
112 this->next_node = slp_first_node;
113 slp_first_node = this;
114 SLP_TREE_SCALAR_STMTS (this) = vNULL;
115 SLP_TREE_SCALAR_OPS (this) = vNULL;
116 SLP_TREE_VEC_DEFS (this) = vNULL;
117 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
118 SLP_TREE_CHILDREN (this) = vNULL;
119 SLP_TREE_LOAD_PERMUTATION (this) = vNULL;
120 SLP_TREE_LANE_PERMUTATION (this) = vNULL;
121 SLP_TREE_SIMD_CLONE_INFO (this) = vNULL;
122 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def;
123 SLP_TREE_CODE (this) = ERROR_MARK;
124 this->ldst_lanes = false;
125 SLP_TREE_VECTYPE (this) = NULL_TREE;
126 SLP_TREE_REPRESENTATIVE (this) = NULL;
127 SLP_TREE_MEMORY_ACCESS_TYPE (this) = VMAT_INVARIANT;
128 SLP_TREE_REF_COUNT (this) = 1;
129 this->failed = NULL;
130 this->max_nunits = 1;
131 this->lanes = 0;
134 /* Tear down a SLP node. */
136 _slp_tree::~_slp_tree ()
138 if (this->prev_node)
139 this->prev_node->next_node = this->next_node;
140 else
141 slp_first_node = this->next_node;
142 if (this->next_node)
143 this->next_node->prev_node = this->prev_node;
144 SLP_TREE_CHILDREN (this).release ();
145 SLP_TREE_SCALAR_STMTS (this).release ();
146 SLP_TREE_SCALAR_OPS (this).release ();
147 SLP_TREE_VEC_DEFS (this).release ();
148 SLP_TREE_LOAD_PERMUTATION (this).release ();
149 SLP_TREE_LANE_PERMUTATION (this).release ();
150 SLP_TREE_SIMD_CLONE_INFO (this).release ();
151 if (this->failed)
152 free (failed);
155 /* Push the single SSA definition in DEF to the vector of vector defs. */
157 void
158 _slp_tree::push_vec_def (gimple *def)
160 if (gphi *phi = dyn_cast <gphi *> (def))
161 vec_defs.quick_push (gimple_phi_result (phi));
162 else
164 def_operand_p defop = single_ssa_def_operand (def, SSA_OP_ALL_DEFS);
165 vec_defs.quick_push (get_def_from_ptr (defop));
169 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
171 void
172 vect_free_slp_tree (slp_tree node)
174 int i;
175 slp_tree child;
177 if (--SLP_TREE_REF_COUNT (node) != 0)
178 return;
180 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
181 if (child)
182 vect_free_slp_tree (child);
184 /* If the node defines any SLP only patterns then those patterns are no
185 longer valid and should be removed. */
186 stmt_vec_info rep_stmt_info = SLP_TREE_REPRESENTATIVE (node);
187 if (rep_stmt_info && STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info))
189 stmt_vec_info stmt_info = vect_orig_stmt (rep_stmt_info);
190 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
191 STMT_SLP_TYPE (stmt_info) = STMT_SLP_TYPE (rep_stmt_info);
194 delete node;
197 /* Return a location suitable for dumpings related to the SLP instance. */
199 dump_user_location_t
200 _slp_instance::location () const
202 if (!root_stmts.is_empty ())
203 return root_stmts[0]->stmt;
204 else
205 return SLP_TREE_SCALAR_STMTS (root)[0]->stmt;
209 /* Free the memory allocated for the SLP instance. */
211 void
212 vect_free_slp_instance (slp_instance instance)
214 vect_free_slp_tree (SLP_INSTANCE_TREE (instance));
215 SLP_INSTANCE_LOADS (instance).release ();
216 SLP_INSTANCE_ROOT_STMTS (instance).release ();
217 SLP_INSTANCE_REMAIN_DEFS (instance).release ();
218 instance->subgraph_entries.release ();
219 instance->cost_vec.release ();
220 free (instance);
224 /* Create an SLP node for SCALAR_STMTS. */
226 slp_tree
227 vect_create_new_slp_node (unsigned nops, tree_code code)
229 slp_tree node = new _slp_tree;
230 SLP_TREE_SCALAR_STMTS (node) = vNULL;
231 SLP_TREE_CHILDREN (node).create (nops);
232 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
233 SLP_TREE_CODE (node) = code;
234 return node;
236 /* Create an SLP node for SCALAR_STMTS. */
238 static slp_tree
239 vect_create_new_slp_node (slp_tree node,
240 vec<stmt_vec_info> scalar_stmts, unsigned nops)
242 SLP_TREE_SCALAR_STMTS (node) = scalar_stmts;
243 SLP_TREE_CHILDREN (node).create (nops);
244 SLP_TREE_DEF_TYPE (node) = vect_internal_def;
245 SLP_TREE_REPRESENTATIVE (node) = scalar_stmts[0];
246 SLP_TREE_LANES (node) = scalar_stmts.length ();
247 return node;
250 /* Create an SLP node for SCALAR_STMTS. */
252 static slp_tree
253 vect_create_new_slp_node (vec<stmt_vec_info> scalar_stmts, unsigned nops)
255 return vect_create_new_slp_node (new _slp_tree, scalar_stmts, nops);
258 /* Create an SLP node for OPS. */
260 static slp_tree
261 vect_create_new_slp_node (slp_tree node, vec<tree> ops)
263 SLP_TREE_SCALAR_OPS (node) = ops;
264 SLP_TREE_DEF_TYPE (node) = vect_external_def;
265 SLP_TREE_LANES (node) = ops.length ();
266 return node;
269 /* Create an SLP node for OPS. */
271 static slp_tree
272 vect_create_new_slp_node (vec<tree> ops)
274 return vect_create_new_slp_node (new _slp_tree, ops);
278 /* This structure is used in creation of an SLP tree. Each instance
279 corresponds to the same operand in a group of scalar stmts in an SLP
280 node. */
281 typedef struct _slp_oprnd_info
283 /* Def-stmts for the operands. */
284 vec<stmt_vec_info> def_stmts;
285 /* Operands. */
286 vec<tree> ops;
287 /* Information about the first statement, its vector def-type, type, the
288 operand itself in case it's constant, and an indication if it's a pattern
289 stmt and gather/scatter info. */
290 tree first_op_type;
291 enum vect_def_type first_dt;
292 bool any_pattern;
293 bool first_gs_p;
294 gather_scatter_info first_gs_info;
295 } *slp_oprnd_info;
298 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
299 operand. */
300 static vec<slp_oprnd_info>
301 vect_create_oprnd_info (int nops, int group_size)
303 int i;
304 slp_oprnd_info oprnd_info;
305 vec<slp_oprnd_info> oprnds_info;
307 oprnds_info.create (nops);
308 for (i = 0; i < nops; i++)
310 oprnd_info = XNEW (struct _slp_oprnd_info);
311 oprnd_info->def_stmts.create (group_size);
312 oprnd_info->ops.create (group_size);
313 oprnd_info->first_dt = vect_uninitialized_def;
314 oprnd_info->first_op_type = NULL_TREE;
315 oprnd_info->any_pattern = false;
316 oprnd_info->first_gs_p = false;
317 oprnds_info.quick_push (oprnd_info);
320 return oprnds_info;
324 /* Free operands info. */
326 static void
327 vect_free_oprnd_info (vec<slp_oprnd_info> &oprnds_info)
329 int i;
330 slp_oprnd_info oprnd_info;
332 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
334 oprnd_info->def_stmts.release ();
335 oprnd_info->ops.release ();
336 XDELETE (oprnd_info);
339 oprnds_info.release ();
342 /* Return the execution frequency of NODE (so that a higher value indicates
343 a "more important" node when optimizing for speed). */
345 static sreal
346 vect_slp_node_weight (slp_tree node)
348 stmt_vec_info stmt_info = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node));
349 basic_block bb = gimple_bb (stmt_info->stmt);
350 return bb->count.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun)->count);
353 /* Return true if STMTS contains a pattern statement. */
355 static bool
356 vect_contains_pattern_stmt_p (vec<stmt_vec_info> stmts)
358 stmt_vec_info stmt_info;
359 unsigned int i;
360 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
361 if (stmt_info && is_pattern_stmt_p (stmt_info))
362 return true;
363 return false;
366 /* Return true when all lanes in the external or constant NODE have
367 the same value. */
369 static bool
370 vect_slp_tree_uniform_p (slp_tree node)
372 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_constant_def
373 || SLP_TREE_DEF_TYPE (node) == vect_external_def);
375 /* Pre-exsting vectors. */
376 if (SLP_TREE_SCALAR_OPS (node).is_empty ())
377 return false;
379 unsigned i;
380 tree op, first = NULL_TREE;
381 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
382 if (!first)
383 first = op;
384 else if (!operand_equal_p (first, op, 0))
385 return false;
387 return true;
390 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
391 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
392 of the chain. */
395 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info,
396 stmt_vec_info first_stmt_info)
398 stmt_vec_info next_stmt_info = first_stmt_info;
399 int result = 0;
401 if (first_stmt_info != DR_GROUP_FIRST_ELEMENT (stmt_info))
402 return -1;
406 if (next_stmt_info == stmt_info)
407 return result;
408 next_stmt_info = DR_GROUP_NEXT_ELEMENT (next_stmt_info);
409 if (next_stmt_info)
410 result += DR_GROUP_GAP (next_stmt_info);
412 while (next_stmt_info);
414 return -1;
417 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
418 using the method implemented by duplicate_and_interleave. Return true
419 if so, returning the number of intermediate vectors in *NVECTORS_OUT
420 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
421 (if nonnull). */
423 bool
424 can_duplicate_and_interleave_p (vec_info *vinfo, unsigned int count,
425 tree elt_type, unsigned int *nvectors_out,
426 tree *vector_type_out,
427 tree *permutes)
429 tree base_vector_type = get_vectype_for_scalar_type (vinfo, elt_type, count);
430 if (!base_vector_type || !VECTOR_MODE_P (TYPE_MODE (base_vector_type)))
431 return false;
433 machine_mode base_vector_mode = TYPE_MODE (base_vector_type);
434 poly_int64 elt_bytes = count * GET_MODE_UNIT_SIZE (base_vector_mode);
435 unsigned int nvectors = 1;
436 for (;;)
438 scalar_int_mode int_mode;
439 poly_int64 elt_bits = elt_bytes * BITS_PER_UNIT;
440 if (int_mode_for_size (elt_bits, 1).exists (&int_mode))
442 /* Get the natural vector type for this SLP group size. */
443 tree int_type = build_nonstandard_integer_type
444 (GET_MODE_BITSIZE (int_mode), 1);
445 tree vector_type
446 = get_vectype_for_scalar_type (vinfo, int_type, count);
447 poly_int64 half_nelts;
448 if (vector_type
449 && VECTOR_MODE_P (TYPE_MODE (vector_type))
450 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type)),
451 GET_MODE_SIZE (base_vector_mode))
452 && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type)),
453 2, &half_nelts))
455 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
456 together into elements of type INT_TYPE and using the result
457 to build NVECTORS vectors. */
458 poly_uint64 nelts = GET_MODE_NUNITS (TYPE_MODE (vector_type));
459 vec_perm_builder sel1 (nelts, 2, 3);
460 vec_perm_builder sel2 (nelts, 2, 3);
462 for (unsigned int i = 0; i < 3; ++i)
464 sel1.quick_push (i);
465 sel1.quick_push (i + nelts);
466 sel2.quick_push (half_nelts + i);
467 sel2.quick_push (half_nelts + i + nelts);
469 vec_perm_indices indices1 (sel1, 2, nelts);
470 vec_perm_indices indices2 (sel2, 2, nelts);
471 machine_mode vmode = TYPE_MODE (vector_type);
472 if (can_vec_perm_const_p (vmode, vmode, indices1)
473 && can_vec_perm_const_p (vmode, vmode, indices2))
475 if (nvectors_out)
476 *nvectors_out = nvectors;
477 if (vector_type_out)
478 *vector_type_out = vector_type;
479 if (permutes)
481 permutes[0] = vect_gen_perm_mask_checked (vector_type,
482 indices1);
483 permutes[1] = vect_gen_perm_mask_checked (vector_type,
484 indices2);
486 return true;
490 if (!multiple_p (elt_bytes, 2, &elt_bytes))
491 return false;
492 nvectors *= 2;
493 /* We need to be able to fuse COUNT / NVECTORS elements together. */
494 if (!multiple_p (count, nvectors))
495 return false;
499 /* Return true if DTA and DTB match. */
501 static bool
502 vect_def_types_match (enum vect_def_type dta, enum vect_def_type dtb)
504 return (dta == dtb
505 || ((dta == vect_external_def || dta == vect_constant_def)
506 && (dtb == vect_external_def || dtb == vect_constant_def)));
509 static const int cond_expr_maps[3][5] = {
510 { 4, -1, -2, 1, 2 },
511 { 4, -2, -1, 1, 2 },
512 { 4, -1, -2, 2, 1 }
514 static const int no_arg_map[] = { 0 };
515 static const int arg0_map[] = { 1, 0 };
516 static const int arg1_map[] = { 1, 1 };
517 static const int arg2_arg3_map[] = { 2, 2, 3 };
518 static const int arg1_arg3_map[] = { 2, 1, 3 };
519 static const int arg1_arg4_arg5_map[] = { 3, 1, 4, 5 };
520 static const int arg1_arg3_arg4_map[] = { 3, 1, 3, 4 };
521 static const int arg3_arg2_map[] = { 2, 3, 2 };
522 static const int op1_op0_map[] = { 2, 1, 0 };
523 static const int off_map[] = { 1, -3 };
524 static const int off_op0_map[] = { 2, -3, 0 };
525 static const int off_arg2_arg3_map[] = { 3, -3, 2, 3 };
526 static const int off_arg3_arg2_map[] = { 3, -3, 3, 2 };
527 static const int mask_call_maps[6][7] = {
528 { 1, 1, },
529 { 2, 1, 2, },
530 { 3, 1, 2, 3, },
531 { 4, 1, 2, 3, 4, },
532 { 5, 1, 2, 3, 4, 5, },
533 { 6, 1, 2, 3, 4, 5, 6 },
536 /* For most SLP statements, there is a one-to-one mapping between
537 gimple arguments and child nodes. If that is not true for STMT,
538 return an array that contains:
540 - the number of child nodes, followed by
541 - for each child node, the index of the argument associated with that node.
542 The special index -1 is the first operand of an embedded comparison and
543 the special index -2 is the second operand of an embedded comparison.
544 The special indes -3 is the offset of a gather as analyzed by
545 vect_check_gather_scatter.
547 SWAP is as for vect_get_and_check_slp_defs. */
549 static const int *
550 vect_get_operand_map (const gimple *stmt, bool gather_scatter_p = false,
551 unsigned char swap = 0)
553 if (auto assign = dyn_cast<const gassign *> (stmt))
555 if (gimple_assign_rhs_code (assign) == COND_EXPR
556 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign)))
557 gcc_unreachable ();
558 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison
559 && swap)
560 return op1_op0_map;
561 if (gather_scatter_p)
562 return (TREE_CODE (gimple_assign_lhs (assign)) != SSA_NAME
563 ? off_op0_map : off_map);
565 gcc_assert (!swap);
566 if (auto call = dyn_cast<const gcall *> (stmt))
568 if (gimple_call_internal_p (call))
569 switch (gimple_call_internal_fn (call))
571 case IFN_MASK_LOAD:
572 return gather_scatter_p ? off_arg2_arg3_map : arg2_arg3_map;
574 case IFN_GATHER_LOAD:
575 return arg1_map;
577 case IFN_MASK_GATHER_LOAD:
578 case IFN_MASK_LEN_GATHER_LOAD:
579 return arg1_arg4_arg5_map;
581 case IFN_SCATTER_STORE:
582 return arg1_arg3_map;
584 case IFN_MASK_SCATTER_STORE:
585 case IFN_MASK_LEN_SCATTER_STORE:
586 return arg1_arg3_arg4_map;
588 case IFN_MASK_STORE:
589 return gather_scatter_p ? off_arg3_arg2_map : arg3_arg2_map;
591 case IFN_MASK_CALL:
593 unsigned nargs = gimple_call_num_args (call);
594 if (nargs >= 2 && nargs <= 7)
595 return mask_call_maps[nargs-2];
596 else
597 return nullptr;
600 case IFN_CLZ:
601 case IFN_CTZ:
602 return arg0_map;
604 case IFN_GOMP_SIMD_LANE:
605 return no_arg_map;
607 default:
608 break;
611 return nullptr;
614 /* Return the SLP node child index for operand OP of STMT. */
617 vect_slp_child_index_for_operand (const gimple *stmt, int op,
618 bool gather_scatter_p)
620 const int *opmap = vect_get_operand_map (stmt, gather_scatter_p);
621 if (!opmap)
622 return op;
623 for (int i = 1; i < 1 + opmap[0]; ++i)
624 if (opmap[i] == op)
625 return i - 1;
626 gcc_unreachable ();
629 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
630 they are of a valid type and that they match the defs of the first stmt of
631 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
632 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
633 indicates swap is required for cond_expr stmts. Specifically, SWAP
634 is 1 if STMT is cond and operands of comparison need to be swapped;
635 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
637 If there was a fatal error return -1; if the error could be corrected by
638 swapping operands of father node of this one, return 1; if everything is
639 ok return 0. */
640 static int
641 vect_get_and_check_slp_defs (vec_info *vinfo, unsigned char swap,
642 bool *skip_args,
643 vec<stmt_vec_info> stmts, unsigned stmt_num,
644 vec<slp_oprnd_info> *oprnds_info)
646 stmt_vec_info stmt_info = stmts[stmt_num];
647 tree oprnd;
648 unsigned int i, number_of_oprnds;
649 enum vect_def_type dt = vect_uninitialized_def;
650 slp_oprnd_info oprnd_info;
651 gather_scatter_info gs_info;
652 unsigned int gs_op = -1u;
653 unsigned int commutative_op = -1U;
654 bool first = stmt_num == 0;
656 if (!stmt_info)
658 for (auto oi : *oprnds_info)
660 oi->def_stmts.quick_push (NULL);
661 oi->ops.quick_push (NULL_TREE);
663 return 0;
666 if (!is_a<gcall *> (stmt_info->stmt)
667 && !is_a<gassign *> (stmt_info->stmt)
668 && !is_a<gphi *> (stmt_info->stmt))
669 return -1;
671 number_of_oprnds = gimple_num_args (stmt_info->stmt);
672 const int *map
673 = vect_get_operand_map (stmt_info->stmt,
674 STMT_VINFO_GATHER_SCATTER_P (stmt_info), swap);
675 if (map)
676 number_of_oprnds = *map++;
677 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
679 if (gimple_call_internal_p (stmt))
681 internal_fn ifn = gimple_call_internal_fn (stmt);
682 commutative_op = first_commutative_argument (ifn);
685 else if (gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt))
687 if (commutative_tree_code (gimple_assign_rhs_code (stmt)))
688 commutative_op = 0;
691 bool swapped = (swap != 0);
692 bool backedge = false;
693 enum vect_def_type *dts = XALLOCAVEC (enum vect_def_type, number_of_oprnds);
694 for (i = 0; i < number_of_oprnds; i++)
696 oprnd_info = (*oprnds_info)[i];
697 int opno = map ? map[i] : int (i);
698 if (opno == -3)
700 gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info));
701 if (!is_a <loop_vec_info> (vinfo)
702 || !vect_check_gather_scatter (stmt_info,
703 as_a <loop_vec_info> (vinfo),
704 first ? &oprnd_info->first_gs_info
705 : &gs_info))
706 return -1;
708 if (first)
710 oprnd_info->first_gs_p = true;
711 oprnd = oprnd_info->first_gs_info.offset;
713 else
715 gs_op = i;
716 oprnd = gs_info.offset;
719 else if (opno < 0)
720 oprnd = TREE_OPERAND (gimple_arg (stmt_info->stmt, 0), -1 - opno);
721 else
723 oprnd = gimple_arg (stmt_info->stmt, opno);
724 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
726 edge e = gimple_phi_arg_edge (stmt, opno);
727 backedge = (is_a <bb_vec_info> (vinfo)
728 ? e->flags & EDGE_DFS_BACK
729 : dominated_by_p (CDI_DOMINATORS, e->src,
730 gimple_bb (stmt_info->stmt)));
733 if (TREE_CODE (oprnd) == VIEW_CONVERT_EXPR)
734 oprnd = TREE_OPERAND (oprnd, 0);
736 stmt_vec_info def_stmt_info;
737 if (!vect_is_simple_use (oprnd, vinfo, &dts[i], &def_stmt_info))
739 if (dump_enabled_p ())
740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
741 "Build SLP failed: can't analyze def for %T\n",
742 oprnd);
744 return -1;
747 if (skip_args[i])
749 oprnd_info->def_stmts.quick_push (NULL);
750 oprnd_info->ops.quick_push (NULL_TREE);
751 oprnd_info->first_dt = vect_uninitialized_def;
752 continue;
755 oprnd_info->def_stmts.quick_push (def_stmt_info);
756 oprnd_info->ops.quick_push (oprnd);
758 if (def_stmt_info
759 && is_pattern_stmt_p (def_stmt_info))
761 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info))
762 != def_stmt_info)
763 oprnd_info->any_pattern = true;
764 else
765 /* If we promote this to external use the original stmt def. */
766 oprnd_info->ops.last ()
767 = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
770 /* If there's a extern def on a backedge make sure we can
771 code-generate at the region start.
772 ??? This is another case that could be fixed by adjusting
773 how we split the function but at the moment we'd have conflicting
774 goals there. */
775 if (backedge
776 && dts[i] == vect_external_def
777 && is_a <bb_vec_info> (vinfo)
778 && TREE_CODE (oprnd) == SSA_NAME
779 && !SSA_NAME_IS_DEFAULT_DEF (oprnd)
780 && !dominated_by_p (CDI_DOMINATORS, vinfo->bbs[0],
781 gimple_bb (SSA_NAME_DEF_STMT (oprnd))))
783 if (dump_enabled_p ())
784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
785 "Build SLP failed: extern def %T only defined "
786 "on backedge\n", oprnd);
787 return -1;
790 if (first)
792 tree type = TREE_TYPE (oprnd);
793 dt = dts[i];
795 /* For the swapping logic below force vect_reduction_def
796 for the reduction op in a SLP reduction group. */
797 if (!STMT_VINFO_DATA_REF (stmt_info)
798 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
799 && (int)i == STMT_VINFO_REDUC_IDX (stmt_info)
800 && def_stmt_info)
801 dts[i] = dt = vect_reduction_def;
803 /* Check the types of the definition. */
804 switch (dt)
806 case vect_external_def:
807 case vect_constant_def:
808 case vect_internal_def:
809 case vect_reduction_def:
810 case vect_double_reduction_def:
811 case vect_induction_def:
812 case vect_nested_cycle:
813 case vect_first_order_recurrence:
814 break;
816 default:
817 /* FORNOW: Not supported. */
818 if (dump_enabled_p ())
819 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
820 "Build SLP failed: illegal type of def %T\n",
821 oprnd);
822 return -1;
825 oprnd_info->first_dt = dt;
826 oprnd_info->first_op_type = type;
829 if (first)
830 return 0;
832 /* Now match the operand definition types to that of the first stmt. */
833 for (i = 0; i < number_of_oprnds;)
835 if (skip_args[i])
837 ++i;
838 continue;
841 oprnd_info = (*oprnds_info)[i];
842 dt = dts[i];
843 stmt_vec_info def_stmt_info = oprnd_info->def_stmts[stmt_num];
844 oprnd = oprnd_info->ops[stmt_num];
845 tree type = TREE_TYPE (oprnd);
847 if (!types_compatible_p (oprnd_info->first_op_type, type))
849 if (dump_enabled_p ())
850 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
851 "Build SLP failed: different operand types\n");
852 return 1;
855 if ((gs_op == i) != oprnd_info->first_gs_p)
857 if (dump_enabled_p ())
858 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
859 "Build SLP failed: mixed gather and non-gather\n");
860 return 1;
862 else if (gs_op == i)
864 if (!operand_equal_p (oprnd_info->first_gs_info.base,
865 gs_info.base))
867 if (dump_enabled_p ())
868 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
869 "Build SLP failed: different gather base\n");
870 return 1;
872 if (oprnd_info->first_gs_info.scale != gs_info.scale)
874 if (dump_enabled_p ())
875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
876 "Build SLP failed: different gather scale\n");
877 return 1;
881 /* Not first stmt of the group, check that the def-stmt/s match
882 the def-stmt/s of the first stmt. Allow different definition
883 types for reduction chains: the first stmt must be a
884 vect_reduction_def (a phi node), and the rest
885 end in the reduction chain. */
886 if ((!vect_def_types_match (oprnd_info->first_dt, dt)
887 && !(oprnd_info->first_dt == vect_reduction_def
888 && !STMT_VINFO_DATA_REF (stmt_info)
889 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
890 && def_stmt_info
891 && !STMT_VINFO_DATA_REF (def_stmt_info)
892 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
893 == REDUC_GROUP_FIRST_ELEMENT (stmt_info))))
894 || (!STMT_VINFO_DATA_REF (stmt_info)
895 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
896 && ((!def_stmt_info
897 || STMT_VINFO_DATA_REF (def_stmt_info)
898 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
899 != REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
900 != (oprnd_info->first_dt != vect_reduction_def))))
902 /* Try swapping operands if we got a mismatch. For BB
903 vectorization only in case it will clearly improve things. */
904 if (i == commutative_op && !swapped
905 && (!is_a <bb_vec_info> (vinfo)
906 || (!vect_def_types_match ((*oprnds_info)[i+1]->first_dt,
907 dts[i+1])
908 && (vect_def_types_match (oprnd_info->first_dt, dts[i+1])
909 || vect_def_types_match
910 ((*oprnds_info)[i+1]->first_dt, dts[i])))))
912 if (dump_enabled_p ())
913 dump_printf_loc (MSG_NOTE, vect_location,
914 "trying swapped operands\n");
915 std::swap (dts[i], dts[i+1]);
916 std::swap ((*oprnds_info)[i]->def_stmts[stmt_num],
917 (*oprnds_info)[i+1]->def_stmts[stmt_num]);
918 std::swap ((*oprnds_info)[i]->ops[stmt_num],
919 (*oprnds_info)[i+1]->ops[stmt_num]);
920 /* After swapping some operands we lost track whether an
921 operand has any pattern defs so be conservative here. */
922 if ((*oprnds_info)[i]->any_pattern
923 || (*oprnds_info)[i+1]->any_pattern)
924 (*oprnds_info)[i]->any_pattern
925 = (*oprnds_info)[i+1]->any_pattern = true;
926 swapped = true;
927 continue;
930 if (is_a <bb_vec_info> (vinfo)
931 && !oprnd_info->any_pattern
932 && number_of_oprnds > 1)
934 /* Now for commutative ops we should see whether we can
935 make the other operand matching. */
936 if (dump_enabled_p ())
937 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
938 "treating operand as external\n");
939 oprnd_info->first_dt = dt = vect_external_def;
941 else
943 if (dump_enabled_p ())
944 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
945 "Build SLP failed: different types\n");
946 return 1;
950 /* Make sure to demote the overall operand to external. */
951 if (dt == vect_external_def)
952 oprnd_info->first_dt = vect_external_def;
953 /* For a SLP reduction chain we want to duplicate the reduction to
954 each of the chain members. That gets us a sane SLP graph (still
955 the stmts are not 100% correct wrt the initial values). */
956 else if ((dt == vect_internal_def
957 || dt == vect_reduction_def)
958 && oprnd_info->first_dt == vect_reduction_def
959 && !STMT_VINFO_DATA_REF (stmt_info)
960 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)
961 && !STMT_VINFO_DATA_REF (def_stmt_info)
962 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info)
963 == REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
965 oprnd_info->def_stmts[stmt_num] = oprnd_info->def_stmts[0];
966 oprnd_info->ops[stmt_num] = oprnd_info->ops[0];
969 ++i;
972 /* Swap operands. */
973 if (swapped)
975 if (dump_enabled_p ())
976 dump_printf_loc (MSG_NOTE, vect_location,
977 "swapped operands to match def types in %G",
978 stmt_info->stmt);
981 return 0;
984 /* Return true if call statements CALL1 and CALL2 are similar enough
985 to be combined into the same SLP group. */
987 bool
988 compatible_calls_p (gcall *call1, gcall *call2)
990 unsigned int nargs = gimple_call_num_args (call1);
991 if (nargs != gimple_call_num_args (call2))
992 return false;
994 if (gimple_call_combined_fn (call1) != gimple_call_combined_fn (call2))
995 return false;
997 if (gimple_call_internal_p (call1))
999 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1)),
1000 TREE_TYPE (gimple_call_lhs (call2))))
1001 return false;
1002 for (unsigned int i = 0; i < nargs; ++i)
1003 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1, i)),
1004 TREE_TYPE (gimple_call_arg (call2, i))))
1005 return false;
1007 else
1009 if (!operand_equal_p (gimple_call_fn (call1),
1010 gimple_call_fn (call2), 0))
1011 return false;
1013 if (gimple_call_fntype (call1) != gimple_call_fntype (call2))
1014 return false;
1017 /* Check that any unvectorized arguments are equal. */
1018 if (const int *map = vect_get_operand_map (call1))
1020 unsigned int nkept = *map++;
1021 unsigned int mapi = 0;
1022 for (unsigned int i = 0; i < nargs; ++i)
1023 if (mapi < nkept && map[mapi] == int (i))
1024 mapi += 1;
1025 else if (!operand_equal_p (gimple_call_arg (call1, i),
1026 gimple_call_arg (call2, i)))
1027 return false;
1030 return true;
1033 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1034 caller's attempt to find the vector type in STMT_INFO with the narrowest
1035 element type. Return true if VECTYPE is nonnull and if it is valid
1036 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1037 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1038 vect_build_slp_tree. */
1040 static bool
1041 vect_record_max_nunits (vec_info *vinfo, stmt_vec_info stmt_info,
1042 unsigned int group_size,
1043 tree vectype, poly_uint64 *max_nunits)
1045 if (!vectype)
1047 if (dump_enabled_p ())
1048 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1049 "Build SLP failed: unsupported data-type in %G\n",
1050 stmt_info->stmt);
1051 /* Fatal mismatch. */
1052 return false;
1055 /* If populating the vector type requires unrolling then fail
1056 before adjusting *max_nunits for basic-block vectorization. */
1057 if (is_a <bb_vec_info> (vinfo)
1058 && !multiple_p (group_size, TYPE_VECTOR_SUBPARTS (vectype)))
1060 if (dump_enabled_p ())
1061 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1062 "Build SLP failed: unrolling required "
1063 "in basic block SLP\n");
1064 /* Fatal mismatch. */
1065 return false;
1068 /* In case of multiple types we need to detect the smallest type. */
1069 vect_update_max_nunits (max_nunits, vectype);
1070 return true;
1073 /* Verify if the scalar stmts STMTS are isomorphic, require data
1074 permutation or are of unsupported types of operation. Return
1075 true if they are, otherwise return false and indicate in *MATCHES
1076 which stmts are not isomorphic to the first one. If MATCHES[0]
1077 is false then this indicates the comparison could not be
1078 carried out or the stmts will never be vectorized by SLP.
1080 Note COND_EXPR is possibly isomorphic to another one after swapping its
1081 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1082 the first stmt by swapping the two operands of comparison; set SWAP[i]
1083 to 2 if stmt I is isormorphic to the first stmt by inverting the code
1084 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1085 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1087 static bool
1088 vect_build_slp_tree_1 (vec_info *vinfo, unsigned char *swap,
1089 vec<stmt_vec_info> stmts, unsigned int group_size,
1090 poly_uint64 *max_nunits, bool *matches,
1091 bool *two_operators, tree *node_vectype)
1093 unsigned int i;
1094 stmt_vec_info first_stmt_info = stmts[0];
1095 code_helper first_stmt_code = ERROR_MARK;
1096 code_helper alt_stmt_code = ERROR_MARK;
1097 code_helper first_cond_code = ERROR_MARK;
1098 bool need_same_oprnds = false;
1099 tree first_lhs = NULL_TREE;
1100 tree first_op1 = NULL_TREE;
1101 stmt_vec_info first_load = NULL, prev_first_load = NULL;
1102 bool first_stmt_ldst_p = false;
1103 bool first_stmt_phi_p = false;
1104 int first_reduc_idx = -1;
1105 bool maybe_soft_fail = false;
1106 tree soft_fail_nunits_vectype = NULL_TREE;
1108 tree vectype, nunits_vectype;
1109 if (!vect_get_vector_types_for_stmt (vinfo, first_stmt_info, &vectype,
1110 &nunits_vectype, group_size))
1112 /* Fatal mismatch. */
1113 matches[0] = false;
1114 return false;
1116 /* Record nunits required but continue analysis, producing matches[]
1117 as if nunits was not an issue. This allows splitting of groups
1118 to happen. */
1119 if (nunits_vectype
1120 && !vect_record_max_nunits (vinfo, first_stmt_info, group_size,
1121 nunits_vectype, max_nunits))
1123 gcc_assert (is_a <bb_vec_info> (vinfo));
1124 maybe_soft_fail = true;
1125 soft_fail_nunits_vectype = nunits_vectype;
1128 gcc_assert (vectype);
1129 *node_vectype = vectype;
1131 /* For every stmt in NODE find its def stmt/s. */
1132 stmt_vec_info stmt_info;
1133 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
1135 bool ldst_p = false;
1136 bool phi_p = false;
1137 code_helper rhs_code = ERROR_MARK;
1139 swap[i] = 0;
1140 matches[i] = false;
1141 if (!stmt_info)
1143 matches[i] = true;
1144 continue;
1147 gimple *stmt = stmt_info->stmt;
1148 if (dump_enabled_p ())
1149 dump_printf_loc (MSG_NOTE, vect_location, "Build SLP for %G", stmt);
1151 /* Fail to vectorize statements marked as unvectorizable, throw
1152 or are volatile. */
1153 if (!STMT_VINFO_VECTORIZABLE (stmt_info)
1154 || stmt_can_throw_internal (cfun, stmt)
1155 || gimple_has_volatile_ops (stmt))
1157 if (dump_enabled_p ())
1158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159 "Build SLP failed: unvectorizable statement %G",
1160 stmt);
1161 /* ??? For BB vectorization we want to commutate operands in a way
1162 to shuffle all unvectorizable defs into one operand and have
1163 the other still vectorized. The following doesn't reliably
1164 work for this though but it's the easiest we can do here. */
1165 if (is_a <bb_vec_info> (vinfo) && i != 0)
1166 continue;
1167 /* Fatal mismatch. */
1168 matches[0] = false;
1169 return false;
1172 gcall *call_stmt = dyn_cast <gcall *> (stmt);
1173 tree lhs = gimple_get_lhs (stmt);
1174 if (lhs == NULL_TREE
1175 && (!call_stmt
1176 || !gimple_call_internal_p (stmt)
1177 || !internal_store_fn_p (gimple_call_internal_fn (stmt))))
1179 if (dump_enabled_p ())
1180 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1181 "Build SLP failed: not GIMPLE_ASSIGN nor "
1182 "GIMPLE_CALL %G", stmt);
1183 if (is_a <bb_vec_info> (vinfo) && i != 0)
1184 continue;
1185 /* Fatal mismatch. */
1186 matches[0] = false;
1187 return false;
1190 if (call_stmt)
1192 combined_fn cfn = gimple_call_combined_fn (call_stmt);
1193 if (cfn != CFN_LAST && cfn != CFN_MASK_CALL)
1194 rhs_code = cfn;
1195 else
1196 rhs_code = CALL_EXPR;
1198 if (cfn == CFN_MASK_LOAD
1199 || cfn == CFN_GATHER_LOAD
1200 || cfn == CFN_MASK_GATHER_LOAD
1201 || cfn == CFN_MASK_LEN_GATHER_LOAD
1202 || cfn == CFN_SCATTER_STORE
1203 || cfn == CFN_MASK_SCATTER_STORE
1204 || cfn == CFN_MASK_LEN_SCATTER_STORE)
1205 ldst_p = true;
1206 else if (cfn == CFN_MASK_STORE)
1208 ldst_p = true;
1209 rhs_code = CFN_MASK_STORE;
1211 else if (cfn == CFN_GOMP_SIMD_LANE)
1213 else if ((cfn != CFN_LAST
1214 && cfn != CFN_MASK_CALL
1215 && internal_fn_p (cfn)
1216 && !vectorizable_internal_fn_p (as_internal_fn (cfn)))
1217 || gimple_call_tail_p (call_stmt)
1218 || gimple_call_noreturn_p (call_stmt)
1219 || gimple_call_chain (call_stmt))
1221 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1223 "Build SLP failed: unsupported call type %G",
1224 (gimple *) call_stmt);
1225 if (is_a <bb_vec_info> (vinfo) && i != 0)
1226 continue;
1227 /* Fatal mismatch. */
1228 matches[0] = false;
1229 return false;
1232 else if (gimple_code (stmt) == GIMPLE_PHI)
1234 rhs_code = ERROR_MARK;
1235 phi_p = true;
1237 else
1239 rhs_code = gimple_assign_rhs_code (stmt);
1240 ldst_p = STMT_VINFO_DATA_REF (stmt_info) != nullptr;
1243 /* Check the operation. */
1244 if (i == 0)
1246 first_lhs = lhs;
1247 first_stmt_code = rhs_code;
1248 first_stmt_ldst_p = ldst_p;
1249 first_stmt_phi_p = phi_p;
1250 first_reduc_idx = STMT_VINFO_REDUC_IDX (stmt_info);
1252 /* Shift arguments should be equal in all the packed stmts for a
1253 vector shift with scalar shift operand. */
1254 if (rhs_code == LSHIFT_EXPR || rhs_code == RSHIFT_EXPR
1255 || rhs_code == LROTATE_EXPR
1256 || rhs_code == RROTATE_EXPR)
1258 /* First see if we have a vector/vector shift. */
1259 if (!directly_supported_p (rhs_code, vectype, optab_vector))
1261 /* No vector/vector shift, try for a vector/scalar shift. */
1262 if (!directly_supported_p (rhs_code, vectype, optab_scalar))
1264 if (dump_enabled_p ())
1265 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1266 "Build SLP failed: "
1267 "op not supported by target.\n");
1268 if (is_a <bb_vec_info> (vinfo) && i != 0)
1269 continue;
1270 /* Fatal mismatch. */
1271 matches[0] = false;
1272 return false;
1274 need_same_oprnds = true;
1275 first_op1 = gimple_assign_rhs2 (stmt);
1278 else if (rhs_code == WIDEN_LSHIFT_EXPR)
1280 need_same_oprnds = true;
1281 first_op1 = gimple_assign_rhs2 (stmt);
1283 else if (!ldst_p
1284 && rhs_code == BIT_FIELD_REF)
1286 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt), 0);
1287 if (!is_a <bb_vec_info> (vinfo)
1288 || TREE_CODE (vec) != SSA_NAME
1289 /* When the element types are not compatible we pun the
1290 source to the target vectype which requires equal size. */
1291 || ((!VECTOR_TYPE_P (TREE_TYPE (vec))
1292 || !types_compatible_p (TREE_TYPE (vectype),
1293 TREE_TYPE (TREE_TYPE (vec))))
1294 && !operand_equal_p (TYPE_SIZE (vectype),
1295 TYPE_SIZE (TREE_TYPE (vec)))))
1297 if (dump_enabled_p ())
1298 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1299 "Build SLP failed: "
1300 "BIT_FIELD_REF not supported\n");
1301 /* Fatal mismatch. */
1302 matches[0] = false;
1303 return false;
1306 else if (rhs_code == CFN_DIV_POW2)
1308 need_same_oprnds = true;
1309 first_op1 = gimple_call_arg (call_stmt, 1);
1311 else if (rhs_code == CFN_GOMP_SIMD_LANE)
1313 need_same_oprnds = true;
1314 first_op1 = gimple_call_arg (call_stmt, 1);
1317 else
1319 if (first_reduc_idx != STMT_VINFO_REDUC_IDX (stmt_info)
1320 /* For SLP reduction groups the index isn't necessarily
1321 uniform but only that of the first stmt matters. */
1322 && !(first_reduc_idx != -1
1323 && STMT_VINFO_REDUC_IDX (stmt_info) != -1
1324 && REDUC_GROUP_FIRST_ELEMENT (stmt_info)))
1326 if (dump_enabled_p ())
1328 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1329 "Build SLP failed: different reduc_idx "
1330 "%d instead of %d in %G",
1331 STMT_VINFO_REDUC_IDX (stmt_info),
1332 first_reduc_idx, stmt);
1334 /* Mismatch. */
1335 continue;
1337 if (!ldst_p
1338 && first_stmt_code != rhs_code
1339 && alt_stmt_code == ERROR_MARK)
1340 alt_stmt_code = rhs_code;
1341 if ((!ldst_p
1342 && first_stmt_code != rhs_code
1343 && (first_stmt_code != IMAGPART_EXPR
1344 || rhs_code != REALPART_EXPR)
1345 && (first_stmt_code != REALPART_EXPR
1346 || rhs_code != IMAGPART_EXPR)
1347 /* Handle mismatches in plus/minus by computing both
1348 and merging the results. */
1349 && !((first_stmt_code == PLUS_EXPR
1350 || first_stmt_code == MINUS_EXPR)
1351 && (alt_stmt_code == PLUS_EXPR
1352 || alt_stmt_code == MINUS_EXPR)
1353 && rhs_code == alt_stmt_code)
1354 && !(first_stmt_code.is_tree_code ()
1355 && rhs_code.is_tree_code ()
1356 && (TREE_CODE_CLASS (tree_code (first_stmt_code))
1357 == tcc_comparison)
1358 && (swap_tree_comparison (tree_code (first_stmt_code))
1359 == tree_code (rhs_code))))
1360 || (ldst_p
1361 && (STMT_VINFO_GROUPED_ACCESS (stmt_info)
1362 != STMT_VINFO_GROUPED_ACCESS (first_stmt_info)))
1363 || (ldst_p
1364 && (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1365 != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info)))
1366 || first_stmt_ldst_p != ldst_p
1367 || first_stmt_phi_p != phi_p)
1369 if (dump_enabled_p ())
1371 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1372 "Build SLP failed: different operation "
1373 "in stmt %G", stmt);
1374 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1375 "original stmt %G", first_stmt_info->stmt);
1377 /* Mismatch. */
1378 continue;
1381 if (!ldst_p
1382 && first_stmt_code == BIT_FIELD_REF
1383 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info->stmt), 0)
1384 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0)))
1386 if (dump_enabled_p ())
1387 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1388 "Build SLP failed: different BIT_FIELD_REF "
1389 "arguments in %G", stmt);
1390 /* Mismatch. */
1391 continue;
1394 if (call_stmt
1395 && first_stmt_code != CFN_MASK_LOAD
1396 && first_stmt_code != CFN_MASK_STORE)
1398 if (!is_a <gcall *> (stmts[0]->stmt)
1399 || !compatible_calls_p (as_a <gcall *> (stmts[0]->stmt),
1400 call_stmt))
1402 if (dump_enabled_p ())
1403 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1404 "Build SLP failed: different calls in %G",
1405 stmt);
1406 /* Mismatch. */
1407 continue;
1411 if ((phi_p || gimple_could_trap_p (stmt_info->stmt))
1412 && (gimple_bb (first_stmt_info->stmt)
1413 != gimple_bb (stmt_info->stmt)))
1415 if (dump_enabled_p ())
1416 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417 "Build SLP failed: different BB for PHI "
1418 "or possibly trapping operation in %G", stmt);
1419 /* Mismatch. */
1420 continue;
1423 if (need_same_oprnds)
1425 tree other_op1 = gimple_arg (stmt, 1);
1426 if (!operand_equal_p (first_op1, other_op1, 0))
1428 if (dump_enabled_p ())
1429 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1430 "Build SLP failed: different shift "
1431 "arguments in %G", stmt);
1432 /* Mismatch. */
1433 continue;
1437 if (first_lhs
1438 && lhs
1439 && !types_compatible_p (TREE_TYPE (lhs), TREE_TYPE (first_lhs)))
1441 if (dump_enabled_p ())
1442 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1443 "Build SLP failed: different vector type "
1444 "in %G", stmt);
1445 /* Mismatch. */
1446 continue;
1450 /* Grouped store or load. */
1451 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1453 gcc_assert (ldst_p);
1454 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
1456 /* Store. */
1457 gcc_assert (rhs_code == CFN_MASK_STORE
1458 || REFERENCE_CLASS_P (lhs)
1459 || DECL_P (lhs));
1461 else
1463 /* Load. */
1464 first_load = DR_GROUP_FIRST_ELEMENT (stmt_info);
1465 if (prev_first_load)
1467 /* Check that there are no loads from different interleaving
1468 chains in the same node. */
1469 if (prev_first_load != first_load)
1471 if (dump_enabled_p ())
1472 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
1473 vect_location,
1474 "Build SLP failed: different "
1475 "interleaving chains in one node %G",
1476 stmt);
1477 /* Mismatch. */
1478 continue;
1481 else
1482 prev_first_load = first_load;
1485 /* Non-grouped store or load. */
1486 else if (ldst_p)
1488 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info))
1489 && rhs_code != CFN_GATHER_LOAD
1490 && rhs_code != CFN_MASK_GATHER_LOAD
1491 && rhs_code != CFN_MASK_LEN_GATHER_LOAD
1492 && rhs_code != CFN_SCATTER_STORE
1493 && rhs_code != CFN_MASK_SCATTER_STORE
1494 && rhs_code != CFN_MASK_LEN_SCATTER_STORE
1495 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info)
1496 /* Not grouped loads are handled as externals for BB
1497 vectorization. For loop vectorization we can handle
1498 splats the same we handle single element interleaving. */
1499 && (is_a <bb_vec_info> (vinfo)
1500 || stmt_info != first_stmt_info))
1502 /* Not grouped load. */
1503 if (dump_enabled_p ())
1504 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1505 "Build SLP failed: not grouped load %G", stmt);
1507 if (i != 0)
1508 continue;
1509 /* Fatal mismatch. */
1510 matches[0] = false;
1511 return false;
1514 /* Not memory operation. */
1515 else
1517 if (!phi_p
1518 && rhs_code.is_tree_code ()
1519 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_binary
1520 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_unary
1521 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_expression
1522 && TREE_CODE_CLASS (tree_code (rhs_code)) != tcc_comparison
1523 && rhs_code != VIEW_CONVERT_EXPR
1524 && rhs_code != CALL_EXPR
1525 && rhs_code != BIT_FIELD_REF
1526 && rhs_code != SSA_NAME)
1528 if (dump_enabled_p ())
1529 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1530 "Build SLP failed: operation unsupported %G",
1531 stmt);
1532 if (is_a <bb_vec_info> (vinfo) && i != 0)
1533 continue;
1534 /* Fatal mismatch. */
1535 matches[0] = false;
1536 return false;
1539 if (rhs_code == COND_EXPR)
1541 tree cond_expr = gimple_assign_rhs1 (stmt);
1542 enum tree_code cond_code = TREE_CODE (cond_expr);
1543 enum tree_code swap_code = ERROR_MARK;
1544 enum tree_code invert_code = ERROR_MARK;
1546 if (i == 0)
1547 first_cond_code = TREE_CODE (cond_expr);
1548 else if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1550 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
1551 swap_code = swap_tree_comparison (cond_code);
1552 invert_code = invert_tree_comparison (cond_code, honor_nans);
1555 if (first_cond_code == cond_code)
1557 /* Isomorphic can be achieved by swapping. */
1558 else if (first_cond_code == swap_code)
1559 swap[i] = 1;
1560 /* Isomorphic can be achieved by inverting. */
1561 else if (first_cond_code == invert_code)
1562 swap[i] = 2;
1563 else
1565 if (dump_enabled_p ())
1566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1567 "Build SLP failed: different"
1568 " operation %G", stmt);
1569 /* Mismatch. */
1570 continue;
1574 if (rhs_code.is_tree_code ()
1575 && TREE_CODE_CLASS ((tree_code)rhs_code) == tcc_comparison
1576 && (swap_tree_comparison ((tree_code)first_stmt_code)
1577 == (tree_code)rhs_code))
1578 swap[i] = 1;
1581 matches[i] = true;
1584 for (i = 0; i < group_size; ++i)
1585 if (!matches[i])
1586 return false;
1588 /* If we allowed a two-operation SLP node verify the target can cope
1589 with the permute we are going to use. */
1590 if (alt_stmt_code != ERROR_MARK
1591 && (!alt_stmt_code.is_tree_code ()
1592 || (TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_reference
1593 && TREE_CODE_CLASS (tree_code (alt_stmt_code)) != tcc_comparison)))
1595 *two_operators = true;
1598 if (maybe_soft_fail)
1600 unsigned HOST_WIDE_INT const_nunits;
1601 if (!TYPE_VECTOR_SUBPARTS
1602 (soft_fail_nunits_vectype).is_constant (&const_nunits)
1603 || const_nunits > group_size)
1604 matches[0] = false;
1605 else
1607 /* With constant vector elements simulate a mismatch at the
1608 point we need to split. */
1609 unsigned tail = group_size & (const_nunits - 1);
1610 memset (&matches[group_size - tail], 0, sizeof (bool) * tail);
1612 return false;
1615 return true;
1618 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1619 Note we never remove apart from at destruction time so we do not
1620 need a special value for deleted that differs from empty. */
1621 struct bst_traits
1623 typedef vec <stmt_vec_info> value_type;
1624 typedef vec <stmt_vec_info> compare_type;
1625 static inline hashval_t hash (value_type);
1626 static inline bool equal (value_type existing, value_type candidate);
1627 static inline bool is_empty (value_type x) { return !x.exists (); }
1628 static inline bool is_deleted (value_type x) { return !x.exists (); }
1629 static const bool empty_zero_p = true;
1630 static inline void mark_empty (value_type &x) { x.release (); }
1631 static inline void mark_deleted (value_type &x) { x.release (); }
1632 static inline void remove (value_type &x) { x.release (); }
1634 inline hashval_t
1635 bst_traits::hash (value_type x)
1637 inchash::hash h;
1638 for (unsigned i = 0; i < x.length (); ++i)
1639 h.add_int (x[i] ? gimple_uid (x[i]->stmt) : -1);
1640 return h.end ();
1642 inline bool
1643 bst_traits::equal (value_type existing, value_type candidate)
1645 if (existing.length () != candidate.length ())
1646 return false;
1647 for (unsigned i = 0; i < existing.length (); ++i)
1648 if (existing[i] != candidate[i])
1649 return false;
1650 return true;
1653 typedef hash_map <vec <stmt_vec_info>, slp_tree,
1654 simple_hashmap_traits <bst_traits, slp_tree> >
1655 scalar_stmts_to_slp_tree_map_t;
1657 /* Release BST_MAP. */
1659 static void
1660 release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t *bst_map)
1662 /* The map keeps a reference on SLP nodes built, release that. */
1663 for (scalar_stmts_to_slp_tree_map_t::iterator it = bst_map->begin ();
1664 it != bst_map->end (); ++it)
1665 if ((*it).second)
1666 vect_free_slp_tree ((*it).second);
1667 delete bst_map;
1670 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1671 but then vec::insert does memmove and that's not compatible with
1672 std::pair. */
1673 struct chain_op_t
1675 chain_op_t (tree_code code_, vect_def_type dt_, tree op_)
1676 : code (code_), dt (dt_), op (op_) {}
1677 tree_code code;
1678 vect_def_type dt;
1679 tree op;
1682 /* Comparator for sorting associatable chains. */
1684 static int
1685 dt_sort_cmp (const void *op1_, const void *op2_, void *)
1687 auto *op1 = (const chain_op_t *) op1_;
1688 auto *op2 = (const chain_op_t *) op2_;
1689 if (op1->dt != op2->dt)
1690 return (int)op1->dt - (int)op2->dt;
1691 return (int)op1->code - (int)op2->code;
1694 /* Linearize the associatable expression chain at START with the
1695 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1696 filling CHAIN with the result and using WORKLIST as intermediate storage.
1697 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1698 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1699 stmts, starting with START. */
1701 static void
1702 vect_slp_linearize_chain (vec_info *vinfo,
1703 vec<std::pair<tree_code, gimple *> > &worklist,
1704 vec<chain_op_t> &chain,
1705 enum tree_code code, gimple *start,
1706 gimple *&code_stmt, gimple *&alt_code_stmt,
1707 vec<gimple *> *chain_stmts)
1709 /* For each lane linearize the addition/subtraction (or other
1710 uniform associatable operation) expression tree. */
1711 worklist.safe_push (std::make_pair (code, start));
1712 while (!worklist.is_empty ())
1714 auto entry = worklist.pop ();
1715 gassign *stmt = as_a <gassign *> (entry.second);
1716 enum tree_code in_code = entry.first;
1717 enum tree_code this_code = gimple_assign_rhs_code (stmt);
1718 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1719 if (!code_stmt
1720 && gimple_assign_rhs_code (stmt) == code)
1721 code_stmt = stmt;
1722 else if (!alt_code_stmt
1723 && gimple_assign_rhs_code (stmt) == MINUS_EXPR)
1724 alt_code_stmt = stmt;
1725 if (chain_stmts)
1726 chain_stmts->safe_push (stmt);
1727 for (unsigned opnum = 1; opnum <= 2; ++opnum)
1729 tree op = gimple_op (stmt, opnum);
1730 vect_def_type dt;
1731 stmt_vec_info def_stmt_info;
1732 bool res = vect_is_simple_use (op, vinfo, &dt, &def_stmt_info);
1733 gcc_assert (res);
1734 if (dt == vect_internal_def
1735 && is_pattern_stmt_p (def_stmt_info))
1736 op = gimple_get_lhs (def_stmt_info->stmt);
1737 gimple *use_stmt;
1738 use_operand_p use_p;
1739 if (dt == vect_internal_def
1740 && single_imm_use (op, &use_p, &use_stmt)
1741 && is_gimple_assign (def_stmt_info->stmt)
1742 && (gimple_assign_rhs_code (def_stmt_info->stmt) == code
1743 || (code == PLUS_EXPR
1744 && (gimple_assign_rhs_code (def_stmt_info->stmt)
1745 == MINUS_EXPR))))
1747 tree_code op_def_code = this_code;
1748 if (op_def_code == MINUS_EXPR && opnum == 1)
1749 op_def_code = PLUS_EXPR;
1750 if (in_code == MINUS_EXPR)
1751 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1752 worklist.safe_push (std::make_pair (op_def_code,
1753 def_stmt_info->stmt));
1755 else
1757 tree_code op_def_code = this_code;
1758 if (op_def_code == MINUS_EXPR && opnum == 1)
1759 op_def_code = PLUS_EXPR;
1760 if (in_code == MINUS_EXPR)
1761 op_def_code = op_def_code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR;
1762 chain.safe_push (chain_op_t (op_def_code, dt, op));
1768 static slp_tree
1769 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1770 vec<stmt_vec_info> stmts, unsigned int group_size,
1771 poly_uint64 *max_nunits,
1772 bool *matches, unsigned *limit, unsigned *tree_size,
1773 scalar_stmts_to_slp_tree_map_t *bst_map);
1775 static slp_tree
1776 vect_build_slp_tree (vec_info *vinfo,
1777 vec<stmt_vec_info> stmts, unsigned int group_size,
1778 poly_uint64 *max_nunits,
1779 bool *matches, unsigned *limit, unsigned *tree_size,
1780 scalar_stmts_to_slp_tree_map_t *bst_map)
1782 if (slp_tree *leader = bst_map->get (stmts))
1784 if (dump_enabled_p ())
1785 dump_printf_loc (MSG_NOTE, vect_location, "re-using %sSLP tree %p\n",
1786 !(*leader)->failed ? "" : "failed ",
1787 (void *) *leader);
1788 if (!(*leader)->failed)
1790 SLP_TREE_REF_COUNT (*leader)++;
1791 vect_update_max_nunits (max_nunits, (*leader)->max_nunits);
1792 stmts.release ();
1793 return *leader;
1795 memcpy (matches, (*leader)->failed, sizeof (bool) * group_size);
1796 return NULL;
1799 /* Single-lane SLP doesn't have the chance of run-away, do not account
1800 it to the limit. */
1801 if (stmts.length () > 1)
1803 if (*limit == 0)
1805 if (dump_enabled_p ())
1806 dump_printf_loc (MSG_NOTE, vect_location,
1807 "SLP discovery limit exceeded\n");
1808 memset (matches, 0, sizeof (bool) * group_size);
1809 return NULL;
1811 --*limit;
1814 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1815 so we can pick up backedge destinations during discovery. */
1816 slp_tree res = new _slp_tree;
1817 SLP_TREE_DEF_TYPE (res) = vect_internal_def;
1818 SLP_TREE_SCALAR_STMTS (res) = stmts;
1819 bst_map->put (stmts.copy (), res);
1821 if (dump_enabled_p ())
1822 dump_printf_loc (MSG_NOTE, vect_location,
1823 "starting SLP discovery for node %p\n", (void *) res);
1825 poly_uint64 this_max_nunits = 1;
1826 slp_tree res_ = vect_build_slp_tree_2 (vinfo, res, stmts, group_size,
1827 &this_max_nunits,
1828 matches, limit, tree_size, bst_map);
1829 if (!res_)
1831 if (dump_enabled_p ())
1832 dump_printf_loc (MSG_NOTE, vect_location,
1833 "SLP discovery for node %p failed\n", (void *) res);
1834 /* Mark the node invalid so we can detect those when still in use
1835 as backedge destinations. */
1836 SLP_TREE_SCALAR_STMTS (res) = vNULL;
1837 SLP_TREE_DEF_TYPE (res) = vect_uninitialized_def;
1838 res->failed = XNEWVEC (bool, group_size);
1839 if (flag_checking)
1841 unsigned i;
1842 for (i = 0; i < group_size; ++i)
1843 if (!matches[i])
1844 break;
1845 gcc_assert (i < group_size);
1847 memcpy (res->failed, matches, sizeof (bool) * group_size);
1849 else
1851 if (dump_enabled_p ())
1852 dump_printf_loc (MSG_NOTE, vect_location,
1853 "SLP discovery for node %p succeeded\n",
1854 (void *) res);
1855 gcc_assert (res_ == res);
1856 res->max_nunits = this_max_nunits;
1857 vect_update_max_nunits (max_nunits, this_max_nunits);
1858 /* Keep a reference for the bst_map use. */
1859 SLP_TREE_REF_COUNT (res)++;
1861 return res_;
1864 /* Helper for building an associated SLP node chain. */
1866 static void
1867 vect_slp_build_two_operator_nodes (slp_tree perm, tree vectype,
1868 slp_tree op0, slp_tree op1,
1869 stmt_vec_info oper1, stmt_vec_info oper2,
1870 vec<std::pair<unsigned, unsigned> > lperm)
1872 unsigned group_size = SLP_TREE_LANES (op1);
1874 slp_tree child1 = new _slp_tree;
1875 SLP_TREE_DEF_TYPE (child1) = vect_internal_def;
1876 SLP_TREE_VECTYPE (child1) = vectype;
1877 SLP_TREE_LANES (child1) = group_size;
1878 SLP_TREE_CHILDREN (child1).create (2);
1879 SLP_TREE_CHILDREN (child1).quick_push (op0);
1880 SLP_TREE_CHILDREN (child1).quick_push (op1);
1881 SLP_TREE_REPRESENTATIVE (child1) = oper1;
1883 slp_tree child2 = new _slp_tree;
1884 SLP_TREE_DEF_TYPE (child2) = vect_internal_def;
1885 SLP_TREE_VECTYPE (child2) = vectype;
1886 SLP_TREE_LANES (child2) = group_size;
1887 SLP_TREE_CHILDREN (child2).create (2);
1888 SLP_TREE_CHILDREN (child2).quick_push (op0);
1889 SLP_TREE_REF_COUNT (op0)++;
1890 SLP_TREE_CHILDREN (child2).quick_push (op1);
1891 SLP_TREE_REF_COUNT (op1)++;
1892 SLP_TREE_REPRESENTATIVE (child2) = oper2;
1894 SLP_TREE_DEF_TYPE (perm) = vect_internal_def;
1895 SLP_TREE_CODE (perm) = VEC_PERM_EXPR;
1896 SLP_TREE_VECTYPE (perm) = vectype;
1897 SLP_TREE_LANES (perm) = group_size;
1898 /* ??? We should set this NULL but that's not expected. */
1899 SLP_TREE_REPRESENTATIVE (perm) = oper1;
1900 SLP_TREE_LANE_PERMUTATION (perm) = lperm;
1901 SLP_TREE_CHILDREN (perm).quick_push (child1);
1902 SLP_TREE_CHILDREN (perm).quick_push (child2);
1905 /* Recursively build an SLP tree starting from NODE.
1906 Fail (and return a value not equal to zero) if def-stmts are not
1907 isomorphic, require data permutation or are of unsupported types of
1908 operation. Otherwise, return 0.
1909 The value returned is the depth in the SLP tree where a mismatch
1910 was found. */
1912 static slp_tree
1913 vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
1914 vec<stmt_vec_info> stmts, unsigned int group_size,
1915 poly_uint64 *max_nunits,
1916 bool *matches, unsigned *limit, unsigned *tree_size,
1917 scalar_stmts_to_slp_tree_map_t *bst_map)
1919 unsigned nops, i, this_tree_size = 0;
1920 poly_uint64 this_max_nunits = *max_nunits;
1922 matches[0] = false;
1924 stmt_vec_info stmt_info = stmts[0];
1925 if (!is_a<gcall *> (stmt_info->stmt)
1926 && !is_a<gassign *> (stmt_info->stmt)
1927 && !is_a<gphi *> (stmt_info->stmt))
1928 return NULL;
1930 nops = gimple_num_args (stmt_info->stmt);
1931 if (const int *map = vect_get_operand_map (stmt_info->stmt,
1932 STMT_VINFO_GATHER_SCATTER_P
1933 (stmt_info)))
1934 nops = map[0];
1936 /* If the SLP node is a PHI (induction or reduction), terminate
1937 the recursion. */
1938 bool *skip_args = XALLOCAVEC (bool, nops);
1939 memset (skip_args, 0, sizeof (bool) * nops);
1940 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
1941 if (gphi *stmt = dyn_cast <gphi *> (stmt_info->stmt))
1943 tree scalar_type = TREE_TYPE (PHI_RESULT (stmt));
1944 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
1945 group_size);
1946 if (!vect_record_max_nunits (vinfo, stmt_info, group_size, vectype,
1947 max_nunits))
1948 return NULL;
1950 vect_def_type def_type = STMT_VINFO_DEF_TYPE (stmt_info);
1951 if (def_type == vect_induction_def)
1953 /* Induction PHIs are not cycles but walk the initial
1954 value. Only for inner loops through, for outer loops
1955 we need to pick up the value from the actual PHIs
1956 to more easily support peeling and epilogue vectorization. */
1957 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1958 if (!nested_in_vect_loop_p (loop, stmt_info))
1959 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1960 else
1961 loop = loop->inner;
1962 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1964 else if (def_type == vect_reduction_def
1965 || def_type == vect_double_reduction_def
1966 || def_type == vect_nested_cycle
1967 || def_type == vect_first_order_recurrence)
1969 /* Else def types have to match. */
1970 stmt_vec_info other_info;
1971 bool all_same = true;
1972 FOR_EACH_VEC_ELT (stmts, i, other_info)
1974 if (STMT_VINFO_DEF_TYPE (other_info) != def_type)
1975 return NULL;
1976 if (other_info != stmt_info)
1977 all_same = false;
1979 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1980 /* Reduction initial values are not explicitely represented. */
1981 if (def_type != vect_first_order_recurrence
1982 && gimple_bb (stmt_info->stmt) == loop->header)
1983 skip_args[loop_preheader_edge (loop)->dest_idx] = true;
1984 /* Reduction chain backedge defs are filled manually.
1985 ??? Need a better way to identify a SLP reduction chain PHI.
1986 Or a better overall way to SLP match those. */
1987 if (stmts.length () > 1
1988 && all_same && def_type == vect_reduction_def)
1989 skip_args[loop_latch_edge (loop)->dest_idx] = true;
1991 else if (def_type != vect_internal_def)
1992 return NULL;
1996 bool two_operators = false;
1997 unsigned char *swap = XALLOCAVEC (unsigned char, group_size);
1998 tree vectype = NULL_TREE;
1999 if (!vect_build_slp_tree_1 (vinfo, swap, stmts, group_size,
2000 &this_max_nunits, matches, &two_operators,
2001 &vectype))
2002 return NULL;
2004 /* If the SLP node is a load, terminate the recursion unless masked. */
2005 if (STMT_VINFO_DATA_REF (stmt_info)
2006 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
2008 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
2009 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)));
2010 else
2012 *max_nunits = this_max_nunits;
2013 (*tree_size)++;
2014 node = vect_create_new_slp_node (node, stmts, 0);
2015 SLP_TREE_VECTYPE (node) = vectype;
2016 /* And compute the load permutation. Whether it is actually
2017 a permutation depends on the unrolling factor which is
2018 decided later. */
2019 vec<unsigned> load_permutation;
2020 int j;
2021 stmt_vec_info load_info;
2022 load_permutation.create (group_size);
2023 stmt_vec_info first_stmt_info
2024 = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2025 ? DR_GROUP_FIRST_ELEMENT (stmt_info) : stmt_info;
2026 bool any_permute = false;
2027 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
2029 int load_place;
2030 if (! load_info)
2032 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2033 load_place = j;
2034 else
2035 load_place = 0;
2037 else if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2038 load_place = vect_get_place_in_interleaving_chain
2039 (load_info, first_stmt_info);
2040 else
2041 load_place = 0;
2042 gcc_assert (load_place != -1);
2043 any_permute |= load_place != j;
2044 load_permutation.quick_push (load_place);
2047 if (gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt))
2049 gcc_assert (gimple_call_internal_p (stmt, IFN_MASK_LOAD));
2050 bool has_gaps = false;
2051 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2052 for (stmt_vec_info si = DR_GROUP_NEXT_ELEMENT (first_stmt_info);
2053 si; si = DR_GROUP_NEXT_ELEMENT (si))
2054 if (DR_GROUP_GAP (si) != 1)
2055 has_gaps = true;
2056 /* We cannot handle permuted masked loads directly, see
2057 PR114375. We cannot handle strided masked loads or masked
2058 loads with gaps unless the mask is uniform. */
2059 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info)
2060 && (DR_GROUP_GAP (first_stmt_info) != 0
2061 || (has_gaps
2062 && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info))))
2063 || STMT_VINFO_STRIDED_P (stmt_info))
2065 load_permutation.release ();
2066 matches[0] = false;
2067 return NULL;
2070 /* For permuted masked loads do an unpermuted masked load of
2071 the whole group followed by a SLP permute node. */
2072 if (any_permute
2073 || (STMT_VINFO_GROUPED_ACCESS (stmt_info)
2074 && DR_GROUP_SIZE (first_stmt_info) != group_size))
2076 /* Discover the whole unpermuted load. */
2077 vec<stmt_vec_info> stmts2;
2078 unsigned dr_group_size = STMT_VINFO_GROUPED_ACCESS (stmt_info)
2079 ? DR_GROUP_SIZE (first_stmt_info) : 1;
2080 stmts2.create (dr_group_size);
2081 stmts2.quick_grow_cleared (dr_group_size);
2082 unsigned i = 0;
2083 for (stmt_vec_info si = first_stmt_info;
2084 si; si = DR_GROUP_NEXT_ELEMENT (si))
2086 if (si != first_stmt_info)
2087 for (unsigned k = 1; k < DR_GROUP_GAP (si); ++k)
2088 stmts2[i++] = NULL;
2089 stmts2[i++] = si;
2091 bool *matches2 = XALLOCAVEC (bool, dr_group_size);
2092 slp_tree unperm_load
2093 = vect_build_slp_tree (vinfo, stmts2, dr_group_size,
2094 &this_max_nunits, matches2, limit,
2095 &this_tree_size, bst_map);
2096 /* When we are able to do the full masked load emit that
2097 followed by 'node' being the desired final permutation. */
2098 if (unperm_load)
2100 gcc_assert
2101 (!SLP_TREE_LOAD_PERMUTATION (unperm_load).exists ());
2102 lane_permutation_t lperm;
2103 lperm.create (group_size);
2104 for (unsigned j = 0; j < load_permutation.length (); ++j)
2105 lperm.quick_push
2106 (std::make_pair (0, load_permutation[j]));
2107 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2108 SLP_TREE_CHILDREN (node).safe_push (unperm_load);
2109 SLP_TREE_LANE_PERMUTATION (node) = lperm;
2110 load_permutation.release ();
2111 return node;
2113 stmts2.release ();
2114 load_permutation.release ();
2115 matches[0] = false;
2116 return NULL;
2118 load_permutation.release ();
2120 else
2122 if (!any_permute
2123 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2124 && group_size == DR_GROUP_SIZE (first_stmt_info))
2125 load_permutation.release ();
2126 SLP_TREE_LOAD_PERMUTATION (node) = load_permutation;
2127 return node;
2131 else if (gimple_assign_single_p (stmt_info->stmt)
2132 && !gimple_vuse (stmt_info->stmt)
2133 && gimple_assign_rhs_code (stmt_info->stmt) == BIT_FIELD_REF)
2135 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
2136 the same SSA name vector of a compatible type to vectype. */
2137 vec<std::pair<unsigned, unsigned> > lperm = vNULL;
2138 tree vec = TREE_OPERAND (gimple_assign_rhs1 (stmt_info->stmt), 0);
2139 stmt_vec_info estmt_info;
2140 FOR_EACH_VEC_ELT (stmts, i, estmt_info)
2142 gassign *estmt = as_a <gassign *> (estmt_info->stmt);
2143 tree bfref = gimple_assign_rhs1 (estmt);
2144 HOST_WIDE_INT lane;
2145 if (!known_eq (bit_field_size (bfref),
2146 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype))))
2147 || !constant_multiple_p (bit_field_offset (bfref),
2148 bit_field_size (bfref), &lane))
2150 lperm.release ();
2151 matches[0] = false;
2152 return NULL;
2154 lperm.safe_push (std::make_pair (0, (unsigned)lane));
2156 slp_tree vnode = vect_create_new_slp_node (vNULL);
2157 if (operand_equal_p (TYPE_SIZE (vectype), TYPE_SIZE (TREE_TYPE (vec))))
2158 /* ??? We record vectype here but we hide eventually necessary
2159 punning and instead rely on code generation to materialize
2160 VIEW_CONVERT_EXPRs as necessary. We instead should make
2161 this explicit somehow. */
2162 SLP_TREE_VECTYPE (vnode) = vectype;
2163 else
2165 /* For different size but compatible elements we can still
2166 use VEC_PERM_EXPR without punning. */
2167 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec))
2168 && types_compatible_p (TREE_TYPE (vectype),
2169 TREE_TYPE (TREE_TYPE (vec))));
2170 SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
2172 auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
2173 unsigned HOST_WIDE_INT const_nunits;
2174 if (nunits.is_constant (&const_nunits))
2175 SLP_TREE_LANES (vnode) = const_nunits;
2176 SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
2177 /* We are always building a permutation node even if it is an identity
2178 permute to shield the rest of the vectorizer from the odd node
2179 representing an actual vector without any scalar ops.
2180 ??? We could hide it completely with making the permute node
2181 external? */
2182 node = vect_create_new_slp_node (node, stmts, 1);
2183 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
2184 SLP_TREE_LANE_PERMUTATION (node) = lperm;
2185 SLP_TREE_VECTYPE (node) = vectype;
2186 SLP_TREE_CHILDREN (node).quick_push (vnode);
2187 return node;
2189 /* When discovery reaches an associatable operation see whether we can
2190 improve that to match up lanes in a way superior to the operand
2191 swapping code which at most looks at two defs.
2192 ??? For BB vectorization we cannot do the brute-force search
2193 for matching as we can succeed by means of builds from scalars
2194 and have no good way to "cost" one build against another. */
2195 else if (is_a <loop_vec_info> (vinfo)
2196 /* Do not bother for single-lane SLP. */
2197 && group_size > 1
2198 /* ??? We don't handle !vect_internal_def defs below. */
2199 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2200 /* ??? Do not associate a reduction, this will wreck REDUC_IDX
2201 mapping as long as that exists on the stmt_info level. */
2202 && STMT_VINFO_REDUC_IDX (stmt_info) == -1
2203 && is_gimple_assign (stmt_info->stmt)
2204 && (associative_tree_code (gimple_assign_rhs_code (stmt_info->stmt))
2205 || gimple_assign_rhs_code (stmt_info->stmt) == MINUS_EXPR)
2206 && ((FLOAT_TYPE_P (vectype) && flag_associative_math)
2207 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype))
2208 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype)))))
2210 /* See if we have a chain of (mixed) adds or subtracts or other
2211 associatable ops. */
2212 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
2213 if (code == MINUS_EXPR)
2214 code = PLUS_EXPR;
2215 stmt_vec_info other_op_stmt_info = NULL;
2216 stmt_vec_info op_stmt_info = NULL;
2217 unsigned chain_len = 0;
2218 auto_vec<chain_op_t> chain;
2219 auto_vec<std::pair<tree_code, gimple *> > worklist;
2220 auto_vec<vec<chain_op_t> > chains (group_size);
2221 auto_vec<slp_tree, 4> children;
2222 bool hard_fail = true;
2223 for (unsigned lane = 0; lane < group_size; ++lane)
2225 if (!stmts[lane])
2227 /* ??? Below we require lane zero is present. */
2228 if (lane == 0)
2230 hard_fail = false;
2231 break;
2233 chains.quick_push (vNULL);
2234 continue;
2236 /* For each lane linearize the addition/subtraction (or other
2237 uniform associatable operation) expression tree. */
2238 gimple *op_stmt = NULL, *other_op_stmt = NULL;
2239 vect_slp_linearize_chain (vinfo, worklist, chain, code,
2240 stmts[lane]->stmt, op_stmt, other_op_stmt,
2241 NULL);
2242 if (!op_stmt_info && op_stmt)
2243 op_stmt_info = vinfo->lookup_stmt (op_stmt);
2244 if (!other_op_stmt_info && other_op_stmt)
2245 other_op_stmt_info = vinfo->lookup_stmt (other_op_stmt);
2246 if (chain.length () == 2)
2248 /* In a chain of just two elements resort to the regular
2249 operand swapping scheme. Likewise if we run into a
2250 length mismatch process regularly as well as we did not
2251 process the other lanes we cannot report a good hint what
2252 lanes to try swapping in the parent. */
2253 hard_fail = false;
2254 break;
2256 else if (chain_len == 0)
2257 chain_len = chain.length ();
2258 else if (chain.length () != chain_len)
2260 /* ??? Here we could slip in magic to compensate with
2261 neutral operands. */
2262 matches[lane] = false;
2263 if (lane != group_size - 1)
2264 matches[0] = false;
2265 break;
2267 chains.quick_push (chain.copy ());
2268 chain.truncate (0);
2270 if (chains.length () == group_size)
2272 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2273 if (!op_stmt_info)
2275 hard_fail = false;
2276 goto out;
2278 /* Now we have a set of chains with the same length. */
2279 /* 1. pre-sort according to def_type and operation. */
2280 for (unsigned lane = 0; lane < group_size; ++lane)
2281 chains[lane].stablesort (dt_sort_cmp, vinfo);
2282 if (dump_enabled_p ())
2284 dump_printf_loc (MSG_NOTE, vect_location,
2285 "pre-sorted chains of %s\n",
2286 get_tree_code_name (code));
2287 for (unsigned lane = 0; lane < group_size; ++lane)
2289 if (!stmts[lane])
2290 dump_printf (MSG_NOTE, "--");
2291 else
2292 for (unsigned opnum = 0; opnum < chain_len; ++opnum)
2293 dump_printf (MSG_NOTE, "%s %T ",
2294 get_tree_code_name (chains[lane][opnum].code),
2295 chains[lane][opnum].op);
2296 dump_printf (MSG_NOTE, "\n");
2299 /* 2. try to build children nodes, associating as necessary. */
2300 /* 2a. prepare and perform early checks to avoid eating into
2301 discovery limit unnecessarily. */
2302 vect_def_type *dts = XALLOCAVEC (vect_def_type, chain_len);
2303 for (unsigned n = 0; n < chain_len; ++n)
2305 vect_def_type dt = chains[0][n].dt;
2306 unsigned lane;
2307 for (lane = 0; lane < group_size; ++lane)
2308 if (stmts[lane] && chains[lane][n].dt != dt)
2310 if (dt == vect_constant_def
2311 && chains[lane][n].dt == vect_external_def)
2312 dt = vect_external_def;
2313 else if (dt == vect_external_def
2314 && chains[lane][n].dt == vect_constant_def)
2316 else
2317 break;
2319 if (lane != group_size)
2321 if (dump_enabled_p ())
2322 dump_printf_loc (MSG_NOTE, vect_location,
2323 "giving up on chain due to mismatched "
2324 "def types\n");
2325 matches[lane] = false;
2326 if (lane != group_size - 1)
2327 matches[0] = false;
2328 goto out;
2330 dts[n] = dt;
2331 if (dt == vect_constant_def
2332 || dt == vect_external_def)
2334 /* Check whether we can build the invariant. If we can't
2335 we never will be able to. */
2336 tree type = TREE_TYPE (chains[0][n].op);
2337 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ()
2338 && (TREE_CODE (type) == BOOLEAN_TYPE
2339 || !can_duplicate_and_interleave_p (vinfo, group_size,
2340 type)))
2342 matches[0] = false;
2343 goto out;
2346 else if (dt != vect_internal_def)
2348 /* Not sure, we might need sth special.
2349 gcc.dg/vect/pr96854.c,
2350 gfortran.dg/vect/fast-math-pr37021.f90
2351 and gfortran.dg/vect/pr61171.f trigger. */
2352 /* Soft-fail for now. */
2353 hard_fail = false;
2354 goto out;
2357 /* 2b. do the actual build. */
2358 for (unsigned n = 0; n < chain_len; ++n)
2360 vect_def_type dt = dts[n];
2361 unsigned lane;
2362 if (dt == vect_constant_def
2363 || dt == vect_external_def)
2365 vec<tree> ops;
2366 ops.create (group_size);
2367 for (lane = 0; lane < group_size; ++lane)
2368 if (stmts[lane])
2369 ops.quick_push (chains[lane][n].op);
2370 else
2371 ops.quick_push (NULL_TREE);
2372 slp_tree child = vect_create_new_slp_node (ops);
2373 SLP_TREE_DEF_TYPE (child) = dt;
2374 children.safe_push (child);
2376 else
2378 vec<stmt_vec_info> op_stmts;
2379 op_stmts.create (group_size);
2380 slp_tree child = NULL;
2381 /* Brute-force our way. We have to consider a lane
2382 failing after fixing an earlier fail up in the
2383 SLP discovery recursion. So track the current
2384 permute per lane. */
2385 unsigned *perms = XALLOCAVEC (unsigned, group_size);
2386 memset (perms, 0, sizeof (unsigned) * group_size);
2389 op_stmts.truncate (0);
2390 for (lane = 0; lane < group_size; ++lane)
2391 if (stmts[lane])
2392 op_stmts.quick_push
2393 (vinfo->lookup_def (chains[lane][n].op));
2394 else
2395 op_stmts.quick_push (NULL);
2396 child = vect_build_slp_tree (vinfo, op_stmts,
2397 group_size, &this_max_nunits,
2398 matches, limit,
2399 &this_tree_size, bst_map);
2400 /* ??? We're likely getting too many fatal mismatches
2401 here so maybe we want to ignore them (but then we
2402 have no idea which lanes fatally mismatched). */
2403 if (child || !matches[0])
2404 break;
2405 /* Swap another lane we have not yet matched up into
2406 lanes that did not match. If we run out of
2407 permute possibilities for a lane terminate the
2408 search. */
2409 bool term = false;
2410 for (lane = 1; lane < group_size; ++lane)
2411 if (!matches[lane])
2413 if (n + perms[lane] + 1 == chain_len)
2415 term = true;
2416 break;
2418 if (dump_enabled_p ())
2419 dump_printf_loc (MSG_NOTE, vect_location,
2420 "swapping operand %d and %d "
2421 "of lane %d\n",
2422 n, n + perms[lane] + 1, lane);
2423 std::swap (chains[lane][n],
2424 chains[lane][n + perms[lane] + 1]);
2425 perms[lane]++;
2427 if (term)
2428 break;
2430 while (1);
2431 if (!child)
2433 if (dump_enabled_p ())
2434 dump_printf_loc (MSG_NOTE, vect_location,
2435 "failed to match up op %d\n", n);
2436 op_stmts.release ();
2437 if (lane != group_size - 1)
2438 matches[0] = false;
2439 else
2440 matches[lane] = false;
2441 goto out;
2443 if (dump_enabled_p ())
2445 dump_printf_loc (MSG_NOTE, vect_location,
2446 "matched up op %d to\n", n);
2447 vect_print_slp_tree (MSG_NOTE, vect_location, child);
2449 children.safe_push (child);
2452 /* 3. build SLP nodes to combine the chain. */
2453 for (unsigned lane = 0; lane < group_size; ++lane)
2454 if (stmts[lane] && chains[lane][0].code != code)
2456 /* See if there's any alternate all-PLUS entry. */
2457 unsigned n;
2458 for (n = 1; n < chain_len; ++n)
2460 for (lane = 0; lane < group_size; ++lane)
2461 if (stmts[lane] && chains[lane][n].code != code)
2462 break;
2463 if (lane == group_size)
2464 break;
2466 if (n != chain_len)
2468 /* Swap that in at first position. */
2469 std::swap (children[0], children[n]);
2470 for (lane = 0; lane < group_size; ++lane)
2471 if (stmts[lane])
2472 std::swap (chains[lane][0], chains[lane][n]);
2474 else
2476 /* ??? When this triggers and we end up with two
2477 vect_constant/external_def up-front things break (ICE)
2478 spectacularly finding an insertion place for the
2479 all-constant op. We should have a fully
2480 vect_internal_def operand though(?) so we can swap
2481 that into first place and then prepend the all-zero
2482 constant. */
2483 if (dump_enabled_p ())
2484 dump_printf_loc (MSG_NOTE, vect_location,
2485 "inserting constant zero to compensate "
2486 "for (partially) negated first "
2487 "operand\n");
2488 chain_len++;
2489 for (lane = 0; lane < group_size; ++lane)
2490 if (stmts[lane])
2491 chains[lane].safe_insert
2492 (0, chain_op_t (code, vect_constant_def, NULL_TREE));
2493 vec<tree> zero_ops;
2494 zero_ops.create (group_size);
2495 zero_ops.quick_push (build_zero_cst (TREE_TYPE (vectype)));
2496 for (lane = 1; lane < group_size; ++lane)
2497 if (stmts[lane])
2498 zero_ops.quick_push (zero_ops[0]);
2499 else
2500 zero_ops.quick_push (NULL_TREE);
2501 slp_tree zero = vect_create_new_slp_node (zero_ops);
2502 SLP_TREE_DEF_TYPE (zero) = vect_constant_def;
2503 children.safe_insert (0, zero);
2505 break;
2507 for (unsigned i = 1; i < children.length (); ++i)
2509 slp_tree op0 = children[i - 1];
2510 slp_tree op1 = children[i];
2511 bool this_two_op = false;
2512 for (unsigned lane = 0; lane < group_size; ++lane)
2513 if (stmts[lane] && chains[lane][i].code != chains[0][i].code)
2515 this_two_op = true;
2516 break;
2518 slp_tree child;
2519 if (i == children.length () - 1)
2520 child = vect_create_new_slp_node (node, stmts, 2);
2521 else
2522 child = vect_create_new_slp_node (2, ERROR_MARK);
2523 if (this_two_op)
2525 vec<std::pair<unsigned, unsigned> > lperm;
2526 lperm.create (group_size);
2527 for (unsigned lane = 0; lane < group_size; ++lane)
2528 lperm.quick_push (std::make_pair
2529 (chains[lane][i].code != chains[0][i].code, lane));
2530 vect_slp_build_two_operator_nodes (child, vectype, op0, op1,
2531 (chains[0][i].code == code
2532 ? op_stmt_info
2533 : other_op_stmt_info),
2534 (chains[0][i].code == code
2535 ? other_op_stmt_info
2536 : op_stmt_info),
2537 lperm);
2539 else
2541 SLP_TREE_DEF_TYPE (child) = vect_internal_def;
2542 SLP_TREE_VECTYPE (child) = vectype;
2543 SLP_TREE_LANES (child) = group_size;
2544 SLP_TREE_CHILDREN (child).quick_push (op0);
2545 SLP_TREE_CHILDREN (child).quick_push (op1);
2546 SLP_TREE_REPRESENTATIVE (child)
2547 = (chains[0][i].code == code
2548 ? op_stmt_info : other_op_stmt_info);
2550 children[i] = child;
2552 *tree_size += this_tree_size + 1;
2553 *max_nunits = this_max_nunits;
2554 while (!chains.is_empty ())
2555 chains.pop ().release ();
2556 return node;
2558 out:
2559 if (dump_enabled_p ())
2560 dump_printf_loc (MSG_NOTE, vect_location,
2561 "failed to line up SLP graph by re-associating "
2562 "operations in lanes%s\n",
2563 !hard_fail ? " trying regular discovery" : "");
2564 while (!children.is_empty ())
2565 vect_free_slp_tree (children.pop ());
2566 while (!chains.is_empty ())
2567 chains.pop ().release ();
2568 /* Hard-fail, otherwise we might run into quadratic processing of the
2569 chains starting one stmt into the chain again. */
2570 if (hard_fail)
2571 return NULL;
2572 /* Fall thru to normal processing. */
2575 /* Get at the operands, verifying they are compatible. */
2576 vec<slp_oprnd_info> oprnds_info = vect_create_oprnd_info (nops, group_size);
2577 slp_oprnd_info oprnd_info;
2578 FOR_EACH_VEC_ELT (stmts, i, stmt_info)
2580 int res = vect_get_and_check_slp_defs (vinfo, swap[i], skip_args,
2581 stmts, i, &oprnds_info);
2582 if (res != 0)
2583 matches[(res == -1) ? 0 : i] = false;
2584 if (!matches[0])
2585 break;
2587 for (i = 0; i < group_size; ++i)
2588 if (!matches[i])
2590 vect_free_oprnd_info (oprnds_info);
2591 return NULL;
2593 swap = NULL;
2595 bool has_two_operators_perm = false;
2596 auto_vec<unsigned> two_op_perm_indices[2];
2597 vec<stmt_vec_info> two_op_scalar_stmts[2] = {vNULL, vNULL};
2599 if (two_operators && oprnds_info.length () == 2 && group_size > 2)
2601 unsigned idx = 0;
2602 hash_map<gimple *, unsigned> seen;
2603 vec<slp_oprnd_info> new_oprnds_info
2604 = vect_create_oprnd_info (1, group_size);
2605 bool success = true;
2607 enum tree_code code = ERROR_MARK;
2608 if (oprnds_info[0]->def_stmts[0]
2609 && is_a<gassign *> (oprnds_info[0]->def_stmts[0]->stmt))
2610 code = gimple_assign_rhs_code (oprnds_info[0]->def_stmts[0]->stmt);
2612 for (unsigned j = 0; j < group_size; ++j)
2614 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2616 stmt_vec_info stmt_info = oprnd_info->def_stmts[j];
2617 if (!stmt_info || !stmt_info->stmt
2618 || !is_a<gassign *> (stmt_info->stmt)
2619 || gimple_assign_rhs_code (stmt_info->stmt) != code
2620 || skip_args[i])
2622 success = false;
2623 break;
2626 bool exists;
2627 unsigned &stmt_idx
2628 = seen.get_or_insert (stmt_info->stmt, &exists);
2630 if (!exists)
2632 new_oprnds_info[0]->def_stmts.safe_push (stmt_info);
2633 new_oprnds_info[0]->ops.safe_push (oprnd_info->ops[j]);
2634 stmt_idx = idx;
2635 idx++;
2638 two_op_perm_indices[i].safe_push (stmt_idx);
2641 if (!success)
2642 break;
2645 if (success && idx == group_size)
2647 if (dump_enabled_p ())
2649 dump_printf_loc (MSG_NOTE, vect_location,
2650 "Replace two_operators operands:\n");
2652 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2654 dump_printf_loc (MSG_NOTE, vect_location,
2655 "Operand %u:\n", i);
2656 for (unsigned j = 0; j < group_size; j++)
2657 dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2658 j, oprnd_info->def_stmts[j]->stmt);
2661 dump_printf_loc (MSG_NOTE, vect_location,
2662 "With a single operand:\n");
2663 for (unsigned j = 0; j < group_size; j++)
2664 dump_printf_loc (MSG_NOTE, vect_location, "\tstmt %u %G",
2665 j, new_oprnds_info[0]->def_stmts[j]->stmt);
2668 two_op_scalar_stmts[0].safe_splice (oprnds_info[0]->def_stmts);
2669 two_op_scalar_stmts[1].safe_splice (oprnds_info[1]->def_stmts);
2671 new_oprnds_info[0]->first_op_type = oprnds_info[0]->first_op_type;
2672 new_oprnds_info[0]->first_dt = oprnds_info[0]->first_dt;
2673 new_oprnds_info[0]->any_pattern = oprnds_info[0]->any_pattern;
2674 new_oprnds_info[0]->first_gs_p = oprnds_info[0]->first_gs_p;
2675 new_oprnds_info[0]->first_gs_info = oprnds_info[0]->first_gs_info;
2677 vect_free_oprnd_info (oprnds_info);
2678 oprnds_info = new_oprnds_info;
2679 nops = 1;
2680 has_two_operators_perm = true;
2682 else
2683 vect_free_oprnd_info (new_oprnds_info);
2686 auto_vec<slp_tree, 4> children;
2688 stmt_info = stmts[0];
2690 /* Create SLP_TREE nodes for the definition node/s. */
2691 FOR_EACH_VEC_ELT (oprnds_info, i, oprnd_info)
2693 slp_tree child = nullptr;
2694 unsigned int j;
2696 /* We're skipping certain operands from processing, for example
2697 outer loop reduction initial defs. */
2698 if (skip_args[i])
2700 children.safe_push (NULL);
2701 continue;
2704 if (oprnd_info->first_dt == vect_uninitialized_def)
2706 /* COND_EXPR have one too many eventually if the condition
2707 is a SSA name. */
2708 gcc_assert (i == 3 && nops == 4);
2709 continue;
2712 if (is_a <bb_vec_info> (vinfo)
2713 && oprnd_info->first_dt == vect_internal_def
2714 && !oprnd_info->any_pattern)
2716 /* For BB vectorization, if all defs are the same do not
2717 bother to continue the build along the single-lane
2718 graph but use a splat of the scalar value. */
2719 stmt_vec_info first_def = oprnd_info->def_stmts[0];
2720 for (j = 1; j < group_size; ++j)
2721 if (oprnd_info->def_stmts[j] != first_def)
2722 break;
2723 if (j == group_size
2724 /* But avoid doing this for loads where we may be
2725 able to CSE things, unless the stmt is not
2726 vectorizable. */
2727 && (!STMT_VINFO_VECTORIZABLE (first_def)
2728 || !gimple_vuse (first_def->stmt)))
2730 if (dump_enabled_p ())
2731 dump_printf_loc (MSG_NOTE, vect_location,
2732 "Using a splat of the uniform operand %G",
2733 first_def->stmt);
2734 oprnd_info->first_dt = vect_external_def;
2738 if (oprnd_info->first_dt == vect_external_def
2739 || oprnd_info->first_dt == vect_constant_def)
2741 if (!GET_MODE_SIZE (vinfo->vector_mode).is_constant ())
2743 tree op0;
2744 tree uniform_val = op0 = oprnd_info->ops[0];
2745 for (j = 1; j < oprnd_info->ops.length (); ++j)
2746 if (oprnd_info->ops[j]
2747 && !operand_equal_p (uniform_val, oprnd_info->ops[j]))
2749 uniform_val = NULL_TREE;
2750 break;
2752 if (!uniform_val
2753 && !can_duplicate_and_interleave_p (vinfo,
2754 oprnd_info->ops.length (),
2755 TREE_TYPE (op0)))
2757 matches[j] = false;
2758 if (dump_enabled_p ())
2759 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760 "Build SLP failed: invalid type of def "
2761 "for variable-length SLP %T\n", op0);
2762 goto fail;
2765 slp_tree invnode = vect_create_new_slp_node (oprnd_info->ops);
2766 SLP_TREE_DEF_TYPE (invnode) = oprnd_info->first_dt;
2767 oprnd_info->ops = vNULL;
2768 children.safe_push (invnode);
2769 continue;
2772 /* When we have a masked load with uniform mask discover this
2773 as a single-lane mask with a splat permute. This way we can
2774 recognize this as a masked load-lane by stripping the splat. */
2775 if (is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
2776 && gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
2777 IFN_MASK_LOAD)
2778 && STMT_VINFO_GROUPED_ACCESS (stmt_info)
2779 && ! STMT_VINFO_SLP_VECT_ONLY (DR_GROUP_FIRST_ELEMENT (stmt_info)))
2781 vec<stmt_vec_info> def_stmts2;
2782 def_stmts2.create (1);
2783 def_stmts2.quick_push (oprnd_info->def_stmts[0]);
2784 child = vect_build_slp_tree (vinfo, def_stmts2, 1,
2785 &this_max_nunits,
2786 matches, limit,
2787 &this_tree_size, bst_map);
2788 if (child)
2790 slp_tree pnode = vect_create_new_slp_node (1, VEC_PERM_EXPR);
2791 SLP_TREE_VECTYPE (pnode) = SLP_TREE_VECTYPE (child);
2792 SLP_TREE_LANES (pnode) = group_size;
2793 SLP_TREE_SCALAR_STMTS (pnode).create (group_size);
2794 SLP_TREE_LANE_PERMUTATION (pnode).create (group_size);
2795 for (unsigned k = 0; k < group_size; ++k)
2797 SLP_TREE_SCALAR_STMTS (pnode)
2798 .quick_push (oprnd_info->def_stmts[0]);
2799 SLP_TREE_LANE_PERMUTATION (pnode)
2800 .quick_push (std::make_pair (0u, 0u));
2802 SLP_TREE_CHILDREN (pnode).quick_push (child);
2803 pnode->max_nunits = child->max_nunits;
2804 children.safe_push (pnode);
2805 oprnd_info->def_stmts = vNULL;
2806 continue;
2808 else
2809 def_stmts2.release ();
2812 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2813 group_size, &this_max_nunits,
2814 matches, limit,
2815 &this_tree_size, bst_map)) != NULL)
2817 oprnd_info->def_stmts = vNULL;
2818 children.safe_push (child);
2819 continue;
2822 /* If the SLP build for operand zero failed and operand zero
2823 and one can be commutated try that for the scalar stmts
2824 that failed the match. */
2825 if (i == 0
2826 /* A first scalar stmt mismatch signals a fatal mismatch. */
2827 && matches[0]
2828 /* ??? For COND_EXPRs we can swap the comparison operands
2829 as well as the arms under some constraints. */
2830 && nops == 2
2831 && oprnds_info[1]->first_dt == vect_internal_def
2832 && is_gimple_assign (stmt_info->stmt)
2833 /* Swapping operands for reductions breaks assumptions later on. */
2834 && STMT_VINFO_REDUC_IDX (stmt_info) == -1)
2836 /* See whether we can swap the matching or the non-matching
2837 stmt operands. */
2838 bool swap_not_matching = true;
2841 for (j = 0; j < group_size; ++j)
2843 if (matches[j] != !swap_not_matching)
2844 continue;
2845 stmt_vec_info stmt_info = stmts[j];
2846 /* Verify if we can swap operands of this stmt. */
2847 gassign *stmt = dyn_cast <gassign *> (stmt_info->stmt);
2848 if (!stmt
2849 || !commutative_tree_code (gimple_assign_rhs_code (stmt)))
2851 if (!swap_not_matching)
2852 goto fail;
2853 swap_not_matching = false;
2854 break;
2858 while (j != group_size);
2860 /* Swap mismatched definition stmts. */
2861 if (dump_enabled_p ())
2862 dump_printf_loc (MSG_NOTE, vect_location,
2863 "Re-trying with swapped operands of stmts ");
2864 for (j = 0; j < group_size; ++j)
2865 if (matches[j] == !swap_not_matching)
2867 std::swap (oprnds_info[0]->def_stmts[j],
2868 oprnds_info[1]->def_stmts[j]);
2869 std::swap (oprnds_info[0]->ops[j],
2870 oprnds_info[1]->ops[j]);
2871 if (dump_enabled_p ())
2872 dump_printf (MSG_NOTE, "%d ", j);
2874 if (dump_enabled_p ())
2875 dump_printf (MSG_NOTE, "\n");
2876 /* After swapping some operands we lost track whether an
2877 operand has any pattern defs so be conservative here. */
2878 if (oprnds_info[0]->any_pattern || oprnds_info[1]->any_pattern)
2879 oprnds_info[0]->any_pattern = oprnds_info[1]->any_pattern = true;
2880 /* And try again with scratch 'matches' ... */
2881 bool *tem = XALLOCAVEC (bool, group_size);
2882 if ((child = vect_build_slp_tree (vinfo, oprnd_info->def_stmts,
2883 group_size, &this_max_nunits,
2884 tem, limit,
2885 &this_tree_size, bst_map)) != NULL)
2887 oprnd_info->def_stmts = vNULL;
2888 children.safe_push (child);
2889 continue;
2892 fail:
2894 /* If the SLP build failed and we analyze a basic-block
2895 simply treat nodes we fail to build as externally defined
2896 (and thus build vectors from the scalar defs).
2897 The cost model will reject outright expensive cases.
2898 ??? This doesn't treat cases where permutation ultimatively
2899 fails (or we don't try permutation below). Ideally we'd
2900 even compute a permutation that will end up with the maximum
2901 SLP tree size... */
2902 if (is_a <bb_vec_info> (vinfo)
2903 /* ??? Rejecting patterns this way doesn't work. We'd have to
2904 do extra work to cancel the pattern so the uses see the
2905 scalar version. */
2906 && !is_pattern_stmt_p (stmt_info)
2907 && !oprnd_info->any_pattern)
2909 /* But if there's a leading vector sized set of matching stmts
2910 fail here so we can split the group. This matches the condition
2911 vect_analyze_slp_instance uses. */
2912 /* ??? We might want to split here and combine the results to support
2913 multiple vector sizes better. */
2914 for (j = 0; j < group_size; ++j)
2915 if (!matches[j])
2916 break;
2917 if (!known_ge (j, TYPE_VECTOR_SUBPARTS (vectype))
2918 && vect_slp_can_convert_to_external (oprnd_info->def_stmts))
2920 if (dump_enabled_p ())
2921 dump_printf_loc (MSG_NOTE, vect_location,
2922 "Building vector operands from scalars\n");
2923 this_tree_size++;
2924 child = vect_create_new_slp_node (oprnd_info->ops);
2925 children.safe_push (child);
2926 oprnd_info->ops = vNULL;
2927 continue;
2931 gcc_assert (child == NULL);
2932 FOR_EACH_VEC_ELT (children, j, child)
2933 if (child)
2934 vect_free_slp_tree (child);
2935 vect_free_oprnd_info (oprnds_info);
2936 return NULL;
2939 vect_free_oprnd_info (oprnds_info);
2941 /* If we have all children of a child built up from uniform scalars
2942 or does more than one possibly expensive vector construction then
2943 just throw that away, causing it built up from scalars.
2944 The exception is the SLP node for the vector store. */
2945 if (is_a <bb_vec_info> (vinfo)
2946 && !STMT_VINFO_GROUPED_ACCESS (stmt_info)
2947 /* ??? Rejecting patterns this way doesn't work. We'd have to
2948 do extra work to cancel the pattern so the uses see the
2949 scalar version. */
2950 && !is_pattern_stmt_p (stmt_info))
2952 slp_tree child;
2953 unsigned j;
2954 bool all_uniform_p = true;
2955 unsigned n_vector_builds = 0;
2956 FOR_EACH_VEC_ELT (children, j, child)
2958 if (!child)
2960 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
2961 all_uniform_p = false;
2962 else if (!vect_slp_tree_uniform_p (child))
2964 all_uniform_p = false;
2965 if (SLP_TREE_DEF_TYPE (child) == vect_external_def)
2966 n_vector_builds++;
2969 if (all_uniform_p
2970 || n_vector_builds > 1
2971 || (n_vector_builds == children.length ()
2972 && is_a <gphi *> (stmt_info->stmt)))
2974 /* Roll back. */
2975 matches[0] = false;
2976 FOR_EACH_VEC_ELT (children, j, child)
2977 if (child)
2978 vect_free_slp_tree (child);
2980 if (dump_enabled_p ())
2981 dump_printf_loc (MSG_NOTE, vect_location,
2982 "Building parent vector operands from "
2983 "scalars instead\n");
2984 return NULL;
2988 *tree_size += this_tree_size + 1;
2989 *max_nunits = this_max_nunits;
2991 if (two_operators)
2993 /* ??? We'd likely want to either cache in bst_map sth like
2994 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2995 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2996 explicit stmts to put in so the keying on 'stmts' doesn't
2997 work (but we have the same issue with nodes that use 'ops'). */
2999 if (has_two_operators_perm)
3001 slp_tree child = children[0];
3002 children.truncate (0);
3003 for (i = 0; i < 2; i++)
3005 slp_tree pnode
3006 = vect_create_new_slp_node (two_op_scalar_stmts[i], 2);
3007 SLP_TREE_CODE (pnode) = VEC_PERM_EXPR;
3008 SLP_TREE_VECTYPE (pnode) = vectype;
3009 SLP_TREE_CHILDREN (pnode).quick_push (child);
3010 SLP_TREE_CHILDREN (pnode).quick_push (child);
3011 lane_permutation_t& perm = SLP_TREE_LANE_PERMUTATION (pnode);
3012 children.safe_push (pnode);
3014 for (unsigned j = 0; j < stmts.length (); j++)
3015 perm.safe_push (std::make_pair (0, two_op_perm_indices[i][j]));
3018 SLP_TREE_REF_COUNT (child) += 4;
3021 slp_tree one = new _slp_tree;
3022 slp_tree two = new _slp_tree;
3023 SLP_TREE_DEF_TYPE (one) = vect_internal_def;
3024 SLP_TREE_DEF_TYPE (two) = vect_internal_def;
3025 SLP_TREE_VECTYPE (one) = vectype;
3026 SLP_TREE_VECTYPE (two) = vectype;
3027 SLP_TREE_CHILDREN (one).safe_splice (children);
3028 SLP_TREE_CHILDREN (two).safe_splice (children);
3029 slp_tree child;
3030 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two), i, child)
3031 SLP_TREE_REF_COUNT (child)++;
3033 /* Here we record the original defs since this
3034 node represents the final lane configuration. */
3035 node = vect_create_new_slp_node (node, stmts, 2);
3036 SLP_TREE_VECTYPE (node) = vectype;
3037 SLP_TREE_CODE (node) = VEC_PERM_EXPR;
3038 SLP_TREE_CHILDREN (node).quick_push (one);
3039 SLP_TREE_CHILDREN (node).quick_push (two);
3040 gassign *stmt = as_a <gassign *> (stmts[0]->stmt);
3041 enum tree_code code0 = gimple_assign_rhs_code (stmt);
3042 enum tree_code ocode = ERROR_MARK;
3043 stmt_vec_info ostmt_info;
3044 unsigned j = 0;
3045 FOR_EACH_VEC_ELT (stmts, i, ostmt_info)
3047 gassign *ostmt = as_a <gassign *> (ostmt_info->stmt);
3048 if (gimple_assign_rhs_code (ostmt) != code0)
3050 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (1, i));
3051 ocode = gimple_assign_rhs_code (ostmt);
3052 j = i;
3054 else
3055 SLP_TREE_LANE_PERMUTATION (node).safe_push (std::make_pair (0, i));
3058 SLP_TREE_CODE (one) = code0;
3059 SLP_TREE_CODE (two) = ocode;
3060 SLP_TREE_LANES (one) = stmts.length ();
3061 SLP_TREE_LANES (two) = stmts.length ();
3062 SLP_TREE_REPRESENTATIVE (one) = stmts[0];
3063 SLP_TREE_REPRESENTATIVE (two) = stmts[j];
3065 return node;
3068 node = vect_create_new_slp_node (node, stmts, nops);
3069 SLP_TREE_VECTYPE (node) = vectype;
3070 SLP_TREE_CHILDREN (node).splice (children);
3071 return node;
3074 /* Dump a single SLP tree NODE. */
3076 static void
3077 vect_print_slp_tree (dump_flags_t dump_kind, dump_location_t loc,
3078 slp_tree node)
3080 unsigned i, j;
3081 slp_tree child;
3082 stmt_vec_info stmt_info;
3083 tree op;
3085 dump_metadata_t metadata (dump_kind, loc.get_impl_location ());
3086 dump_user_location_t user_loc = loc.get_user_location ();
3087 dump_printf_loc (metadata, user_loc,
3088 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
3089 ", refcnt=%u)",
3090 SLP_TREE_DEF_TYPE (node) == vect_external_def
3091 ? " (external)"
3092 : (SLP_TREE_DEF_TYPE (node) == vect_constant_def
3093 ? " (constant)"
3094 : ""), (void *) node,
3095 estimated_poly_value (node->max_nunits),
3096 SLP_TREE_REF_COUNT (node));
3097 if (SLP_TREE_VECTYPE (node))
3098 dump_printf (metadata, " %T", SLP_TREE_VECTYPE (node));
3099 dump_printf (metadata, "\n");
3100 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
3102 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
3103 dump_printf_loc (metadata, user_loc, "op: VEC_PERM_EXPR\n");
3104 else
3105 dump_printf_loc (metadata, user_loc, "op template: %G",
3106 SLP_TREE_REPRESENTATIVE (node)->stmt);
3108 if (SLP_TREE_SCALAR_STMTS (node).exists ())
3109 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3110 if (stmt_info)
3111 dump_printf_loc (metadata, user_loc, "\t%sstmt %u %G",
3112 STMT_VINFO_LIVE_P (stmt_info) ? "[l] " : "",
3113 i, stmt_info->stmt);
3114 else
3115 dump_printf_loc (metadata, user_loc, "\tstmt %u ---\n", i);
3116 else
3118 dump_printf_loc (metadata, user_loc, "\t{ ");
3119 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node), i, op)
3120 dump_printf (metadata, "%T%s ", op,
3121 i < SLP_TREE_SCALAR_OPS (node).length () - 1 ? "," : "");
3122 dump_printf (metadata, "}\n");
3124 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
3126 dump_printf_loc (metadata, user_loc, "\tload permutation {");
3127 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node), i, j)
3128 dump_printf (dump_kind, " %u", j);
3129 dump_printf (dump_kind, " }\n");
3131 if (SLP_TREE_LANE_PERMUTATION (node).exists ())
3133 dump_printf_loc (metadata, user_loc, "\tlane permutation {");
3134 for (i = 0; i < SLP_TREE_LANE_PERMUTATION (node).length (); ++i)
3135 dump_printf (dump_kind, " %u[%u]",
3136 SLP_TREE_LANE_PERMUTATION (node)[i].first,
3137 SLP_TREE_LANE_PERMUTATION (node)[i].second);
3138 dump_printf (dump_kind, " }%s\n",
3139 node->ldst_lanes ? " (load-lanes)" : "");
3141 if (SLP_TREE_CHILDREN (node).is_empty ())
3142 return;
3143 dump_printf_loc (metadata, user_loc, "\tchildren");
3144 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3145 dump_printf (dump_kind, " %p", (void *)child);
3146 dump_printf (dump_kind, "%s\n",
3147 node->ldst_lanes && !SLP_TREE_LANE_PERMUTATION (node).exists ()
3148 ? " (store-lanes)" : "");
3151 DEBUG_FUNCTION void
3152 debug (slp_tree node)
3154 debug_dump_context ctx;
3155 vect_print_slp_tree (MSG_NOTE,
3156 dump_location_t::from_location_t (UNKNOWN_LOCATION),
3157 node);
3160 /* Recursive helper for the dot producer below. */
3162 static void
3163 dot_slp_tree (FILE *f, slp_tree node, hash_set<slp_tree> &visited)
3165 if (visited.add (node))
3166 return;
3168 fprintf (f, "\"%p\" [label=\"", (void *)node);
3169 vect_print_slp_tree (MSG_NOTE,
3170 dump_location_t::from_location_t (UNKNOWN_LOCATION),
3171 node);
3172 fprintf (f, "\"];\n");
3175 for (slp_tree child : SLP_TREE_CHILDREN (node))
3176 fprintf (f, "\"%p\" -> \"%p\";", (void *)node, (void *)child);
3178 for (slp_tree child : SLP_TREE_CHILDREN (node))
3179 if (child)
3180 dot_slp_tree (f, child, visited);
3183 DEBUG_FUNCTION void
3184 dot_slp_tree (const char *fname, slp_tree node)
3186 FILE *f = fopen (fname, "w");
3187 fprintf (f, "digraph {\n");
3188 fflush (f);
3190 debug_dump_context ctx (f);
3191 hash_set<slp_tree> visited;
3192 dot_slp_tree (f, node, visited);
3194 fflush (f);
3195 fprintf (f, "}\n");
3196 fclose (f);
3199 DEBUG_FUNCTION void
3200 dot_slp_tree (const char *fname, const vec<slp_instance> &slp_instances)
3202 FILE *f = fopen (fname, "w");
3203 fprintf (f, "digraph {\n");
3204 fflush (f);
3206 debug_dump_context ctx (f);
3207 hash_set<slp_tree> visited;
3208 for (auto inst : slp_instances)
3209 dot_slp_tree (f, SLP_INSTANCE_TREE (inst), visited);
3211 fflush (f);
3212 fprintf (f, "}\n");
3213 fclose (f);
3216 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
3218 static void
3219 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3220 slp_tree node, hash_set<slp_tree> &visited)
3222 unsigned i;
3223 slp_tree child;
3225 if (visited.add (node))
3226 return;
3228 vect_print_slp_tree (dump_kind, loc, node);
3230 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3231 if (child)
3232 vect_print_slp_graph (dump_kind, loc, child, visited);
3235 static void
3236 vect_print_slp_graph (dump_flags_t dump_kind, dump_location_t loc,
3237 slp_tree entry)
3239 hash_set<slp_tree> visited;
3240 vect_print_slp_graph (dump_kind, loc, entry, visited);
3243 DEBUG_FUNCTION void
3244 debug (slp_instance instance)
3246 debug_dump_context ctx;
3247 vect_print_slp_graph (MSG_NOTE,
3248 dump_location_t::from_location_t (UNKNOWN_LOCATION),
3249 SLP_INSTANCE_TREE (instance));
3252 /* Mark the tree rooted at NODE with PURE_SLP. */
3254 static void
3255 vect_mark_slp_stmts (vec_info *vinfo, slp_tree node,
3256 hash_set<slp_tree> &visited)
3258 int i;
3259 stmt_vec_info stmt_info;
3260 slp_tree child;
3262 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3263 return;
3265 if (visited.add (node))
3266 return;
3268 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3269 if (stmt_info)
3271 STMT_SLP_TYPE (stmt_info) = pure_slp;
3272 /* ??? For .MASK_LOAD and .MASK_STORE detected as load/store-lanes
3273 when there is the mask_conversion pattern applied we have lost the
3274 alternate lanes of the uniform mask which nevertheless
3275 have separate pattern defs. To not confuse hybrid
3276 analysis we mark those as covered as well here. */
3277 if (node->ldst_lanes)
3278 if (gcall *call = dyn_cast <gcall *> (stmt_info->stmt))
3279 if (gimple_call_internal_p (call, IFN_MASK_LOAD)
3280 || gimple_call_internal_p (call, IFN_MASK_STORE))
3282 tree mask = gimple_call_arg (call,
3283 internal_fn_mask_index
3284 (gimple_call_internal_fn (call)));
3285 if (TREE_CODE (mask) == SSA_NAME)
3286 if (stmt_vec_info mask_info = vinfo->lookup_def (mask))
3288 mask_info = vect_stmt_to_vectorize (mask_info);
3289 STMT_SLP_TYPE (mask_info) = pure_slp;
3294 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3295 if (child)
3296 vect_mark_slp_stmts (vinfo, child, visited);
3299 static void
3300 vect_mark_slp_stmts (vec_info *vinfo, slp_tree node)
3302 hash_set<slp_tree> visited;
3303 vect_mark_slp_stmts (vinfo, node, visited);
3306 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
3308 static void
3309 vect_mark_slp_stmts_relevant (slp_tree node, hash_set<slp_tree> &visited)
3311 int i;
3312 stmt_vec_info stmt_info;
3313 slp_tree child;
3315 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3316 return;
3318 if (visited.add (node))
3319 return;
3321 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
3322 if (stmt_info)
3324 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info)
3325 || STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope);
3326 STMT_VINFO_RELEVANT (stmt_info) = vect_used_in_scope;
3329 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3330 if (child)
3331 vect_mark_slp_stmts_relevant (child, visited);
3334 static void
3335 vect_mark_slp_stmts_relevant (slp_tree node)
3337 hash_set<slp_tree> visited;
3338 vect_mark_slp_stmts_relevant (node, visited);
3342 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
3344 static void
3345 vect_gather_slp_loads (vec<slp_tree> &loads, slp_tree node,
3346 hash_set<slp_tree> &visited)
3348 if (!node || visited.add (node))
3349 return;
3351 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
3352 return;
3354 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR)
3356 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
3357 if (STMT_VINFO_DATA_REF (stmt_info)
3358 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
3359 loads.safe_push (node);
3362 unsigned i;
3363 slp_tree child;
3364 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3365 vect_gather_slp_loads (loads, child, visited);
3369 /* Find the last store in SLP INSTANCE. */
3371 stmt_vec_info
3372 vect_find_last_scalar_stmt_in_slp (slp_tree node)
3374 stmt_vec_info last = NULL;
3375 stmt_vec_info stmt_vinfo;
3377 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3378 if (stmt_vinfo)
3380 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3381 last = last ? get_later_stmt (stmt_vinfo, last) : stmt_vinfo;
3384 return last;
3387 /* Find the first stmt in NODE. */
3389 stmt_vec_info
3390 vect_find_first_scalar_stmt_in_slp (slp_tree node)
3392 stmt_vec_info first = NULL;
3393 stmt_vec_info stmt_vinfo;
3395 for (int i = 0; SLP_TREE_SCALAR_STMTS (node).iterate (i, &stmt_vinfo); i++)
3396 if (stmt_vinfo)
3398 stmt_vinfo = vect_orig_stmt (stmt_vinfo);
3399 if (!first
3400 || get_later_stmt (stmt_vinfo, first) == first)
3401 first = stmt_vinfo;
3404 return first;
3407 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3408 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3409 (also containing the first GROUP1_SIZE stmts, since stores are
3410 consecutive), the second containing the remainder.
3411 Return the first stmt in the second group. */
3413 static stmt_vec_info
3414 vect_split_slp_store_group (stmt_vec_info first_vinfo, unsigned group1_size)
3416 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo) == first_vinfo);
3417 gcc_assert (group1_size > 0);
3418 int group2_size = DR_GROUP_SIZE (first_vinfo) - group1_size;
3419 gcc_assert (group2_size > 0);
3420 DR_GROUP_SIZE (first_vinfo) = group1_size;
3422 stmt_vec_info stmt_info = first_vinfo;
3423 for (unsigned i = group1_size; i > 1; i--)
3425 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info);
3426 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3428 /* STMT is now the last element of the first group. */
3429 stmt_vec_info group2 = DR_GROUP_NEXT_ELEMENT (stmt_info);
3430 DR_GROUP_NEXT_ELEMENT (stmt_info) = 0;
3432 DR_GROUP_SIZE (group2) = group2_size;
3433 for (stmt_info = group2; stmt_info;
3434 stmt_info = DR_GROUP_NEXT_ELEMENT (stmt_info))
3436 DR_GROUP_FIRST_ELEMENT (stmt_info) = group2;
3437 gcc_assert (DR_GROUP_GAP (stmt_info) == 1);
3440 /* For the second group, the DR_GROUP_GAP is that before the original group,
3441 plus skipping over the first vector. */
3442 DR_GROUP_GAP (group2) = DR_GROUP_GAP (first_vinfo) + group1_size;
3444 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3445 DR_GROUP_GAP (first_vinfo) += group2_size;
3447 if (dump_enabled_p ())
3448 dump_printf_loc (MSG_NOTE, vect_location, "Split group into %d and %d\n",
3449 group1_size, group2_size);
3451 return group2;
3454 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3455 statements and a vector of NUNITS elements. */
3457 static poly_uint64
3458 calculate_unrolling_factor (poly_uint64 nunits, unsigned int group_size)
3460 return exact_div (common_multiple (nunits, group_size), group_size);
3463 /* Helper that checks to see if a node is a load node. */
3465 static inline bool
3466 vect_is_slp_load_node (slp_tree root)
3468 return (SLP_TREE_CODE (root) != VEC_PERM_EXPR
3469 && SLP_TREE_DEF_TYPE (root) == vect_internal_def
3470 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root))
3471 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root))));
3475 /* Helper function of optimize_load_redistribution that performs the operation
3476 recursively. */
3478 static slp_tree
3479 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t *bst_map,
3480 vec_info *vinfo, unsigned int group_size,
3481 hash_map<slp_tree, slp_tree> *load_map,
3482 slp_tree root)
3484 if (slp_tree *leader = load_map->get (root))
3485 return *leader;
3487 slp_tree node;
3488 unsigned i;
3490 /* For now, we don't know anything about externals so do not do anything. */
3491 if (!root || SLP_TREE_DEF_TYPE (root) != vect_internal_def)
3492 return NULL;
3493 else if (SLP_TREE_CODE (root) == VEC_PERM_EXPR)
3495 /* First convert this node into a load node and add it to the leaves
3496 list and flatten the permute from a lane to a load one. If it's
3497 unneeded it will be elided later. */
3498 vec<stmt_vec_info> stmts;
3499 stmts.create (SLP_TREE_LANES (root));
3500 lane_permutation_t lane_perm = SLP_TREE_LANE_PERMUTATION (root);
3501 for (unsigned j = 0; j < lane_perm.length (); j++)
3503 std::pair<unsigned, unsigned> perm = lane_perm[j];
3504 node = SLP_TREE_CHILDREN (root)[perm.first];
3506 if (!vect_is_slp_load_node (node)
3507 || SLP_TREE_CHILDREN (node).exists ())
3509 stmts.release ();
3510 goto next;
3513 stmts.quick_push (SLP_TREE_SCALAR_STMTS (node)[perm.second]);
3516 if (dump_enabled_p ())
3517 dump_printf_loc (MSG_NOTE, vect_location,
3518 "converting stmts on permute node %p\n",
3519 (void *) root);
3521 bool *matches = XALLOCAVEC (bool, group_size);
3522 poly_uint64 max_nunits = 1;
3523 unsigned tree_size = 0, limit = 1;
3524 node = vect_build_slp_tree (vinfo, stmts, group_size, &max_nunits,
3525 matches, &limit, &tree_size, bst_map);
3526 if (!node)
3527 stmts.release ();
3529 load_map->put (root, node);
3530 return node;
3533 next:
3534 load_map->put (root, NULL);
3536 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3538 slp_tree value
3539 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3540 node);
3541 if (value)
3543 SLP_TREE_REF_COUNT (value)++;
3544 SLP_TREE_CHILDREN (root)[i] = value;
3545 /* ??? We know the original leafs of the replaced nodes will
3546 be referenced by bst_map, only the permutes created by
3547 pattern matching are not. */
3548 if (SLP_TREE_REF_COUNT (node) == 1)
3549 load_map->remove (node);
3550 vect_free_slp_tree (node);
3554 return NULL;
3557 /* Temporary workaround for loads not being CSEd during SLP build. This
3558 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3559 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3560 same DR such that the final operation is equal to a permuted load. Such
3561 NODES are then directly converted into LOADS themselves. The nodes are
3562 CSEd using BST_MAP. */
3564 static void
3565 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t *bst_map,
3566 vec_info *vinfo, unsigned int group_size,
3567 hash_map<slp_tree, slp_tree> *load_map,
3568 slp_tree root)
3570 slp_tree node;
3571 unsigned i;
3573 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root), i , node)
3575 slp_tree value
3576 = optimize_load_redistribution_1 (bst_map, vinfo, group_size, load_map,
3577 node);
3578 if (value)
3580 SLP_TREE_REF_COUNT (value)++;
3581 SLP_TREE_CHILDREN (root)[i] = value;
3582 /* ??? We know the original leafs of the replaced nodes will
3583 be referenced by bst_map, only the permutes created by
3584 pattern matching are not. */
3585 if (SLP_TREE_REF_COUNT (node) == 1)
3586 load_map->remove (node);
3587 vect_free_slp_tree (node);
3592 /* Helper function of vect_match_slp_patterns.
3594 Attempts to match patterns against the slp tree rooted in REF_NODE using
3595 VINFO. Patterns are matched in post-order traversal.
3597 If matching is successful the value in REF_NODE is updated and returned, if
3598 not then it is returned unchanged. */
3600 static bool
3601 vect_match_slp_patterns_2 (slp_tree *ref_node, vec_info *vinfo,
3602 slp_tree_to_load_perm_map_t *perm_cache,
3603 slp_compat_nodes_map_t *compat_cache,
3604 hash_set<slp_tree> *visited)
3606 unsigned i;
3607 slp_tree node = *ref_node;
3608 bool found_p = false;
3609 if (!node || visited->add (node))
3610 return false;
3612 slp_tree child;
3613 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
3614 found_p |= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node)[i],
3615 vinfo, perm_cache, compat_cache,
3616 visited);
3618 for (unsigned x = 0; x < num__slp_patterns; x++)
3620 vect_pattern *pattern
3621 = slp_patterns[x] (perm_cache, compat_cache, ref_node);
3622 if (pattern)
3624 pattern->build (vinfo);
3625 delete pattern;
3626 found_p = true;
3630 return found_p;
3633 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3634 vec_info VINFO.
3636 The modified tree is returned. Patterns are tried in order and multiple
3637 patterns may match. */
3639 static bool
3640 vect_match_slp_patterns (slp_instance instance, vec_info *vinfo,
3641 hash_set<slp_tree> *visited,
3642 slp_tree_to_load_perm_map_t *perm_cache,
3643 slp_compat_nodes_map_t *compat_cache)
3645 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3646 slp_tree *ref_node = &SLP_INSTANCE_TREE (instance);
3648 if (dump_enabled_p ())
3649 dump_printf_loc (MSG_NOTE, vect_location,
3650 "Analyzing SLP tree %p for patterns\n",
3651 (void *) SLP_INSTANCE_TREE (instance));
3653 return vect_match_slp_patterns_2 (ref_node, vinfo, perm_cache, compat_cache,
3654 visited);
3657 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3658 vectorizing with VECTYPE that might be NULL. MASKED_P indicates whether
3659 the stores are masked.
3660 Return true if we could use IFN_STORE_LANES instead and if that appears
3661 to be the better approach. */
3663 static bool
3664 vect_slp_prefer_store_lanes_p (vec_info *vinfo, stmt_vec_info stmt_info,
3665 tree vectype, bool masked_p,
3666 unsigned int group_size,
3667 unsigned int new_group_size)
3669 if (!vectype)
3671 tree scalar_type = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
3672 vectype = get_vectype_for_scalar_type (vinfo, scalar_type);
3674 if (!vectype)
3675 return false;
3676 /* Allow the split if one of the two new groups would operate on full
3677 vectors *within* rather than across one scalar loop iteration.
3678 This is purely a heuristic, but it should work well for group
3679 sizes of 3 and 4, where the possible splits are:
3681 3->2+1: OK if the vector has exactly two elements
3682 4->2+2: Likewise
3683 4->3+1: Less clear-cut. */
3684 if (multiple_p (group_size - new_group_size, TYPE_VECTOR_SUBPARTS (vectype))
3685 || multiple_p (new_group_size, TYPE_VECTOR_SUBPARTS (vectype)))
3686 return false;
3687 return vect_store_lanes_supported (vectype, group_size, masked_p) != IFN_LAST;
3690 /* Analyze an SLP instance starting from a group of grouped stores. Call
3691 vect_build_slp_tree to build a tree of packed stmts if possible.
3692 Return FALSE if it's impossible to SLP any stmt in the loop. */
3694 static bool
3695 vect_analyze_slp_instance (vec_info *vinfo,
3696 scalar_stmts_to_slp_tree_map_t *bst_map,
3697 stmt_vec_info stmt_info, slp_instance_kind kind,
3698 unsigned max_tree_size, unsigned *limit,
3699 bool force_single_lane);
3701 /* Build an interleaving scheme for the store sources RHS_NODES from
3702 SCALAR_STMTS. */
3704 static slp_tree
3705 vect_build_slp_store_interleaving (vec<slp_tree> &rhs_nodes,
3706 vec<stmt_vec_info> &scalar_stmts,
3707 poly_uint64 max_nunits)
3709 unsigned int group_size = scalar_stmts.length ();
3710 slp_tree node = vect_create_new_slp_node (scalar_stmts,
3711 SLP_TREE_CHILDREN
3712 (rhs_nodes[0]).length ());
3713 SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
3714 node->max_nunits = max_nunits;
3715 for (unsigned l = 0;
3716 l < SLP_TREE_CHILDREN (rhs_nodes[0]).length (); ++l)
3718 /* And a permute merging all RHS SLP trees. */
3719 slp_tree perm = vect_create_new_slp_node (rhs_nodes.length (),
3720 VEC_PERM_EXPR);
3721 SLP_TREE_CHILDREN (node).quick_push (perm);
3722 SLP_TREE_LANE_PERMUTATION (perm).create (group_size);
3723 SLP_TREE_VECTYPE (perm) = SLP_TREE_VECTYPE (node);
3724 perm->max_nunits = max_nunits;
3725 SLP_TREE_LANES (perm) = group_size;
3726 /* ??? We should set this NULL but that's not expected. */
3727 SLP_TREE_REPRESENTATIVE (perm)
3728 = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes[0])[l]);
3729 for (unsigned j = 0; j < rhs_nodes.length (); ++j)
3731 SLP_TREE_CHILDREN (perm)
3732 .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[l]);
3733 SLP_TREE_CHILDREN (rhs_nodes[j])[l]->refcnt++;
3734 for (unsigned k = 0;
3735 k < SLP_TREE_SCALAR_STMTS (rhs_nodes[j]).length (); ++k)
3737 /* ??? We should populate SLP_TREE_SCALAR_STMTS
3738 or SLP_TREE_SCALAR_OPS but then we might have
3739 a mix of both in our children. */
3740 SLP_TREE_LANE_PERMUTATION (perm)
3741 .quick_push (std::make_pair (j, k));
3745 /* Now we have a single permute node but we cannot code-generate
3746 the case with more than two inputs.
3747 Perform pairwise reduction, reducing the two inputs
3748 with the least number of lanes to one and then repeat until
3749 we end up with two inputs. That scheme makes sure we end
3750 up with permutes satisfying the restriction of requiring at
3751 most two vector inputs to produce a single vector output
3752 when the number of lanes is even. */
3753 while (SLP_TREE_CHILDREN (perm).length () > 2)
3755 /* When we have three equal sized groups left the pairwise
3756 reduction does not result in a scheme that avoids using
3757 three vectors. Instead merge the first two groups
3758 to the final size with do-not-care elements (chosen
3759 from the first group) and then merge with the third.
3760 { A0, B0, x, A1, B1, x, ... }
3761 -> { A0, B0, C0, A1, B1, C1, ... }
3762 This handles group size of three (and at least
3763 power-of-two multiples of that). */
3764 if (SLP_TREE_CHILDREN (perm).length () == 3
3765 && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3766 == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[1]))
3767 && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[0])
3768 == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[2])))
3770 int ai = 0;
3771 int bi = 1;
3772 slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3773 slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3774 unsigned n = SLP_TREE_LANES (perm);
3776 slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3777 SLP_TREE_LANES (permab) = n;
3778 SLP_TREE_LANE_PERMUTATION (permab).create (n);
3779 SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3780 permab->max_nunits = max_nunits;
3781 /* ??? Should be NULL but that's not expected. */
3782 SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3783 SLP_TREE_CHILDREN (permab).quick_push (a);
3784 for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3785 SLP_TREE_LANE_PERMUTATION (permab)
3786 .quick_push (std::make_pair (0, k));
3787 SLP_TREE_CHILDREN (permab).quick_push (b);
3788 for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3789 SLP_TREE_LANE_PERMUTATION (permab)
3790 .quick_push (std::make_pair (1, k));
3791 /* Push the do-not-care lanes. */
3792 for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3793 SLP_TREE_LANE_PERMUTATION (permab)
3794 .quick_push (std::make_pair (0, k));
3796 /* Put the merged node into 'perm', in place of a. */
3797 SLP_TREE_CHILDREN (perm)[ai] = permab;
3798 /* Adjust the references to b in the permutation
3799 of perm and to the later children which we'll
3800 remove. */
3801 for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3803 std::pair<unsigned, unsigned> &p
3804 = SLP_TREE_LANE_PERMUTATION (perm)[k];
3805 if (p.first == (unsigned) bi)
3807 p.first = ai;
3808 p.second += SLP_TREE_LANES (a);
3810 else if (p.first > (unsigned) bi)
3811 p.first--;
3813 SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3814 break;
3817 /* Pick the two nodes with the least number of lanes,
3818 prefer the earliest candidate and maintain ai < bi. */
3819 int ai = -1;
3820 int bi = -1;
3821 for (unsigned ci = 0; ci < SLP_TREE_CHILDREN (perm).length (); ++ci)
3823 if (ai == -1)
3824 ai = ci;
3825 else if (bi == -1)
3826 bi = ci;
3827 else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
3828 < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai]))
3829 || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ci])
3830 < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi])))
3832 if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[ai])
3833 <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm)[bi]))
3834 bi = ci;
3835 else
3837 ai = bi;
3838 bi = ci;
3843 /* Produce a merge of nodes ai and bi. */
3844 slp_tree a = SLP_TREE_CHILDREN (perm)[ai];
3845 slp_tree b = SLP_TREE_CHILDREN (perm)[bi];
3846 unsigned n = SLP_TREE_LANES (a) + SLP_TREE_LANES (b);
3847 slp_tree permab = vect_create_new_slp_node (2, VEC_PERM_EXPR);
3848 SLP_TREE_LANES (permab) = n;
3849 SLP_TREE_LANE_PERMUTATION (permab).create (n);
3850 SLP_TREE_VECTYPE (permab) = SLP_TREE_VECTYPE (perm);
3851 permab->max_nunits = max_nunits;
3852 /* ??? Should be NULL but that's not expected. */
3853 SLP_TREE_REPRESENTATIVE (permab) = SLP_TREE_REPRESENTATIVE (perm);
3854 SLP_TREE_CHILDREN (permab).quick_push (a);
3855 for (unsigned k = 0; k < SLP_TREE_LANES (a); ++k)
3856 SLP_TREE_LANE_PERMUTATION (permab)
3857 .quick_push (std::make_pair (0, k));
3858 SLP_TREE_CHILDREN (permab).quick_push (b);
3859 for (unsigned k = 0; k < SLP_TREE_LANES (b); ++k)
3860 SLP_TREE_LANE_PERMUTATION (permab)
3861 .quick_push (std::make_pair (1, k));
3863 /* Put the merged node into 'perm', in place of a. */
3864 SLP_TREE_CHILDREN (perm)[ai] = permab;
3865 /* Adjust the references to b in the permutation
3866 of perm and to the later children which we'll
3867 remove. */
3868 for (unsigned k = 0; k < SLP_TREE_LANES (perm); ++k)
3870 std::pair<unsigned, unsigned> &p
3871 = SLP_TREE_LANE_PERMUTATION (perm)[k];
3872 if (p.first == (unsigned) bi)
3874 p.first = ai;
3875 p.second += SLP_TREE_LANES (a);
3877 else if (p.first > (unsigned) bi)
3878 p.first--;
3880 SLP_TREE_CHILDREN (perm).ordered_remove (bi);
3884 return node;
3887 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3888 of KIND. Return true if successful. */
3890 static bool
3891 vect_build_slp_instance (vec_info *vinfo,
3892 slp_instance_kind kind,
3893 vec<stmt_vec_info> &scalar_stmts,
3894 vec<stmt_vec_info> &root_stmt_infos,
3895 vec<tree> &remain,
3896 unsigned max_tree_size, unsigned *limit,
3897 scalar_stmts_to_slp_tree_map_t *bst_map,
3898 /* ??? We need stmt_info for group splitting. */
3899 stmt_vec_info stmt_info_,
3900 bool force_single_lane)
3902 /* If there's no budget left bail out early. */
3903 if (*limit == 0)
3904 return false;
3906 if (kind == slp_inst_kind_ctor)
3908 if (dump_enabled_p ())
3909 dump_printf_loc (MSG_NOTE, vect_location,
3910 "Analyzing vectorizable constructor: %G\n",
3911 root_stmt_infos[0]->stmt);
3913 else if (kind == slp_inst_kind_gcond)
3915 if (dump_enabled_p ())
3916 dump_printf_loc (MSG_NOTE, vect_location,
3917 "Analyzing vectorizable control flow: %G",
3918 root_stmt_infos[0]->stmt);
3921 if (dump_enabled_p ())
3923 dump_printf_loc (MSG_NOTE, vect_location,
3924 "Starting SLP discovery for\n");
3925 for (unsigned i = 0; i < scalar_stmts.length (); ++i)
3926 dump_printf_loc (MSG_NOTE, vect_location,
3927 " %G", scalar_stmts[i]->stmt);
3930 /* Build the tree for the SLP instance. */
3931 unsigned int group_size = scalar_stmts.length ();
3932 bool *matches = XALLOCAVEC (bool, group_size);
3933 poly_uint64 max_nunits = 1;
3934 unsigned tree_size = 0;
3935 unsigned i;
3937 slp_tree node = NULL;
3938 if (group_size > 1 && force_single_lane)
3940 matches[0] = true;
3941 matches[1] = false;
3943 else
3944 node = vect_build_slp_tree (vinfo, scalar_stmts, group_size,
3945 &max_nunits, matches, limit,
3946 &tree_size, bst_map);
3947 if (node != NULL)
3949 /* Calculate the unrolling factor based on the smallest type. */
3950 poly_uint64 unrolling_factor
3951 = calculate_unrolling_factor (max_nunits, group_size);
3953 if (maybe_ne (unrolling_factor, 1U)
3954 && is_a <bb_vec_info> (vinfo))
3956 unsigned HOST_WIDE_INT const_max_nunits;
3957 if (!max_nunits.is_constant (&const_max_nunits)
3958 || const_max_nunits > group_size)
3960 if (dump_enabled_p ())
3961 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3962 "Build SLP failed: store group "
3963 "size not a multiple of the vector size "
3964 "in basic block SLP\n");
3965 vect_free_slp_tree (node);
3966 return false;
3968 /* Fatal mismatch. */
3969 if (dump_enabled_p ())
3970 dump_printf_loc (MSG_NOTE, vect_location,
3971 "SLP discovery succeeded but node needs "
3972 "splitting\n");
3973 memset (matches, true, group_size);
3974 matches[group_size / const_max_nunits * const_max_nunits] = false;
3975 vect_free_slp_tree (node);
3977 else
3979 /* Create a new SLP instance. */
3980 slp_instance new_instance = XNEW (class _slp_instance);
3981 SLP_INSTANCE_TREE (new_instance) = node;
3982 SLP_INSTANCE_LOADS (new_instance) = vNULL;
3983 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
3984 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
3985 SLP_INSTANCE_KIND (new_instance) = kind;
3986 new_instance->reduc_phis = NULL;
3987 new_instance->cost_vec = vNULL;
3988 new_instance->subgraph_entries = vNULL;
3990 if (dump_enabled_p ())
3991 dump_printf_loc (MSG_NOTE, vect_location,
3992 "SLP size %u vs. limit %u.\n",
3993 tree_size, max_tree_size);
3995 /* Fixup SLP reduction chains. */
3996 if (kind == slp_inst_kind_reduc_chain)
3998 /* If this is a reduction chain with a conversion in front
3999 amend the SLP tree with a node for that. */
4000 gimple *scalar_def
4001 = vect_orig_stmt (scalar_stmts[group_size - 1])->stmt;
4002 if (STMT_VINFO_DEF_TYPE (scalar_stmts[0]) != vect_reduction_def)
4004 /* Get at the conversion stmt - we know it's the single use
4005 of the last stmt of the reduction chain. */
4006 use_operand_p use_p;
4007 bool r = single_imm_use (gimple_assign_lhs (scalar_def),
4008 &use_p, &scalar_def);
4009 gcc_assert (r);
4010 stmt_vec_info next_info = vinfo->lookup_stmt (scalar_def);
4011 next_info = vect_stmt_to_vectorize (next_info);
4012 scalar_stmts = vNULL;
4013 scalar_stmts.create (group_size);
4014 for (unsigned i = 0; i < group_size; ++i)
4015 scalar_stmts.quick_push (next_info);
4016 slp_tree conv = vect_create_new_slp_node (scalar_stmts, 1);
4017 SLP_TREE_VECTYPE (conv) = STMT_VINFO_VECTYPE (next_info);
4018 SLP_TREE_CHILDREN (conv).quick_push (node);
4019 SLP_INSTANCE_TREE (new_instance) = conv;
4020 /* We also have to fake this conversion stmt as SLP reduction
4021 group so we don't have to mess with too much code
4022 elsewhere. */
4023 REDUC_GROUP_FIRST_ELEMENT (next_info) = next_info;
4024 REDUC_GROUP_NEXT_ELEMENT (next_info) = NULL;
4026 /* Fill the backedge child of the PHI SLP node. The
4027 general matching code cannot find it because the
4028 scalar code does not reflect how we vectorize the
4029 reduction. */
4030 use_operand_p use_p;
4031 imm_use_iterator imm_iter;
4032 class loop *loop = LOOP_VINFO_LOOP (as_a <loop_vec_info> (vinfo));
4033 FOR_EACH_IMM_USE_FAST (use_p, imm_iter,
4034 gimple_get_lhs (scalar_def))
4035 /* There are exactly two non-debug uses, the reduction
4036 PHI and the loop-closed PHI node. */
4037 if (!is_gimple_debug (USE_STMT (use_p))
4038 && gimple_bb (USE_STMT (use_p)) == loop->header)
4040 auto_vec<stmt_vec_info, 64> phis (group_size);
4041 stmt_vec_info phi_info
4042 = vinfo->lookup_stmt (USE_STMT (use_p));
4043 for (unsigned i = 0; i < group_size; ++i)
4044 phis.quick_push (phi_info);
4045 slp_tree *phi_node = bst_map->get (phis);
4046 unsigned dest_idx = loop_latch_edge (loop)->dest_idx;
4047 SLP_TREE_CHILDREN (*phi_node)[dest_idx]
4048 = SLP_INSTANCE_TREE (new_instance);
4049 SLP_INSTANCE_TREE (new_instance)->refcnt++;
4053 vinfo->slp_instances.safe_push (new_instance);
4055 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4056 the number of scalar stmts in the root in a few places.
4057 Verify that assumption holds. */
4058 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4059 .length () == group_size);
4061 if (dump_enabled_p ())
4063 dump_printf_loc (MSG_NOTE, vect_location,
4064 "Final SLP tree for instance %p:\n",
4065 (void *) new_instance);
4066 vect_print_slp_graph (MSG_NOTE, vect_location,
4067 SLP_INSTANCE_TREE (new_instance));
4070 return true;
4073 /* Failed to SLP. */
4075 stmt_vec_info stmt_info = stmt_info_;
4076 /* Try to break the group up into pieces. */
4077 if (*limit > 0 && kind == slp_inst_kind_store)
4079 /* ??? We could delay all the actual splitting of store-groups
4080 until after SLP discovery of the original group completed.
4081 Then we can recurse to vect_build_slp_instance directly. */
4082 for (i = 0; i < group_size; i++)
4083 if (!matches[i])
4084 break;
4086 /* For basic block SLP, try to break the group up into multiples of
4087 a vector size. */
4088 if (is_a <bb_vec_info> (vinfo)
4089 && (i > 1 && i < group_size))
4091 /* Free the allocated memory. */
4092 scalar_stmts.release ();
4094 tree scalar_type
4095 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info)));
4096 tree vectype = get_vectype_for_scalar_type (vinfo, scalar_type,
4097 1 << floor_log2 (i));
4098 unsigned HOST_WIDE_INT const_nunits;
4099 if (vectype
4100 && TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits))
4102 /* Split into two groups at the first vector boundary. */
4103 gcc_assert ((const_nunits & (const_nunits - 1)) == 0);
4104 unsigned group1_size = i & ~(const_nunits - 1);
4106 if (dump_enabled_p ())
4107 dump_printf_loc (MSG_NOTE, vect_location,
4108 "Splitting SLP group at stmt %u\n", i);
4109 stmt_vec_info rest = vect_split_slp_store_group (stmt_info,
4110 group1_size);
4111 bool res = vect_analyze_slp_instance (vinfo, bst_map, stmt_info,
4112 kind, max_tree_size,
4113 limit, false);
4114 /* Split the rest at the failure point and possibly
4115 re-analyze the remaining matching part if it has
4116 at least two lanes. */
4117 if (group1_size < i
4118 && (i + 1 < group_size
4119 || i - group1_size > 1))
4121 stmt_vec_info rest2 = rest;
4122 rest = vect_split_slp_store_group (rest, i - group1_size);
4123 if (i - group1_size > 1)
4124 res |= vect_analyze_slp_instance (vinfo, bst_map, rest2,
4125 kind, max_tree_size,
4126 limit, false);
4128 /* Re-analyze the non-matching tail if it has at least
4129 two lanes. */
4130 if (i + 1 < group_size)
4131 res |= vect_analyze_slp_instance (vinfo, bst_map,
4132 rest, kind, max_tree_size,
4133 limit, false);
4134 return res;
4138 /* For loop vectorization split the RHS into arbitrary pieces of
4139 size >= 1. */
4140 else if (is_a <loop_vec_info> (vinfo)
4141 && (group_size != 1 && i < group_size))
4143 gcall *call = dyn_cast <gcall *> (stmt_info->stmt);
4144 bool masked_p = call
4145 && gimple_call_internal_p (call)
4146 && internal_fn_mask_index (gimple_call_internal_fn (call)) != -1;
4147 /* There are targets that cannot do even/odd interleaving schemes
4148 so they absolutely need to use load/store-lanes. For now
4149 force single-lane SLP for them - they would be happy with
4150 uniform power-of-two lanes (but depending on element size),
4151 but even if we can use 'i' as indicator we would need to
4152 backtrack when later lanes fail to discover with the same
4153 granularity. We cannot turn any of strided or scatter store
4154 into store-lanes. */
4155 /* ??? If this is not in sync with what get_load_store_type
4156 later decides the SLP representation is not good for other
4157 store vectorization methods. */
4158 bool want_store_lanes
4159 = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
4160 && ! STMT_VINFO_STRIDED_P (stmt_info)
4161 && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
4162 && compare_step_with_zero (vinfo, stmt_info) > 0
4163 && vect_slp_prefer_store_lanes_p (vinfo, stmt_info, NULL_TREE,
4164 masked_p, group_size, 1));
4165 if (want_store_lanes || force_single_lane)
4166 i = 1;
4168 /* A fatal discovery fail doesn't always mean single-lane SLP
4169 isn't a possibility, so try. */
4170 if (i == 0)
4171 i = 1;
4173 if (dump_enabled_p ())
4174 dump_printf_loc (MSG_NOTE, vect_location,
4175 "Splitting SLP group at stmt %u\n", i);
4177 /* Analyze the stored values and pinch them together with
4178 a permute node so we can preserve the whole store group. */
4179 auto_vec<slp_tree> rhs_nodes;
4180 poly_uint64 max_nunits = 1;
4182 unsigned int rhs_common_nlanes = 0;
4183 unsigned int start = 0, end = i;
4184 while (start < group_size)
4186 gcc_assert (end - start >= 1);
4187 vec<stmt_vec_info> substmts;
4188 substmts.create (end - start);
4189 for (unsigned j = start; j < end; ++j)
4190 substmts.quick_push (scalar_stmts[j]);
4191 max_nunits = 1;
4192 node = vect_build_slp_tree (vinfo, substmts, end - start,
4193 &max_nunits,
4194 matches, limit, &tree_size, bst_map);
4195 if (node)
4197 rhs_nodes.safe_push (node);
4198 vect_update_max_nunits (&max_nunits, node->max_nunits);
4199 if (start == 0)
4200 rhs_common_nlanes = SLP_TREE_LANES (node);
4201 else if (rhs_common_nlanes != SLP_TREE_LANES (node))
4202 rhs_common_nlanes = 0;
4203 start = end;
4204 if (want_store_lanes || force_single_lane)
4205 end = start + 1;
4206 else
4207 end = group_size;
4209 else
4211 substmts.release ();
4212 if (end - start == 1)
4214 /* Single-lane discovery failed. Free ressources. */
4215 for (auto node : rhs_nodes)
4216 vect_free_slp_tree (node);
4217 scalar_stmts.release ();
4218 if (dump_enabled_p ())
4219 dump_printf_loc (MSG_NOTE, vect_location,
4220 "SLP discovery failed\n");
4221 return false;
4224 /* ??? It really happens that we soft-fail SLP
4225 build at a mismatch but the matching part hard-fails
4226 later. As we know we arrived here with a group
4227 larger than one try a group of size one! */
4228 if (!matches[0])
4229 end = start + 1;
4230 else
4231 for (unsigned j = start; j < end; j++)
4232 if (!matches[j - start])
4234 end = j;
4235 break;
4240 /* Now re-assess whether we want store lanes in case the
4241 discovery ended up producing all single-lane RHSs. */
4242 if (! want_store_lanes
4243 && rhs_common_nlanes == 1
4244 && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info)
4245 && ! STMT_VINFO_STRIDED_P (stmt_info)
4246 && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info)
4247 && compare_step_with_zero (vinfo, stmt_info) > 0
4248 && (vect_store_lanes_supported (SLP_TREE_VECTYPE (rhs_nodes[0]),
4249 group_size, masked_p)
4250 != IFN_LAST))
4251 want_store_lanes = true;
4253 /* Now we assume we can build the root SLP node from all stores. */
4254 if (want_store_lanes)
4256 /* For store-lanes feed the store node with all RHS nodes
4257 in order. */
4258 node = vect_create_new_slp_node (scalar_stmts,
4259 SLP_TREE_CHILDREN
4260 (rhs_nodes[0]).length ());
4261 SLP_TREE_VECTYPE (node) = SLP_TREE_VECTYPE (rhs_nodes[0]);
4262 node->max_nunits = max_nunits;
4263 node->ldst_lanes = true;
4264 SLP_TREE_CHILDREN (node)
4265 .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes[0]).length ()
4266 + rhs_nodes.length () - 1);
4267 /* First store value and possibly mask. */
4268 SLP_TREE_CHILDREN (node)
4269 .splice (SLP_TREE_CHILDREN (rhs_nodes[0]));
4270 /* Rest of the store values. All mask nodes are the same,
4271 this should be guaranteed by dataref group discovery. */
4272 for (unsigned j = 1; j < rhs_nodes.length (); ++j)
4273 SLP_TREE_CHILDREN (node)
4274 .quick_push (SLP_TREE_CHILDREN (rhs_nodes[j])[0]);
4275 for (slp_tree child : SLP_TREE_CHILDREN (node))
4276 child->refcnt++;
4278 else
4279 node = vect_build_slp_store_interleaving (rhs_nodes, scalar_stmts,
4280 max_nunits);
4282 while (!rhs_nodes.is_empty ())
4283 vect_free_slp_tree (rhs_nodes.pop ());
4285 /* Create a new SLP instance. */
4286 slp_instance new_instance = XNEW (class _slp_instance);
4287 SLP_INSTANCE_TREE (new_instance) = node;
4288 SLP_INSTANCE_LOADS (new_instance) = vNULL;
4289 SLP_INSTANCE_ROOT_STMTS (new_instance) = root_stmt_infos;
4290 SLP_INSTANCE_REMAIN_DEFS (new_instance) = remain;
4291 SLP_INSTANCE_KIND (new_instance) = kind;
4292 new_instance->reduc_phis = NULL;
4293 new_instance->cost_vec = vNULL;
4294 new_instance->subgraph_entries = vNULL;
4296 if (dump_enabled_p ())
4297 dump_printf_loc (MSG_NOTE, vect_location,
4298 "SLP size %u vs. limit %u.\n",
4299 tree_size, max_tree_size);
4301 vinfo->slp_instances.safe_push (new_instance);
4303 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4304 the number of scalar stmts in the root in a few places.
4305 Verify that assumption holds. */
4306 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance))
4307 .length () == group_size);
4309 if (dump_enabled_p ())
4311 dump_printf_loc (MSG_NOTE, vect_location,
4312 "Final SLP tree for instance %p:\n",
4313 (void *) new_instance);
4314 vect_print_slp_graph (MSG_NOTE, vect_location,
4315 SLP_INSTANCE_TREE (new_instance));
4317 return true;
4319 else
4320 /* Free the allocated memory. */
4321 scalar_stmts.release ();
4323 /* Even though the first vector did not all match, we might be able to SLP
4324 (some) of the remainder. FORNOW ignore this possibility. */
4326 else
4327 /* Free the allocated memory. */
4328 scalar_stmts.release ();
4330 /* Failed to SLP. */
4331 if (dump_enabled_p ())
4332 dump_printf_loc (MSG_NOTE, vect_location, "SLP discovery failed\n");
4333 return false;
4337 /* Analyze an SLP instance starting from a group of grouped stores. Call
4338 vect_build_slp_tree to build a tree of packed stmts if possible.
4339 Return FALSE if it's impossible to SLP any stmt in the loop. */
4341 static bool
4342 vect_analyze_slp_instance (vec_info *vinfo,
4343 scalar_stmts_to_slp_tree_map_t *bst_map,
4344 stmt_vec_info stmt_info,
4345 slp_instance_kind kind,
4346 unsigned max_tree_size, unsigned *limit,
4347 bool force_single_lane)
4349 vec<stmt_vec_info> scalar_stmts;
4351 if (is_a <bb_vec_info> (vinfo))
4352 vect_location = stmt_info->stmt;
4354 stmt_vec_info next_info = stmt_info;
4355 if (kind == slp_inst_kind_store)
4357 /* Collect the stores and store them in scalar_stmts. */
4358 scalar_stmts.create (DR_GROUP_SIZE (stmt_info));
4359 while (next_info)
4361 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4362 next_info = DR_GROUP_NEXT_ELEMENT (next_info);
4365 else if (kind == slp_inst_kind_reduc_chain)
4367 /* Collect the reduction stmts and store them in scalar_stmts. */
4368 scalar_stmts.create (REDUC_GROUP_SIZE (stmt_info));
4369 while (next_info)
4371 scalar_stmts.quick_push (vect_stmt_to_vectorize (next_info));
4372 next_info = REDUC_GROUP_NEXT_ELEMENT (next_info);
4374 /* Mark the first element of the reduction chain as reduction to properly
4375 transform the node. In the reduction analysis phase only the last
4376 element of the chain is marked as reduction. */
4377 STMT_VINFO_DEF_TYPE (stmt_info)
4378 = STMT_VINFO_DEF_TYPE (scalar_stmts.last ());
4379 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))
4380 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts.last ()));
4382 else
4383 gcc_unreachable ();
4385 vec<stmt_vec_info> roots = vNULL;
4386 vec<tree> remain = vNULL;
4387 /* Build the tree for the SLP instance. */
4388 bool res = vect_build_slp_instance (vinfo, kind, scalar_stmts,
4389 roots, remain,
4390 max_tree_size, limit, bst_map,
4391 kind == slp_inst_kind_store
4392 ? stmt_info : NULL, force_single_lane);
4394 /* ??? If this is slp_inst_kind_store and the above succeeded here's
4395 where we should do store group splitting. */
4397 return res;
4400 /* qsort comparator ordering SLP load nodes. */
4402 static int
4403 vllp_cmp (const void *a_, const void *b_)
4405 const slp_tree a = *(const slp_tree *)a_;
4406 const slp_tree b = *(const slp_tree *)b_;
4407 stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (a)[0];
4408 stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (b)[0];
4409 if (STMT_VINFO_GROUPED_ACCESS (a0)
4410 && STMT_VINFO_GROUPED_ACCESS (b0)
4411 && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
4413 /* Same group, order after lanes used. */
4414 if (SLP_TREE_LANES (a) < SLP_TREE_LANES (b))
4415 return 1;
4416 else if (SLP_TREE_LANES (a) > SLP_TREE_LANES (b))
4417 return -1;
4418 else
4420 /* Try to order loads using the same lanes together, breaking
4421 the tie with the lane number that first differs. */
4422 if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
4423 && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
4424 return 0;
4425 else if (SLP_TREE_LOAD_PERMUTATION (a).exists ()
4426 && !SLP_TREE_LOAD_PERMUTATION (b).exists ())
4427 return 1;
4428 else if (!SLP_TREE_LOAD_PERMUTATION (a).exists ()
4429 && SLP_TREE_LOAD_PERMUTATION (b).exists ())
4430 return -1;
4431 else
4433 for (unsigned i = 0; i < SLP_TREE_LANES (a); ++i)
4434 if (SLP_TREE_LOAD_PERMUTATION (a)[i]
4435 != SLP_TREE_LOAD_PERMUTATION (b)[i])
4437 /* In-order lane first, that's what the above case for
4438 no permutation does. */
4439 if (SLP_TREE_LOAD_PERMUTATION (a)[i] == i)
4440 return -1;
4441 else if (SLP_TREE_LOAD_PERMUTATION (b)[i] == i)
4442 return 1;
4443 else if (SLP_TREE_LOAD_PERMUTATION (a)[i]
4444 < SLP_TREE_LOAD_PERMUTATION (b)[i])
4445 return -1;
4446 else
4447 return 1;
4449 return 0;
4453 else /* Different groups or non-groups. */
4455 /* Order groups as their first element to keep them together. */
4456 if (STMT_VINFO_GROUPED_ACCESS (a0))
4457 a0 = DR_GROUP_FIRST_ELEMENT (a0);
4458 if (STMT_VINFO_GROUPED_ACCESS (b0))
4459 b0 = DR_GROUP_FIRST_ELEMENT (b0);
4460 if (a0 == b0)
4461 return 0;
4462 /* Tie using UID. */
4463 else if (gimple_uid (STMT_VINFO_STMT (a0))
4464 < gimple_uid (STMT_VINFO_STMT (b0)))
4465 return -1;
4466 else
4468 gcc_assert (gimple_uid (STMT_VINFO_STMT (a0))
4469 != gimple_uid (STMT_VINFO_STMT (b0)));
4470 return 1;
4475 /* Process the set of LOADS that are all from the same dataref group. */
4477 static void
4478 vect_lower_load_permutations (loop_vec_info loop_vinfo,
4479 scalar_stmts_to_slp_tree_map_t *bst_map,
4480 const array_slice<slp_tree> &loads,
4481 bool force_single_lane)
4483 /* We at this point want to lower without a fixed VF or vector
4484 size in mind which means we cannot actually compute whether we
4485 need three or more vectors for a load permutation yet. So always
4486 lower. */
4487 stmt_vec_info first
4488 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads[0])[0]);
4489 unsigned group_lanes = DR_GROUP_SIZE (first);
4491 /* Verify if all load permutations can be implemented with a suitably
4492 large element load-lanes operation. */
4493 unsigned ld_lanes_lanes = SLP_TREE_LANES (loads[0]);
4494 if (STMT_VINFO_STRIDED_P (first)
4495 || compare_step_with_zero (loop_vinfo, first) <= 0
4496 || exact_log2 (ld_lanes_lanes) == -1
4497 /* ??? For now only support the single-lane case as there is
4498 missing support on the store-lane side and code generation
4499 isn't up to the task yet. */
4500 || ld_lanes_lanes != 1
4501 || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads[0]),
4502 group_lanes / ld_lanes_lanes,
4503 false) == IFN_LAST)
4504 ld_lanes_lanes = 0;
4505 else
4506 /* Verify the loads access the same number of lanes aligned to
4507 ld_lanes_lanes. */
4508 for (slp_tree load : loads)
4510 if (SLP_TREE_LANES (load) != ld_lanes_lanes)
4512 ld_lanes_lanes = 0;
4513 break;
4515 unsigned first = SLP_TREE_LOAD_PERMUTATION (load)[0];
4516 if (first % ld_lanes_lanes != 0)
4518 ld_lanes_lanes = 0;
4519 break;
4521 for (unsigned i = 1; i < SLP_TREE_LANES (load); ++i)
4522 if (SLP_TREE_LOAD_PERMUTATION (load)[i] != first + i)
4524 ld_lanes_lanes = 0;
4525 break;
4529 /* Only a power-of-two number of lanes matches interleaving with N levels.
4530 ??? An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
4531 at each step. */
4532 if (ld_lanes_lanes == 0 && exact_log2 (group_lanes) == -1 && group_lanes != 3)
4533 return;
4535 for (slp_tree load : loads)
4537 /* Leave masked or gather loads alone for now. */
4538 if (!SLP_TREE_CHILDREN (load).is_empty ())
4539 continue;
4541 /* We want to pattern-match special cases here and keep those
4542 alone. Candidates are splats and load-lane. */
4544 /* We need to lower only loads of less than half of the groups
4545 lanes, including duplicate lanes. Note this leaves nodes
4546 with a non-1:1 load permutation around instead of canonicalizing
4547 those into a load and a permute node. Removing this early
4548 check would do such canonicalization. */
4549 if (SLP_TREE_LANES (load) >= (group_lanes + 1) / 2
4550 && ld_lanes_lanes == 0)
4551 continue;
4553 /* Build the permute to get the original load permutation order. */
4554 bool contiguous = true;
4555 lane_permutation_t final_perm;
4556 final_perm.create (SLP_TREE_LANES (load));
4557 for (unsigned i = 0; i < SLP_TREE_LANES (load); ++i)
4559 final_perm.quick_push
4560 (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load)[i]));
4561 if (i != 0
4562 && (SLP_TREE_LOAD_PERMUTATION (load)[i]
4563 != SLP_TREE_LOAD_PERMUTATION (load)[i-1] + 1))
4564 contiguous = false;
4567 /* When the load permutation accesses a contiguous unpermuted,
4568 power-of-two aligned and sized chunk leave the load alone.
4569 We can likely (re-)load it more efficiently rather than
4570 extracting it from the larger load.
4571 ??? Long-term some of the lowering should move to where
4572 the vector types involved are fixed. */
4573 if (!force_single_lane
4574 && ld_lanes_lanes == 0
4575 && contiguous
4576 && (SLP_TREE_LANES (load) > 1 || loads.size () == 1)
4577 && pow2p_hwi (SLP_TREE_LANES (load))
4578 && pow2p_hwi (group_lanes)
4579 && SLP_TREE_LOAD_PERMUTATION (load)[0] % SLP_TREE_LANES (load) == 0
4580 && group_lanes % SLP_TREE_LANES (load) == 0)
4582 final_perm.release ();
4583 continue;
4586 /* First build (and possibly re-use) a load node for the
4587 unpermuted group. Gaps in the middle and on the end are
4588 represented with NULL stmts. */
4589 vec<stmt_vec_info> stmts;
4590 stmts.create (group_lanes);
4591 for (stmt_vec_info s = first; s; s = DR_GROUP_NEXT_ELEMENT (s))
4593 if (s != first)
4594 for (unsigned i = 1; i < DR_GROUP_GAP (s); ++i)
4595 stmts.quick_push (NULL);
4596 stmts.quick_push (s);
4598 for (unsigned i = 0; i < DR_GROUP_GAP (first); ++i)
4599 stmts.quick_push (NULL);
4600 poly_uint64 max_nunits = 1;
4601 bool *matches = XALLOCAVEC (bool, group_lanes);
4602 unsigned limit = 1;
4603 unsigned tree_size = 0;
4604 slp_tree l0 = vect_build_slp_tree (loop_vinfo, stmts,
4605 group_lanes,
4606 &max_nunits, matches, &limit,
4607 &tree_size, bst_map);
4608 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (l0).exists ());
4610 if (ld_lanes_lanes != 0)
4612 /* ??? If this is not in sync with what get_load_store_type
4613 later decides the SLP representation is not good for other
4614 store vectorization methods. */
4615 l0->ldst_lanes = true;
4616 load->ldst_lanes = true;
4619 while (1)
4621 unsigned group_lanes = SLP_TREE_LANES (l0);
4622 if (ld_lanes_lanes != 0
4623 || SLP_TREE_LANES (load) >= (group_lanes + 1) / 2)
4624 break;
4626 /* Try to lower by reducing the group to half its size using an
4627 interleaving scheme. For this try to compute whether all
4628 elements needed for this load are in even or odd elements of
4629 an even/odd decomposition with N consecutive elements.
4630 Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
4631 with N == 2. */
4632 /* ??? Only an even number of lanes can be handed this way, but the
4633 fallback below could work for any number. We have to make sure
4634 to round up in that case. */
4635 gcc_assert ((group_lanes & 1) == 0 || group_lanes == 3);
4636 unsigned even = 0, odd = 0;
4637 if ((group_lanes & 1) == 0)
4639 even = (1 << ceil_log2 (group_lanes)) - 1;
4640 odd = even;
4641 for (auto l : final_perm)
4643 even &= ~l.second;
4644 odd &= l.second;
4648 /* Now build an even or odd extraction from the unpermuted load. */
4649 lane_permutation_t perm;
4650 perm.create ((group_lanes + 1) / 2);
4651 unsigned even_level = even ? 1 << ctz_hwi (even) : 0;
4652 unsigned odd_level = odd ? 1 << ctz_hwi (odd) : 0;
4653 if (even_level
4654 && group_lanes % (2 * even_level) == 0
4655 /* ??? When code generating permutes we do not try to pun
4656 to larger component modes so level != 1 isn't a natural
4657 even/odd extract. Prefer one if possible. */
4658 && (even_level == 1 || !odd_level || odd_level != 1))
4660 /* { 0, 1, ... 4, 5 ..., } */
4661 for (unsigned i = 0; i < group_lanes / 2 / even_level; ++i)
4662 for (unsigned j = 0; j < even_level; ++j)
4663 perm.quick_push (std::make_pair (0, 2 * i * even_level + j));
4665 else if (odd_level)
4667 /* { ..., 2, 3, ... 6, 7 } */
4668 gcc_assert (group_lanes % (2 * odd_level) == 0);
4669 for (unsigned i = 0; i < group_lanes / 2 / odd_level; ++i)
4670 for (unsigned j = 0; j < odd_level; ++j)
4671 perm.quick_push
4672 (std::make_pair (0, (2 * i + 1) * odd_level + j));
4674 else
4676 /* As fallback extract all used lanes and fill to half the
4677 group size by repeating the last element.
4678 ??? This is quite a bad strathegy for re-use - we could
4679 brute force our way to find more optimal filling lanes to
4680 maximize re-use when looking at all loads from the group. */
4681 auto_bitmap l;
4682 for (auto p : final_perm)
4683 bitmap_set_bit (l, p.second);
4684 unsigned i = 0;
4685 bitmap_iterator bi;
4686 EXECUTE_IF_SET_IN_BITMAP (l, 0, i, bi)
4687 perm.quick_push (std::make_pair (0, i));
4688 while (perm.length () < (group_lanes + 1) / 2)
4689 perm.quick_push (perm.last ());
4692 /* Update final_perm with the intermediate permute. */
4693 for (unsigned i = 0; i < final_perm.length (); ++i)
4695 unsigned l = final_perm[i].second;
4696 unsigned j;
4697 for (j = 0; j < perm.length (); ++j)
4698 if (perm[j].second == l)
4700 final_perm[i].second = j;
4701 break;
4703 gcc_assert (j < perm.length ());
4706 /* And create scalar stmts. */
4707 vec<stmt_vec_info> perm_stmts;
4708 perm_stmts.create (perm.length ());
4709 for (unsigned i = 0; i < perm.length (); ++i)
4710 perm_stmts.quick_push (SLP_TREE_SCALAR_STMTS (l0)[perm[i].second]);
4712 slp_tree p = vect_create_new_slp_node (1, VEC_PERM_EXPR);
4713 SLP_TREE_CHILDREN (p).quick_push (l0);
4714 SLP_TREE_LANE_PERMUTATION (p) = perm;
4715 SLP_TREE_VECTYPE (p) = SLP_TREE_VECTYPE (load);
4716 SLP_TREE_LANES (p) = perm.length ();
4717 SLP_TREE_REPRESENTATIVE (p) = SLP_TREE_REPRESENTATIVE (load);
4718 /* ??? As we have scalar stmts for this intermediate permute we
4719 could CSE it via bst_map but we do not want to pick up
4720 another SLP node with a load permutation. We instead should
4721 have a "local" CSE map here. */
4722 SLP_TREE_SCALAR_STMTS (p) = perm_stmts;
4724 /* We now have a node for (group_lanes + 1) / 2 lanes. */
4725 l0 = p;
4728 /* And finally from the ordered reduction node create the
4729 permute to shuffle the lanes into the original load-permutation
4730 order. We replace the original load node with this. */
4731 SLP_TREE_CODE (load) = VEC_PERM_EXPR;
4732 SLP_TREE_LOAD_PERMUTATION (load).release ();
4733 SLP_TREE_LANE_PERMUTATION (load) = final_perm;
4734 SLP_TREE_CHILDREN (load).create (1);
4735 SLP_TREE_CHILDREN (load).quick_push (l0);
4739 /* Transform SLP loads in the SLP graph created by SLP discovery to
4740 group loads from the same group and lower load permutations that
4741 are unlikely to be supported into a series of permutes.
4742 In the degenerate case of having only single-lane SLP instances
4743 this should result in a series of permute nodes emulating an
4744 interleaving scheme. */
4746 static void
4747 vect_lower_load_permutations (loop_vec_info loop_vinfo,
4748 scalar_stmts_to_slp_tree_map_t *bst_map,
4749 bool force_single_lane)
4751 /* Gather and sort loads across all instances. */
4752 hash_set<slp_tree> visited;
4753 auto_vec<slp_tree> loads;
4754 for (auto inst : loop_vinfo->slp_instances)
4755 vect_gather_slp_loads (loads, SLP_INSTANCE_TREE (inst), visited);
4756 if (loads.is_empty ())
4757 return;
4758 loads.qsort (vllp_cmp);
4760 /* Now process each dataref group separately. */
4761 unsigned firsti = 0;
4762 for (unsigned i = 1; i < loads.length (); ++i)
4764 slp_tree first = loads[firsti];
4765 slp_tree next = loads[i];
4766 stmt_vec_info a0 = SLP_TREE_SCALAR_STMTS (first)[0];
4767 stmt_vec_info b0 = SLP_TREE_SCALAR_STMTS (next)[0];
4768 if (STMT_VINFO_GROUPED_ACCESS (a0)
4769 && STMT_VINFO_GROUPED_ACCESS (b0)
4770 && DR_GROUP_FIRST_ELEMENT (a0) == DR_GROUP_FIRST_ELEMENT (b0))
4771 continue;
4772 /* Now we have one or multiple SLP loads of the same group from
4773 firsti to i - 1. */
4774 if (STMT_VINFO_GROUPED_ACCESS (a0))
4775 vect_lower_load_permutations (loop_vinfo, bst_map,
4776 make_array_slice (&loads[firsti],
4777 i - firsti),
4778 force_single_lane);
4779 firsti = i;
4781 if (firsti < loads.length ()
4782 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads[firsti])[0]))
4783 vect_lower_load_permutations (loop_vinfo, bst_map,
4784 make_array_slice (&loads[firsti],
4785 loads.length () - firsti),
4786 force_single_lane);
4789 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
4790 trees of packed scalar stmts if SLP is possible. */
4792 opt_result
4793 vect_analyze_slp (vec_info *vinfo, unsigned max_tree_size,
4794 bool force_single_lane)
4796 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
4797 unsigned int i;
4798 stmt_vec_info first_element;
4799 slp_instance instance;
4801 DUMP_VECT_SCOPE ("vect_analyze_slp");
4803 unsigned limit = max_tree_size;
4805 scalar_stmts_to_slp_tree_map_t *bst_map
4806 = new scalar_stmts_to_slp_tree_map_t ();
4808 /* Find SLP sequences starting from groups of grouped stores. */
4809 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
4810 vect_analyze_slp_instance (vinfo, bst_map, first_element,
4811 slp_inst_kind_store, max_tree_size, &limit,
4812 force_single_lane);
4814 /* For loops also start SLP discovery from non-grouped stores. */
4815 if (loop_vinfo)
4817 data_reference_p dr;
4818 FOR_EACH_VEC_ELT (vinfo->shared->datarefs, i, dr)
4819 if (DR_IS_WRITE (dr))
4821 stmt_vec_info stmt_info = vinfo->lookup_dr (dr)->stmt;
4822 /* Grouped stores are already handled above. */
4823 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
4824 continue;
4825 vec<stmt_vec_info> stmts;
4826 vec<stmt_vec_info> roots = vNULL;
4827 vec<tree> remain = vNULL;
4828 stmts.create (1);
4829 stmts.quick_push (stmt_info);
4830 vect_build_slp_instance (vinfo, slp_inst_kind_store,
4831 stmts, roots, remain, max_tree_size,
4832 &limit, bst_map, NULL, force_single_lane);
4836 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
4838 for (unsigned i = 0; i < bb_vinfo->roots.length (); ++i)
4840 vect_location = bb_vinfo->roots[i].roots[0]->stmt;
4841 /* Apply patterns. */
4842 for (unsigned j = 0; j < bb_vinfo->roots[i].stmts.length (); ++j)
4843 bb_vinfo->roots[i].stmts[j]
4844 = vect_stmt_to_vectorize (bb_vinfo->roots[i].stmts[j]);
4845 if (vect_build_slp_instance (bb_vinfo, bb_vinfo->roots[i].kind,
4846 bb_vinfo->roots[i].stmts,
4847 bb_vinfo->roots[i].roots,
4848 bb_vinfo->roots[i].remain,
4849 max_tree_size, &limit, bst_map, NULL,
4850 false))
4852 bb_vinfo->roots[i].stmts = vNULL;
4853 bb_vinfo->roots[i].roots = vNULL;
4854 bb_vinfo->roots[i].remain = vNULL;
4859 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
4861 /* Find SLP sequences starting from reduction chains. */
4862 FOR_EACH_VEC_ELT (loop_vinfo->reduction_chains, i, first_element)
4863 if (! STMT_VINFO_RELEVANT_P (first_element)
4864 && ! STMT_VINFO_LIVE_P (first_element))
4866 else if (force_single_lane
4867 || ! vect_analyze_slp_instance (vinfo, bst_map, first_element,
4868 slp_inst_kind_reduc_chain,
4869 max_tree_size, &limit,
4870 force_single_lane))
4872 /* Dissolve reduction chain group. */
4873 stmt_vec_info vinfo = first_element;
4874 stmt_vec_info last = NULL;
4875 while (vinfo)
4877 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
4878 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
4879 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
4880 last = vinfo;
4881 vinfo = next;
4883 STMT_VINFO_DEF_TYPE (first_element) = vect_internal_def;
4884 /* It can be still vectorized as part of an SLP reduction. */
4885 loop_vinfo->reductions.safe_push (last);
4888 /* Find SLP sequences starting from groups of reductions. */
4889 if (loop_vinfo->reductions.length () > 0)
4891 /* Collect reduction statements we can combine into
4892 a SLP reduction. */
4893 vec<stmt_vec_info> scalar_stmts;
4894 scalar_stmts.create (loop_vinfo->reductions.length ());
4895 for (auto next_info : loop_vinfo->reductions)
4897 next_info = vect_stmt_to_vectorize (next_info);
4898 if ((STMT_VINFO_RELEVANT_P (next_info)
4899 || STMT_VINFO_LIVE_P (next_info))
4900 /* ??? Make sure we didn't skip a conversion around a
4901 reduction path. In that case we'd have to reverse
4902 engineer that conversion stmt following the chain using
4903 reduc_idx and from the PHI using reduc_def. */
4904 && (STMT_VINFO_DEF_TYPE (next_info) == vect_reduction_def
4905 || (STMT_VINFO_DEF_TYPE (next_info)
4906 == vect_double_reduction_def)))
4908 /* Do not discover SLP reductions combining lane-reducing
4909 ops, that will fail later. */
4910 if (!force_single_lane
4911 && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info)))
4912 scalar_stmts.quick_push (next_info);
4913 else
4915 /* Do SLP discovery for single-lane reductions. */
4916 vec<stmt_vec_info> stmts;
4917 vec<stmt_vec_info> roots = vNULL;
4918 vec<tree> remain = vNULL;
4919 stmts.create (1);
4920 stmts.quick_push (next_info);
4921 vect_build_slp_instance (vinfo,
4922 slp_inst_kind_reduc_group,
4923 stmts, roots, remain,
4924 max_tree_size, &limit,
4925 bst_map, NULL,
4926 force_single_lane);
4930 /* Save for re-processing on failure. */
4931 vec<stmt_vec_info> saved_stmts = scalar_stmts.copy ();
4932 vec<stmt_vec_info> roots = vNULL;
4933 vec<tree> remain = vNULL;
4934 if (scalar_stmts.length () <= 1
4935 || !vect_build_slp_instance (loop_vinfo,
4936 slp_inst_kind_reduc_group,
4937 scalar_stmts, roots, remain,
4938 max_tree_size, &limit, bst_map,
4939 NULL, force_single_lane))
4941 if (scalar_stmts.length () <= 1)
4942 scalar_stmts.release ();
4943 /* Do SLP discovery for single-lane reductions. */
4944 for (auto stmt_info : saved_stmts)
4946 vec<stmt_vec_info> stmts;
4947 vec<stmt_vec_info> roots = vNULL;
4948 vec<tree> remain = vNULL;
4949 stmts.create (1);
4950 stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
4951 vect_build_slp_instance (vinfo,
4952 slp_inst_kind_reduc_group,
4953 stmts, roots, remain,
4954 max_tree_size, &limit,
4955 bst_map, NULL, force_single_lane);
4958 saved_stmts.release ();
4961 /* Make sure to vectorize only-live stmts, usually inductions. */
4962 for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
4963 for (auto gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi);
4964 gsi_next (&gsi))
4966 gphi *lc_phi = *gsi;
4967 tree def = gimple_phi_arg_def_from_edge (lc_phi, e);
4968 stmt_vec_info stmt_info;
4969 if (TREE_CODE (def) == SSA_NAME
4970 && !virtual_operand_p (def)
4971 && (stmt_info = loop_vinfo->lookup_def (def))
4972 && ((stmt_info = vect_stmt_to_vectorize (stmt_info)), true)
4973 && STMT_VINFO_RELEVANT (stmt_info) == vect_used_only_live
4974 && STMT_VINFO_LIVE_P (stmt_info)
4975 && (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
4976 || (STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
4977 && STMT_VINFO_REDUC_IDX (stmt_info) == -1)))
4979 vec<stmt_vec_info> stmts;
4980 vec<stmt_vec_info> roots = vNULL;
4981 vec<tree> remain = vNULL;
4982 stmts.create (1);
4983 stmts.quick_push (vect_stmt_to_vectorize (stmt_info));
4984 vect_build_slp_instance (vinfo,
4985 slp_inst_kind_reduc_group,
4986 stmts, roots, remain,
4987 max_tree_size, &limit,
4988 bst_map, NULL, force_single_lane);
4992 /* Find SLP sequences starting from gconds. */
4993 for (auto cond : LOOP_VINFO_LOOP_CONDS (loop_vinfo))
4995 auto cond_info = loop_vinfo->lookup_stmt (cond);
4997 cond_info = vect_stmt_to_vectorize (cond_info);
4998 vec<stmt_vec_info> roots = vNULL;
4999 roots.safe_push (cond_info);
5000 gimple *stmt = STMT_VINFO_STMT (cond_info);
5001 tree args0 = gimple_cond_lhs (stmt);
5002 tree args1 = gimple_cond_rhs (stmt);
5004 /* These should be enforced by cond lowering. */
5005 gcc_assert (gimple_cond_code (stmt) == NE_EXPR);
5006 gcc_assert (zerop (args1));
5008 /* An argument without a loop def will be codegened from vectorizing the
5009 root gcond itself. As such we don't need to try to build an SLP tree
5010 from them. It's highly likely that the resulting SLP tree here if both
5011 arguments have a def will be incompatible, but we rely on it being split
5012 later on. */
5013 auto varg = loop_vinfo->lookup_def (args0);
5014 vec<stmt_vec_info> stmts;
5015 vec<tree> remain = vNULL;
5016 stmts.create (1);
5017 stmts.quick_push (vect_stmt_to_vectorize (varg));
5019 if (! vect_build_slp_instance (vinfo, slp_inst_kind_gcond,
5020 stmts, roots, remain,
5021 max_tree_size, &limit,
5022 bst_map, NULL, force_single_lane))
5023 roots.release ();
5026 /* Find and create slp instances for inductions that have been forced
5027 live due to early break. */
5028 edge latch_e = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
5029 for (auto stmt_info : LOOP_VINFO_EARLY_BREAKS_LIVE_IVS (loop_vinfo))
5031 vec<stmt_vec_info> stmts;
5032 vec<stmt_vec_info> roots = vNULL;
5033 vec<tree> remain = vNULL;
5034 gphi *lc_phi = as_a<gphi *> (STMT_VINFO_STMT (stmt_info));
5035 tree def = gimple_phi_arg_def_from_edge (lc_phi, latch_e);
5036 stmt_vec_info lc_info = loop_vinfo->lookup_def (def);
5037 stmts.create (1);
5038 stmts.quick_push (vect_stmt_to_vectorize (lc_info));
5039 vect_build_slp_instance (vinfo, slp_inst_kind_reduc_group,
5040 stmts, roots, remain,
5041 max_tree_size, &limit,
5042 bst_map, NULL, force_single_lane);
5046 hash_set<slp_tree> visited_patterns;
5047 slp_tree_to_load_perm_map_t perm_cache;
5048 slp_compat_nodes_map_t compat_cache;
5050 /* See if any patterns can be found in the SLP tree. */
5051 bool pattern_found = false;
5052 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5053 pattern_found |= vect_match_slp_patterns (instance, vinfo,
5054 &visited_patterns, &perm_cache,
5055 &compat_cache);
5057 /* If any were found optimize permutations of loads. */
5058 if (pattern_found)
5060 hash_map<slp_tree, slp_tree> load_map;
5061 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5063 slp_tree root = SLP_INSTANCE_TREE (instance);
5064 optimize_load_redistribution (bst_map, vinfo, SLP_TREE_LANES (root),
5065 &load_map, root);
5069 /* Check whether we should force some SLP instances to use load/store-lanes
5070 and do so by forcing SLP re-discovery with single lanes. We used
5071 to cancel SLP when this applied to all instances in a loop but now
5072 we decide this per SLP instance. It's important to do this only
5073 after SLP pattern recognition. */
5074 if (is_a <loop_vec_info> (vinfo))
5075 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5076 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
5077 && !SLP_INSTANCE_TREE (instance)->ldst_lanes)
5079 slp_tree slp_root = SLP_INSTANCE_TREE (instance);
5080 int group_size = SLP_TREE_LANES (slp_root);
5081 tree vectype = SLP_TREE_VECTYPE (slp_root);
5083 stmt_vec_info rep_info = SLP_TREE_REPRESENTATIVE (slp_root);
5084 gimple *rep = STMT_VINFO_STMT (rep_info);
5085 bool masked = (is_gimple_call (rep)
5086 && gimple_call_internal_p (rep)
5087 && internal_fn_mask_index
5088 (gimple_call_internal_fn (rep)) != -1);
5089 if (!STMT_VINFO_GROUPED_ACCESS (rep_info)
5090 || slp_root->ldst_lanes
5091 || (vect_store_lanes_supported (vectype, group_size, masked)
5092 == IFN_LAST))
5093 continue;
5095 auto_vec<slp_tree> loads;
5096 hash_set<slp_tree> visited;
5097 vect_gather_slp_loads (loads, slp_root, visited);
5099 /* Check whether any load in the SLP instance is possibly
5100 permuted. */
5101 bool loads_permuted = false;
5102 slp_tree load_node;
5103 unsigned j;
5104 FOR_EACH_VEC_ELT (loads, j, load_node)
5106 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
5107 continue;
5108 unsigned k;
5109 stmt_vec_info load_info;
5110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), k, load_info)
5111 if (SLP_TREE_LOAD_PERMUTATION (load_node)[k] != k)
5113 loads_permuted = true;
5114 break;
5118 /* If the loads and stores can use load/store-lanes force re-discovery
5119 with single lanes. */
5120 if (loads_permuted)
5122 bool can_use_lanes = true;
5123 FOR_EACH_VEC_ELT (loads, j, load_node)
5124 if (STMT_VINFO_GROUPED_ACCESS
5125 (SLP_TREE_REPRESENTATIVE (load_node)))
5127 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
5128 (SLP_TREE_REPRESENTATIVE (load_node));
5129 rep = STMT_VINFO_STMT (stmt_vinfo);
5130 masked = (is_gimple_call (rep)
5131 && gimple_call_internal_p (rep)
5132 && internal_fn_mask_index
5133 (gimple_call_internal_fn (rep)));
5134 /* Use SLP for strided accesses (or if we can't
5135 load-lanes). */
5136 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
5137 || compare_step_with_zero (vinfo, stmt_vinfo) <= 0
5138 || vect_load_lanes_supported
5139 (STMT_VINFO_VECTYPE (stmt_vinfo),
5140 DR_GROUP_SIZE (stmt_vinfo), masked) == IFN_LAST
5141 /* ??? During SLP re-discovery with a single lane
5142 a masked grouped load will appear permuted and
5143 discovery will fail. We have to rework this
5144 on the discovery side - for now avoid ICEing. */
5145 || masked)
5147 can_use_lanes = false;
5148 break;
5152 if (can_use_lanes)
5154 if (dump_enabled_p ())
5155 dump_printf_loc (MSG_NOTE, vect_location,
5156 "SLP instance %p can use load/store-lanes,"
5157 " re-discovering with single-lanes\n",
5158 (void *) instance);
5160 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (slp_root);
5162 vect_free_slp_instance (instance);
5163 limit = max_tree_size;
5164 bool res = vect_analyze_slp_instance (vinfo, bst_map,
5165 stmt_info,
5166 slp_inst_kind_store,
5167 max_tree_size, &limit,
5168 true);
5169 gcc_assert (res);
5170 auto new_inst = LOOP_VINFO_SLP_INSTANCES (vinfo).pop ();
5171 LOOP_VINFO_SLP_INSTANCES (vinfo)[i] = new_inst;
5176 /* When we end up with load permutations that we cannot possibly handle,
5177 like those requiring three vector inputs, lower them using interleaving
5178 like schemes. */
5179 if (loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
5181 vect_lower_load_permutations (loop_vinfo, bst_map, force_single_lane);
5182 if (dump_enabled_p ())
5184 dump_printf_loc (MSG_NOTE, vect_location,
5185 "SLP graph after lowering permutations:\n");
5186 hash_set<slp_tree> visited;
5187 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5188 vect_print_slp_graph (MSG_NOTE, vect_location,
5189 SLP_INSTANCE_TREE (instance), visited);
5193 release_scalar_stmts_to_slp_tree_map (bst_map);
5195 if (pattern_found && dump_enabled_p ())
5197 dump_printf_loc (MSG_NOTE, vect_location,
5198 "Pattern matched SLP tree\n");
5199 hash_set<slp_tree> visited;
5200 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo), i, instance)
5201 vect_print_slp_graph (MSG_NOTE, vect_location,
5202 SLP_INSTANCE_TREE (instance), visited);
5205 return opt_result::success ();
5208 /* Estimates the cost of inserting layout changes into the SLP graph.
5209 It can also say that the insertion is impossible. */
5211 struct slpg_layout_cost
5213 slpg_layout_cost () = default;
5214 slpg_layout_cost (sreal, bool);
5216 static slpg_layout_cost impossible () { return { sreal::max (), 0 }; }
5217 bool is_possible () const { return depth != sreal::max (); }
5219 bool operator== (const slpg_layout_cost &) const;
5220 bool operator!= (const slpg_layout_cost &) const;
5222 bool is_better_than (const slpg_layout_cost &, bool) const;
5224 void add_parallel_cost (const slpg_layout_cost &);
5225 void add_serial_cost (const slpg_layout_cost &);
5226 void split (unsigned int);
5228 /* The longest sequence of layout changes needed during any traversal
5229 of the partition dag, weighted by execution frequency.
5231 This is the most important metric when optimizing for speed, since
5232 it helps to ensure that we keep the number of operations on
5233 critical paths to a minimum. */
5234 sreal depth = 0;
5236 /* An estimate of the total number of operations needed. It is weighted by
5237 execution frequency when optimizing for speed but not when optimizing for
5238 size. In order to avoid double-counting, a node with a fanout of N will
5239 distribute 1/N of its total cost to each successor.
5241 This is the most important metric when optimizing for size, since
5242 it helps to keep the total number of operations to a minimum, */
5243 sreal total = 0;
5246 /* Construct costs for a node with weight WEIGHT. A higher weight
5247 indicates more frequent execution. IS_FOR_SIZE is true if we are
5248 optimizing for size rather than speed. */
5250 slpg_layout_cost::slpg_layout_cost (sreal weight, bool is_for_size)
5251 : depth (weight), total (is_for_size && weight > 0 ? 1 : weight)
5255 bool
5256 slpg_layout_cost::operator== (const slpg_layout_cost &other) const
5258 return depth == other.depth && total == other.total;
5261 bool
5262 slpg_layout_cost::operator!= (const slpg_layout_cost &other) const
5264 return !operator== (other);
5267 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
5268 true if we are optimizing for size rather than speed. */
5270 bool
5271 slpg_layout_cost::is_better_than (const slpg_layout_cost &other,
5272 bool is_for_size) const
5274 if (is_for_size)
5276 if (total != other.total)
5277 return total < other.total;
5278 return depth < other.depth;
5280 else
5282 if (depth != other.depth)
5283 return depth < other.depth;
5284 return total < other.total;
5288 /* Increase the costs to account for something with cost INPUT_COST
5289 happening in parallel with the current costs. */
5291 void
5292 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost &input_cost)
5294 depth = std::max (depth, input_cost.depth);
5295 total += input_cost.total;
5298 /* Increase the costs to account for something with cost INPUT_COST
5299 happening in series with the current costs. */
5301 void
5302 slpg_layout_cost::add_serial_cost (const slpg_layout_cost &other)
5304 depth += other.depth;
5305 total += other.total;
5308 /* Split the total cost among TIMES successors or predecessors. */
5310 void
5311 slpg_layout_cost::split (unsigned int times)
5313 if (times > 1)
5314 total /= times;
5317 /* Information about one node in the SLP graph, for use during
5318 vect_optimize_slp_pass. */
5320 struct slpg_vertex
5322 slpg_vertex (slp_tree node_) : node (node_) {}
5324 /* The node itself. */
5325 slp_tree node;
5327 /* Which partition the node belongs to, or -1 if none. Nodes outside of
5328 partitions are flexible; they can have whichever layout consumers
5329 want them to have. */
5330 int partition = -1;
5332 /* The number of nodes that directly use the result of this one
5333 (i.e. the number of nodes that count this one as a child). */
5334 unsigned int out_degree = 0;
5336 /* The execution frequency of the node. */
5337 sreal weight = 0;
5339 /* The total execution frequency of all nodes that directly use the
5340 result of this one. */
5341 sreal out_weight = 0;
5344 /* Information about one partition of the SLP graph, for use during
5345 vect_optimize_slp_pass. */
5347 struct slpg_partition_info
5349 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
5350 of m_partitioned_nodes. */
5351 unsigned int node_begin = 0;
5352 unsigned int node_end = 0;
5354 /* Which layout we've chosen to use for this partition, or -1 if
5355 we haven't picked one yet. */
5356 int layout = -1;
5358 /* The number of predecessors and successors in the partition dag.
5359 The predecessors always have lower partition numbers and the
5360 successors always have higher partition numbers.
5362 Note that the directions of these edges are not necessarily the
5363 same as in the data flow graph. For example, if an SCC has separate
5364 partitions for an inner loop and an outer loop, the inner loop's
5365 partition will have at least two incoming edges from the outer loop's
5366 partition: one for a live-in value and one for a live-out value.
5367 In data flow terms, one of these edges would also be from the outer loop
5368 to the inner loop, but the other would be in the opposite direction. */
5369 unsigned int in_degree = 0;
5370 unsigned int out_degree = 0;
5373 /* Information about the costs of using a particular layout for a
5374 particular partition. It can also say that the combination is
5375 impossible. */
5377 struct slpg_partition_layout_costs
5379 bool is_possible () const { return internal_cost.is_possible (); }
5380 void mark_impossible () { internal_cost = slpg_layout_cost::impossible (); }
5382 /* The costs inherited from predecessor partitions. */
5383 slpg_layout_cost in_cost;
5385 /* The inherent cost of the layout within the node itself. For example,
5386 this is nonzero for a load if choosing a particular layout would require
5387 the load to permute the loaded elements. It is nonzero for a
5388 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
5389 to full-vector moves. */
5390 slpg_layout_cost internal_cost;
5392 /* The costs inherited from successor partitions. */
5393 slpg_layout_cost out_cost;
5396 /* This class tries to optimize the layout of vectors in order to avoid
5397 unnecessary shuffling. At the moment, the set of possible layouts are
5398 restricted to bijective permutations.
5400 The goal of the pass depends on whether we're optimizing for size or
5401 for speed. When optimizing for size, the goal is to reduce the overall
5402 number of layout changes (including layout changes implied by things
5403 like load permutations). When optimizing for speed, the goal is to
5404 reduce the maximum latency attributable to layout changes on any
5405 non-cyclical path through the data flow graph.
5407 For example, when optimizing a loop nest for speed, we will prefer
5408 to make layout changes outside of a loop rather than inside of a loop,
5409 and will prefer to make layout changes in parallel rather than serially,
5410 even if that increases the overall number of layout changes.
5412 The high-level procedure is:
5414 (1) Build a graph in which edges go from uses (parents) to definitions
5415 (children).
5417 (2) Divide the graph into a dag of strongly-connected components (SCCs).
5419 (3) When optimizing for speed, partition the nodes in each SCC based
5420 on their containing cfg loop. When optimizing for size, treat
5421 each SCC as a single partition.
5423 This gives us a dag of partitions. The goal is now to assign a
5424 layout to each partition.
5426 (4) Construct a set of vector layouts that are worth considering.
5427 Record which nodes must keep their current layout.
5429 (5) Perform a forward walk over the partition dag (from loads to stores)
5430 accumulating the "forward" cost of using each layout. When visiting
5431 each partition, assign a tentative choice of layout to the partition
5432 and use that choice when calculating the cost of using a different
5433 layout in successor partitions.
5435 (6) Perform a backward walk over the partition dag (from stores to loads),
5436 accumulating the "backward" cost of using each layout. When visiting
5437 each partition, make a final choice of layout for that partition based
5438 on the accumulated forward costs (from (5)) and backward costs
5439 (from (6)).
5441 (7) Apply the chosen layouts to the SLP graph.
5443 For example, consider the SLP statements:
5445 S1: a_1 = load
5446 loop:
5447 S2: a_2 = PHI<a_1, a_3>
5448 S3: b_1 = load
5449 S4: a_3 = a_2 + b_1
5450 exit:
5451 S5: a_4 = PHI<a_3>
5452 S6: store a_4
5454 S2 and S4 form an SCC and are part of the same loop. Every other
5455 statement is in a singleton SCC. In this example there is a one-to-one
5456 mapping between SCCs and partitions and the partition dag looks like this;
5458 S1 S3
5460 S2+S4
5466 S2, S3 and S4 will have a higher execution frequency than the other
5467 statements, so when optimizing for speed, the goal is to avoid any
5468 layout changes:
5470 - within S3
5471 - within S2+S4
5472 - on the S3->S2+S4 edge
5474 For example, if S3 was originally a reversing load, the goal of the
5475 pass is to make it an unreversed load and change the layout on the
5476 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
5477 on S1->S2+S4 and S5->S6 would also be acceptable.)
5479 The difference between SCCs and partitions becomes important if we
5480 add an outer loop:
5482 S1: a_1 = ...
5483 loop1:
5484 S2: a_2 = PHI<a_1, a_6>
5485 S3: b_1 = load
5486 S4: a_3 = a_2 + b_1
5487 loop2:
5488 S5: a_4 = PHI<a_3, a_5>
5489 S6: c_1 = load
5490 S7: a_5 = a_4 + c_1
5491 exit2:
5492 S8: a_6 = PHI<a_5>
5493 S9: store a_6
5494 exit1:
5496 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
5497 for speed, we usually do not want restrictions in the outer loop to "infect"
5498 the decision for the inner loop. For example, if an outer-loop node
5499 in the SCC contains a statement with a fixed layout, that should not
5500 prevent the inner loop from using a different layout. Conversely,
5501 the inner loop should not dictate a layout to the outer loop: if the
5502 outer loop does a lot of computation, then it may not be efficient to
5503 do all of that computation in the inner loop's preferred layout.
5505 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
5506 and S5+S7 (inner). We also try to arrange partitions so that:
5508 - the partition for an outer loop comes before the partition for
5509 an inner loop
5511 - if a sibling loop A dominates a sibling loop B, A's partition
5512 comes before B's
5514 This gives the following partition dag for the example above:
5516 S1 S3
5518 S2+S4+S8 S6
5519 | \\ /
5520 | S5+S7
5524 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
5525 one for a reversal of the edge S7->S8.
5527 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
5528 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
5529 preferred layout against the cost of changing the layout on entry to the
5530 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
5532 Although this works well when optimizing for speed, it has the downside
5533 when optimizing for size that the choice of layout for S5+S7 is completely
5534 independent of S9, which lessens the chance of reducing the overall number
5535 of permutations. We therefore do not partition SCCs when optimizing
5536 for size.
5538 To give a concrete example of the difference between optimizing
5539 for size and speed, consider:
5541 a[0] = (b[1] << c[3]) - d[1];
5542 a[1] = (b[0] << c[2]) - d[0];
5543 a[2] = (b[3] << c[1]) - d[3];
5544 a[3] = (b[2] << c[0]) - d[2];
5546 There are three different layouts here: one for a, one for b and d,
5547 and one for c. When optimizing for speed it is better to permute each
5548 of b, c and d into the order required by a, since those permutations
5549 happen in parallel. But when optimizing for size, it is better to:
5551 - permute c into the same order as b
5552 - do the arithmetic
5553 - permute the result into the order required by a
5555 This gives 2 permutations rather than 3. */
5557 class vect_optimize_slp_pass
5559 public:
5560 vect_optimize_slp_pass (vec_info *vinfo) : m_vinfo (vinfo) {}
5561 void run ();
5563 private:
5564 /* Graph building. */
5565 struct loop *containing_loop (slp_tree);
5566 bool is_cfg_latch_edge (graph_edge *);
5567 void build_vertices (hash_set<slp_tree> &, slp_tree);
5568 void build_vertices ();
5569 void build_graph ();
5571 /* Partitioning. */
5572 void create_partitions ();
5573 template<typename T> void for_each_partition_edge (unsigned int, T);
5575 /* Layout selection. */
5576 bool is_compatible_layout (slp_tree, unsigned int);
5577 int change_layout_cost (slp_tree, unsigned int, unsigned int);
5578 slpg_partition_layout_costs &partition_layout_costs (unsigned int,
5579 unsigned int);
5580 void change_vec_perm_layout (slp_tree, lane_permutation_t &,
5581 int, unsigned int);
5582 int internal_node_cost (slp_tree, int, unsigned int);
5583 void start_choosing_layouts ();
5585 /* Cost propagation. */
5586 slpg_layout_cost edge_layout_cost (graph_edge *, unsigned int,
5587 unsigned int, unsigned int);
5588 slpg_layout_cost total_in_cost (unsigned int);
5589 slpg_layout_cost forward_cost (graph_edge *, unsigned int, unsigned int);
5590 slpg_layout_cost backward_cost (graph_edge *, unsigned int, unsigned int);
5591 void forward_pass ();
5592 void backward_pass ();
5594 /* Rematerialization. */
5595 slp_tree get_result_with_layout (slp_tree, unsigned int);
5596 void materialize ();
5598 /* Clean-up. */
5599 void remove_redundant_permutations ();
5601 /* Masked load lanes discovery. */
5602 void decide_masked_load_lanes ();
5604 void dump ();
5606 vec_info *m_vinfo;
5608 /* True if we should optimize the graph for size, false if we should
5609 optimize it for speed. (It wouldn't be easy to make this decision
5610 more locally.) */
5611 bool m_optimize_size;
5613 /* A graph of all SLP nodes, with edges leading from uses to definitions.
5614 In other words, a node's predecessors are its slp_tree parents and
5615 a node's successors are its slp_tree children. */
5616 graph *m_slpg = nullptr;
5618 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
5619 auto_vec<slpg_vertex> m_vertices;
5621 /* The list of all leaves of M_SLPG. such as external definitions, constants,
5622 and loads. */
5623 auto_vec<int> m_leafs;
5625 /* This array has one entry for every vector layout that we're considering.
5626 Element 0 is null and indicates "no change". Other entries describe
5627 permutations that are inherent in the current graph and that we would
5628 like to reverse if possible.
5630 For example, a permutation { 1, 2, 3, 0 } means that something has
5631 effectively been permuted in that way, such as a load group
5632 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
5633 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
5634 in order to put things "back" in order. */
5635 auto_vec<vec<unsigned> > m_perms;
5637 /* A partitioning of the nodes for which a layout must be chosen.
5638 Each partition represents an <SCC, cfg loop> pair; that is,
5639 nodes in different SCCs belong to different partitions, and nodes
5640 within an SCC can be further partitioned according to a containing
5641 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
5643 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
5644 from leaves (such as loads) to roots (such as stores).
5646 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
5647 auto_vec<slpg_partition_info> m_partitions;
5649 /* The list of all nodes for which a layout must be chosen. Nodes for
5650 partition P come before the nodes for partition P+1. Nodes within a
5651 partition are in reverse postorder. */
5652 auto_vec<unsigned int> m_partitioned_nodes;
5654 /* Index P * num-layouts + L contains the cost of using layout L
5655 for partition P. */
5656 auto_vec<slpg_partition_layout_costs> m_partition_layout_costs;
5658 /* Index N * num-layouts + L, if nonnull, is a node that provides the
5659 original output of node N adjusted to have layout L. */
5660 auto_vec<slp_tree> m_node_layouts;
5663 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
5664 Also record whether we should optimize anything for speed rather
5665 than size. */
5667 void
5668 vect_optimize_slp_pass::build_vertices (hash_set<slp_tree> &visited,
5669 slp_tree node)
5671 unsigned i;
5672 slp_tree child;
5674 if (visited.add (node))
5675 return;
5677 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
5679 basic_block bb = gimple_bb (vect_orig_stmt (rep)->stmt);
5680 if (optimize_bb_for_speed_p (bb))
5681 m_optimize_size = false;
5684 node->vertex = m_vertices.length ();
5685 m_vertices.safe_push (slpg_vertex (node));
5687 bool leaf = true;
5688 bool force_leaf = false;
5689 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
5690 if (child)
5692 leaf = false;
5693 build_vertices (visited, child);
5695 else
5696 force_leaf = true;
5697 /* Since SLP discovery works along use-def edges all cycles have an
5698 entry - but there's the exception of cycles where we do not handle
5699 the entry explicitely (but with a NULL SLP node), like some reductions
5700 and inductions. Force those SLP PHIs to act as leafs to make them
5701 backwards reachable. */
5702 if (leaf || force_leaf)
5703 m_leafs.safe_push (node->vertex);
5706 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
5708 void
5709 vect_optimize_slp_pass::build_vertices ()
5711 hash_set<slp_tree> visited;
5712 unsigned i;
5713 slp_instance instance;
5714 m_vertices.truncate (0);
5715 m_leafs.truncate (0);
5716 FOR_EACH_VEC_ELT (m_vinfo->slp_instances, i, instance)
5717 build_vertices (visited, SLP_INSTANCE_TREE (instance));
5720 /* Apply (reverse) bijectite PERM to VEC. */
5722 template <class T>
5723 static void
5724 vect_slp_permute (vec<unsigned> perm,
5725 vec<T> &vec, bool reverse)
5727 auto_vec<T, 64> saved;
5728 saved.create (vec.length ());
5729 for (unsigned i = 0; i < vec.length (); ++i)
5730 saved.quick_push (vec[i]);
5732 if (reverse)
5734 for (unsigned i = 0; i < vec.length (); ++i)
5735 vec[perm[i]] = saved[i];
5736 for (unsigned i = 0; i < vec.length (); ++i)
5737 gcc_assert (vec[perm[i]] == saved[i]);
5739 else
5741 for (unsigned i = 0; i < vec.length (); ++i)
5742 vec[i] = saved[perm[i]];
5743 for (unsigned i = 0; i < vec.length (); ++i)
5744 gcc_assert (vec[i] == saved[perm[i]]);
5748 /* Return the cfg loop that contains NODE. */
5750 struct loop *
5751 vect_optimize_slp_pass::containing_loop (slp_tree node)
5753 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
5754 if (!rep)
5755 return ENTRY_BLOCK_PTR_FOR_FN (cfun)->loop_father;
5756 return gimple_bb (vect_orig_stmt (rep)->stmt)->loop_father;
5759 /* Return true if UD (an edge from a use to a definition) is associated
5760 with a loop latch edge in the cfg. */
5762 bool
5763 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge *ud)
5765 slp_tree use = m_vertices[ud->src].node;
5766 slp_tree def = m_vertices[ud->dest].node;
5767 if ((SLP_TREE_DEF_TYPE (use) != vect_internal_def
5768 || SLP_TREE_CODE (use) == VEC_PERM_EXPR)
5769 || SLP_TREE_DEF_TYPE (def) != vect_internal_def)
5770 return false;
5772 stmt_vec_info use_rep = vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use));
5773 return (is_a<gphi *> (use_rep->stmt)
5774 && bb_loop_header_p (gimple_bb (use_rep->stmt))
5775 && containing_loop (def) == containing_loop (use));
5778 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
5779 a nonnull data field. */
5781 void
5782 vect_optimize_slp_pass::build_graph ()
5784 m_optimize_size = true;
5785 build_vertices ();
5787 m_slpg = new_graph (m_vertices.length ());
5788 for (slpg_vertex &v : m_vertices)
5789 for (slp_tree child : SLP_TREE_CHILDREN (v.node))
5790 if (child)
5792 graph_edge *ud = add_edge (m_slpg, v.node->vertex, child->vertex);
5793 if (is_cfg_latch_edge (ud))
5794 ud->data = this;
5798 /* Return true if E corresponds to a loop latch edge in the cfg. */
5800 static bool
5801 skip_cfg_latch_edges (graph_edge *e)
5803 return e->data;
5806 /* Create the node partitions. */
5808 void
5809 vect_optimize_slp_pass::create_partitions ()
5811 /* Calculate a postorder of the graph, ignoring edges that correspond
5812 to natural latch edges in the cfg. Reading the vector from the end
5813 to the beginning gives the reverse postorder. */
5814 auto_vec<int> initial_rpo;
5815 graphds_dfs (m_slpg, &m_leafs[0], m_leafs.length (), &initial_rpo,
5816 false, NULL, skip_cfg_latch_edges);
5817 gcc_assert (initial_rpo.length () == m_vertices.length ());
5819 /* Calculate the strongly connected components of the graph. */
5820 auto_vec<int> scc_grouping;
5821 unsigned int num_sccs = graphds_scc (m_slpg, NULL, NULL, &scc_grouping);
5823 /* Create a new index order in which all nodes from the same SCC are
5824 consecutive. Use scc_pos to record the index of the first node in
5825 each SCC. */
5826 auto_vec<unsigned int> scc_pos (num_sccs);
5827 int last_component = -1;
5828 unsigned int node_count = 0;
5829 for (unsigned int node_i : scc_grouping)
5831 if (last_component != m_slpg->vertices[node_i].component)
5833 last_component = m_slpg->vertices[node_i].component;
5834 gcc_assert (last_component == int (scc_pos.length ()));
5835 scc_pos.quick_push (node_count);
5837 node_count += 1;
5839 gcc_assert (node_count == initial_rpo.length ()
5840 && last_component + 1 == int (num_sccs));
5842 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
5843 inside each SCC following the RPO we calculated above. The fact that
5844 we ignored natural latch edges when calculating the RPO should ensure
5845 that, for natural loop nests:
5847 - the first node that we encounter in a cfg loop is the loop header phi
5848 - the loop header phis are in dominance order
5850 Arranging for this is an optimization (see below) rather than a
5851 correctness issue. Unnatural loops with a tangled mess of backedges
5852 will still work correctly, but might give poorer results.
5854 Also update scc_pos so that it gives 1 + the index of the last node
5855 in the SCC. */
5856 m_partitioned_nodes.safe_grow (node_count);
5857 for (unsigned int old_i = initial_rpo.length (); old_i-- > 0;)
5859 unsigned int node_i = initial_rpo[old_i];
5860 unsigned int new_i = scc_pos[m_slpg->vertices[node_i].component]++;
5861 m_partitioned_nodes[new_i] = node_i;
5864 /* When optimizing for speed, partition each SCC based on the containing
5865 cfg loop. The order we constructed above should ensure that, for natural
5866 cfg loops, we'll create sub-SCC partitions for outer loops before
5867 the corresponding sub-SCC partitions for inner loops. Similarly,
5868 when one sibling loop A dominates another sibling loop B, we should
5869 create a sub-SCC partition for A before a sub-SCC partition for B.
5871 As above, nothing depends for correctness on whether this achieves
5872 a natural nesting, but we should get better results when it does. */
5873 m_partitions.reserve (m_vertices.length ());
5874 unsigned int next_partition_i = 0;
5875 hash_map<struct loop *, int> loop_partitions;
5876 unsigned int rpo_begin = 0;
5877 unsigned int num_partitioned_nodes = 0;
5878 for (unsigned int rpo_end : scc_pos)
5880 loop_partitions.empty ();
5881 unsigned int partition_i = next_partition_i;
5882 for (unsigned int rpo_i = rpo_begin; rpo_i < rpo_end; ++rpo_i)
5884 /* Handle externals and constants optimistically throughout.
5885 But treat existing vectors as fixed since we do not handle
5886 permuting them. */
5887 unsigned int node_i = m_partitioned_nodes[rpo_i];
5888 auto &vertex = m_vertices[node_i];
5889 if ((SLP_TREE_DEF_TYPE (vertex.node) == vect_external_def
5890 && !SLP_TREE_VEC_DEFS (vertex.node).exists ())
5891 || SLP_TREE_DEF_TYPE (vertex.node) == vect_constant_def)
5892 vertex.partition = -1;
5893 else
5895 bool existed;
5896 if (m_optimize_size)
5897 existed = next_partition_i > partition_i;
5898 else
5900 struct loop *loop = containing_loop (vertex.node);
5901 auto &entry = loop_partitions.get_or_insert (loop, &existed);
5902 if (!existed)
5903 entry = next_partition_i;
5904 partition_i = entry;
5906 if (!existed)
5908 m_partitions.quick_push (slpg_partition_info ());
5909 next_partition_i += 1;
5911 vertex.partition = partition_i;
5912 num_partitioned_nodes += 1;
5913 m_partitions[partition_i].node_end += 1;
5916 rpo_begin = rpo_end;
5919 /* Assign ranges of consecutive node indices to each partition,
5920 in partition order. Start with node_end being the same as
5921 node_begin so that the next loop can use it as a counter. */
5922 unsigned int node_begin = 0;
5923 for (auto &partition : m_partitions)
5925 partition.node_begin = node_begin;
5926 node_begin += partition.node_end;
5927 partition.node_end = partition.node_begin;
5929 gcc_assert (node_begin == num_partitioned_nodes);
5931 /* Finally build the list of nodes in partition order. */
5932 m_partitioned_nodes.truncate (num_partitioned_nodes);
5933 for (unsigned int node_i = 0; node_i < m_vertices.length (); ++node_i)
5935 int partition_i = m_vertices[node_i].partition;
5936 if (partition_i >= 0)
5938 unsigned int order_i = m_partitions[partition_i].node_end++;
5939 m_partitioned_nodes[order_i] = node_i;
5944 /* Look for edges from earlier partitions into node NODE_I and edges from
5945 node NODE_I into later partitions. Call:
5947 FN (ud, other_node_i)
5949 for each such use-to-def edge ud, where other_node_i is the node at the
5950 other end of the edge. */
5952 template<typename T>
5953 void
5954 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i, T fn)
5956 int partition_i = m_vertices[node_i].partition;
5957 for (graph_edge *pred = m_slpg->vertices[node_i].pred;
5958 pred; pred = pred->pred_next)
5960 int src_partition_i = m_vertices[pred->src].partition;
5961 if (src_partition_i >= 0 && src_partition_i != partition_i)
5962 fn (pred, pred->src);
5964 for (graph_edge *succ = m_slpg->vertices[node_i].succ;
5965 succ; succ = succ->succ_next)
5967 int dest_partition_i = m_vertices[succ->dest].partition;
5968 if (dest_partition_i >= 0 && dest_partition_i != partition_i)
5969 fn (succ, succ->dest);
5973 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
5974 that NODE would operate on. This test is independent of NODE's actual
5975 operation. */
5977 bool
5978 vect_optimize_slp_pass::is_compatible_layout (slp_tree node,
5979 unsigned int layout_i)
5981 if (layout_i == 0)
5982 return true;
5984 if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
5985 return false;
5987 return true;
5990 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
5991 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
5992 layouts is incompatible with NODE or if the change is not possible for
5993 some other reason.
5995 The properties taken from NODE include the number of lanes and the
5996 vector type. The actual operation doesn't matter. */
5999 vect_optimize_slp_pass::change_layout_cost (slp_tree node,
6000 unsigned int from_layout_i,
6001 unsigned int to_layout_i)
6003 if (!is_compatible_layout (node, from_layout_i)
6004 || !is_compatible_layout (node, to_layout_i))
6005 return -1;
6007 if (from_layout_i == to_layout_i)
6008 return 0;
6010 auto_vec<slp_tree, 1> children (1);
6011 children.quick_push (node);
6012 auto_lane_permutation_t perm (SLP_TREE_LANES (node));
6013 if (from_layout_i > 0)
6014 for (unsigned int i : m_perms[from_layout_i])
6015 perm.quick_push ({ 0, i });
6016 else
6017 for (unsigned int i = 0; i < SLP_TREE_LANES (node); ++i)
6018 perm.quick_push ({ 0, i });
6019 if (to_layout_i > 0)
6020 vect_slp_permute (m_perms[to_layout_i], perm, true);
6021 auto count = vectorizable_slp_permutation_1 (m_vinfo, nullptr, node, perm,
6022 children, false);
6023 if (count >= 0)
6024 return MAX (count, 1);
6026 /* ??? In principle we could try changing via layout 0, giving two
6027 layout changes rather than 1. Doing that would require
6028 corresponding support in get_result_with_layout. */
6029 return -1;
6032 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
6034 inline slpg_partition_layout_costs &
6035 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i,
6036 unsigned int layout_i)
6038 return m_partition_layout_costs[partition_i * m_perms.length () + layout_i];
6041 /* Change PERM in one of two ways:
6043 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
6044 chosen for child I of NODE.
6046 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
6048 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
6050 void
6051 vect_optimize_slp_pass::
6052 change_vec_perm_layout (slp_tree node, lane_permutation_t &perm,
6053 int in_layout_i, unsigned int out_layout_i)
6055 for (auto &entry : perm)
6057 int this_in_layout_i = in_layout_i;
6058 if (this_in_layout_i < 0)
6060 slp_tree in_node = SLP_TREE_CHILDREN (node)[entry.first];
6061 unsigned int in_partition_i = m_vertices[in_node->vertex].partition;
6062 if (in_partition_i == -1u)
6063 continue;
6064 this_in_layout_i = m_partitions[in_partition_i].layout;
6066 if (this_in_layout_i > 0)
6067 entry.second = m_perms[this_in_layout_i][entry.second];
6069 if (out_layout_i > 0)
6070 vect_slp_permute (m_perms[out_layout_i], perm, true);
6073 /* Check whether the target allows NODE to be rearranged so that the node's
6074 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
6075 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
6077 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
6078 NODE can adapt to the layout changes that have (perhaps provisionally)
6079 been chosen for NODE's children, so that no extra permutations are
6080 needed on either the input or the output of NODE.
6082 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
6083 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
6085 IN_LAYOUT_I has no meaning for other types of node.
6087 Keeping the node as-is is always valid. If the target doesn't appear
6088 to support the node as-is, but might realistically support other layouts,
6089 then layout 0 instead has the cost of a worst-case permutation. On the
6090 one hand, this ensures that every node has at least one valid layout,
6091 avoiding what would otherwise be an awkward special case. On the other,
6092 it still encourages the pass to change an invalid pre-existing layout
6093 choice into a valid one. */
6096 vect_optimize_slp_pass::internal_node_cost (slp_tree node, int in_layout_i,
6097 unsigned int out_layout_i)
6099 const int fallback_cost = 1;
6101 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6103 auto_lane_permutation_t tmp_perm;
6104 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
6106 /* Check that the child nodes support the chosen layout. Checking
6107 the first child is enough, since any second child would have the
6108 same shape. */
6109 auto first_child = SLP_TREE_CHILDREN (node)[0];
6110 if (in_layout_i > 0
6111 && !is_compatible_layout (first_child, in_layout_i))
6112 return -1;
6114 change_vec_perm_layout (node, tmp_perm, in_layout_i, out_layout_i);
6115 int count = vectorizable_slp_permutation_1 (m_vinfo, nullptr,
6116 node, tmp_perm,
6117 SLP_TREE_CHILDREN (node),
6118 false);
6119 if (count < 0)
6121 if (in_layout_i == 0 && out_layout_i == 0)
6123 /* Use the fallback cost if the node could in principle support
6124 some nonzero layout for both the inputs and the outputs.
6125 Otherwise assume that the node will be rejected later
6126 and rebuilt from scalars. */
6127 if (SLP_TREE_LANES (node) == SLP_TREE_LANES (first_child))
6128 return fallback_cost;
6129 return 0;
6131 return -1;
6134 /* We currently have no way of telling whether the new layout is cheaper
6135 or more expensive than the old one. But at least in principle,
6136 it should be worth making zero permutations (whole-vector shuffles)
6137 cheaper than real permutations, in case the pass is able to remove
6138 the latter. */
6139 return count == 0 ? 0 : 1;
6142 stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node);
6143 if (rep
6144 && STMT_VINFO_DATA_REF (rep)
6145 && DR_IS_READ (STMT_VINFO_DATA_REF (rep))
6146 && SLP_TREE_LOAD_PERMUTATION (node).exists ())
6148 auto_load_permutation_t tmp_perm;
6149 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
6150 if (out_layout_i > 0)
6151 vect_slp_permute (m_perms[out_layout_i], tmp_perm, true);
6153 poly_uint64 vf = 1;
6154 if (auto loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo))
6155 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6156 unsigned int n_perms;
6157 if (!vect_transform_slp_perm_load_1 (m_vinfo, node, tmp_perm, vNULL,
6158 nullptr, vf, true, false, &n_perms))
6160 auto rep = SLP_TREE_REPRESENTATIVE (node);
6161 if (out_layout_i == 0)
6163 /* Use the fallback cost if the load is an N-to-N permutation.
6164 Otherwise assume that the node will be rejected later
6165 and rebuilt from scalars. */
6166 if (STMT_VINFO_GROUPED_ACCESS (rep)
6167 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep))
6168 == SLP_TREE_LANES (node)))
6169 return fallback_cost;
6170 return 0;
6172 return -1;
6175 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
6176 return n_perms == 0 ? 0 : 1;
6179 return 0;
6182 /* Decide which element layouts we should consider using. Calculate the
6183 weights associated with inserting layout changes on partition edges.
6184 Also mark partitions that cannot change layout, by setting their
6185 layout to zero. */
6187 void
6188 vect_optimize_slp_pass::start_choosing_layouts ()
6190 /* Used to assign unique permutation indices. */
6191 using perm_hash = unbounded_hashmap_traits<
6192 vec_free_hash_base<int_hash_base<unsigned>>,
6193 int_hash<int, -1, -2>
6195 hash_map<vec<unsigned>, int, perm_hash> layout_ids;
6197 /* Layout 0 is "no change". */
6198 m_perms.safe_push (vNULL);
6200 /* Create layouts from existing permutations. */
6201 auto_load_permutation_t tmp_perm;
6202 for (unsigned int node_i : m_partitioned_nodes)
6204 /* Leafs also double as entries to the reverse graph. Allow the
6205 layout of those to be changed. */
6206 auto &vertex = m_vertices[node_i];
6207 auto &partition = m_partitions[vertex.partition];
6208 if (!m_slpg->vertices[node_i].succ)
6209 partition.layout = 0;
6211 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
6212 slp_tree node = vertex.node;
6213 stmt_vec_info dr_stmt = SLP_TREE_REPRESENTATIVE (node);
6214 slp_tree child;
6215 unsigned HOST_WIDE_INT imin, imax = 0;
6216 bool any_permute = false;
6217 tmp_perm.truncate (0);
6218 if (SLP_TREE_LOAD_PERMUTATION (node).exists ())
6220 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
6221 unpermuted, record a layout that reverses this permutation.
6223 We would need more work to cope with loads that are internally
6224 permuted and also have inputs (such as masks for
6225 IFN_MASK_LOADs). */
6226 gcc_assert (partition.layout == 0 && !m_slpg->vertices[node_i].succ);
6227 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt))
6229 partition.layout = -1;
6230 continue;
6232 dr_stmt = DR_GROUP_FIRST_ELEMENT (dr_stmt);
6233 imin = DR_GROUP_SIZE (dr_stmt) + 1;
6234 tmp_perm.safe_splice (SLP_TREE_LOAD_PERMUTATION (node));
6236 else if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
6237 && SLP_TREE_CHILDREN (node).length () == 1
6238 && (child = SLP_TREE_CHILDREN (node)[0])
6239 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child))
6240 .is_constant (&imin)))
6242 /* If the child has the same vector size as this node,
6243 reversing the permutation can make the permutation a no-op.
6244 In other cases it can change a true permutation into a
6245 full-vector extract. */
6246 tmp_perm.reserve (SLP_TREE_LANES (node));
6247 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
6248 tmp_perm.quick_push (SLP_TREE_LANE_PERMUTATION (node)[j].second);
6250 else
6251 continue;
6253 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
6255 unsigned idx = tmp_perm[j];
6256 imin = MIN (imin, idx);
6257 imax = MAX (imax, idx);
6258 if (idx - tmp_perm[0] != j)
6259 any_permute = true;
6261 /* If the span doesn't match we'd disrupt VF computation, avoid
6262 that for now. */
6263 if (imax - imin + 1 != SLP_TREE_LANES (node))
6264 continue;
6265 /* If there's no permute no need to split one out. In this case
6266 we can consider turning a load into a permuted load, if that
6267 turns out to be cheaper than alternatives. */
6268 if (!any_permute)
6270 partition.layout = -1;
6271 continue;
6274 /* For now only handle true permutes, like
6275 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
6276 when permuting constants and invariants keeping the permute
6277 bijective. */
6278 auto_sbitmap load_index (SLP_TREE_LANES (node));
6279 bitmap_clear (load_index);
6280 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
6281 bitmap_set_bit (load_index, tmp_perm[j] - imin);
6282 unsigned j;
6283 for (j = 0; j < SLP_TREE_LANES (node); ++j)
6284 if (!bitmap_bit_p (load_index, j))
6285 break;
6286 if (j != SLP_TREE_LANES (node))
6287 continue;
6289 vec<unsigned> perm = vNULL;
6290 perm.safe_grow (SLP_TREE_LANES (node), true);
6291 for (unsigned j = 0; j < SLP_TREE_LANES (node); ++j)
6292 perm[j] = tmp_perm[j] - imin;
6294 if (int (m_perms.length ()) >= param_vect_max_layout_candidates)
6296 /* Continue to use existing layouts, but don't add any more. */
6297 int *entry = layout_ids.get (perm);
6298 partition.layout = entry ? *entry : 0;
6299 perm.release ();
6301 else
6303 bool existed;
6304 int &layout_i = layout_ids.get_or_insert (perm, &existed);
6305 if (existed)
6306 perm.release ();
6307 else
6309 layout_i = m_perms.length ();
6310 m_perms.safe_push (perm);
6312 partition.layout = layout_i;
6316 /* Initially assume that every layout is possible and has zero cost
6317 in every partition. */
6318 m_partition_layout_costs.safe_grow_cleared (m_partitions.length ()
6319 * m_perms.length ());
6321 /* We have to mark outgoing permutations facing non-associating-reduction
6322 graph entries that are not represented as to be materialized.
6323 slp_inst_kind_bb_reduc currently only covers associatable reductions. */
6324 for (slp_instance instance : m_vinfo->slp_instances)
6325 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor)
6327 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
6328 m_partitions[m_vertices[node_i].partition].layout = 0;
6330 else if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_reduc_chain)
6332 stmt_vec_info stmt_info
6333 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance));
6334 stmt_vec_info reduc_info = info_for_reduction (m_vinfo, stmt_info);
6335 if (needs_fold_left_reduction_p (TREE_TYPE
6336 (gimple_get_lhs (stmt_info->stmt)),
6337 STMT_VINFO_REDUC_CODE (reduc_info)))
6339 unsigned int node_i = SLP_INSTANCE_TREE (instance)->vertex;
6340 m_partitions[m_vertices[node_i].partition].layout = 0;
6344 /* Check which layouts each node and partition can handle. Calculate the
6345 weights associated with inserting layout changes on edges. */
6346 for (unsigned int node_i : m_partitioned_nodes)
6348 auto &vertex = m_vertices[node_i];
6349 auto &partition = m_partitions[vertex.partition];
6350 slp_tree node = vertex.node;
6352 if (stmt_vec_info rep = SLP_TREE_REPRESENTATIVE (node))
6354 vertex.weight = vect_slp_node_weight (node);
6356 /* We do not handle stores with a permutation, so all
6357 incoming permutations must have been materialized.
6359 We also don't handle masked grouped loads, which lack a
6360 permutation vector. In this case the memory locations
6361 form an implicit second input to the loads, on top of the
6362 explicit mask input, and the memory input's layout cannot
6363 be changed.
6365 On the other hand, we do support permuting gather loads and
6366 masked gather loads, where each scalar load is independent
6367 of the others. This can be useful if the address/index input
6368 benefits from permutation. */
6369 if (STMT_VINFO_DATA_REF (rep)
6370 && STMT_VINFO_GROUPED_ACCESS (rep)
6371 && !SLP_TREE_LOAD_PERMUTATION (node).exists ())
6372 partition.layout = 0;
6374 /* We cannot change the layout of an operation that is
6375 not independent on lanes. Note this is an explicit
6376 negative list since that's much shorter than the respective
6377 positive one but it's critical to keep maintaining it. */
6378 if (is_gimple_call (STMT_VINFO_STMT (rep)))
6379 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep)))
6381 case CFN_COMPLEX_ADD_ROT90:
6382 case CFN_COMPLEX_ADD_ROT270:
6383 case CFN_COMPLEX_MUL:
6384 case CFN_COMPLEX_MUL_CONJ:
6385 case CFN_VEC_ADDSUB:
6386 case CFN_VEC_FMADDSUB:
6387 case CFN_VEC_FMSUBADD:
6388 partition.layout = 0;
6389 default:;
6393 auto process_edge = [&](graph_edge *ud, unsigned int other_node_i)
6395 auto &other_vertex = m_vertices[other_node_i];
6397 /* Count the number of edges from earlier partitions and the number
6398 of edges to later partitions. */
6399 if (other_vertex.partition < vertex.partition)
6400 partition.in_degree += 1;
6401 else
6402 partition.out_degree += 1;
6404 /* If the current node uses the result of OTHER_NODE_I, accumulate
6405 the effects of that. */
6406 if (ud->src == int (node_i))
6408 other_vertex.out_weight += vertex.weight;
6409 other_vertex.out_degree += 1;
6412 for_each_partition_edge (node_i, process_edge);
6416 /* Return the incoming costs for node NODE_I, assuming that each input keeps
6417 its current (provisional) choice of layout. The inputs do not necessarily
6418 have the same layout as each other. */
6420 slpg_layout_cost
6421 vect_optimize_slp_pass::total_in_cost (unsigned int node_i)
6423 auto &vertex = m_vertices[node_i];
6424 slpg_layout_cost cost;
6425 auto add_cost = [&](graph_edge *, unsigned int other_node_i)
6427 auto &other_vertex = m_vertices[other_node_i];
6428 if (other_vertex.partition < vertex.partition)
6430 auto &other_partition = m_partitions[other_vertex.partition];
6431 auto &other_costs = partition_layout_costs (other_vertex.partition,
6432 other_partition.layout);
6433 slpg_layout_cost this_cost = other_costs.in_cost;
6434 this_cost.add_serial_cost (other_costs.internal_cost);
6435 this_cost.split (other_partition.out_degree);
6436 cost.add_parallel_cost (this_cost);
6439 for_each_partition_edge (node_i, add_cost);
6440 return cost;
6443 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
6444 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
6445 slpg_layout_cost::impossible () if the change isn't possible. */
6447 slpg_layout_cost
6448 vect_optimize_slp_pass::
6449 edge_layout_cost (graph_edge *ud, unsigned int node1_i, unsigned int layout1_i,
6450 unsigned int layout2_i)
6452 auto &def_vertex = m_vertices[ud->dest];
6453 auto &use_vertex = m_vertices[ud->src];
6454 auto def_layout_i = ud->dest == int (node1_i) ? layout1_i : layout2_i;
6455 auto use_layout_i = ud->dest == int (node1_i) ? layout2_i : layout1_i;
6456 auto factor = change_layout_cost (def_vertex.node, def_layout_i,
6457 use_layout_i);
6458 if (factor < 0)
6459 return slpg_layout_cost::impossible ();
6461 /* We have a choice of putting the layout change at the site of the
6462 definition or at the site of the use. Prefer the former when
6463 optimizing for size or when the execution frequency of the
6464 definition is no greater than the combined execution frequencies of
6465 the uses. When putting the layout change at the site of the definition,
6466 divvy up the cost among all consumers. */
6467 if (m_optimize_size || def_vertex.weight <= def_vertex.out_weight)
6469 slpg_layout_cost cost = { def_vertex.weight * factor, m_optimize_size };
6470 cost.split (def_vertex.out_degree);
6471 return cost;
6473 return { use_vertex.weight * factor, m_optimize_size };
6476 /* UD represents a use-def link between FROM_NODE_I and a node in a later
6477 partition; FROM_NODE_I could be the definition node or the use node.
6478 The node at the other end of the link wants to use layout TO_LAYOUT_I.
6479 Return the cost of any necessary fix-ups on edge UD, or return
6480 slpg_layout_cost::impossible () if the change isn't possible.
6482 At this point, FROM_NODE_I's partition has chosen the cheapest
6483 layout based on the information available so far, but this choice
6484 is only provisional. */
6486 slpg_layout_cost
6487 vect_optimize_slp_pass::forward_cost (graph_edge *ud, unsigned int from_node_i,
6488 unsigned int to_layout_i)
6490 auto &from_vertex = m_vertices[from_node_i];
6491 unsigned int from_partition_i = from_vertex.partition;
6492 slpg_partition_info &from_partition = m_partitions[from_partition_i];
6493 gcc_assert (from_partition.layout >= 0);
6495 /* First calculate the cost on the assumption that FROM_PARTITION sticks
6496 with its current layout preference. */
6497 slpg_layout_cost cost = slpg_layout_cost::impossible ();
6498 auto edge_cost = edge_layout_cost (ud, from_node_i,
6499 from_partition.layout, to_layout_i);
6500 if (edge_cost.is_possible ())
6502 auto &from_costs = partition_layout_costs (from_partition_i,
6503 from_partition.layout);
6504 cost = from_costs.in_cost;
6505 cost.add_serial_cost (from_costs.internal_cost);
6506 cost.split (from_partition.out_degree);
6507 cost.add_serial_cost (edge_cost);
6509 else if (from_partition.layout == 0)
6510 /* We must allow the source partition to have layout 0 as a fallback,
6511 in case all other options turn out to be impossible. */
6512 return cost;
6514 /* Take the minimum of that cost and the cost that applies if
6515 FROM_PARTITION instead switches to TO_LAYOUT_I. */
6516 auto &direct_layout_costs = partition_layout_costs (from_partition_i,
6517 to_layout_i);
6518 if (direct_layout_costs.is_possible ())
6520 slpg_layout_cost direct_cost = direct_layout_costs.in_cost;
6521 direct_cost.add_serial_cost (direct_layout_costs.internal_cost);
6522 direct_cost.split (from_partition.out_degree);
6523 if (!cost.is_possible ()
6524 || direct_cost.is_better_than (cost, m_optimize_size))
6525 cost = direct_cost;
6528 return cost;
6531 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
6532 partition; TO_NODE_I could be the definition node or the use node.
6533 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
6534 return the cost of any necessary fix-ups on edge UD, or
6535 slpg_layout_cost::impossible () if the choice cannot be made.
6537 At this point, TO_NODE_I's partition has a fixed choice of layout. */
6539 slpg_layout_cost
6540 vect_optimize_slp_pass::backward_cost (graph_edge *ud, unsigned int to_node_i,
6541 unsigned int from_layout_i)
6543 auto &to_vertex = m_vertices[to_node_i];
6544 unsigned int to_partition_i = to_vertex.partition;
6545 slpg_partition_info &to_partition = m_partitions[to_partition_i];
6546 gcc_assert (to_partition.layout >= 0);
6548 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
6549 adjusted for this input having layout FROM_LAYOUT_I. Assume that
6550 any other inputs keep their current choice of layout. */
6551 auto &to_costs = partition_layout_costs (to_partition_i,
6552 to_partition.layout);
6553 if (ud->src == int (to_node_i)
6554 && SLP_TREE_CODE (to_vertex.node) == VEC_PERM_EXPR)
6556 auto &from_partition = m_partitions[m_vertices[ud->dest].partition];
6557 auto old_layout = from_partition.layout;
6558 from_partition.layout = from_layout_i;
6559 int factor = internal_node_cost (to_vertex.node, -1,
6560 to_partition.layout);
6561 from_partition.layout = old_layout;
6562 if (factor >= 0)
6564 slpg_layout_cost cost = to_costs.out_cost;
6565 cost.add_serial_cost ({ to_vertex.weight * factor,
6566 m_optimize_size });
6567 cost.split (to_partition.in_degree);
6568 return cost;
6572 /* Compute the cost if we insert any necessary layout change on edge UD. */
6573 auto edge_cost = edge_layout_cost (ud, to_node_i,
6574 to_partition.layout, from_layout_i);
6575 if (edge_cost.is_possible ())
6577 slpg_layout_cost cost = to_costs.out_cost;
6578 cost.add_serial_cost (to_costs.internal_cost);
6579 cost.split (to_partition.in_degree);
6580 cost.add_serial_cost (edge_cost);
6581 return cost;
6584 return slpg_layout_cost::impossible ();
6587 /* Make a forward pass through the partitions, accumulating input costs.
6588 Make a tentative (provisional) choice of layout for each partition,
6589 ensuring that this choice still allows later partitions to keep
6590 their original layout. */
6592 void
6593 vect_optimize_slp_pass::forward_pass ()
6595 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
6596 ++partition_i)
6598 auto &partition = m_partitions[partition_i];
6600 /* If the partition consists of a single VEC_PERM_EXPR, precompute
6601 the incoming cost that would apply if every predecessor partition
6602 keeps its current layout. This is used within the loop below. */
6603 slpg_layout_cost in_cost;
6604 slp_tree single_node = nullptr;
6605 if (partition.node_end == partition.node_begin + 1)
6607 unsigned int node_i = m_partitioned_nodes[partition.node_begin];
6608 single_node = m_vertices[node_i].node;
6609 if (SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
6610 in_cost = total_in_cost (node_i);
6613 /* Go through the possible layouts. Decide which ones are valid
6614 for this partition and record which of the valid layouts has
6615 the lowest cost. */
6616 unsigned int min_layout_i = 0;
6617 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
6618 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6620 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6621 if (!layout_costs.is_possible ())
6622 continue;
6624 /* If the recorded layout is already 0 then the layout cannot
6625 change. */
6626 if (partition.layout == 0 && layout_i != 0)
6628 layout_costs.mark_impossible ();
6629 continue;
6632 bool is_possible = true;
6633 for (unsigned int order_i = partition.node_begin;
6634 order_i < partition.node_end; ++order_i)
6636 unsigned int node_i = m_partitioned_nodes[order_i];
6637 auto &vertex = m_vertices[node_i];
6639 /* Reject the layout if it is individually incompatible
6640 with any node in the partition. */
6641 if (!is_compatible_layout (vertex.node, layout_i))
6643 is_possible = false;
6644 break;
6647 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
6649 auto &other_vertex = m_vertices[other_node_i];
6650 if (other_vertex.partition < vertex.partition)
6652 /* Accumulate the incoming costs from earlier
6653 partitions, plus the cost of any layout changes
6654 on UD itself. */
6655 auto cost = forward_cost (ud, other_node_i, layout_i);
6656 if (!cost.is_possible ())
6657 is_possible = false;
6658 else
6659 layout_costs.in_cost.add_parallel_cost (cost);
6661 else
6662 /* Reject the layout if it would make layout 0 impossible
6663 for later partitions. This amounts to testing that the
6664 target supports reversing the layout change on edges
6665 to later partitions.
6667 In principle, it might be possible to push a layout
6668 change all the way down a graph, so that it never
6669 needs to be reversed and so that the target doesn't
6670 need to support the reverse operation. But it would
6671 be awkward to bail out if we hit a partition that
6672 does not support the new layout, especially since
6673 we are not dealing with a lattice. */
6674 is_possible &= edge_layout_cost (ud, other_node_i, 0,
6675 layout_i).is_possible ();
6677 for_each_partition_edge (node_i, add_cost);
6679 /* Accumulate the cost of using LAYOUT_I within NODE,
6680 both for the inputs and the outputs. */
6681 int factor = internal_node_cost (vertex.node, layout_i,
6682 layout_i);
6683 if (factor < 0)
6685 is_possible = false;
6686 break;
6688 else if (factor)
6689 layout_costs.internal_cost.add_serial_cost
6690 ({ vertex.weight * factor, m_optimize_size });
6692 if (!is_possible)
6694 layout_costs.mark_impossible ();
6695 continue;
6698 /* Combine the incoming and partition-internal costs. */
6699 slpg_layout_cost combined_cost = layout_costs.in_cost;
6700 combined_cost.add_serial_cost (layout_costs.internal_cost);
6702 /* If this partition consists of a single VEC_PERM_EXPR, see
6703 if the VEC_PERM_EXPR can be changed to support output layout
6704 LAYOUT_I while keeping all the provisional choices of input
6705 layout. */
6706 if (single_node
6707 && SLP_TREE_CODE (single_node) == VEC_PERM_EXPR)
6709 int factor = internal_node_cost (single_node, -1, layout_i);
6710 if (factor >= 0)
6712 auto weight = m_vertices[single_node->vertex].weight;
6713 slpg_layout_cost internal_cost
6714 = { weight * factor, m_optimize_size };
6716 slpg_layout_cost alt_cost = in_cost;
6717 alt_cost.add_serial_cost (internal_cost);
6718 if (alt_cost.is_better_than (combined_cost, m_optimize_size))
6720 combined_cost = alt_cost;
6721 layout_costs.in_cost = in_cost;
6722 layout_costs.internal_cost = internal_cost;
6727 /* Record the layout with the lowest cost. Prefer layout 0 in
6728 the event of a tie between it and another layout. */
6729 if (!min_layout_cost.is_possible ()
6730 || combined_cost.is_better_than (min_layout_cost,
6731 m_optimize_size))
6733 min_layout_i = layout_i;
6734 min_layout_cost = combined_cost;
6738 /* This loop's handling of earlier partitions should ensure that
6739 choosing the original layout for the current partition is no
6740 less valid than it was in the original graph, even with the
6741 provisional layout choices for those earlier partitions. */
6742 gcc_assert (min_layout_cost.is_possible ());
6743 partition.layout = min_layout_i;
6747 /* Make a backward pass through the partitions, accumulating output costs.
6748 Make a final choice of layout for each partition. */
6750 void
6751 vect_optimize_slp_pass::backward_pass ()
6753 for (unsigned int partition_i = m_partitions.length (); partition_i-- > 0;)
6755 auto &partition = m_partitions[partition_i];
6757 unsigned int min_layout_i = 0;
6758 slpg_layout_cost min_layout_cost = slpg_layout_cost::impossible ();
6759 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
6761 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
6762 if (!layout_costs.is_possible ())
6763 continue;
6765 /* Accumulate the costs from successor partitions. */
6766 bool is_possible = true;
6767 for (unsigned int order_i = partition.node_begin;
6768 order_i < partition.node_end; ++order_i)
6770 unsigned int node_i = m_partitioned_nodes[order_i];
6771 auto &vertex = m_vertices[node_i];
6772 auto add_cost = [&](graph_edge *ud, unsigned int other_node_i)
6774 auto &other_vertex = m_vertices[other_node_i];
6775 auto &other_partition = m_partitions[other_vertex.partition];
6776 if (other_vertex.partition > vertex.partition)
6778 /* Accumulate the incoming costs from later
6779 partitions, plus the cost of any layout changes
6780 on UD itself. */
6781 auto cost = backward_cost (ud, other_node_i, layout_i);
6782 if (!cost.is_possible ())
6783 is_possible = false;
6784 else
6785 layout_costs.out_cost.add_parallel_cost (cost);
6787 else
6788 /* Make sure that earlier partitions can (if necessary
6789 or beneficial) keep the layout that they chose in
6790 the forward pass. This ensures that there is at
6791 least one valid choice of layout. */
6792 is_possible &= edge_layout_cost (ud, other_node_i,
6793 other_partition.layout,
6794 layout_i).is_possible ();
6796 for_each_partition_edge (node_i, add_cost);
6798 if (!is_possible)
6800 layout_costs.mark_impossible ();
6801 continue;
6804 /* Locally combine the costs from the forward and backward passes.
6805 (This combined cost is not passed on, since that would lead
6806 to double counting.) */
6807 slpg_layout_cost combined_cost = layout_costs.in_cost;
6808 combined_cost.add_serial_cost (layout_costs.internal_cost);
6809 combined_cost.add_serial_cost (layout_costs.out_cost);
6811 /* Record the layout with the lowest cost. Prefer layout 0 in
6812 the event of a tie between it and another layout. */
6813 if (!min_layout_cost.is_possible ()
6814 || combined_cost.is_better_than (min_layout_cost,
6815 m_optimize_size))
6817 min_layout_i = layout_i;
6818 min_layout_cost = combined_cost;
6822 gcc_assert (min_layout_cost.is_possible ());
6823 partition.layout = min_layout_i;
6827 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
6828 NODE already has the layout that was selected for its partition. */
6830 slp_tree
6831 vect_optimize_slp_pass::get_result_with_layout (slp_tree node,
6832 unsigned int to_layout_i)
6834 unsigned int result_i = node->vertex * m_perms.length () + to_layout_i;
6835 slp_tree result = m_node_layouts[result_i];
6836 if (result)
6837 return result;
6839 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
6840 || (SLP_TREE_DEF_TYPE (node) == vect_external_def
6841 /* We can't permute vector defs in place. */
6842 && SLP_TREE_VEC_DEFS (node).is_empty ()))
6844 /* If the vector is uniform or unchanged, there's nothing to do. */
6845 if (to_layout_i == 0 || vect_slp_tree_uniform_p (node))
6846 result = node;
6847 else
6849 auto scalar_ops = SLP_TREE_SCALAR_OPS (node).copy ();
6850 result = vect_create_new_slp_node (scalar_ops);
6851 vect_slp_permute (m_perms[to_layout_i], scalar_ops, true);
6854 else
6856 unsigned int partition_i = m_vertices[node->vertex].partition;
6857 unsigned int from_layout_i = m_partitions[partition_i].layout;
6858 if (from_layout_i == to_layout_i)
6859 return node;
6861 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
6862 permutation instead of a serial one. Leave the new permutation
6863 in TMP_PERM on success. */
6864 auto_lane_permutation_t tmp_perm;
6865 unsigned int num_inputs = 1;
6866 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6868 tmp_perm.safe_splice (SLP_TREE_LANE_PERMUTATION (node));
6869 if (from_layout_i != 0)
6870 vect_slp_permute (m_perms[from_layout_i], tmp_perm, false);
6871 if (to_layout_i != 0)
6872 vect_slp_permute (m_perms[to_layout_i], tmp_perm, true);
6873 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
6874 tmp_perm,
6875 SLP_TREE_CHILDREN (node),
6876 false) >= 0)
6877 num_inputs = SLP_TREE_CHILDREN (node).length ();
6878 else
6879 tmp_perm.truncate (0);
6882 if (dump_enabled_p ())
6884 if (tmp_perm.length () > 0)
6885 dump_printf_loc (MSG_NOTE, vect_location,
6886 "duplicating permutation node %p with"
6887 " layout %d\n",
6888 (void *) node, to_layout_i);
6889 else
6890 dump_printf_loc (MSG_NOTE, vect_location,
6891 "inserting permutation node in place of %p\n",
6892 (void *) node);
6895 unsigned int num_lanes = SLP_TREE_LANES (node);
6896 result = vect_create_new_slp_node (num_inputs, VEC_PERM_EXPR);
6897 if (SLP_TREE_SCALAR_STMTS (node).length ())
6899 auto &stmts = SLP_TREE_SCALAR_STMTS (result);
6900 stmts.safe_splice (SLP_TREE_SCALAR_STMTS (node));
6901 if (from_layout_i != 0)
6902 vect_slp_permute (m_perms[from_layout_i], stmts, false);
6903 if (to_layout_i != 0)
6904 vect_slp_permute (m_perms[to_layout_i], stmts, true);
6906 SLP_TREE_REPRESENTATIVE (result) = SLP_TREE_REPRESENTATIVE (node);
6907 SLP_TREE_LANES (result) = num_lanes;
6908 SLP_TREE_VECTYPE (result) = SLP_TREE_VECTYPE (node);
6909 result->vertex = -1;
6911 auto &lane_perm = SLP_TREE_LANE_PERMUTATION (result);
6912 if (tmp_perm.length ())
6914 lane_perm.safe_splice (tmp_perm);
6915 SLP_TREE_CHILDREN (result).safe_splice (SLP_TREE_CHILDREN (node));
6917 else
6919 lane_perm.create (num_lanes);
6920 for (unsigned j = 0; j < num_lanes; ++j)
6921 lane_perm.quick_push ({ 0, j });
6922 if (from_layout_i != 0)
6923 vect_slp_permute (m_perms[from_layout_i], lane_perm, false);
6924 if (to_layout_i != 0)
6925 vect_slp_permute (m_perms[to_layout_i], lane_perm, true);
6926 SLP_TREE_CHILDREN (result).safe_push (node);
6928 for (slp_tree child : SLP_TREE_CHILDREN (result))
6929 child->refcnt++;
6931 m_node_layouts[result_i] = result;
6932 return result;
6935 /* Apply the chosen vector layouts to the SLP graph. */
6937 void
6938 vect_optimize_slp_pass::materialize ()
6940 /* We no longer need the costs, so avoid having two O(N * P) arrays
6941 live at the same time. */
6942 m_partition_layout_costs.release ();
6943 m_node_layouts.safe_grow_cleared (m_vertices.length () * m_perms.length ());
6945 auto_sbitmap fully_folded (m_vertices.length ());
6946 bitmap_clear (fully_folded);
6947 for (unsigned int node_i : m_partitioned_nodes)
6949 auto &vertex = m_vertices[node_i];
6950 slp_tree node = vertex.node;
6951 int layout_i = m_partitions[vertex.partition].layout;
6952 gcc_assert (layout_i >= 0);
6954 /* Rearrange the scalar statements to match the chosen layout. */
6955 if (layout_i > 0)
6956 vect_slp_permute (m_perms[layout_i],
6957 SLP_TREE_SCALAR_STMTS (node), true);
6959 /* Update load and lane permutations. */
6960 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
6962 /* First try to absorb the input vector layouts. If that fails,
6963 force the inputs to have layout LAYOUT_I too. We checked that
6964 that was possible before deciding to use nonzero output layouts.
6965 (Note that at this stage we don't really have any guarantee that
6966 the target supports the original VEC_PERM_EXPR.) */
6967 auto &perm = SLP_TREE_LANE_PERMUTATION (node);
6968 auto_lane_permutation_t tmp_perm;
6969 tmp_perm.safe_splice (perm);
6970 change_vec_perm_layout (node, tmp_perm, -1, layout_i);
6971 if (vectorizable_slp_permutation_1 (m_vinfo, nullptr, node,
6972 tmp_perm,
6973 SLP_TREE_CHILDREN (node),
6974 false) >= 0)
6976 if (dump_enabled_p ()
6977 && !std::equal (tmp_perm.begin (), tmp_perm.end (),
6978 perm.begin ()))
6979 dump_printf_loc (MSG_NOTE, vect_location,
6980 "absorbing input layouts into %p\n",
6981 (void *) node);
6982 std::copy (tmp_perm.begin (), tmp_perm.end (), perm.begin ());
6983 bitmap_set_bit (fully_folded, node_i);
6985 else
6987 /* Not MSG_MISSED because it would make no sense to users. */
6988 if (dump_enabled_p ())
6989 dump_printf_loc (MSG_NOTE, vect_location,
6990 "failed to absorb input layouts into %p\n",
6991 (void *) node);
6992 change_vec_perm_layout (nullptr, perm, layout_i, layout_i);
6995 else
6997 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node).exists ());
6998 auto &load_perm = SLP_TREE_LOAD_PERMUTATION (node);
6999 if (layout_i > 0)
7000 /* ??? When we handle non-bijective permutes the idea
7001 is that we can force the load-permutation to be
7002 { min, min + 1, min + 2, ... max }. But then the
7003 scalar defs might no longer match the lane content
7004 which means wrong-code with live lane vectorization.
7005 So we possibly have to have NULL entries for those. */
7006 vect_slp_permute (m_perms[layout_i], load_perm, true);
7010 /* Do this before any nodes disappear, since it involves a walk
7011 over the leaves. */
7012 remove_redundant_permutations ();
7014 /* Replace each child with a correctly laid-out version. */
7015 for (unsigned int node_i : m_partitioned_nodes)
7017 /* Skip nodes that have already been handled above. */
7018 if (bitmap_bit_p (fully_folded, node_i))
7019 continue;
7021 auto &vertex = m_vertices[node_i];
7022 int in_layout_i = m_partitions[vertex.partition].layout;
7023 gcc_assert (in_layout_i >= 0);
7025 unsigned j;
7026 slp_tree child;
7027 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex.node), j, child)
7029 if (!child)
7030 continue;
7032 slp_tree new_child = get_result_with_layout (child, in_layout_i);
7033 if (new_child != child)
7035 vect_free_slp_tree (child);
7036 SLP_TREE_CHILDREN (vertex.node)[j] = new_child;
7037 new_child->refcnt += 1;
7043 /* Elide load permutations that are not necessary. Such permutations might
7044 be pre-existing, rather than created by the layout optimizations. */
7046 void
7047 vect_optimize_slp_pass::remove_redundant_permutations ()
7049 for (unsigned int node_i : m_leafs)
7051 slp_tree node = m_vertices[node_i].node;
7052 if (!SLP_TREE_LOAD_PERMUTATION (node).exists ())
7053 continue;
7055 /* In basic block vectorization we allow any subchain of an interleaving
7056 chain.
7057 FORNOW: not in loop SLP because of realignment complications. */
7058 if (is_a <bb_vec_info> (m_vinfo))
7060 bool subchain_p = true;
7061 stmt_vec_info next_load_info = NULL;
7062 stmt_vec_info load_info;
7063 unsigned j;
7064 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
7066 if (j != 0
7067 && (next_load_info != load_info
7068 || ! load_info
7069 || DR_GROUP_GAP (load_info) != 1))
7071 subchain_p = false;
7072 break;
7074 next_load_info = DR_GROUP_NEXT_ELEMENT (load_info);
7076 if (subchain_p)
7078 SLP_TREE_LOAD_PERMUTATION (node).release ();
7079 continue;
7082 else
7084 loop_vec_info loop_vinfo = as_a<loop_vec_info> (m_vinfo);
7085 stmt_vec_info load_info;
7086 bool this_load_permuted = false;
7087 unsigned j;
7088 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), j, load_info)
7089 if (SLP_TREE_LOAD_PERMUTATION (node)[j] != j)
7091 this_load_permuted = true;
7092 break;
7094 /* When this isn't a grouped access we know it's single element
7095 and contiguous. */
7096 if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node)[0]))
7098 if (!this_load_permuted
7099 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
7100 || SLP_TREE_LANES (node) == 1))
7101 SLP_TREE_LOAD_PERMUTATION (node).release ();
7102 continue;
7104 stmt_vec_info first_stmt_info
7105 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node)[0]);
7106 if (!this_load_permuted
7107 /* The load requires permutation when unrolling exposes
7108 a gap either because the group is larger than the SLP
7109 group-size or because there is a gap between the groups. */
7110 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1U)
7111 || ((SLP_TREE_LANES (node) == DR_GROUP_SIZE (first_stmt_info))
7112 && DR_GROUP_GAP (first_stmt_info) == 0)))
7114 SLP_TREE_LOAD_PERMUTATION (node).release ();
7115 continue;
7121 /* Print the partition graph and layout information to the dump file. */
7123 void
7124 vect_optimize_slp_pass::dump ()
7126 dump_printf_loc (MSG_NOTE, vect_location,
7127 "SLP optimize permutations:\n");
7128 for (unsigned int layout_i = 1; layout_i < m_perms.length (); ++layout_i)
7130 dump_printf_loc (MSG_NOTE, vect_location, " %d: { ", layout_i);
7131 const char *sep = "";
7132 for (unsigned int idx : m_perms[layout_i])
7134 dump_printf (MSG_NOTE, "%s%d", sep, idx);
7135 sep = ", ";
7137 dump_printf (MSG_NOTE, " }\n");
7139 dump_printf_loc (MSG_NOTE, vect_location,
7140 "SLP optimize partitions:\n");
7141 for (unsigned int partition_i = 0; partition_i < m_partitions.length ();
7142 ++partition_i)
7144 auto &partition = m_partitions[partition_i];
7145 dump_printf_loc (MSG_NOTE, vect_location, " -------------\n");
7146 dump_printf_loc (MSG_NOTE, vect_location,
7147 " partition %d (layout %d):\n",
7148 partition_i, partition.layout);
7149 dump_printf_loc (MSG_NOTE, vect_location, " nodes:\n");
7150 for (unsigned int order_i = partition.node_begin;
7151 order_i < partition.node_end; ++order_i)
7153 auto &vertex = m_vertices[m_partitioned_nodes[order_i]];
7154 dump_printf_loc (MSG_NOTE, vect_location, " - %p:\n",
7155 (void *) vertex.node);
7156 dump_printf_loc (MSG_NOTE, vect_location,
7157 " weight: %f\n",
7158 vertex.weight.to_double ());
7159 if (vertex.out_degree)
7160 dump_printf_loc (MSG_NOTE, vect_location,
7161 " out weight: %f (degree %d)\n",
7162 vertex.out_weight.to_double (),
7163 vertex.out_degree);
7164 if (SLP_TREE_CODE (vertex.node) == VEC_PERM_EXPR)
7165 dump_printf_loc (MSG_NOTE, vect_location,
7166 " op: VEC_PERM_EXPR\n");
7167 else if (auto rep = SLP_TREE_REPRESENTATIVE (vertex.node))
7168 dump_printf_loc (MSG_NOTE, vect_location,
7169 " op template: %G", rep->stmt);
7171 dump_printf_loc (MSG_NOTE, vect_location, " edges:\n");
7172 for (unsigned int order_i = partition.node_begin;
7173 order_i < partition.node_end; ++order_i)
7175 unsigned int node_i = m_partitioned_nodes[order_i];
7176 auto &vertex = m_vertices[node_i];
7177 auto print_edge = [&](graph_edge *, unsigned int other_node_i)
7179 auto &other_vertex = m_vertices[other_node_i];
7180 if (other_vertex.partition < vertex.partition)
7181 dump_printf_loc (MSG_NOTE, vect_location,
7182 " - %p [%d] --> %p\n",
7183 (void *) other_vertex.node,
7184 other_vertex.partition,
7185 (void *) vertex.node);
7186 else
7187 dump_printf_loc (MSG_NOTE, vect_location,
7188 " - %p --> [%d] %p\n",
7189 (void *) vertex.node,
7190 other_vertex.partition,
7191 (void *) other_vertex.node);
7193 for_each_partition_edge (node_i, print_edge);
7196 for (unsigned int layout_i = 0; layout_i < m_perms.length (); ++layout_i)
7198 auto &layout_costs = partition_layout_costs (partition_i, layout_i);
7199 if (layout_costs.is_possible ())
7201 dump_printf_loc (MSG_NOTE, vect_location,
7202 " layout %d:%s\n", layout_i,
7203 partition.layout == int (layout_i)
7204 ? " (*)" : "");
7205 slpg_layout_cost combined_cost = layout_costs.in_cost;
7206 combined_cost.add_serial_cost (layout_costs.internal_cost);
7207 combined_cost.add_serial_cost (layout_costs.out_cost);
7208 #define TEMPLATE "{depth: %f, total: %f}"
7209 dump_printf_loc (MSG_NOTE, vect_location,
7210 " " TEMPLATE "\n",
7211 layout_costs.in_cost.depth.to_double (),
7212 layout_costs.in_cost.total.to_double ());
7213 dump_printf_loc (MSG_NOTE, vect_location,
7214 " + " TEMPLATE "\n",
7215 layout_costs.internal_cost.depth.to_double (),
7216 layout_costs.internal_cost.total.to_double ());
7217 dump_printf_loc (MSG_NOTE, vect_location,
7218 " + " TEMPLATE "\n",
7219 layout_costs.out_cost.depth.to_double (),
7220 layout_costs.out_cost.total.to_double ());
7221 dump_printf_loc (MSG_NOTE, vect_location,
7222 " = " TEMPLATE "\n",
7223 combined_cost.depth.to_double (),
7224 combined_cost.total.to_double ());
7225 #undef TEMPLATE
7227 else
7228 dump_printf_loc (MSG_NOTE, vect_location,
7229 " layout %d: rejected\n", layout_i);
7234 /* Masked load lanes discovery. */
7236 void
7237 vect_optimize_slp_pass::decide_masked_load_lanes ()
7239 for (auto v : m_vertices)
7241 slp_tree node = v.node;
7242 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
7243 || SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7244 continue;
7245 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7246 if (! STMT_VINFO_GROUPED_ACCESS (stmt_info)
7247 /* The mask has to be uniform. */
7248 || STMT_VINFO_SLP_VECT_ONLY (stmt_info)
7249 || ! is_a <gcall *> (STMT_VINFO_STMT (stmt_info))
7250 || ! gimple_call_internal_p (STMT_VINFO_STMT (stmt_info),
7251 IFN_MASK_LOAD))
7252 continue;
7253 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
7254 if (STMT_VINFO_STRIDED_P (stmt_info)
7255 || compare_step_with_zero (m_vinfo, stmt_info) <= 0
7256 || vect_load_lanes_supported (SLP_TREE_VECTYPE (node),
7257 DR_GROUP_SIZE (stmt_info),
7258 true) == IFN_LAST)
7259 continue;
7261 /* Uniform masks need to be suitably represented. */
7262 slp_tree mask = SLP_TREE_CHILDREN (node)[0];
7263 if (SLP_TREE_CODE (mask) != VEC_PERM_EXPR
7264 || SLP_TREE_CHILDREN (mask).length () != 1)
7265 continue;
7266 bool match = true;
7267 for (auto perm : SLP_TREE_LANE_PERMUTATION (mask))
7268 if (perm.first != 0 || perm.second != 0)
7270 match = false;
7271 break;
7273 if (!match)
7274 continue;
7276 /* Now see if the consumer side matches. */
7277 for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
7278 pred; pred = pred->pred_next)
7280 slp_tree pred_node = m_vertices[pred->src].node;
7281 /* All consumers should be a permute with a single outgoing lane. */
7282 if (SLP_TREE_CODE (pred_node) != VEC_PERM_EXPR
7283 || SLP_TREE_LANES (pred_node) != 1)
7285 match = false;
7286 break;
7288 gcc_assert (SLP_TREE_CHILDREN (pred_node).length () == 1);
7290 if (!match)
7291 continue;
7292 /* Now we can mark the nodes as to use load lanes. */
7293 node->ldst_lanes = true;
7294 for (graph_edge *pred = m_slpg->vertices[node->vertex].pred;
7295 pred; pred = pred->pred_next)
7296 m_vertices[pred->src].node->ldst_lanes = true;
7297 /* The catch is we have to massage the mask. We have arranged
7298 analyzed uniform masks to be represented by a splat VEC_PERM
7299 which we can now simply elide as we cannot easily re-do SLP
7300 discovery here. */
7301 slp_tree new_mask = SLP_TREE_CHILDREN (mask)[0];
7302 SLP_TREE_REF_COUNT (new_mask)++;
7303 SLP_TREE_CHILDREN (node)[0] = new_mask;
7304 vect_free_slp_tree (mask);
7308 /* Main entry point for the SLP graph optimization pass. */
7310 void
7311 vect_optimize_slp_pass::run ()
7313 build_graph ();
7314 create_partitions ();
7315 start_choosing_layouts ();
7316 if (m_perms.length () > 1)
7318 forward_pass ();
7319 backward_pass ();
7320 if (dump_enabled_p ())
7321 dump ();
7322 materialize ();
7323 while (!m_perms.is_empty ())
7324 m_perms.pop ().release ();
7326 else
7327 remove_redundant_permutations ();
7328 free_graph (m_slpg);
7329 build_graph ();
7330 decide_masked_load_lanes ();
7331 free_graph (m_slpg);
7334 /* Apply CSE to NODE and its children using BST_MAP. */
7336 static void
7337 vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t *bst_map, slp_tree& node)
7339 bool put_p = false;
7340 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def
7341 /* Besides some VEC_PERM_EXPR, two-operator nodes also
7342 lack scalar stmts and thus CSE doesn't work via bst_map. Ideally
7343 we'd have sth that works for all internal and external nodes. */
7344 && !SLP_TREE_SCALAR_STMTS (node).is_empty ())
7346 slp_tree *leader = bst_map->get (SLP_TREE_SCALAR_STMTS (node));
7347 if (leader)
7349 /* We've visited this node already. */
7350 if (!*leader || *leader == node)
7351 return;
7353 if (dump_enabled_p ())
7354 dump_printf_loc (MSG_NOTE, vect_location,
7355 "re-using SLP tree %p for %p\n",
7356 (void *)*leader, (void *)node);
7357 vect_free_slp_tree (node);
7358 (*leader)->refcnt += 1;
7359 node = *leader;
7360 return;
7363 /* Avoid creating a cycle by populating the map only after recursion. */
7364 bst_map->put (SLP_TREE_SCALAR_STMTS (node).copy (), nullptr);
7365 node->refcnt += 1;
7366 put_p = true;
7367 /* And recurse. */
7370 for (slp_tree &child : SLP_TREE_CHILDREN (node))
7371 if (child)
7372 vect_cse_slp_nodes (bst_map, child);
7374 /* Now record the node for CSE in other siblings. */
7375 if (put_p)
7376 *bst_map->get (SLP_TREE_SCALAR_STMTS (node)) = node;
7379 /* Optimize the SLP graph of VINFO. */
7381 void
7382 vect_optimize_slp (vec_info *vinfo)
7384 if (vinfo->slp_instances.is_empty ())
7385 return;
7386 vect_optimize_slp_pass (vinfo).run ();
7388 /* Apply CSE again to nodes after permute optimization. */
7389 scalar_stmts_to_slp_tree_map_t *bst_map
7390 = new scalar_stmts_to_slp_tree_map_t ();
7392 for (auto inst : vinfo->slp_instances)
7393 vect_cse_slp_nodes (bst_map, SLP_INSTANCE_TREE (inst));
7395 release_scalar_stmts_to_slp_tree_map (bst_map);
7398 /* Gather loads reachable from the individual SLP graph entries. */
7400 void
7401 vect_gather_slp_loads (vec_info *vinfo)
7403 unsigned i;
7404 slp_instance instance;
7405 FOR_EACH_VEC_ELT (vinfo->slp_instances, i, instance)
7407 hash_set<slp_tree> visited;
7408 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance),
7409 SLP_INSTANCE_TREE (instance), visited);
7413 /* For NODE update VF based on the number of lanes and the vector types
7414 used. */
7416 static void
7417 vect_update_slp_vf_for_node (slp_tree node, poly_uint64 &vf,
7418 hash_set<slp_tree> &visited)
7420 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
7421 return;
7422 if (visited.add (node))
7423 return;
7425 for (slp_tree child : SLP_TREE_CHILDREN (node))
7426 vect_update_slp_vf_for_node (child, vf, visited);
7428 /* We do not visit SLP nodes for constants or externals - those neither
7429 have a vector type set yet (vectorizable_* does this) nor do they
7430 have max_nunits set. Instead we rely on internal nodes max_nunit
7431 to cover constant/external operands.
7432 Note that when we stop using fixed size vectors externs and constants
7433 shouldn't influence the (minimum) vectorization factor, instead
7434 vectorizable_* should honor the vectorization factor when trying to
7435 assign vector types to constants and externals and cause iteration
7436 to a higher vectorization factor when required. */
7437 poly_uint64 node_vf
7438 = calculate_unrolling_factor (node->max_nunits, SLP_TREE_LANES (node));
7439 vf = force_common_multiple (vf, node_vf);
7441 /* For permute nodes that are fed from externs or constants we have to
7442 consider their number of lanes as well. Likewise for store-lanes. */
7443 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR
7444 || node->ldst_lanes)
7445 for (slp_tree child : SLP_TREE_CHILDREN (node))
7446 if (SLP_TREE_DEF_TYPE (child) != vect_internal_def)
7448 poly_uint64 child_vf
7449 = calculate_unrolling_factor (node->max_nunits,
7450 SLP_TREE_LANES (child));
7451 vf = force_common_multiple (vf, child_vf);
7455 /* For each possible SLP instance decide whether to SLP it and calculate overall
7456 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
7457 least one instance. */
7459 bool
7460 vect_make_slp_decision (loop_vec_info loop_vinfo)
7462 unsigned int i;
7463 poly_uint64 unrolling_factor = 1;
7464 const vec<slp_instance> &slp_instances
7465 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
7466 slp_instance instance;
7467 int decided_to_slp = 0;
7469 DUMP_VECT_SCOPE ("vect_make_slp_decision");
7471 hash_set<slp_tree> visited;
7472 FOR_EACH_VEC_ELT (slp_instances, i, instance)
7474 /* FORNOW: SLP if you can. */
7475 /* All unroll factors have the form:
7477 GET_MODE_SIZE (vinfo->vector_mode) * X
7479 for some rational X, so they must have a common multiple. */
7480 vect_update_slp_vf_for_node (SLP_INSTANCE_TREE (instance),
7481 unrolling_factor, visited);
7483 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
7484 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
7485 loop-based vectorization. Such stmts will be marked as HYBRID. */
7486 vect_mark_slp_stmts (loop_vinfo, SLP_INSTANCE_TREE (instance));
7487 decided_to_slp++;
7490 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo) = unrolling_factor;
7492 if (decided_to_slp && dump_enabled_p ())
7494 dump_printf_loc (MSG_NOTE, vect_location,
7495 "Decided to SLP %d instances. Unrolling factor ",
7496 decided_to_slp);
7497 dump_dec (MSG_NOTE, unrolling_factor);
7498 dump_printf (MSG_NOTE, "\n");
7501 return (decided_to_slp > 0);
7504 /* Private data for vect_detect_hybrid_slp. */
7505 struct vdhs_data
7507 loop_vec_info loop_vinfo;
7508 vec<stmt_vec_info> *worklist;
7511 /* Walker for walk_gimple_op. */
7513 static tree
7514 vect_detect_hybrid_slp (tree *tp, int *, void *data)
7516 walk_stmt_info *wi = (walk_stmt_info *)data;
7517 vdhs_data *dat = (vdhs_data *)wi->info;
7519 if (wi->is_lhs)
7520 return NULL_TREE;
7522 stmt_vec_info def_stmt_info = dat->loop_vinfo->lookup_def (*tp);
7523 if (!def_stmt_info)
7524 return NULL_TREE;
7525 def_stmt_info = vect_stmt_to_vectorize (def_stmt_info);
7526 if (PURE_SLP_STMT (def_stmt_info))
7528 if (dump_enabled_p ())
7529 dump_printf_loc (MSG_NOTE, vect_location, "marking hybrid: %G",
7530 def_stmt_info->stmt);
7531 STMT_SLP_TYPE (def_stmt_info) = hybrid;
7532 dat->worklist->safe_push (def_stmt_info);
7535 return NULL_TREE;
7538 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
7539 if so, otherwise pushing it to WORKLIST. */
7541 static void
7542 maybe_push_to_hybrid_worklist (vec_info *vinfo,
7543 vec<stmt_vec_info> &worklist,
7544 stmt_vec_info stmt_info)
7546 if (dump_enabled_p ())
7547 dump_printf_loc (MSG_NOTE, vect_location,
7548 "Processing hybrid candidate : %G", stmt_info->stmt);
7549 stmt_vec_info orig_info = vect_orig_stmt (stmt_info);
7550 imm_use_iterator iter2;
7551 ssa_op_iter iter1;
7552 use_operand_p use_p;
7553 def_operand_p def_p;
7554 bool any_def = false;
7555 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_info->stmt, iter1, SSA_OP_DEF)
7557 any_def = true;
7558 FOR_EACH_IMM_USE_FAST (use_p, iter2, DEF_FROM_PTR (def_p))
7560 if (is_gimple_debug (USE_STMT (use_p)))
7561 continue;
7562 stmt_vec_info use_info = vinfo->lookup_stmt (USE_STMT (use_p));
7563 /* An out-of loop use means this is a loop_vect sink. */
7564 if (!use_info)
7566 if (dump_enabled_p ())
7567 dump_printf_loc (MSG_NOTE, vect_location,
7568 "Found loop_vect sink: %G", stmt_info->stmt);
7569 worklist.safe_push (stmt_info);
7570 return;
7572 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info)))
7574 if (dump_enabled_p ())
7575 dump_printf_loc (MSG_NOTE, vect_location,
7576 "Found loop_vect use: %G", use_info->stmt);
7577 worklist.safe_push (stmt_info);
7578 return;
7582 /* No def means this is a loop_vect sink. Gimple conditionals also don't have a
7583 def but shouldn't be considered sinks. */
7584 if (!any_def && STMT_VINFO_DEF_TYPE (stmt_info) != vect_condition_def)
7586 if (dump_enabled_p ())
7587 dump_printf_loc (MSG_NOTE, vect_location,
7588 "Found loop_vect sink: %G", stmt_info->stmt);
7589 worklist.safe_push (stmt_info);
7590 return;
7592 if (dump_enabled_p ())
7593 dump_printf_loc (MSG_NOTE, vect_location,
7594 "Marked SLP consumed stmt pure: %G", stmt_info->stmt);
7595 STMT_SLP_TYPE (stmt_info) = pure_slp;
7598 /* Find stmts that must be both vectorized and SLPed. */
7600 void
7601 vect_detect_hybrid_slp (loop_vec_info loop_vinfo)
7603 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
7605 /* All stmts participating in SLP are marked pure_slp, all other
7606 stmts are loop_vect.
7607 First collect all loop_vect stmts into a worklist.
7608 SLP patterns cause not all original scalar stmts to appear in
7609 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
7610 Rectify this here and do a backward walk over the IL only considering
7611 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
7612 mark them as pure_slp. */
7613 auto_vec<stmt_vec_info> worklist;
7614 for (int i = LOOP_VINFO_LOOP (loop_vinfo)->num_nodes - 1; i >= 0; --i)
7616 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
7617 for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
7618 gsi_next (&gsi))
7620 gphi *phi = gsi.phi ();
7621 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (phi);
7622 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
7623 maybe_push_to_hybrid_worklist (loop_vinfo,
7624 worklist, stmt_info);
7626 for (gimple_stmt_iterator gsi = gsi_last_bb (bb); !gsi_end_p (gsi);
7627 gsi_prev (&gsi))
7629 gimple *stmt = gsi_stmt (gsi);
7630 if (is_gimple_debug (stmt))
7631 continue;
7632 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
7633 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
7635 for (gimple_stmt_iterator gsi2
7636 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
7637 !gsi_end_p (gsi2); gsi_next (&gsi2))
7639 stmt_vec_info patt_info
7640 = loop_vinfo->lookup_stmt (gsi_stmt (gsi2));
7641 if (!STMT_SLP_TYPE (patt_info)
7642 && STMT_VINFO_RELEVANT (patt_info))
7643 maybe_push_to_hybrid_worklist (loop_vinfo,
7644 worklist, patt_info);
7646 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7648 if (!STMT_SLP_TYPE (stmt_info) && STMT_VINFO_RELEVANT (stmt_info))
7649 maybe_push_to_hybrid_worklist (loop_vinfo,
7650 worklist, stmt_info);
7654 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
7655 mark any SLP vectorized stmt as hybrid.
7656 ??? We're visiting def stmts N times (once for each non-SLP and
7657 once for each hybrid-SLP use). */
7658 walk_stmt_info wi;
7659 vdhs_data dat;
7660 dat.worklist = &worklist;
7661 dat.loop_vinfo = loop_vinfo;
7662 memset (&wi, 0, sizeof (wi));
7663 wi.info = (void *)&dat;
7664 while (!worklist.is_empty ())
7666 stmt_vec_info stmt_info = worklist.pop ();
7667 /* Since SSA operands are not set up for pattern stmts we need
7668 to use walk_gimple_op. */
7669 wi.is_lhs = 0;
7670 walk_gimple_op (stmt_info->stmt, vect_detect_hybrid_slp, &wi);
7671 /* For gather/scatter make sure to walk the offset operand, that
7672 can be a scaling and conversion away. */
7673 gather_scatter_info gs_info;
7674 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info)
7675 && vect_check_gather_scatter (stmt_info, loop_vinfo, &gs_info))
7677 int dummy;
7678 vect_detect_hybrid_slp (&gs_info.offset, &dummy, &wi);
7684 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
7686 _bb_vec_info::_bb_vec_info (vec<basic_block> _bbs, vec_info_shared *shared)
7687 : vec_info (vec_info::bb, shared),
7688 roots (vNULL)
7690 /* The region we are operating on. bbs[0] is the entry, excluding
7691 its PHI nodes. In the future we might want to track an explicit
7692 entry edge to cover bbs[0] PHI nodes and have a region entry
7693 insert location. */
7694 bbs = _bbs.address ();
7695 nbbs = _bbs.length ();
7697 for (unsigned i = 0; i < nbbs; ++i)
7699 if (i != 0)
7700 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
7701 gsi_next (&si))
7703 gphi *phi = si.phi ();
7704 gimple_set_uid (phi, 0);
7705 add_stmt (phi);
7707 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
7708 !gsi_end_p (gsi); gsi_next (&gsi))
7710 gimple *stmt = gsi_stmt (gsi);
7711 gimple_set_uid (stmt, 0);
7712 if (is_gimple_debug (stmt))
7713 continue;
7714 add_stmt (stmt);
7720 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
7721 stmts in the basic block. */
7723 _bb_vec_info::~_bb_vec_info ()
7725 /* Reset region marker. */
7726 for (unsigned i = 0; i < nbbs; ++i)
7728 if (i != 0)
7729 for (gphi_iterator si = gsi_start_phis (bbs[i]); !gsi_end_p (si);
7730 gsi_next (&si))
7732 gphi *phi = si.phi ();
7733 gimple_set_uid (phi, -1);
7735 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
7736 !gsi_end_p (gsi); gsi_next (&gsi))
7738 gimple *stmt = gsi_stmt (gsi);
7739 gimple_set_uid (stmt, -1);
7743 for (unsigned i = 0; i < roots.length (); ++i)
7745 roots[i].stmts.release ();
7746 roots[i].roots.release ();
7747 roots[i].remain.release ();
7749 roots.release ();
7752 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
7753 given then that child nodes have already been processed, and that
7754 their def types currently match their SLP node's def type. */
7756 static bool
7757 vect_slp_analyze_node_operations_1 (vec_info *vinfo, slp_tree node,
7758 slp_instance node_instance,
7759 stmt_vector_for_cost *cost_vec)
7761 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
7763 /* Calculate the number of vector statements to be created for the scalar
7764 stmts in this node. It is the number of scalar elements in one scalar
7765 iteration (DR_GROUP_SIZE) multiplied by VF divided by the number of
7766 elements in a vector. For single-defuse-cycle, lane-reducing op, and
7767 PHI statement that starts reduction comprised of only lane-reducing ops,
7768 the number is more than effective vector statements actually required. */
7769 SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vect_get_num_copies (vinfo, node);
7771 /* Handle purely internal nodes. */
7772 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
7774 if (!vectorizable_slp_permutation (vinfo, NULL, node, cost_vec))
7775 return false;
7777 stmt_vec_info slp_stmt_info;
7778 unsigned int i;
7779 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
7781 if (slp_stmt_info
7782 && STMT_VINFO_LIVE_P (slp_stmt_info)
7783 && !vectorizable_live_operation (vinfo, slp_stmt_info, node,
7784 node_instance, i,
7785 false, cost_vec))
7786 return false;
7788 return true;
7791 bool dummy;
7792 return vect_analyze_stmt (vinfo, stmt_info, &dummy,
7793 node, node_instance, cost_vec);
7796 /* Verify if we can externalize a set of internal defs. */
7798 static bool
7799 vect_slp_can_convert_to_external (const vec<stmt_vec_info> &stmts)
7801 basic_block bb = NULL;
7802 for (stmt_vec_info stmt : stmts)
7803 if (!stmt)
7804 return false;
7805 /* Constant generation uses get_later_stmt which can only handle
7806 defs from the same BB. */
7807 else if (!bb)
7808 bb = gimple_bb (stmt->stmt);
7809 else if (gimple_bb (stmt->stmt) != bb)
7810 return false;
7811 return true;
7814 /* Try to build NODE from scalars, returning true on success.
7815 NODE_INSTANCE is the SLP instance that contains NODE. */
7817 static bool
7818 vect_slp_convert_to_external (vec_info *vinfo, slp_tree node,
7819 slp_instance node_instance)
7821 stmt_vec_info stmt_info;
7822 unsigned int i;
7824 if (!is_a <bb_vec_info> (vinfo)
7825 || node == SLP_INSTANCE_TREE (node_instance)
7826 || !SLP_TREE_SCALAR_STMTS (node).exists ()
7827 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node))
7828 /* Force the mask use to be built from scalars instead. */
7829 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node))
7830 || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node)))
7831 return false;
7833 if (dump_enabled_p ())
7834 dump_printf_loc (MSG_NOTE, vect_location,
7835 "Building vector operands of %p from scalars instead\n",
7836 (void *) node);
7838 /* Don't remove and free the child nodes here, since they could be
7839 referenced by other structures. The analysis and scheduling phases
7840 (need to) ignore child nodes of anything that isn't vect_internal_def. */
7841 unsigned int group_size = SLP_TREE_LANES (node);
7842 SLP_TREE_DEF_TYPE (node) = vect_external_def;
7843 /* Invariants get their vector type from the uses. */
7844 SLP_TREE_VECTYPE (node) = NULL_TREE;
7845 SLP_TREE_SCALAR_OPS (node).safe_grow (group_size, true);
7846 SLP_TREE_LOAD_PERMUTATION (node).release ();
7847 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
7849 tree lhs = gimple_get_lhs (vect_orig_stmt (stmt_info)->stmt);
7850 SLP_TREE_SCALAR_OPS (node)[i] = lhs;
7852 return true;
7855 /* Return true if all elements of the slice are the same. */
7856 bool
7857 vect_scalar_ops_slice::all_same_p () const
7859 for (unsigned int i = 1; i < length; ++i)
7860 if (!operand_equal_p (op (0), op (i)))
7861 return false;
7862 return true;
7865 hashval_t
7866 vect_scalar_ops_slice_hash::hash (const value_type &s)
7868 hashval_t hash = 0;
7869 for (unsigned i = 0; i < s.length; ++i)
7870 hash = iterative_hash_expr (s.op (i), hash);
7871 return hash;
7874 bool
7875 vect_scalar_ops_slice_hash::equal (const value_type &s1,
7876 const compare_type &s2)
7878 if (s1.length != s2.length)
7879 return false;
7880 for (unsigned i = 0; i < s1.length; ++i)
7881 if (!operand_equal_p (s1.op (i), s2.op (i)))
7882 return false;
7883 return true;
7886 /* Compute the prologue cost for invariant or constant operands represented
7887 by NODE. */
7889 static void
7890 vect_prologue_cost_for_slp (slp_tree node,
7891 stmt_vector_for_cost *cost_vec)
7893 /* There's a special case of an existing vector, that costs nothing. */
7894 if (SLP_TREE_SCALAR_OPS (node).length () == 0
7895 && !SLP_TREE_VEC_DEFS (node).is_empty ())
7896 return;
7897 /* Without looking at the actual initializer a vector of
7898 constants can be implemented as load from the constant pool.
7899 When all elements are the same we can use a splat. */
7900 tree vectype = SLP_TREE_VECTYPE (node);
7901 unsigned group_size = SLP_TREE_SCALAR_OPS (node).length ();
7902 unsigned HOST_WIDE_INT const_nunits;
7903 unsigned nelt_limit;
7904 auto ops = &SLP_TREE_SCALAR_OPS (node);
7905 auto_vec<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
7906 if (TYPE_VECTOR_SUBPARTS (vectype).is_constant (&const_nunits)
7907 && ! multiple_p (const_nunits, group_size))
7909 nelt_limit = const_nunits;
7910 hash_set<vect_scalar_ops_slice_hash> vector_ops;
7911 for (unsigned int i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
7912 if (!vector_ops.add ({ ops, i * nelt_limit, nelt_limit }))
7913 starts.quick_push (i * nelt_limit);
7915 else
7917 /* If either the vector has variable length or the vectors
7918 are composed of repeated whole groups we only need to
7919 cost construction once. All vectors will be the same. */
7920 nelt_limit = group_size;
7921 starts.quick_push (0);
7923 /* ??? We're just tracking whether vectors in a single node are the same.
7924 Ideally we'd do something more global. */
7925 bool passed = false;
7926 for (unsigned int start : starts)
7928 vect_cost_for_stmt kind;
7929 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def)
7930 kind = vector_load;
7931 else if (vect_scalar_ops_slice { ops, start, nelt_limit }.all_same_p ())
7932 kind = scalar_to_vec;
7933 else
7934 kind = vec_construct;
7935 /* The target cost hook has no idea which part of the SLP node
7936 we are costing so avoid passing it down more than once. Pass
7937 it to the first vec_construct or scalar_to_vec part since for those
7938 the x86 backend tries to account for GPR to XMM register moves. */
7939 record_stmt_cost (cost_vec, 1, kind,
7940 (kind != vector_load && !passed) ? node : nullptr,
7941 vectype, 0, vect_prologue);
7942 if (kind != vector_load)
7943 passed = true;
7947 /* Analyze statements contained in SLP tree NODE after recursively analyzing
7948 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
7950 Return true if the operations are supported. */
7952 static bool
7953 vect_slp_analyze_node_operations (vec_info *vinfo, slp_tree node,
7954 slp_instance node_instance,
7955 hash_set<slp_tree> &visited_set,
7956 vec<slp_tree> &visited_vec,
7957 stmt_vector_for_cost *cost_vec)
7959 int i, j;
7960 slp_tree child;
7962 /* Assume we can code-generate all invariants. */
7963 if (!node
7964 || SLP_TREE_DEF_TYPE (node) == vect_constant_def
7965 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
7966 return true;
7968 if (SLP_TREE_DEF_TYPE (node) == vect_uninitialized_def)
7970 if (dump_enabled_p ())
7971 dump_printf_loc (MSG_NOTE, vect_location,
7972 "Failed cyclic SLP reference in %p\n", (void *) node);
7973 return false;
7975 gcc_assert (SLP_TREE_DEF_TYPE (node) == vect_internal_def);
7977 /* If we already analyzed the exact same set of scalar stmts we're done.
7978 We share the generated vector stmts for those. */
7979 if (visited_set.add (node))
7980 return true;
7981 visited_vec.safe_push (node);
7983 bool res = true;
7984 unsigned visited_rec_start = visited_vec.length ();
7985 unsigned cost_vec_rec_start = cost_vec->length ();
7986 bool seen_non_constant_child = false;
7987 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
7989 res = vect_slp_analyze_node_operations (vinfo, child, node_instance,
7990 visited_set, visited_vec,
7991 cost_vec);
7992 if (!res)
7993 break;
7994 if (child && SLP_TREE_DEF_TYPE (child) != vect_constant_def)
7995 seen_non_constant_child = true;
7997 /* We're having difficulties scheduling nodes with just constant
7998 operands and no scalar stmts since we then cannot compute a stmt
7999 insertion place. */
8000 if (res
8001 && !seen_non_constant_child
8002 && SLP_TREE_SCALAR_STMTS (node).is_empty ())
8004 if (dump_enabled_p ())
8005 dump_printf_loc (MSG_NOTE, vect_location,
8006 "Cannot vectorize all-constant op node %p\n",
8007 (void *) node);
8008 res = false;
8011 if (res)
8012 res = vect_slp_analyze_node_operations_1 (vinfo, node, node_instance,
8013 cost_vec);
8014 /* If analysis failed we have to pop all recursive visited nodes
8015 plus ourselves. */
8016 if (!res)
8018 while (visited_vec.length () >= visited_rec_start)
8019 visited_set.remove (visited_vec.pop ());
8020 cost_vec->truncate (cost_vec_rec_start);
8023 /* When the node can be vectorized cost invariant nodes it references.
8024 This is not done in DFS order to allow the refering node
8025 vectorizable_* calls to nail down the invariant nodes vector type
8026 and possibly unshare it if it needs a different vector type than
8027 other referrers. */
8028 if (res)
8029 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), j, child)
8030 if (child
8031 && (SLP_TREE_DEF_TYPE (child) == vect_constant_def
8032 || SLP_TREE_DEF_TYPE (child) == vect_external_def)
8033 /* Perform usual caching, note code-generation still
8034 code-gens these nodes multiple times but we expect
8035 to CSE them later. */
8036 && !visited_set.add (child))
8038 visited_vec.safe_push (child);
8039 /* ??? After auditing more code paths make a "default"
8040 and push the vector type from NODE to all children
8041 if it is not already set. */
8042 /* Compute the number of vectors to be generated. */
8043 tree vector_type = SLP_TREE_VECTYPE (child);
8044 if (!vector_type)
8046 /* Masked loads can have an undefined (default SSA definition)
8047 else operand. We do not need to cost it. */
8048 vec<tree> ops = SLP_TREE_SCALAR_OPS (child);
8049 if ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
8050 == load_vec_info_type)
8051 && ((ops.length ()
8052 && TREE_CODE (ops[0]) == SSA_NAME
8053 && SSA_NAME_IS_DEFAULT_DEF (ops[0])
8054 && VAR_P (SSA_NAME_VAR (ops[0])))
8055 || SLP_TREE_DEF_TYPE (child) == vect_constant_def))
8056 continue;
8058 /* For shifts with a scalar argument we don't need
8059 to cost or code-generate anything.
8060 ??? Represent this more explicitely. */
8061 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node))
8062 == shift_vec_info_type)
8063 && j == 1);
8064 continue;
8067 SLP_TREE_NUMBER_OF_VEC_STMTS (child)
8068 = vect_get_num_copies (vinfo, child);
8069 /* And cost them. */
8070 vect_prologue_cost_for_slp (child, cost_vec);
8073 /* If this node or any of its children can't be vectorized, try pruning
8074 the tree here rather than felling the whole thing. */
8075 if (!res && vect_slp_convert_to_external (vinfo, node, node_instance))
8077 /* We'll need to revisit this for invariant costing and number
8078 of vectorized stmt setting. */
8079 res = true;
8082 return res;
8085 /* Given a definition DEF, analyze if it will have any live scalar use after
8086 performing SLP vectorization whose information is represented by BB_VINFO,
8087 and record result into hash map SCALAR_USE_MAP as cache for later fast
8088 check. If recursion DEPTH exceeds a limit, stop analysis and make a
8089 conservative assumption. Return 0 if no scalar use, 1 if there is, -1
8090 means recursion is limited. */
8092 static int
8093 vec_slp_has_scalar_use (bb_vec_info bb_vinfo, tree def,
8094 hash_map<tree, int> &scalar_use_map,
8095 int depth = 0)
8097 const int depth_limit = 2;
8098 imm_use_iterator use_iter;
8099 gimple *use_stmt;
8101 if (int *res = scalar_use_map.get (def))
8102 return *res;
8104 int scalar_use = 1;
8106 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, def)
8108 if (is_gimple_debug (use_stmt))
8109 continue;
8111 stmt_vec_info use_stmt_info = bb_vinfo->lookup_stmt (use_stmt);
8113 if (!use_stmt_info)
8114 break;
8116 if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
8117 continue;
8119 /* Do not step forward when encounter PHI statement, since it may
8120 involve cyclic reference and cause infinite recursive invocation. */
8121 if (gimple_code (use_stmt) == GIMPLE_PHI)
8122 break;
8124 /* When pattern recognition is involved, a statement whose definition is
8125 consumed in some pattern, may not be included in the final replacement
8126 pattern statements, so would be skipped when building SLP graph.
8128 * Original
8129 char a_c = *(char *) a;
8130 char b_c = *(char *) b;
8131 unsigned short a_s = (unsigned short) a_c;
8132 int a_i = (int) a_s;
8133 int b_i = (int) b_c;
8134 int r_i = a_i - b_i;
8136 * After pattern replacement
8137 a_s = (unsigned short) a_c;
8138 a_i = (int) a_s;
8140 patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
8141 patt_b_i = (int) patt_b_s; // b_i = (int) b_c
8143 patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
8144 patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
8146 The definitions of a_i(original statement) and b_i(pattern statement)
8147 are related to, but actually not part of widen_minus pattern.
8148 Vectorizing the pattern does not cause these definition statements to
8149 be marked as PURE_SLP. For this case, we need to recursively check
8150 whether their uses are all absorbed into vectorized code. But there
8151 is an exception that some use may participate in an vectorized
8152 operation via an external SLP node containing that use as an element.
8153 The parameter "scalar_use_map" tags such kind of SSA as having scalar
8154 use in advance. */
8155 tree lhs = gimple_get_lhs (use_stmt);
8157 if (!lhs || TREE_CODE (lhs) != SSA_NAME)
8158 break;
8160 if (depth_limit && depth >= depth_limit)
8161 return -1;
8163 if ((scalar_use = vec_slp_has_scalar_use (bb_vinfo, lhs, scalar_use_map,
8164 depth + 1)))
8165 break;
8168 if (end_imm_use_stmt_p (&use_iter))
8169 scalar_use = 0;
8171 /* If recursion is limited, do not cache result for non-root defs. */
8172 if (!depth || scalar_use >= 0)
8174 bool added = scalar_use_map.put (def, scalar_use);
8175 gcc_assert (!added);
8178 return scalar_use;
8181 /* Mark lanes of NODE that are live outside of the basic-block vectorized
8182 region and that can be vectorized using vectorizable_live_operation
8183 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
8184 scalar code computing it to be retained. */
8186 static void
8187 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo, slp_tree node,
8188 slp_instance instance,
8189 stmt_vector_for_cost *cost_vec,
8190 hash_map<tree, int> &scalar_use_map,
8191 hash_set<stmt_vec_info> &svisited,
8192 hash_set<slp_tree> &visited)
8194 if (visited.add (node))
8195 return;
8197 unsigned i;
8198 stmt_vec_info stmt_info;
8199 stmt_vec_info last_stmt = vect_find_last_scalar_stmt_in_slp (node);
8200 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8202 if (!stmt_info || svisited.contains (stmt_info))
8203 continue;
8204 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
8205 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)
8206 && STMT_VINFO_RELATED_STMT (orig_stmt_info) != stmt_info)
8207 /* Only the pattern root stmt computes the original scalar value. */
8208 continue;
8209 bool mark_visited = true;
8210 gimple *orig_stmt = orig_stmt_info->stmt;
8211 ssa_op_iter op_iter;
8212 def_operand_p def_p;
8213 FOR_EACH_PHI_OR_STMT_DEF (def_p, orig_stmt, op_iter, SSA_OP_DEF)
8215 if (vec_slp_has_scalar_use (bb_vinfo, DEF_FROM_PTR (def_p),
8216 scalar_use_map))
8218 STMT_VINFO_LIVE_P (stmt_info) = true;
8219 if (vectorizable_live_operation (bb_vinfo, stmt_info, node,
8220 instance, i, false, cost_vec))
8221 /* ??? So we know we can vectorize the live stmt from one SLP
8222 node. If we cannot do so from all or none consistently
8223 we'd have to record which SLP node (and lane) we want to
8224 use for the live operation. So make sure we can
8225 code-generate from all nodes. */
8226 mark_visited = false;
8227 else
8228 STMT_VINFO_LIVE_P (stmt_info) = false;
8231 /* We have to verify whether we can insert the lane extract
8232 before all uses. The following is a conservative approximation.
8233 We cannot put this into vectorizable_live_operation because
8234 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
8235 doesn't work.
8236 Note that while the fact that we emit code for loads at the
8237 first load should make this a non-problem leafs we construct
8238 from scalars are vectorized after the last scalar def.
8239 ??? If we'd actually compute the insert location during
8240 analysis we could use sth less conservative than the last
8241 scalar stmt in the node for the dominance check. */
8242 /* ??? What remains is "live" uses in vector CTORs in the same
8243 SLP graph which is where those uses can end up code-generated
8244 right after their definition instead of close to their original
8245 use. But that would restrict us to code-generate lane-extracts
8246 from the latest stmt in a node. So we compensate for this
8247 during code-generation, simply not replacing uses for those
8248 hopefully rare cases. */
8249 imm_use_iterator use_iter;
8250 gimple *use_stmt;
8251 stmt_vec_info use_stmt_info;
8253 if (STMT_VINFO_LIVE_P (stmt_info))
8254 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter, DEF_FROM_PTR (def_p))
8255 if (!is_gimple_debug (use_stmt)
8256 && (!(use_stmt_info = bb_vinfo->lookup_stmt (use_stmt))
8257 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info)))
8258 && !vect_stmt_dominates_stmt_p (last_stmt->stmt, use_stmt))
8260 if (dump_enabled_p ())
8261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8262 "Cannot determine insertion place for "
8263 "lane extract\n");
8264 STMT_VINFO_LIVE_P (stmt_info) = false;
8265 mark_visited = true;
8268 if (mark_visited)
8269 svisited.add (stmt_info);
8272 slp_tree child;
8273 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8274 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8275 vect_bb_slp_mark_live_stmts (bb_vinfo, child, instance, cost_vec,
8276 scalar_use_map, svisited, visited);
8279 /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
8280 are live outside of the basic-block vectorized region and that can be
8281 vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
8283 static void
8284 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo)
8286 if (bb_vinfo->slp_instances.is_empty ())
8287 return;
8289 hash_set<stmt_vec_info> svisited;
8290 hash_set<slp_tree> visited;
8291 hash_map<tree, int> scalar_use_map;
8292 auto_vec<slp_tree> worklist;
8294 for (slp_instance instance : bb_vinfo->slp_instances)
8296 if (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc)
8297 for (tree op : SLP_INSTANCE_REMAIN_DEFS (instance))
8298 if (TREE_CODE (op) == SSA_NAME)
8299 scalar_use_map.put (op, 1);
8300 if (!visited.add (SLP_INSTANCE_TREE (instance)))
8301 worklist.safe_push (SLP_INSTANCE_TREE (instance));
8306 slp_tree node = worklist.pop ();
8308 if (SLP_TREE_DEF_TYPE (node) == vect_external_def)
8310 for (tree op : SLP_TREE_SCALAR_OPS (node))
8311 if (TREE_CODE (op) == SSA_NAME)
8312 scalar_use_map.put (op, 1);
8314 else
8316 for (slp_tree child : SLP_TREE_CHILDREN (node))
8317 if (child && !visited.add (child))
8318 worklist.safe_push (child);
8321 while (!worklist.is_empty ());
8323 visited.empty ();
8325 for (slp_instance instance : bb_vinfo->slp_instances)
8327 vect_location = instance->location ();
8328 vect_bb_slp_mark_live_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance),
8329 instance, &instance->cost_vec,
8330 scalar_use_map, svisited, visited);
8334 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
8336 static bool
8337 vectorizable_bb_reduc_epilogue (slp_instance instance,
8338 stmt_vector_for_cost *cost_vec)
8340 gassign *stmt = as_a <gassign *> (instance->root_stmts[0]->stmt);
8341 enum tree_code reduc_code = gimple_assign_rhs_code (stmt);
8342 if (reduc_code == MINUS_EXPR)
8343 reduc_code = PLUS_EXPR;
8344 internal_fn reduc_fn;
8345 tree vectype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance));
8346 if (!vectype
8347 || !reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
8348 || reduc_fn == IFN_LAST
8349 || !direct_internal_fn_supported_p (reduc_fn, vectype, OPTIMIZE_FOR_BOTH)
8350 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt)),
8351 TREE_TYPE (vectype)))
8353 if (dump_enabled_p ())
8354 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8355 "not vectorized: basic block reduction epilogue "
8356 "operation unsupported.\n");
8357 return false;
8360 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
8361 cost log2 vector operations plus shuffles and one extraction. */
8362 unsigned steps = floor_log2 (vect_nunits_for_cost (vectype));
8363 record_stmt_cost (cost_vec, steps, vector_stmt, instance->root_stmts[0],
8364 vectype, 0, vect_body);
8365 record_stmt_cost (cost_vec, steps, vec_perm, instance->root_stmts[0],
8366 vectype, 0, vect_body);
8367 record_stmt_cost (cost_vec, 1, vec_to_scalar, instance->root_stmts[0],
8368 vectype, 0, vect_body);
8370 /* Since we replace all stmts of a possibly longer scalar reduction
8371 chain account for the extra scalar stmts for that. */
8372 record_stmt_cost (cost_vec, instance->remain_defs.length (), scalar_stmt,
8373 instance->root_stmts[0], 0, vect_body);
8374 return true;
8377 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
8378 and recurse to children. */
8380 static void
8381 vect_slp_prune_covered_roots (slp_tree node, hash_set<stmt_vec_info> &roots,
8382 hash_set<slp_tree> &visited)
8384 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def
8385 || visited.add (node))
8386 return;
8388 stmt_vec_info stmt;
8389 unsigned i;
8390 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt)
8391 if (stmt)
8392 roots.remove (vect_orig_stmt (stmt));
8394 slp_tree child;
8395 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8396 if (child)
8397 vect_slp_prune_covered_roots (child, roots, visited);
8400 /* Analyze statements in SLP instances of VINFO. Return true if the
8401 operations are supported. */
8403 bool
8404 vect_slp_analyze_operations (vec_info *vinfo)
8406 slp_instance instance;
8407 int i;
8409 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
8411 hash_set<slp_tree> visited;
8412 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
8414 auto_vec<slp_tree> visited_vec;
8415 stmt_vector_for_cost cost_vec;
8416 cost_vec.create (2);
8417 if (is_a <bb_vec_info> (vinfo))
8418 vect_location = instance->location ();
8419 if (!vect_slp_analyze_node_operations (vinfo,
8420 SLP_INSTANCE_TREE (instance),
8421 instance, visited, visited_vec,
8422 &cost_vec)
8423 /* CTOR instances require vectorized defs for the SLP tree root. */
8424 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_ctor
8425 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance))
8426 != vect_internal_def
8427 /* Make sure we vectorized with the expected type. */
8428 || !useless_type_conversion_p
8429 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
8430 (instance->root_stmts[0]->stmt))),
8431 TREE_TYPE (SLP_TREE_VECTYPE
8432 (SLP_INSTANCE_TREE (instance))))))
8433 /* Check we can vectorize the reduction. */
8434 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_bb_reduc
8435 && !vectorizable_bb_reduc_epilogue (instance, &cost_vec))
8436 /* Check we can vectorize the gcond. */
8437 || (SLP_INSTANCE_KIND (instance) == slp_inst_kind_gcond
8438 && !vectorizable_early_exit (vinfo,
8439 SLP_INSTANCE_ROOT_STMTS (instance)[0],
8440 NULL, NULL,
8441 SLP_INSTANCE_TREE (instance),
8442 &cost_vec)))
8444 cost_vec.release ();
8445 slp_tree node = SLP_INSTANCE_TREE (instance);
8446 stmt_vec_info stmt_info;
8447 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8448 stmt_info = SLP_INSTANCE_ROOT_STMTS (instance)[0];
8449 else
8450 stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
8451 if (is_a <loop_vec_info> (vinfo))
8453 if (dump_enabled_p ())
8454 dump_printf_loc (MSG_NOTE, vect_location,
8455 "unsupported SLP instance starting from: %G",
8456 stmt_info->stmt);
8457 return false;
8459 if (dump_enabled_p ())
8460 dump_printf_loc (MSG_NOTE, vect_location,
8461 "removing SLP instance operations starting from: %G",
8462 stmt_info->stmt);
8463 vect_free_slp_instance (instance);
8464 vinfo->slp_instances.ordered_remove (i);
8465 while (!visited_vec.is_empty ())
8466 visited.remove (visited_vec.pop ());
8468 else
8470 i++;
8471 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
8473 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
8474 cost_vec.release ();
8476 else
8477 /* For BB vectorization remember the SLP graph entry
8478 cost for later. */
8479 instance->cost_vec = cost_vec;
8483 /* Now look for SLP instances with a root that are covered by other
8484 instances and remove them. */
8485 hash_set<stmt_vec_info> roots;
8486 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
8487 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8488 roots.add (SLP_INSTANCE_ROOT_STMTS (instance)[0]);
8489 if (!roots.is_empty ())
8491 visited.empty ();
8492 for (i = 0; vinfo->slp_instances.iterate (i, &instance); ++i)
8493 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance), roots,
8494 visited);
8495 for (i = 0; vinfo->slp_instances.iterate (i, &instance); )
8496 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ()
8497 && !roots.contains (SLP_INSTANCE_ROOT_STMTS (instance)[0]))
8499 stmt_vec_info root = SLP_INSTANCE_ROOT_STMTS (instance)[0];
8500 if (dump_enabled_p ())
8501 dump_printf_loc (MSG_NOTE, vect_location,
8502 "removing SLP instance operations starting "
8503 "from: %G", root->stmt);
8504 vect_free_slp_instance (instance);
8505 vinfo->slp_instances.ordered_remove (i);
8507 else
8508 ++i;
8511 /* Compute vectorizable live stmts. */
8512 if (bb_vec_info bb_vinfo = dyn_cast <bb_vec_info> (vinfo))
8513 vect_bb_slp_mark_live_stmts (bb_vinfo);
8515 return !vinfo->slp_instances.is_empty ();
8518 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
8519 closing the eventual chain. */
8521 static slp_instance
8522 get_ultimate_leader (slp_instance instance,
8523 hash_map<slp_instance, slp_instance> &instance_leader)
8525 auto_vec<slp_instance *, 8> chain;
8526 slp_instance *tem;
8527 while (*(tem = instance_leader.get (instance)) != instance)
8529 chain.safe_push (tem);
8530 instance = *tem;
8532 while (!chain.is_empty ())
8533 *chain.pop () = instance;
8534 return instance;
8537 namespace {
8538 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
8539 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
8540 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
8542 INSTANCE_LEADER is as for get_ultimate_leader. */
8544 template<typename T>
8545 bool
8546 vect_map_to_instance (slp_instance instance, T key,
8547 hash_map<T, slp_instance> &key_to_instance,
8548 hash_map<slp_instance, slp_instance> &instance_leader)
8550 bool existed_p;
8551 slp_instance &key_instance = key_to_instance.get_or_insert (key, &existed_p);
8552 if (!existed_p)
8554 else if (key_instance != instance)
8556 /* If we're running into a previously marked key make us the
8557 leader of the current ultimate leader. This keeps the
8558 leader chain acyclic and works even when the current instance
8559 connects two previously independent graph parts. */
8560 slp_instance key_leader
8561 = get_ultimate_leader (key_instance, instance_leader);
8562 if (key_leader != instance)
8563 instance_leader.put (key_leader, instance);
8565 key_instance = instance;
8566 return existed_p;
8570 /* Worker of vect_bb_partition_graph, recurse on NODE. */
8572 static void
8573 vect_bb_partition_graph_r (bb_vec_info bb_vinfo,
8574 slp_instance instance, slp_tree node,
8575 hash_map<stmt_vec_info, slp_instance> &stmt_to_instance,
8576 hash_map<slp_tree, slp_instance> &node_to_instance,
8577 hash_map<slp_instance, slp_instance> &instance_leader)
8579 stmt_vec_info stmt_info;
8580 unsigned i;
8582 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8583 if (stmt_info)
8584 vect_map_to_instance (instance, stmt_info, stmt_to_instance,
8585 instance_leader);
8587 if (vect_map_to_instance (instance, node, node_to_instance,
8588 instance_leader))
8589 return;
8591 slp_tree child;
8592 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8593 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8594 vect_bb_partition_graph_r (bb_vinfo, instance, child, stmt_to_instance,
8595 node_to_instance, instance_leader);
8598 /* Partition the SLP graph into pieces that can be costed independently. */
8600 static void
8601 vect_bb_partition_graph (bb_vec_info bb_vinfo)
8603 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
8605 /* First walk the SLP graph assigning each involved scalar stmt a
8606 corresponding SLP graph entry and upon visiting a previously
8607 marked stmt, make the stmts leader the current SLP graph entry. */
8608 hash_map<stmt_vec_info, slp_instance> stmt_to_instance;
8609 hash_map<slp_tree, slp_instance> node_to_instance;
8610 hash_map<slp_instance, slp_instance> instance_leader;
8611 slp_instance instance;
8612 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
8614 instance_leader.put (instance, instance);
8615 vect_bb_partition_graph_r (bb_vinfo,
8616 instance, SLP_INSTANCE_TREE (instance),
8617 stmt_to_instance, node_to_instance,
8618 instance_leader);
8621 /* Then collect entries to each independent subgraph. */
8622 for (unsigned i = 0; bb_vinfo->slp_instances.iterate (i, &instance); ++i)
8624 slp_instance leader = get_ultimate_leader (instance, instance_leader);
8625 leader->subgraph_entries.safe_push (instance);
8626 if (dump_enabled_p ()
8627 && leader != instance)
8628 dump_printf_loc (MSG_NOTE, vect_location,
8629 "instance %p is leader of %p\n",
8630 (void *) leader, (void *) instance);
8634 /* Compute the set of scalar stmts participating in internal and external
8635 nodes. */
8637 static void
8638 vect_slp_gather_vectorized_scalar_stmts (vec_info *vinfo, slp_tree node,
8639 hash_set<slp_tree> &visited,
8640 hash_set<stmt_vec_info> &vstmts,
8641 hash_set<stmt_vec_info> &estmts)
8643 int i;
8644 stmt_vec_info stmt_info;
8645 slp_tree child;
8647 if (visited.add (node))
8648 return;
8650 if (SLP_TREE_DEF_TYPE (node) == vect_internal_def)
8652 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8653 if (stmt_info)
8654 vstmts.add (stmt_info);
8656 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8657 if (child)
8658 vect_slp_gather_vectorized_scalar_stmts (vinfo, child, visited,
8659 vstmts, estmts);
8661 else
8662 for (tree def : SLP_TREE_SCALAR_OPS (node))
8664 stmt_vec_info def_stmt = vinfo->lookup_def (def);
8665 if (def_stmt)
8666 estmts.add (def_stmt);
8671 /* Compute the scalar cost of the SLP node NODE and its children
8672 and return it. Do not account defs that are marked in LIFE and
8673 update LIFE according to uses of NODE. */
8675 static void
8676 vect_bb_slp_scalar_cost (vec_info *vinfo,
8677 slp_tree node, vec<bool, va_heap> *life,
8678 stmt_vector_for_cost *cost_vec,
8679 hash_set<stmt_vec_info> &vectorized_scalar_stmts,
8680 hash_set<stmt_vec_info> &scalar_stmts_in_externs,
8681 hash_set<slp_tree> &visited)
8683 unsigned i;
8684 stmt_vec_info stmt_info;
8685 slp_tree child;
8687 if (visited.add (node))
8688 return;
8690 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
8692 ssa_op_iter op_iter;
8693 def_operand_p def_p;
8695 if (!stmt_info
8696 || (*life)[i]
8697 /* Defs also used in external nodes are not in the
8698 vectorized_scalar_stmts set as they need to be preserved.
8699 Honor that. */
8700 || scalar_stmts_in_externs.contains (stmt_info))
8701 continue;
8703 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
8704 gimple *orig_stmt = orig_stmt_info->stmt;
8706 /* If there is a non-vectorized use of the defs then the scalar
8707 stmt is kept live in which case we do not account it or any
8708 required defs in the SLP children in the scalar cost. This
8709 way we make the vectorization more costly when compared to
8710 the scalar cost. */
8711 if (!STMT_VINFO_LIVE_P (stmt_info))
8713 auto_vec<gimple *, 8> worklist;
8714 hash_set<gimple *> *worklist_visited = NULL;
8715 worklist.quick_push (orig_stmt);
8718 gimple *work_stmt = worklist.pop ();
8719 FOR_EACH_PHI_OR_STMT_DEF (def_p, work_stmt, op_iter, SSA_OP_DEF)
8721 imm_use_iterator use_iter;
8722 gimple *use_stmt;
8723 FOR_EACH_IMM_USE_STMT (use_stmt, use_iter,
8724 DEF_FROM_PTR (def_p))
8725 if (!is_gimple_debug (use_stmt))
8727 stmt_vec_info use_stmt_info
8728 = vinfo->lookup_stmt (use_stmt);
8729 if (!use_stmt_info
8730 || !vectorized_scalar_stmts.contains (use_stmt_info))
8732 if (use_stmt_info
8733 && STMT_VINFO_IN_PATTERN_P (use_stmt_info))
8735 /* For stmts participating in patterns we have
8736 to check its uses recursively. */
8737 if (!worklist_visited)
8738 worklist_visited = new hash_set<gimple *> ();
8739 if (!worklist_visited->add (use_stmt))
8740 worklist.safe_push (use_stmt);
8741 continue;
8743 (*life)[i] = true;
8744 goto next_lane;
8749 while (!worklist.is_empty ());
8750 next_lane:
8751 if (worklist_visited)
8752 delete worklist_visited;
8753 if ((*life)[i])
8754 continue;
8757 /* Count scalar stmts only once. */
8758 if (gimple_visited_p (orig_stmt))
8759 continue;
8760 gimple_set_visited (orig_stmt, true);
8762 vect_cost_for_stmt kind;
8763 if (STMT_VINFO_DATA_REF (orig_stmt_info))
8765 data_reference_p dr = STMT_VINFO_DATA_REF (orig_stmt_info);
8766 tree base = get_base_address (DR_REF (dr));
8767 /* When the scalar access is to a non-global not address-taken
8768 decl that is not BLKmode assume we can access it with a single
8769 non-load/store instruction. */
8770 if (DECL_P (base)
8771 && !is_global_var (base)
8772 && !TREE_ADDRESSABLE (base)
8773 && DECL_MODE (base) != BLKmode)
8774 kind = scalar_stmt;
8775 else if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info)))
8776 kind = scalar_load;
8777 else
8778 kind = scalar_store;
8780 else if (vect_nop_conversion_p (orig_stmt_info))
8781 continue;
8782 /* For single-argument PHIs assume coalescing which means zero cost
8783 for the scalar and the vector PHIs. This avoids artificially
8784 favoring the vector path (but may pessimize it in some cases). */
8785 else if (is_a <gphi *> (orig_stmt_info->stmt)
8786 && gimple_phi_num_args
8787 (as_a <gphi *> (orig_stmt_info->stmt)) == 1)
8788 continue;
8789 else
8790 kind = scalar_stmt;
8791 record_stmt_cost (cost_vec, 1, kind, orig_stmt_info,
8792 SLP_TREE_VECTYPE (node), 0, vect_body);
8795 auto_vec<bool, 20> subtree_life;
8796 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
8798 if (child && SLP_TREE_DEF_TYPE (child) == vect_internal_def)
8800 /* Do not directly pass LIFE to the recursive call, copy it to
8801 confine changes in the callee to the current child/subtree. */
8802 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
8804 subtree_life.safe_grow_cleared (SLP_TREE_LANES (child), true);
8805 for (unsigned j = 0;
8806 j < SLP_TREE_LANE_PERMUTATION (node).length (); ++j)
8808 auto perm = SLP_TREE_LANE_PERMUTATION (node)[j];
8809 if (perm.first == i)
8810 subtree_life[perm.second] = (*life)[j];
8813 else
8815 gcc_assert (SLP_TREE_LANES (node) == SLP_TREE_LANES (child));
8816 subtree_life.safe_splice (*life);
8818 vect_bb_slp_scalar_cost (vinfo, child, &subtree_life, cost_vec,
8819 vectorized_scalar_stmts,
8820 scalar_stmts_in_externs, visited);
8821 subtree_life.truncate (0);
8826 /* Comparator for the loop-index sorted cost vectors. */
8828 static int
8829 li_cost_vec_cmp (const void *a_, const void *b_)
8831 auto *a = (const std::pair<unsigned, stmt_info_for_cost *> *)a_;
8832 auto *b = (const std::pair<unsigned, stmt_info_for_cost *> *)b_;
8833 if (a->first < b->first)
8834 return -1;
8835 else if (a->first == b->first)
8836 return 0;
8837 return 1;
8840 /* Check if vectorization of the basic block is profitable for the
8841 subgraph denoted by SLP_INSTANCES. */
8843 static bool
8844 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo,
8845 vec<slp_instance> slp_instances,
8846 loop_p orig_loop)
8848 slp_instance instance;
8849 int i;
8850 unsigned int vec_inside_cost = 0, vec_outside_cost = 0, scalar_cost = 0;
8851 unsigned int vec_prologue_cost = 0, vec_epilogue_cost = 0;
8853 if (dump_enabled_p ())
8855 dump_printf_loc (MSG_NOTE, vect_location, "Costing subgraph: \n");
8856 hash_set<slp_tree> visited;
8857 FOR_EACH_VEC_ELT (slp_instances, i, instance)
8858 vect_print_slp_graph (MSG_NOTE, vect_location,
8859 SLP_INSTANCE_TREE (instance), visited);
8862 /* Compute the set of scalar stmts we know will go away 'locally' when
8863 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
8864 not accurate for nodes promoted extern late or for scalar stmts that
8865 are used both in extern defs and in vectorized defs. */
8866 hash_set<stmt_vec_info> vectorized_scalar_stmts;
8867 hash_set<stmt_vec_info> scalar_stmts_in_externs;
8868 hash_set<slp_tree> visited;
8869 FOR_EACH_VEC_ELT (slp_instances, i, instance)
8871 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo,
8872 SLP_INSTANCE_TREE (instance),
8873 visited,
8874 vectorized_scalar_stmts,
8875 scalar_stmts_in_externs);
8876 for (stmt_vec_info rstmt : SLP_INSTANCE_ROOT_STMTS (instance))
8877 vectorized_scalar_stmts.add (rstmt);
8879 /* Scalar stmts used as defs in external nodes need to be preseved, so
8880 remove them from vectorized_scalar_stmts. */
8881 for (stmt_vec_info stmt : scalar_stmts_in_externs)
8882 vectorized_scalar_stmts.remove (stmt);
8884 /* Calculate scalar cost and sum the cost for the vector stmts
8885 previously collected. */
8886 stmt_vector_for_cost scalar_costs = vNULL;
8887 stmt_vector_for_cost vector_costs = vNULL;
8888 visited.empty ();
8889 FOR_EACH_VEC_ELT (slp_instances, i, instance)
8891 auto_vec<bool, 20> life;
8892 life.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance)),
8893 true);
8894 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
8895 record_stmt_cost (&scalar_costs,
8896 SLP_INSTANCE_ROOT_STMTS (instance).length (),
8897 scalar_stmt,
8898 SLP_INSTANCE_ROOT_STMTS (instance)[0], 0, vect_body);
8899 vect_bb_slp_scalar_cost (bb_vinfo,
8900 SLP_INSTANCE_TREE (instance),
8901 &life, &scalar_costs, vectorized_scalar_stmts,
8902 scalar_stmts_in_externs, visited);
8903 vector_costs.safe_splice (instance->cost_vec);
8904 instance->cost_vec.release ();
8907 if (dump_enabled_p ())
8908 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
8910 /* When costing non-loop vectorization we need to consider each covered
8911 loop independently and make sure vectorization is profitable. For
8912 now we assume a loop may be not entered or executed an arbitrary
8913 number of iterations (??? static information can provide more
8914 precise info here) which means we can simply cost each containing
8915 loops stmts separately. */
8917 /* First produce cost vectors sorted by loop index. */
8918 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
8919 li_scalar_costs (scalar_costs.length ());
8920 auto_vec<std::pair<unsigned, stmt_info_for_cost *> >
8921 li_vector_costs (vector_costs.length ());
8922 stmt_info_for_cost *cost;
8923 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
8925 unsigned l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
8926 li_scalar_costs.quick_push (std::make_pair (l, cost));
8928 /* Use a random used loop as fallback in case the first vector_costs
8929 entry does not have a stmt_info associated with it. */
8930 unsigned l = li_scalar_costs[0].first;
8931 FOR_EACH_VEC_ELT (vector_costs, i, cost)
8933 /* We inherit from the previous COST, invariants, externals and
8934 extracts immediately follow the cost for the related stmt. */
8935 if (cost->stmt_info)
8936 l = gimple_bb (cost->stmt_info->stmt)->loop_father->num;
8937 li_vector_costs.quick_push (std::make_pair (l, cost));
8939 li_scalar_costs.qsort (li_cost_vec_cmp);
8940 li_vector_costs.qsort (li_cost_vec_cmp);
8942 /* Now cost the portions individually. */
8943 unsigned vi = 0;
8944 unsigned si = 0;
8945 bool profitable = true;
8946 while (si < li_scalar_costs.length ()
8947 && vi < li_vector_costs.length ())
8949 unsigned sl = li_scalar_costs[si].first;
8950 unsigned vl = li_vector_costs[vi].first;
8951 if (sl != vl)
8953 if (dump_enabled_p ())
8954 dump_printf_loc (MSG_NOTE, vect_location,
8955 "Scalar %d and vector %d loop part do not "
8956 "match up, skipping scalar part\n", sl, vl);
8957 /* Skip the scalar part, assuming zero cost on the vector side. */
8960 si++;
8962 while (si < li_scalar_costs.length ()
8963 && li_scalar_costs[si].first == sl);
8964 continue;
8967 class vector_costs *scalar_target_cost_data = init_cost (bb_vinfo, true);
8970 add_stmt_cost (scalar_target_cost_data, li_scalar_costs[si].second);
8971 si++;
8973 while (si < li_scalar_costs.length ()
8974 && li_scalar_costs[si].first == sl);
8975 scalar_target_cost_data->finish_cost (nullptr);
8976 scalar_cost = scalar_target_cost_data->body_cost ();
8978 /* Complete the target-specific vector cost calculation. */
8979 class vector_costs *vect_target_cost_data = init_cost (bb_vinfo, false);
8982 add_stmt_cost (vect_target_cost_data, li_vector_costs[vi].second);
8983 vi++;
8985 while (vi < li_vector_costs.length ()
8986 && li_vector_costs[vi].first == vl);
8987 vect_target_cost_data->finish_cost (scalar_target_cost_data);
8988 vec_prologue_cost = vect_target_cost_data->prologue_cost ();
8989 vec_inside_cost = vect_target_cost_data->body_cost ();
8990 vec_epilogue_cost = vect_target_cost_data->epilogue_cost ();
8991 delete scalar_target_cost_data;
8992 delete vect_target_cost_data;
8994 vec_outside_cost = vec_prologue_cost + vec_epilogue_cost;
8996 if (dump_enabled_p ())
8998 dump_printf_loc (MSG_NOTE, vect_location,
8999 "Cost model analysis for part in loop %d:\n", sl);
9000 dump_printf (MSG_NOTE, " Vector cost: %d\n",
9001 vec_inside_cost + vec_outside_cost);
9002 dump_printf (MSG_NOTE, " Scalar cost: %d\n", scalar_cost);
9005 /* Vectorization is profitable if its cost is more than the cost of scalar
9006 version. Note that we err on the vector side for equal cost because
9007 the cost estimate is otherwise quite pessimistic (constant uses are
9008 free on the scalar side but cost a load on the vector side for
9009 example). */
9010 if (vec_outside_cost + vec_inside_cost > scalar_cost)
9012 profitable = false;
9013 break;
9016 if (profitable && vi < li_vector_costs.length ())
9018 if (dump_enabled_p ())
9019 dump_printf_loc (MSG_NOTE, vect_location,
9020 "Excess vector cost for part in loop %d:\n",
9021 li_vector_costs[vi].first);
9022 profitable = false;
9025 /* Unset visited flag. This is delayed when the subgraph is profitable
9026 and we process the loop for remaining unvectorized if-converted code. */
9027 if (!orig_loop || !profitable)
9028 FOR_EACH_VEC_ELT (scalar_costs, i, cost)
9029 gimple_set_visited (cost->stmt_info->stmt, false);
9031 scalar_costs.release ();
9032 vector_costs.release ();
9034 return profitable;
9037 /* qsort comparator for lane defs. */
9039 static int
9040 vld_cmp (const void *a_, const void *b_)
9042 auto *a = (const std::pair<unsigned, tree> *)a_;
9043 auto *b = (const std::pair<unsigned, tree> *)b_;
9044 return a->first - b->first;
9047 /* Return true if USE_STMT is a vector lane insert into VEC and set
9048 *THIS_LANE to the lane number that is set. */
9050 static bool
9051 vect_slp_is_lane_insert (gimple *use_stmt, tree vec, unsigned *this_lane)
9053 gassign *use_ass = dyn_cast <gassign *> (use_stmt);
9054 if (!use_ass
9055 || gimple_assign_rhs_code (use_ass) != BIT_INSERT_EXPR
9056 || (vec
9057 ? gimple_assign_rhs1 (use_ass) != vec
9058 : ((vec = gimple_assign_rhs1 (use_ass)), false))
9059 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec)),
9060 TREE_TYPE (gimple_assign_rhs2 (use_ass)))
9061 || !constant_multiple_p
9062 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass)),
9063 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec)))),
9064 this_lane))
9065 return false;
9066 return true;
9069 /* Find any vectorizable constructors and add them to the grouped_store
9070 array. */
9072 static void
9073 vect_slp_check_for_roots (bb_vec_info bb_vinfo)
9075 for (unsigned i = 0; i < bb_vinfo->nbbs; ++i)
9076 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[i]);
9077 !gsi_end_p (gsi); gsi_next (&gsi))
9079 gassign *assign = dyn_cast<gassign *> (gsi_stmt (gsi));
9080 /* This can be used to start SLP discovery for early breaks for BB early breaks
9081 when we get that far. */
9082 if (!assign)
9083 continue;
9085 tree rhs = gimple_assign_rhs1 (assign);
9086 enum tree_code code = gimple_assign_rhs_code (assign);
9087 use_operand_p use_p;
9088 gimple *use_stmt;
9089 if (code == CONSTRUCTOR)
9091 if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9092 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)),
9093 CONSTRUCTOR_NELTS (rhs))
9094 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs, 0)->value))
9095 || uniform_vector_p (rhs))
9096 continue;
9098 unsigned j;
9099 tree val;
9100 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9101 if (TREE_CODE (val) != SSA_NAME
9102 || !bb_vinfo->lookup_def (val))
9103 break;
9104 if (j != CONSTRUCTOR_NELTS (rhs))
9105 continue;
9107 vec<stmt_vec_info> roots = vNULL;
9108 roots.safe_push (bb_vinfo->lookup_stmt (assign));
9109 vec<stmt_vec_info> stmts;
9110 stmts.create (CONSTRUCTOR_NELTS (rhs));
9111 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs), j, val)
9112 stmts.quick_push
9113 (vect_stmt_to_vectorize (bb_vinfo->lookup_def (val)));
9114 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9115 stmts, roots));
9117 else if (code == BIT_INSERT_EXPR
9118 && VECTOR_TYPE_P (TREE_TYPE (rhs))
9119 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).is_constant ()
9120 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs)).to_constant () > 1
9121 && integer_zerop (gimple_assign_rhs3 (assign))
9122 && useless_type_conversion_p
9123 (TREE_TYPE (TREE_TYPE (rhs)),
9124 TREE_TYPE (gimple_assign_rhs2 (assign)))
9125 && bb_vinfo->lookup_def (gimple_assign_rhs2 (assign)))
9127 /* We start to match on insert to lane zero but since the
9128 inserts need not be ordered we'd have to search both
9129 the def and the use chains. */
9130 tree vectype = TREE_TYPE (rhs);
9131 unsigned nlanes = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
9132 auto_vec<std::pair<unsigned, tree> > lane_defs (nlanes);
9133 auto_sbitmap lanes (nlanes);
9134 bitmap_clear (lanes);
9135 bitmap_set_bit (lanes, 0);
9136 tree def = gimple_assign_lhs (assign);
9137 lane_defs.quick_push
9138 (std::make_pair (0, gimple_assign_rhs2 (assign)));
9139 unsigned lanes_found = 1;
9140 /* Start with the use chains, the last stmt will be the root. */
9141 stmt_vec_info last = bb_vinfo->lookup_stmt (assign);
9142 vec<stmt_vec_info> roots = vNULL;
9143 roots.safe_push (last);
9146 use_operand_p use_p;
9147 gimple *use_stmt;
9148 if (!single_imm_use (def, &use_p, &use_stmt))
9149 break;
9150 unsigned this_lane;
9151 if (!bb_vinfo->lookup_stmt (use_stmt)
9152 || !vect_slp_is_lane_insert (use_stmt, def, &this_lane)
9153 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (use_stmt)))
9154 break;
9155 if (bitmap_bit_p (lanes, this_lane))
9156 break;
9157 lanes_found++;
9158 bitmap_set_bit (lanes, this_lane);
9159 gassign *use_ass = as_a <gassign *> (use_stmt);
9160 lane_defs.quick_push (std::make_pair
9161 (this_lane, gimple_assign_rhs2 (use_ass)));
9162 last = bb_vinfo->lookup_stmt (use_ass);
9163 roots.safe_push (last);
9164 def = gimple_assign_lhs (use_ass);
9166 while (lanes_found < nlanes);
9167 if (roots.length () > 1)
9168 std::swap(roots[0], roots[roots.length () - 1]);
9169 if (lanes_found < nlanes)
9171 /* Now search the def chain. */
9172 def = gimple_assign_rhs1 (assign);
9175 if (TREE_CODE (def) != SSA_NAME
9176 || !has_single_use (def))
9177 break;
9178 gimple *def_stmt = SSA_NAME_DEF_STMT (def);
9179 unsigned this_lane;
9180 if (!bb_vinfo->lookup_stmt (def_stmt)
9181 || !vect_slp_is_lane_insert (def_stmt,
9182 NULL_TREE, &this_lane)
9183 || !bb_vinfo->lookup_def (gimple_assign_rhs2 (def_stmt)))
9184 break;
9185 if (bitmap_bit_p (lanes, this_lane))
9186 break;
9187 lanes_found++;
9188 bitmap_set_bit (lanes, this_lane);
9189 lane_defs.quick_push (std::make_pair
9190 (this_lane,
9191 gimple_assign_rhs2 (def_stmt)));
9192 roots.safe_push (bb_vinfo->lookup_stmt (def_stmt));
9193 def = gimple_assign_rhs1 (def_stmt);
9195 while (lanes_found < nlanes);
9197 if (lanes_found == nlanes)
9199 /* Sort lane_defs after the lane index and register the root. */
9200 lane_defs.qsort (vld_cmp);
9201 vec<stmt_vec_info> stmts;
9202 stmts.create (nlanes);
9203 for (unsigned i = 0; i < nlanes; ++i)
9204 stmts.quick_push (bb_vinfo->lookup_def (lane_defs[i].second));
9205 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_ctor,
9206 stmts, roots));
9208 else
9209 roots.release ();
9211 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs))
9212 && (associative_tree_code (code) || code == MINUS_EXPR)
9213 /* ??? This pessimizes a two-element reduction. PR54400.
9214 ??? In-order reduction could be handled if we only
9215 traverse one operand chain in vect_slp_linearize_chain. */
9216 && !needs_fold_left_reduction_p (TREE_TYPE (rhs), code)
9217 /* Ops with constants at the tail can be stripped here. */
9218 && TREE_CODE (rhs) == SSA_NAME
9219 && TREE_CODE (gimple_assign_rhs2 (assign)) == SSA_NAME
9220 /* Should be the chain end. */
9221 && (!single_imm_use (gimple_assign_lhs (assign),
9222 &use_p, &use_stmt)
9223 || !is_gimple_assign (use_stmt)
9224 || (gimple_assign_rhs_code (use_stmt) != code
9225 && ((code != PLUS_EXPR && code != MINUS_EXPR)
9226 || (gimple_assign_rhs_code (use_stmt)
9227 != (code == PLUS_EXPR ? MINUS_EXPR : PLUS_EXPR))))))
9229 /* We start the match at the end of a possible association
9230 chain. */
9231 auto_vec<chain_op_t> chain;
9232 auto_vec<std::pair<tree_code, gimple *> > worklist;
9233 auto_vec<gimple *> chain_stmts;
9234 gimple *code_stmt = NULL, *alt_code_stmt = NULL;
9235 if (code == MINUS_EXPR)
9236 code = PLUS_EXPR;
9237 internal_fn reduc_fn;
9238 if (!reduction_fn_for_scalar_code (code, &reduc_fn)
9239 || reduc_fn == IFN_LAST)
9240 continue;
9241 vect_slp_linearize_chain (bb_vinfo, worklist, chain, code, assign,
9242 /* ??? */
9243 code_stmt, alt_code_stmt, &chain_stmts);
9244 if (chain.length () > 1)
9246 /* Sort the chain according to def_type and operation. */
9247 chain.sort (dt_sort_cmp, bb_vinfo);
9248 /* ??? Now we'd want to strip externals and constants
9249 but record those to be handled in the epilogue. */
9250 /* ??? For now do not allow mixing ops or externs/constants. */
9251 bool invalid = false;
9252 unsigned remain_cnt = 0;
9253 unsigned last_idx = 0;
9254 for (unsigned i = 0; i < chain.length (); ++i)
9256 if (chain[i].code != code)
9258 invalid = true;
9259 break;
9261 if (chain[i].dt != vect_internal_def
9262 /* Avoid stmts where the def is not the LHS, like
9263 ASMs. */
9264 || (gimple_get_lhs (bb_vinfo->lookup_def
9265 (chain[i].op)->stmt)
9266 != chain[i].op))
9267 remain_cnt++;
9268 else
9269 last_idx = i;
9271 /* Make sure to have an even number of lanes as we later do
9272 all-or-nothing discovery, not trying to split further. */
9273 if ((chain.length () - remain_cnt) & 1)
9274 remain_cnt++;
9275 if (!invalid && chain.length () - remain_cnt > 1)
9277 vec<stmt_vec_info> stmts;
9278 vec<tree> remain = vNULL;
9279 stmts.create (chain.length ());
9280 if (remain_cnt > 0)
9281 remain.create (remain_cnt);
9282 for (unsigned i = 0; i < chain.length (); ++i)
9284 stmt_vec_info stmt_info;
9285 if (chain[i].dt == vect_internal_def
9286 && ((stmt_info = bb_vinfo->lookup_def (chain[i].op)),
9287 gimple_get_lhs (stmt_info->stmt) == chain[i].op)
9288 && (i != last_idx
9289 || (stmts.length () & 1)))
9290 stmts.quick_push (stmt_info);
9291 else
9292 remain.quick_push (chain[i].op);
9294 vec<stmt_vec_info> roots;
9295 roots.create (chain_stmts.length ());
9296 for (unsigned i = 0; i < chain_stmts.length (); ++i)
9297 roots.quick_push (bb_vinfo->lookup_stmt (chain_stmts[i]));
9298 bb_vinfo->roots.safe_push (slp_root (slp_inst_kind_bb_reduc,
9299 stmts, roots, remain));
9306 /* Walk the grouped store chains and replace entries with their
9307 pattern variant if any. */
9309 static void
9310 vect_fixup_store_groups_with_patterns (vec_info *vinfo)
9312 stmt_vec_info first_element;
9313 unsigned i;
9315 FOR_EACH_VEC_ELT (vinfo->grouped_stores, i, first_element)
9317 /* We also have CTORs in this array. */
9318 if (!STMT_VINFO_GROUPED_ACCESS (first_element))
9319 continue;
9320 if (STMT_VINFO_IN_PATTERN_P (first_element))
9322 stmt_vec_info orig = first_element;
9323 first_element = STMT_VINFO_RELATED_STMT (first_element);
9324 DR_GROUP_FIRST_ELEMENT (first_element) = first_element;
9325 DR_GROUP_SIZE (first_element) = DR_GROUP_SIZE (orig);
9326 DR_GROUP_GAP (first_element) = DR_GROUP_GAP (orig);
9327 DR_GROUP_NEXT_ELEMENT (first_element) = DR_GROUP_NEXT_ELEMENT (orig);
9328 vinfo->grouped_stores[i] = first_element;
9330 stmt_vec_info prev = first_element;
9331 while (DR_GROUP_NEXT_ELEMENT (prev))
9333 stmt_vec_info elt = DR_GROUP_NEXT_ELEMENT (prev);
9334 if (STMT_VINFO_IN_PATTERN_P (elt))
9336 stmt_vec_info orig = elt;
9337 elt = STMT_VINFO_RELATED_STMT (elt);
9338 DR_GROUP_NEXT_ELEMENT (prev) = elt;
9339 DR_GROUP_GAP (elt) = DR_GROUP_GAP (orig);
9340 DR_GROUP_NEXT_ELEMENT (elt) = DR_GROUP_NEXT_ELEMENT (orig);
9342 DR_GROUP_FIRST_ELEMENT (elt) = first_element;
9343 prev = elt;
9348 /* Check if the region described by BB_VINFO can be vectorized, returning
9349 true if so. When returning false, set FATAL to true if the same failure
9350 would prevent vectorization at other vector sizes, false if it is still
9351 worth trying other sizes. N_STMTS is the number of statements in the
9352 region. */
9354 static bool
9355 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo, int n_stmts, bool &fatal,
9356 vec<int> *dataref_groups)
9358 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
9360 slp_instance instance;
9361 int i;
9362 poly_uint64 min_vf = 2;
9364 /* The first group of checks is independent of the vector size. */
9365 fatal = true;
9367 /* Analyze the data references. */
9369 if (!vect_analyze_data_refs (bb_vinfo, &min_vf, NULL))
9371 if (dump_enabled_p ())
9372 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9373 "not vectorized: unhandled data-ref in basic "
9374 "block.\n");
9375 return false;
9378 if (!vect_analyze_data_ref_accesses (bb_vinfo, dataref_groups))
9380 if (dump_enabled_p ())
9381 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9382 "not vectorized: unhandled data access in "
9383 "basic block.\n");
9384 return false;
9387 vect_slp_check_for_roots (bb_vinfo);
9389 /* If there are no grouped stores and no constructors in the region
9390 there is no need to continue with pattern recog as vect_analyze_slp
9391 will fail anyway. */
9392 if (bb_vinfo->grouped_stores.is_empty ()
9393 && bb_vinfo->roots.is_empty ())
9395 if (dump_enabled_p ())
9396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9397 "not vectorized: no grouped stores in "
9398 "basic block.\n");
9399 return false;
9402 /* While the rest of the analysis below depends on it in some way. */
9403 fatal = false;
9405 vect_pattern_recog (bb_vinfo);
9407 /* Update store groups from pattern processing. */
9408 vect_fixup_store_groups_with_patterns (bb_vinfo);
9410 /* Check the SLP opportunities in the basic block, analyze and build SLP
9411 trees. */
9412 if (!vect_analyze_slp (bb_vinfo, n_stmts, false))
9414 if (dump_enabled_p ())
9416 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9417 "Failed to SLP the basic block.\n");
9418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9419 "not vectorized: failed to find SLP opportunities "
9420 "in basic block.\n");
9422 return false;
9425 /* Optimize permutations. */
9426 vect_optimize_slp (bb_vinfo);
9428 /* Gather the loads reachable from the SLP graph entries. */
9429 vect_gather_slp_loads (bb_vinfo);
9431 vect_record_base_alignments (bb_vinfo);
9433 /* Analyze and verify the alignment of data references and the
9434 dependence in the SLP instances. */
9435 for (i = 0; BB_VINFO_SLP_INSTANCES (bb_vinfo).iterate (i, &instance); )
9437 vect_location = instance->location ();
9438 if (! vect_slp_analyze_instance_alignment (bb_vinfo, instance)
9439 || ! vect_slp_analyze_instance_dependence (bb_vinfo, instance))
9441 slp_tree node = SLP_INSTANCE_TREE (instance);
9442 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
9443 if (dump_enabled_p ())
9444 dump_printf_loc (MSG_NOTE, vect_location,
9445 "removing SLP instance operations starting from: %G",
9446 stmt_info->stmt);
9447 vect_free_slp_instance (instance);
9448 BB_VINFO_SLP_INSTANCES (bb_vinfo).ordered_remove (i);
9449 continue;
9452 /* Mark all the statements that we want to vectorize as pure SLP and
9453 relevant. */
9454 vect_mark_slp_stmts (bb_vinfo, SLP_INSTANCE_TREE (instance));
9455 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance));
9456 unsigned j;
9457 stmt_vec_info root;
9458 /* Likewise consider instance root stmts as vectorized. */
9459 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance), j, root)
9460 STMT_SLP_TYPE (root) = pure_slp;
9462 i++;
9464 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo).length ())
9465 return false;
9467 if (!vect_slp_analyze_operations (bb_vinfo))
9469 if (dump_enabled_p ())
9470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9471 "not vectorized: bad operation in basic block.\n");
9472 return false;
9475 vect_bb_partition_graph (bb_vinfo);
9477 return true;
9480 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
9481 basic blocks in BBS, returning true on success.
9482 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
9484 static bool
9485 vect_slp_region (vec<basic_block> bbs, vec<data_reference_p> datarefs,
9486 vec<int> *dataref_groups, unsigned int n_stmts,
9487 loop_p orig_loop)
9489 bb_vec_info bb_vinfo;
9490 auto_vector_modes vector_modes;
9492 /* Autodetect first vector size we try. */
9493 machine_mode next_vector_mode = VOIDmode;
9494 targetm.vectorize.autovectorize_vector_modes (&vector_modes, false);
9495 unsigned int mode_i = 0;
9497 vec_info_shared shared;
9499 machine_mode autodetected_vector_mode = VOIDmode;
9500 while (1)
9502 bool vectorized = false;
9503 bool fatal = false;
9504 bb_vinfo = new _bb_vec_info (bbs, &shared);
9506 bool first_time_p = shared.datarefs.is_empty ();
9507 BB_VINFO_DATAREFS (bb_vinfo) = datarefs;
9508 if (first_time_p)
9509 bb_vinfo->shared->save_datarefs ();
9510 else
9511 bb_vinfo->shared->check_datarefs ();
9512 bb_vinfo->vector_mode = next_vector_mode;
9514 if (vect_slp_analyze_bb_1 (bb_vinfo, n_stmts, fatal, dataref_groups))
9516 if (dump_enabled_p ())
9518 dump_printf_loc (MSG_NOTE, vect_location,
9519 "***** Analysis succeeded with vector mode"
9520 " %s\n", GET_MODE_NAME (bb_vinfo->vector_mode));
9521 dump_printf_loc (MSG_NOTE, vect_location, "SLPing BB part\n");
9524 bb_vinfo->shared->check_datarefs ();
9526 bool force_clear = false;
9527 auto_vec<slp_instance> profitable_subgraphs;
9528 for (slp_instance instance : BB_VINFO_SLP_INSTANCES (bb_vinfo))
9530 if (instance->subgraph_entries.is_empty ())
9531 continue;
9533 dump_user_location_t saved_vect_location = vect_location;
9534 vect_location = instance->location ();
9535 if (!unlimited_cost_model (NULL)
9536 && !vect_bb_vectorization_profitable_p
9537 (bb_vinfo, instance->subgraph_entries, orig_loop))
9539 if (dump_enabled_p ())
9540 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9541 "not vectorized: vectorization is not "
9542 "profitable.\n");
9543 vect_location = saved_vect_location;
9544 continue;
9547 vect_location = saved_vect_location;
9548 if (!dbg_cnt (vect_slp))
9550 force_clear = true;
9551 continue;
9554 profitable_subgraphs.safe_push (instance);
9557 /* When we're vectorizing an if-converted loop body make sure
9558 we vectorized all if-converted code. */
9559 if ((!profitable_subgraphs.is_empty () || force_clear) && orig_loop)
9561 gcc_assert (bb_vinfo->nbbs == 1);
9562 for (gimple_stmt_iterator gsi = gsi_start_bb (bb_vinfo->bbs[0]);
9563 !gsi_end_p (gsi); gsi_next (&gsi))
9565 /* The costing above left us with DCEable vectorized scalar
9566 stmts having the visited flag set on profitable
9567 subgraphs. Do the delayed clearing of the flag here. */
9568 if (gimple_visited_p (gsi_stmt (gsi)))
9570 gimple_set_visited (gsi_stmt (gsi), false);
9571 continue;
9573 if (flag_vect_cost_model == VECT_COST_MODEL_UNLIMITED)
9574 continue;
9576 if (gassign *ass = dyn_cast <gassign *> (gsi_stmt (gsi)))
9577 if (gimple_assign_rhs_code (ass) == COND_EXPR)
9579 if (!profitable_subgraphs.is_empty ()
9580 && dump_enabled_p ())
9581 dump_printf_loc (MSG_NOTE, vect_location,
9582 "not profitable because of "
9583 "unprofitable if-converted scalar "
9584 "code\n");
9585 profitable_subgraphs.truncate (0);
9590 /* Finally schedule the profitable subgraphs. */
9591 for (slp_instance instance : profitable_subgraphs)
9593 if (!vectorized && dump_enabled_p ())
9594 dump_printf_loc (MSG_NOTE, vect_location,
9595 "Basic block will be vectorized "
9596 "using SLP\n");
9597 vectorized = true;
9599 /* Dump before scheduling as store vectorization will remove
9600 the original stores and mess with the instance tree
9601 so querying its location will eventually ICE. */
9602 if (flag_checking)
9603 for (slp_instance sub : instance->subgraph_entries)
9604 gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub)));
9605 unsigned HOST_WIDE_INT bytes;
9606 if (dump_enabled_p ())
9607 for (slp_instance sub : instance->subgraph_entries)
9609 tree vtype = SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub));
9610 if (GET_MODE_SIZE (TYPE_MODE (vtype)).is_constant (&bytes))
9611 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
9612 sub->location (),
9613 "basic block part vectorized using %wu "
9614 "byte vectors\n", bytes);
9615 else
9616 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
9617 sub->location (),
9618 "basic block part vectorized using "
9619 "variable length vectors\n");
9622 dump_user_location_t saved_vect_location = vect_location;
9623 vect_location = instance->location ();
9625 vect_schedule_slp (bb_vinfo, instance->subgraph_entries);
9627 vect_location = saved_vect_location;
9631 /* Generate the invariant statements. */
9632 if (!gimple_seq_empty_p (bb_vinfo->inv_pattern_def_seq))
9634 if (dump_enabled_p ())
9635 dump_printf_loc (MSG_NOTE, vect_location,
9636 "------>generating invariant statements\n");
9638 bb_vinfo->insert_seq_on_entry (NULL,
9639 bb_vinfo->inv_pattern_def_seq);
9642 else
9644 if (dump_enabled_p ())
9645 dump_printf_loc (MSG_NOTE, vect_location,
9646 "***** Analysis failed with vector mode %s\n",
9647 GET_MODE_NAME (bb_vinfo->vector_mode));
9650 if (mode_i == 0)
9651 autodetected_vector_mode = bb_vinfo->vector_mode;
9653 if (!fatal)
9654 while (mode_i < vector_modes.length ()
9655 && vect_chooses_same_modes_p (bb_vinfo, vector_modes[mode_i]))
9657 if (dump_enabled_p ())
9658 dump_printf_loc (MSG_NOTE, vect_location,
9659 "***** The result for vector mode %s would"
9660 " be the same\n",
9661 GET_MODE_NAME (vector_modes[mode_i]));
9662 mode_i += 1;
9665 delete bb_vinfo;
9667 if (mode_i < vector_modes.length ()
9668 && VECTOR_MODE_P (autodetected_vector_mode)
9669 && (related_vector_mode (vector_modes[mode_i],
9670 GET_MODE_INNER (autodetected_vector_mode))
9671 == autodetected_vector_mode)
9672 && (related_vector_mode (autodetected_vector_mode,
9673 GET_MODE_INNER (vector_modes[mode_i]))
9674 == vector_modes[mode_i]))
9676 if (dump_enabled_p ())
9677 dump_printf_loc (MSG_NOTE, vect_location,
9678 "***** Skipping vector mode %s, which would"
9679 " repeat the analysis for %s\n",
9680 GET_MODE_NAME (vector_modes[mode_i]),
9681 GET_MODE_NAME (autodetected_vector_mode));
9682 mode_i += 1;
9685 if (vectorized
9686 || mode_i == vector_modes.length ()
9687 || autodetected_vector_mode == VOIDmode
9688 /* If vect_slp_analyze_bb_1 signaled that analysis for all
9689 vector sizes will fail do not bother iterating. */
9690 || fatal)
9691 return vectorized;
9693 /* Try the next biggest vector size. */
9694 next_vector_mode = vector_modes[mode_i++];
9695 if (dump_enabled_p ())
9696 dump_printf_loc (MSG_NOTE, vect_location,
9697 "***** Re-trying analysis with vector mode %s\n",
9698 GET_MODE_NAME (next_vector_mode));
9703 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
9704 true if anything in the basic-block was vectorized. */
9706 static bool
9707 vect_slp_bbs (const vec<basic_block> &bbs, loop_p orig_loop)
9709 vec<data_reference_p> datarefs = vNULL;
9710 auto_vec<int> dataref_groups;
9711 int insns = 0;
9712 int current_group = 0;
9714 for (unsigned i = 0; i < bbs.length (); i++)
9716 basic_block bb = bbs[i];
9717 for (gimple_stmt_iterator gsi = gsi_after_labels (bb); !gsi_end_p (gsi);
9718 gsi_next (&gsi))
9720 gimple *stmt = gsi_stmt (gsi);
9721 if (is_gimple_debug (stmt))
9722 continue;
9724 insns++;
9726 if (gimple_location (stmt) != UNKNOWN_LOCATION)
9727 vect_location = stmt;
9729 if (!vect_find_stmt_data_reference (NULL, stmt, &datarefs,
9730 &dataref_groups, current_group))
9731 ++current_group;
9733 /* New BBs always start a new DR group. */
9734 ++current_group;
9737 return vect_slp_region (bbs, datarefs, &dataref_groups, insns, orig_loop);
9740 /* Special entry for the BB vectorizer. Analyze and transform a single
9741 if-converted BB with ORIG_LOOPs body being the not if-converted
9742 representation. Returns true if anything in the basic-block was
9743 vectorized. */
9745 bool
9746 vect_slp_if_converted_bb (basic_block bb, loop_p orig_loop)
9748 auto_vec<basic_block> bbs;
9749 bbs.safe_push (bb);
9750 return vect_slp_bbs (bbs, orig_loop);
9753 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
9754 true if anything in the basic-block was vectorized. */
9756 bool
9757 vect_slp_function (function *fun)
9759 bool r = false;
9760 int *rpo = XNEWVEC (int, n_basic_blocks_for_fn (fun));
9761 auto_bitmap exit_bbs;
9762 bitmap_set_bit (exit_bbs, EXIT_BLOCK);
9763 edge entry = single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun));
9764 unsigned n = rev_post_order_and_mark_dfs_back_seme (fun, entry, exit_bbs,
9765 true, rpo, NULL);
9767 /* For the moment split the function into pieces to avoid making
9768 the iteration on the vector mode moot. Split at points we know
9769 to not handle well which is CFG merges (SLP discovery doesn't
9770 handle non-loop-header PHIs) and loop exits. Since pattern
9771 recog requires reverse iteration to visit uses before defs
9772 simply chop RPO into pieces. */
9773 auto_vec<basic_block> bbs;
9774 for (unsigned i = 0; i < n; i++)
9776 basic_block bb = BASIC_BLOCK_FOR_FN (fun, rpo[i]);
9777 bool split = false;
9779 /* Split when a BB is not dominated by the first block. */
9780 if (!bbs.is_empty ()
9781 && !dominated_by_p (CDI_DOMINATORS, bb, bbs[0]))
9783 if (dump_enabled_p ())
9784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9785 "splitting region at dominance boundary bb%d\n",
9786 bb->index);
9787 split = true;
9789 /* Split when the loop determined by the first block
9790 is exited. This is because we eventually insert
9791 invariants at region begin. */
9792 else if (!bbs.is_empty ()
9793 && bbs[0]->loop_father != bb->loop_father
9794 && !flow_loop_nested_p (bbs[0]->loop_father, bb->loop_father))
9796 if (dump_enabled_p ())
9797 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9798 "splitting region at loop %d exit at bb%d\n",
9799 bbs[0]->loop_father->num, bb->index);
9800 split = true;
9802 else if (!bbs.is_empty ()
9803 && bb->loop_father->header == bb
9804 && bb->loop_father->dont_vectorize)
9806 if (dump_enabled_p ())
9807 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9808 "splitting region at dont-vectorize loop %d "
9809 "entry at bb%d\n",
9810 bb->loop_father->num, bb->index);
9811 split = true;
9814 if (split && !bbs.is_empty ())
9816 r |= vect_slp_bbs (bbs, NULL);
9817 bbs.truncate (0);
9820 if (bbs.is_empty ())
9822 /* We need to be able to insert at the head of the region which
9823 we cannot for region starting with a returns-twice call. */
9824 if (gcall *first = safe_dyn_cast <gcall *> (first_stmt (bb)))
9825 if (gimple_call_flags (first) & ECF_RETURNS_TWICE)
9827 if (dump_enabled_p ())
9828 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9829 "skipping bb%d as start of region as it "
9830 "starts with returns-twice call\n",
9831 bb->index);
9832 continue;
9834 /* If the loop this BB belongs to is marked as not to be vectorized
9835 honor that also for BB vectorization. */
9836 if (bb->loop_father->dont_vectorize)
9837 continue;
9840 bbs.safe_push (bb);
9842 /* When we have a stmt ending this block and defining a
9843 value we have to insert on edges when inserting after it for
9844 a vector containing its definition. Avoid this for now. */
9845 if (gimple *last = *gsi_last_bb (bb))
9846 if (gimple_get_lhs (last)
9847 && is_ctrl_altering_stmt (last))
9849 if (dump_enabled_p ())
9850 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9851 "splitting region at control altering "
9852 "definition %G", last);
9853 r |= vect_slp_bbs (bbs, NULL);
9854 bbs.truncate (0);
9858 if (!bbs.is_empty ())
9859 r |= vect_slp_bbs (bbs, NULL);
9861 free (rpo);
9863 return r;
9866 /* Build a variable-length vector in which the elements in ELTS are repeated
9867 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
9868 RESULTS and add any new instructions to SEQ.
9870 The approach we use is:
9872 (1) Find a vector mode VM with integer elements of mode IM.
9874 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
9875 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
9876 from small vectors to IM.
9878 (3) Duplicate each ELTS'[I] into a vector of mode VM.
9880 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
9881 correct byte contents.
9883 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
9885 We try to find the largest IM for which this sequence works, in order
9886 to cut down on the number of interleaves. */
9888 void
9889 duplicate_and_interleave (vec_info *vinfo, gimple_seq *seq, tree vector_type,
9890 const vec<tree> &elts, unsigned int nresults,
9891 vec<tree> &results)
9893 unsigned int nelts = elts.length ();
9894 tree element_type = TREE_TYPE (vector_type);
9896 /* (1) Find a vector mode VM with integer elements of mode IM. */
9897 unsigned int nvectors = 1;
9898 tree new_vector_type;
9899 tree permutes[2];
9900 if (!can_duplicate_and_interleave_p (vinfo, nelts, element_type,
9901 &nvectors, &new_vector_type,
9902 permutes))
9903 gcc_unreachable ();
9905 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
9906 unsigned int partial_nelts = nelts / nvectors;
9907 tree partial_vector_type = build_vector_type (element_type, partial_nelts);
9909 tree_vector_builder partial_elts;
9910 auto_vec<tree, 32> pieces (nvectors * 2);
9911 pieces.quick_grow_cleared (nvectors * 2);
9912 for (unsigned int i = 0; i < nvectors; ++i)
9914 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
9915 ELTS' has mode IM. */
9916 partial_elts.new_vector (partial_vector_type, partial_nelts, 1);
9917 for (unsigned int j = 0; j < partial_nelts; ++j)
9918 partial_elts.quick_push (elts[i * partial_nelts + j]);
9919 tree t = gimple_build_vector (seq, &partial_elts);
9920 t = gimple_build (seq, VIEW_CONVERT_EXPR,
9921 TREE_TYPE (new_vector_type), t);
9923 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
9924 pieces[i] = gimple_build_vector_from_val (seq, new_vector_type, t);
9927 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
9928 correct byte contents.
9930 Conceptually, we need to repeat the following operation log2(nvectors)
9931 times, where hi_start = nvectors / 2:
9933 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
9934 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
9936 However, if each input repeats every N elements and the VF is
9937 a multiple of N * 2, the HI result is the same as the LO result.
9938 This will be true for the first N1 iterations of the outer loop,
9939 followed by N2 iterations for which both the LO and HI results
9940 are needed. I.e.:
9942 N1 + N2 = log2(nvectors)
9944 Each "N1 iteration" doubles the number of redundant vectors and the
9945 effect of the process as a whole is to have a sequence of nvectors/2**N1
9946 vectors that repeats 2**N1 times. Rather than generate these redundant
9947 vectors, we halve the number of vectors for each N1 iteration. */
9948 unsigned int in_start = 0;
9949 unsigned int out_start = nvectors;
9950 unsigned int new_nvectors = nvectors;
9951 for (unsigned int in_repeat = 1; in_repeat < nvectors; in_repeat *= 2)
9953 unsigned int hi_start = new_nvectors / 2;
9954 unsigned int out_i = 0;
9955 for (unsigned int in_i = 0; in_i < new_nvectors; ++in_i)
9957 if ((in_i & 1) != 0
9958 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type),
9959 2 * in_repeat))
9960 continue;
9962 tree output = make_ssa_name (new_vector_type);
9963 tree input1 = pieces[in_start + (in_i / 2)];
9964 tree input2 = pieces[in_start + (in_i / 2) + hi_start];
9965 gassign *stmt = gimple_build_assign (output, VEC_PERM_EXPR,
9966 input1, input2,
9967 permutes[in_i & 1]);
9968 gimple_seq_add_stmt (seq, stmt);
9969 pieces[out_start + out_i] = output;
9970 out_i += 1;
9972 std::swap (in_start, out_start);
9973 new_nvectors = out_i;
9976 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
9977 results.reserve (nresults);
9978 for (unsigned int i = 0; i < nresults; ++i)
9979 if (i < new_nvectors)
9980 results.quick_push (gimple_build (seq, VIEW_CONVERT_EXPR, vector_type,
9981 pieces[in_start + i]));
9982 else
9983 results.quick_push (results[i - new_nvectors]);
9987 /* For constant and loop invariant defs in OP_NODE this function creates
9988 vector defs that will be used in the vectorized stmts and stores them
9989 to SLP_TREE_VEC_DEFS of OP_NODE. */
9991 static void
9992 vect_create_constant_vectors (vec_info *vinfo, slp_tree op_node)
9994 unsigned HOST_WIDE_INT nunits;
9995 tree vec_cst;
9996 unsigned j, number_of_places_left_in_vector;
9997 tree vector_type;
9998 tree vop;
9999 int group_size = op_node->ops.length ();
10000 unsigned int vec_num, i;
10001 unsigned number_of_copies = 1;
10002 bool constant_p;
10003 gimple_seq ctor_seq = NULL;
10004 auto_vec<tree, 16> permute_results;
10006 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
10007 vector_type = SLP_TREE_VECTYPE (op_node);
10009 unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (op_node);
10010 SLP_TREE_VEC_DEFS (op_node).create (number_of_vectors);
10011 auto_vec<tree> voprnds (number_of_vectors);
10013 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
10014 created vectors. It is greater than 1 if unrolling is performed.
10016 For example, we have two scalar operands, s1 and s2 (e.g., group of
10017 strided accesses of size two), while NUNITS is four (i.e., four scalars
10018 of this type can be packed in a vector). The output vector will contain
10019 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
10020 will be 2).
10022 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
10023 containing the operands.
10025 For example, NUNITS is four as before, and the group size is 8
10026 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
10027 {s5, s6, s7, s8}. */
10029 /* When using duplicate_and_interleave, we just need one element for
10030 each scalar statement. */
10031 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
10032 nunits = group_size;
10034 number_of_copies = nunits * number_of_vectors / group_size;
10036 number_of_places_left_in_vector = nunits;
10037 constant_p = true;
10038 tree uniform_elt = NULL_TREE;
10039 tree_vector_builder elts (vector_type, nunits, 1);
10040 elts.quick_grow (nunits);
10041 stmt_vec_info insert_after = NULL;
10042 for (j = 0; j < number_of_copies; j++)
10044 tree op;
10045 for (i = group_size - 1; op_node->ops.iterate (i, &op); i--)
10047 /* Create 'vect_ = {op0,op1,...,opn}'. */
10048 tree orig_op = op;
10049 if (number_of_places_left_in_vector == nunits)
10050 uniform_elt = op;
10051 else if (uniform_elt && operand_equal_p (uniform_elt, op))
10052 op = elts[number_of_places_left_in_vector];
10053 else
10054 uniform_elt = NULL_TREE;
10055 number_of_places_left_in_vector--;
10056 if (!types_compatible_p (TREE_TYPE (vector_type), TREE_TYPE (op)))
10058 if (CONSTANT_CLASS_P (op))
10060 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10062 /* Can't use VIEW_CONVERT_EXPR for booleans because
10063 of possibly different sizes of scalar value and
10064 vector element. */
10065 if (integer_zerop (op))
10066 op = build_int_cst (TREE_TYPE (vector_type), 0);
10067 else if (integer_onep (op))
10068 op = build_all_ones_cst (TREE_TYPE (vector_type));
10069 else
10070 gcc_unreachable ();
10072 else
10073 op = fold_unary (VIEW_CONVERT_EXPR,
10074 TREE_TYPE (vector_type), op);
10075 gcc_assert (op && CONSTANT_CLASS_P (op));
10077 else
10079 tree new_temp = make_ssa_name (TREE_TYPE (vector_type));
10080 gimple *init_stmt;
10081 if (VECTOR_BOOLEAN_TYPE_P (vector_type))
10083 tree true_val
10084 = build_all_ones_cst (TREE_TYPE (vector_type));
10085 tree false_val
10086 = build_zero_cst (TREE_TYPE (vector_type));
10087 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op)));
10088 init_stmt = gimple_build_assign (new_temp, COND_EXPR,
10089 op, true_val,
10090 false_val);
10092 else
10094 op = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (vector_type),
10095 op);
10096 init_stmt
10097 = gimple_build_assign (new_temp, VIEW_CONVERT_EXPR,
10098 op);
10100 gimple_seq_add_stmt (&ctor_seq, init_stmt);
10101 op = new_temp;
10104 elts[number_of_places_left_in_vector] = op;
10105 if (!CONSTANT_CLASS_P (op))
10106 constant_p = false;
10107 /* For BB vectorization we have to compute an insert location
10108 when a def is inside the analyzed region since we cannot
10109 simply insert at the BB start in this case. */
10110 stmt_vec_info opdef;
10111 if (TREE_CODE (orig_op) == SSA_NAME
10112 && !SSA_NAME_IS_DEFAULT_DEF (orig_op)
10113 && is_a <bb_vec_info> (vinfo)
10114 && (opdef = vinfo->lookup_def (orig_op)))
10116 if (!insert_after)
10117 insert_after = opdef;
10118 else
10119 insert_after = get_later_stmt (insert_after, opdef);
10122 if (number_of_places_left_in_vector == 0)
10124 auto type_nunits = TYPE_VECTOR_SUBPARTS (vector_type);
10125 if (uniform_elt)
10126 vec_cst = gimple_build_vector_from_val (&ctor_seq, vector_type,
10127 elts[0]);
10128 else if (constant_p
10129 ? multiple_p (type_nunits, nunits)
10130 : known_eq (type_nunits, nunits))
10131 vec_cst = gimple_build_vector (&ctor_seq, &elts);
10132 else
10134 if (permute_results.is_empty ())
10135 duplicate_and_interleave (vinfo, &ctor_seq, vector_type,
10136 elts, number_of_vectors,
10137 permute_results);
10138 vec_cst = permute_results[number_of_vectors - j - 1];
10140 if (!gimple_seq_empty_p (ctor_seq))
10142 if (insert_after)
10144 gimple_stmt_iterator gsi;
10145 if (gimple_code (insert_after->stmt) == GIMPLE_PHI)
10147 gsi = gsi_after_labels (gimple_bb (insert_after->stmt));
10148 gsi_insert_seq_before (&gsi, ctor_seq,
10149 GSI_CONTINUE_LINKING);
10151 else if (!stmt_ends_bb_p (insert_after->stmt))
10153 gsi = gsi_for_stmt (insert_after->stmt);
10154 gsi_insert_seq_after (&gsi, ctor_seq,
10155 GSI_CONTINUE_LINKING);
10157 else
10159 /* When we want to insert after a def where the
10160 defining stmt throws then insert on the fallthru
10161 edge. */
10162 edge e = find_fallthru_edge
10163 (gimple_bb (insert_after->stmt)->succs);
10164 basic_block new_bb
10165 = gsi_insert_seq_on_edge_immediate (e, ctor_seq);
10166 gcc_assert (!new_bb);
10169 else
10170 vinfo->insert_seq_on_entry (NULL, ctor_seq);
10171 ctor_seq = NULL;
10173 voprnds.quick_push (vec_cst);
10174 insert_after = NULL;
10175 number_of_places_left_in_vector = nunits;
10176 constant_p = true;
10177 elts.new_vector (vector_type, nunits, 1);
10178 elts.quick_grow (nunits);
10183 /* Since the vectors are created in the reverse order, we should invert
10184 them. */
10185 vec_num = voprnds.length ();
10186 for (j = vec_num; j != 0; j--)
10188 vop = voprnds[j - 1];
10189 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
10192 /* In case that VF is greater than the unrolling factor needed for the SLP
10193 group of stmts, NUMBER_OF_VECTORS to be created is greater than
10194 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
10195 to replicate the vectors. */
10196 while (number_of_vectors > SLP_TREE_VEC_DEFS (op_node).length ())
10197 for (i = 0; SLP_TREE_VEC_DEFS (op_node).iterate (i, &vop) && i < vec_num;
10198 i++)
10199 SLP_TREE_VEC_DEFS (op_node).quick_push (vop);
10202 /* Get the Ith vectorized definition from SLP_NODE. */
10204 tree
10205 vect_get_slp_vect_def (slp_tree slp_node, unsigned i)
10207 return SLP_TREE_VEC_DEFS (slp_node)[i];
10210 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
10212 void
10213 vect_get_slp_defs (slp_tree slp_node, vec<tree> *vec_defs)
10215 vec_defs->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node));
10216 vec_defs->splice (SLP_TREE_VEC_DEFS (slp_node));
10219 /* Get N vectorized definitions for SLP_NODE. */
10221 void
10222 vect_get_slp_defs (vec_info *,
10223 slp_tree slp_node, vec<vec<tree> > *vec_oprnds, unsigned n)
10225 if (n == -1U)
10226 n = SLP_TREE_CHILDREN (slp_node).length ();
10228 for (unsigned i = 0; i < n; ++i)
10230 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
10231 vec<tree> vec_defs = vNULL;
10232 vect_get_slp_defs (child, &vec_defs);
10233 vec_oprnds->quick_push (vec_defs);
10237 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
10238 - PERM gives the permutation that the caller wants to use for NODE,
10239 which might be different from SLP_LOAD_PERMUTATION.
10240 - DUMP_P controls whether the function dumps information. */
10242 static bool
10243 vect_transform_slp_perm_load_1 (vec_info *vinfo, slp_tree node,
10244 load_permutation_t &perm,
10245 const vec<tree> &dr_chain,
10246 gimple_stmt_iterator *gsi, poly_uint64 vf,
10247 bool analyze_only, bool dump_p,
10248 unsigned *n_perms, unsigned int *n_loads,
10249 bool dce_chain)
10251 stmt_vec_info stmt_info = SLP_TREE_SCALAR_STMTS (node)[0];
10252 int vec_index = 0;
10253 tree vectype = SLP_TREE_VECTYPE (node);
10254 unsigned int group_size = SLP_TREE_SCALAR_STMTS (node).length ();
10255 unsigned int mask_element;
10256 unsigned dr_group_size;
10257 machine_mode mode;
10259 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info))
10260 dr_group_size = 1;
10261 else
10263 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
10264 dr_group_size = DR_GROUP_SIZE (stmt_info);
10267 mode = TYPE_MODE (vectype);
10268 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10269 unsigned int nstmts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
10271 /* Initialize the vect stmts of NODE to properly insert the generated
10272 stmts later. */
10273 if (! analyze_only)
10274 for (unsigned i = SLP_TREE_VEC_DEFS (node).length (); i < nstmts; i++)
10275 SLP_TREE_VEC_DEFS (node).quick_push (NULL_TREE);
10277 /* Generate permutation masks for every NODE. Number of masks for each NODE
10278 is equal to GROUP_SIZE.
10279 E.g., we have a group of three nodes with three loads from the same
10280 location in each node, and the vector size is 4. I.e., we have a
10281 a0b0c0a1b1c1... sequence and we need to create the following vectors:
10282 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
10283 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
10286 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
10287 The last mask is illegal since we assume two operands for permute
10288 operation, and the mask element values can't be outside that range.
10289 Hence, the last mask must be converted into {2,5,5,5}.
10290 For the first two permutations we need the first and the second input
10291 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
10292 we need the second and the third vectors: {b1,c1,a2,b2} and
10293 {c2,a3,b3,c3}. */
10295 int vect_stmts_counter = 0;
10296 unsigned int index = 0;
10297 int first_vec_index = -1;
10298 int second_vec_index = -1;
10299 bool noop_p = true;
10300 *n_perms = 0;
10302 vec_perm_builder mask;
10303 unsigned int nelts_to_build;
10304 unsigned int nvectors_per_build;
10305 unsigned int in_nlanes;
10306 bool repeating_p = (group_size == dr_group_size
10307 && multiple_p (nunits, group_size));
10308 if (repeating_p)
10310 /* A single vector contains a whole number of copies of the node, so:
10311 (a) all permutes can use the same mask; and
10312 (b) the permutes only need a single vector input. */
10313 mask.new_vector (nunits, group_size, 3);
10314 nelts_to_build = mask.encoded_nelts ();
10315 /* It's possible to obtain zero nstmts during analyze_only, so make
10316 it at least one to ensure the later computation for n_perms
10317 proceed. */
10318 nvectors_per_build = nstmts > 0 ? nstmts : 1;
10319 in_nlanes = dr_group_size * 3;
10321 else
10323 /* We need to construct a separate mask for each vector statement. */
10324 unsigned HOST_WIDE_INT const_nunits, const_vf;
10325 if (!nunits.is_constant (&const_nunits)
10326 || !vf.is_constant (&const_vf))
10327 return false;
10328 mask.new_vector (const_nunits, const_nunits, 1);
10329 nelts_to_build = const_vf * group_size;
10330 nvectors_per_build = 1;
10331 in_nlanes = const_vf * dr_group_size;
10333 auto_sbitmap used_in_lanes (in_nlanes);
10334 bitmap_clear (used_in_lanes);
10335 auto_bitmap used_defs;
10337 unsigned int count = mask.encoded_nelts ();
10338 mask.quick_grow (count);
10339 vec_perm_indices indices;
10341 for (unsigned int j = 0; j < nelts_to_build; j++)
10343 unsigned int iter_num = j / group_size;
10344 unsigned int stmt_num = j % group_size;
10345 unsigned int i = (iter_num * dr_group_size + perm[stmt_num]);
10346 bitmap_set_bit (used_in_lanes, i);
10347 if (repeating_p)
10349 first_vec_index = 0;
10350 mask_element = i;
10352 else
10354 /* Enforced before the loop when !repeating_p. */
10355 unsigned int const_nunits = nunits.to_constant ();
10356 vec_index = i / const_nunits;
10357 mask_element = i % const_nunits;
10358 if (vec_index == first_vec_index
10359 || first_vec_index == -1)
10361 first_vec_index = vec_index;
10363 else if (vec_index == second_vec_index
10364 || second_vec_index == -1)
10366 second_vec_index = vec_index;
10367 mask_element += const_nunits;
10369 else
10371 if (dump_p)
10372 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10373 "permutation requires at "
10374 "least three vectors %G",
10375 stmt_info->stmt);
10376 gcc_assert (analyze_only);
10377 return false;
10380 gcc_assert (mask_element < 2 * const_nunits);
10383 if (mask_element != index)
10384 noop_p = false;
10385 mask[index++] = mask_element;
10387 if (index == count)
10389 if (!noop_p)
10391 indices.new_vector (mask, second_vec_index == -1 ? 1 : 2, nunits);
10392 if (!can_vec_perm_const_p (mode, mode, indices))
10394 if (dump_p)
10396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10397 "unsupported vect permute { ");
10398 for (i = 0; i < count; ++i)
10400 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
10401 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
10403 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
10405 gcc_assert (analyze_only);
10406 return false;
10409 tree mask_vec = NULL_TREE;
10410 if (!analyze_only)
10411 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
10413 if (second_vec_index == -1)
10414 second_vec_index = first_vec_index;
10416 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
10418 ++*n_perms;
10419 if (analyze_only)
10420 continue;
10421 /* Generate the permute statement if necessary. */
10422 tree first_vec = dr_chain[first_vec_index + ri];
10423 tree second_vec = dr_chain[second_vec_index + ri];
10424 gassign *stmt = as_a<gassign *> (stmt_info->stmt);
10425 tree perm_dest
10426 = vect_create_destination_var (gimple_assign_lhs (stmt),
10427 vectype);
10428 perm_dest = make_ssa_name (perm_dest);
10429 gimple *perm_stmt
10430 = gimple_build_assign (perm_dest, VEC_PERM_EXPR, first_vec,
10431 second_vec, mask_vec);
10432 vect_finish_stmt_generation (vinfo, stmt_info, perm_stmt,
10433 gsi);
10434 if (dce_chain)
10436 bitmap_set_bit (used_defs, first_vec_index + ri);
10437 bitmap_set_bit (used_defs, second_vec_index + ri);
10440 /* Store the vector statement in NODE. */
10441 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = perm_dest;
10444 else if (!analyze_only)
10446 for (unsigned int ri = 0; ri < nvectors_per_build; ++ri)
10448 tree first_vec = dr_chain[first_vec_index + ri];
10449 /* If mask was NULL_TREE generate the requested
10450 identity transform. */
10451 if (dce_chain)
10452 bitmap_set_bit (used_defs, first_vec_index + ri);
10454 /* Store the vector statement in NODE. */
10455 SLP_TREE_VEC_DEFS (node)[vect_stmts_counter++] = first_vec;
10459 index = 0;
10460 first_vec_index = -1;
10461 second_vec_index = -1;
10462 noop_p = true;
10466 if (n_loads)
10468 if (repeating_p)
10469 *n_loads = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
10470 else
10472 /* Enforced above when !repeating_p. */
10473 unsigned int const_nunits = nunits.to_constant ();
10474 *n_loads = 0;
10475 bool load_seen = false;
10476 for (unsigned i = 0; i < in_nlanes; ++i)
10478 if (i % const_nunits == 0)
10480 if (load_seen)
10481 *n_loads += 1;
10482 load_seen = false;
10484 if (bitmap_bit_p (used_in_lanes, i))
10485 load_seen = true;
10487 if (load_seen)
10488 *n_loads += 1;
10492 if (dce_chain)
10493 for (unsigned i = 0; i < dr_chain.length (); ++i)
10494 if (!bitmap_bit_p (used_defs, i))
10496 tree def = dr_chain[i];
10499 gimple *stmt = SSA_NAME_DEF_STMT (def);
10500 if (is_gimple_assign (stmt)
10501 && (gimple_assign_rhs_code (stmt) == VIEW_CONVERT_EXPR
10502 || gimple_assign_rhs_code (stmt) == CONSTRUCTOR))
10503 def = single_ssa_tree_operand (stmt, SSA_OP_USE);
10504 else
10505 def = NULL;
10506 gimple_stmt_iterator rgsi = gsi_for_stmt (stmt);
10507 gsi_remove (&rgsi, true);
10508 release_defs (stmt);
10510 while (def);
10513 return true;
10516 /* Generate vector permute statements from a list of loads in DR_CHAIN.
10517 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
10518 permute statements for the SLP node NODE. Store the number of vector
10519 permute instructions in *N_PERMS and the number of vector load
10520 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
10521 that were not needed. */
10523 bool
10524 vect_transform_slp_perm_load (vec_info *vinfo,
10525 slp_tree node, const vec<tree> &dr_chain,
10526 gimple_stmt_iterator *gsi, poly_uint64 vf,
10527 bool analyze_only, unsigned *n_perms,
10528 unsigned int *n_loads, bool dce_chain)
10530 return vect_transform_slp_perm_load_1 (vinfo, node,
10531 SLP_TREE_LOAD_PERMUTATION (node),
10532 dr_chain, gsi, vf, analyze_only,
10533 dump_enabled_p (), n_perms, n_loads,
10534 dce_chain);
10537 /* Produce the next vector result for SLP permutation NODE by adding a vector
10538 statement at GSI. If MASK_VEC is nonnull, add:
10540 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
10542 otherwise add:
10544 <new SSA name> = FIRST_DEF. */
10546 static void
10547 vect_add_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
10548 slp_tree node, tree first_def, tree second_def,
10549 tree mask_vec, poly_uint64 identity_offset)
10551 tree vectype = SLP_TREE_VECTYPE (node);
10553 /* ??? We SLP match existing vector element extracts but
10554 allow punning which we need to re-instantiate at uses
10555 but have no good way of explicitly representing. */
10556 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)), TYPE_SIZE (vectype))
10557 && !types_compatible_p (TREE_TYPE (first_def), vectype))
10559 gassign *conv_stmt
10560 = gimple_build_assign (make_ssa_name (vectype),
10561 build1 (VIEW_CONVERT_EXPR, vectype, first_def));
10562 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
10563 first_def = gimple_assign_lhs (conv_stmt);
10565 gassign *perm_stmt;
10566 tree perm_dest = make_ssa_name (vectype);
10567 if (mask_vec)
10569 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def)),
10570 TYPE_SIZE (vectype))
10571 && !types_compatible_p (TREE_TYPE (second_def), vectype))
10573 gassign *conv_stmt
10574 = gimple_build_assign (make_ssa_name (vectype),
10575 build1 (VIEW_CONVERT_EXPR,
10576 vectype, second_def));
10577 vect_finish_stmt_generation (vinfo, NULL, conv_stmt, gsi);
10578 second_def = gimple_assign_lhs (conv_stmt);
10580 perm_stmt = gimple_build_assign (perm_dest, VEC_PERM_EXPR,
10581 first_def, second_def,
10582 mask_vec);
10584 else if (!types_compatible_p (TREE_TYPE (first_def), vectype))
10586 /* For identity permutes we still need to handle the case
10587 of offsetted extracts or concats. */
10588 unsigned HOST_WIDE_INT c;
10589 auto first_def_nunits
10590 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def));
10591 if (known_le (TYPE_VECTOR_SUBPARTS (vectype), first_def_nunits))
10593 unsigned HOST_WIDE_INT elsz
10594 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def))));
10595 tree lowpart = build3 (BIT_FIELD_REF, vectype, first_def,
10596 TYPE_SIZE (vectype),
10597 bitsize_int (identity_offset * elsz));
10598 perm_stmt = gimple_build_assign (perm_dest, lowpart);
10600 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype),
10601 first_def_nunits, &c) && c == 2)
10603 tree ctor = build_constructor_va (vectype, 2, NULL_TREE, first_def,
10604 NULL_TREE, second_def);
10605 perm_stmt = gimple_build_assign (perm_dest, ctor);
10607 else
10608 gcc_unreachable ();
10610 else
10612 /* We need a copy here in case the def was external. */
10613 perm_stmt = gimple_build_assign (perm_dest, first_def);
10615 vect_finish_stmt_generation (vinfo, NULL, perm_stmt, gsi);
10616 /* Store the vector statement in NODE. */
10617 node->push_vec_def (perm_stmt);
10620 /* Subroutine of vectorizable_slp_permutation. Check whether the target
10621 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
10622 If GSI is nonnull, emit the permutation there.
10624 When GSI is null, the only purpose of NODE is to give properties
10625 of the result, such as the vector type and number of SLP lanes.
10626 The node does not need to be a VEC_PERM_EXPR.
10628 If the target supports the operation, return the number of individual
10629 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
10630 dump file if DUMP_P is true. */
10632 static int
10633 vectorizable_slp_permutation_1 (vec_info *vinfo, gimple_stmt_iterator *gsi,
10634 slp_tree node, lane_permutation_t &perm,
10635 vec<slp_tree> &children, bool dump_p)
10637 tree vectype = SLP_TREE_VECTYPE (node);
10639 /* ??? We currently only support all same vector input types
10640 while the SLP IL should really do a concat + select and thus accept
10641 arbitrary mismatches. */
10642 slp_tree child;
10643 unsigned i;
10644 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10645 bool repeating_p = multiple_p (nunits, SLP_TREE_LANES (node));
10646 /* True if we're permuting a single input of 2N vectors down
10647 to N vectors. This case doesn't generalize beyond 2 since
10648 VEC_PERM_EXPR only takes 2 inputs. */
10649 bool pack_p = false;
10650 /* If we're permuting inputs of N vectors each into X*N outputs,
10651 this is the value of X, otherwise it is 1. */
10652 unsigned int unpack_factor = 1;
10653 tree op_vectype = NULL_TREE;
10654 FOR_EACH_VEC_ELT (children, i, child)
10655 if (SLP_TREE_VECTYPE (child))
10657 op_vectype = SLP_TREE_VECTYPE (child);
10658 break;
10660 if (!op_vectype)
10661 op_vectype = vectype;
10662 FOR_EACH_VEC_ELT (children, i, child)
10664 if ((SLP_TREE_DEF_TYPE (child) != vect_internal_def
10665 && !vect_maybe_update_slp_op_vectype (child, op_vectype))
10666 || !types_compatible_p (SLP_TREE_VECTYPE (child), op_vectype)
10667 || !types_compatible_p (TREE_TYPE (vectype), TREE_TYPE (op_vectype)))
10669 if (dump_p)
10670 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10671 "Unsupported vector types in lane permutation\n");
10672 return -1;
10674 auto op_nunits = TYPE_VECTOR_SUBPARTS (op_vectype);
10675 unsigned int this_unpack_factor;
10676 /* Detect permutations of external, pre-existing vectors. The external
10677 node's SLP_TREE_LANES stores the total number of units in the vector,
10678 or zero if the vector has variable length.
10680 We are expected to keep the original VEC_PERM_EXPR for such cases.
10681 There is no repetition to model. */
10682 if (SLP_TREE_DEF_TYPE (child) == vect_external_def
10683 && SLP_TREE_SCALAR_OPS (child).is_empty ())
10684 repeating_p = false;
10685 /* Check whether the input has twice as many lanes per vector. */
10686 else if (children.length () == 1
10687 && known_eq (SLP_TREE_LANES (child) * nunits,
10688 SLP_TREE_LANES (node) * op_nunits * 2))
10689 pack_p = true;
10690 /* Check whether the output has N times as many lanes per vector. */
10691 else if (constant_multiple_p (SLP_TREE_LANES (node) * op_nunits,
10692 SLP_TREE_LANES (child) * nunits,
10693 &this_unpack_factor)
10694 && (i == 0 || unpack_factor == this_unpack_factor))
10695 unpack_factor = this_unpack_factor;
10696 else
10697 repeating_p = false;
10700 gcc_assert (perm.length () == SLP_TREE_LANES (node));
10702 /* Load-lanes permute. This permute only acts as a forwarder to
10703 select the correct vector def of the load-lanes load which
10704 has the permuted vectors in its vector defs like
10705 { v0, w0, r0, v1, w1, r1 ... } for a ld3. All costs are
10706 accounted for in the costing for the actual load so we
10707 return zero here. */
10708 if (node->ldst_lanes)
10710 gcc_assert (children.length () == 1);
10711 if (!gsi)
10712 /* This is a trivial op always supported. */
10713 return 0;
10714 slp_tree child = children[0];
10715 unsigned vec_idx = (SLP_TREE_LANE_PERMUTATION (node)[0].second
10716 / SLP_TREE_LANES (node));
10717 unsigned vec_num = SLP_TREE_LANES (child) / SLP_TREE_LANES (node);
10718 for (unsigned i = 0; i < SLP_TREE_NUMBER_OF_VEC_STMTS (node); ++i)
10720 tree def = SLP_TREE_VEC_DEFS (child)[i * vec_num + vec_idx];
10721 node->push_vec_def (def);
10723 return 0;
10726 /* Set REPEATING_P to true if the permutations are cylical wrt UNPACK_FACTOR
10727 and if we can generate the vectors in a vector-length agnostic way.
10728 This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
10729 compile time.
10731 The significance of UNPACK_STEP is that, when PACK_P is false,
10732 output vector I operates on a window of UNPACK_STEP elements from each
10733 input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR). For example,
10734 when UNPACK_FACTOR is 2, the first output vector operates on lanes
10735 [0, NUNITS / 2 - 1] of each input vector and the second output vector
10736 operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
10738 When REPEATING_P is true, NOUTPUTS holds the total number of outputs
10739 that we actually need to generate. */
10740 uint64_t noutputs = 0;
10741 poly_uint64 unpack_step = 0;
10742 loop_vec_info linfo = dyn_cast <loop_vec_info> (vinfo);
10743 if (!linfo
10744 || !multiple_p (nunits, unpack_factor, &unpack_step)
10745 || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo)
10746 * SLP_TREE_LANES (node), nunits, &noutputs))
10747 repeating_p = false;
10749 /* We can handle the conditions described for REPEATING_P above for
10750 both variable- and constant-length vectors. The fallback requires
10751 us to generate every element of every permute vector explicitly,
10752 which is only possible for constant-length permute vectors.
10754 Set:
10756 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
10757 mask vectors that we want to build.
10759 - NCOPIES to the number of copies of PERM that we need in order
10760 to build the necessary permute mask vectors. */
10761 uint64_t npatterns;
10762 unsigned nelts_per_pattern;
10763 uint64_t ncopies;
10764 if (repeating_p)
10766 /* We need permute mask vectors that have the form:
10768 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
10770 In other words, the original n-element permute in PERM is
10771 "unrolled" to fill a full vector. The stepped vector encoding
10772 that we use for permutes requires 3n elements. */
10773 npatterns = SLP_TREE_LANES (node);
10774 nelts_per_pattern = ncopies = 3;
10776 else
10778 /* Calculate every element of every permute mask vector explicitly,
10779 instead of relying on the pattern described above. */
10780 if (!nunits.is_constant (&npatterns)
10781 || !TYPE_VECTOR_SUBPARTS (op_vectype).is_constant ())
10783 if (dump_p)
10784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10785 "unsupported permutation %p on variable-length"
10786 " vectors\n", (void *) node);
10787 return -1;
10789 nelts_per_pattern = ncopies = 1;
10790 if (linfo && !LOOP_VINFO_VECT_FACTOR (linfo).is_constant (&ncopies))
10792 if (dump_p)
10793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10794 "unsupported permutation %p for variable VF\n",
10795 (void *) node);
10796 return -1;
10798 pack_p = false;
10799 unpack_factor = 1;
10801 unsigned olanes = unpack_factor * ncopies * SLP_TREE_LANES (node);
10802 gcc_assert (repeating_p || multiple_p (olanes, nunits));
10804 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
10805 from the { SLP operand, scalar lane } permutation as recorded in the
10806 SLP node as intermediate step. This part should already work
10807 with SLP children with arbitrary number of lanes. */
10808 auto_vec<std::pair<std::pair<unsigned, unsigned>, poly_uint64>> vperm;
10809 auto_vec<poly_uint64> active_lane;
10810 vperm.create (olanes);
10811 active_lane.safe_grow_cleared (children.length (), true);
10812 for (unsigned int ui = 0; ui < unpack_factor; ++ui)
10814 for (unsigned j = 0; j < children.length (); ++j)
10815 active_lane[j] = ui * unpack_step;
10816 for (unsigned i = 0; i < ncopies; ++i)
10818 for (unsigned pi = 0; pi < perm.length (); ++pi)
10820 std::pair<unsigned, unsigned> p = perm[pi];
10821 tree vtype = SLP_TREE_VECTYPE (children[p.first]);
10822 if (repeating_p)
10823 vperm.quick_push ({{p.first, 0},
10824 p.second + active_lane[p.first]});
10825 else
10827 /* We checked above that the vectors are constant-length. */
10828 unsigned vnunits = TYPE_VECTOR_SUBPARTS (vtype)
10829 .to_constant ();
10830 unsigned lane = active_lane[p.first].to_constant ();
10831 unsigned vi = (lane + p.second) / vnunits;
10832 unsigned vl = (lane + p.second) % vnunits;
10833 vperm.quick_push ({{p.first, vi}, vl});
10836 /* Advance to the next group. */
10837 for (unsigned j = 0; j < children.length (); ++j)
10838 active_lane[j] += SLP_TREE_LANES (children[j]);
10842 if (dump_p)
10844 dump_printf_loc (MSG_NOTE, vect_location,
10845 "vectorizing permutation %p", (void *)node);
10846 for (unsigned i = 0; i < perm.length (); ++i)
10847 dump_printf (MSG_NOTE, " op%u[%u]", perm[i].first, perm[i].second);
10848 if (repeating_p)
10849 dump_printf (MSG_NOTE, " (repeat %d)", SLP_TREE_LANES (node));
10850 dump_printf (MSG_NOTE, "\n");
10851 dump_printf_loc (MSG_NOTE, vect_location, "as");
10852 for (unsigned i = 0; i < vperm.length (); ++i)
10854 if (i != 0
10855 && (repeating_p
10856 ? multiple_p (i, npatterns)
10857 : multiple_p (i, TYPE_VECTOR_SUBPARTS (vectype))))
10858 dump_printf (MSG_NOTE, ",");
10859 dump_printf (MSG_NOTE, " vops%u[%u][",
10860 vperm[i].first.first, vperm[i].first.second);
10861 dump_dec (MSG_NOTE, vperm[i].second);
10862 dump_printf (MSG_NOTE, "]");
10864 dump_printf (MSG_NOTE, "\n");
10867 /* We can only handle two-vector permutes, everything else should
10868 be lowered on the SLP level. The following is closely inspired
10869 by vect_transform_slp_perm_load and is supposed to eventually
10870 replace it.
10871 ??? As intermediate step do code-gen in the SLP tree representation
10872 somehow? */
10873 std::pair<unsigned, unsigned> first_vec = std::make_pair (-1U, -1U);
10874 std::pair<unsigned, unsigned> second_vec = std::make_pair (-1U, -1U);
10875 unsigned int index = 0;
10876 poly_uint64 mask_element;
10877 vec_perm_builder mask;
10878 mask.new_vector (nunits, npatterns, nelts_per_pattern);
10879 unsigned int count = mask.encoded_nelts ();
10880 mask.quick_grow (count);
10881 vec_perm_indices indices;
10882 unsigned nperms = 0;
10883 /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
10884 vectors to check during analysis, but we need to generate NOUTPUTS
10885 vectors during transformation. */
10886 unsigned total_nelts = olanes;
10887 unsigned process_nelts = olanes;
10888 if (repeating_p)
10890 total_nelts = (total_nelts / unpack_factor) * noutputs;
10891 if (gsi)
10892 process_nelts = total_nelts;
10894 unsigned last_ei = (total_nelts - 1) % process_nelts;
10895 for (unsigned i = 0; i < process_nelts; ++i)
10897 /* VI is the input vector index when generating code for REPEATING_P. */
10898 unsigned vi = i / olanes * (pack_p ? 2 : 1);
10899 unsigned ei = i % olanes;
10900 mask_element = vperm[ei].second;
10901 if (pack_p)
10903 /* In this case, we have N outputs and the single child provides 2N
10904 inputs. Output X permutes inputs 2X and 2X+1.
10906 The mask indices are taken directly from the SLP permutation node.
10907 Index X selects from the first vector if (X / NUNITS) % 2 == 0;
10908 X selects from the second vector otherwise. These conditions
10909 are only known at compile time for constant-length vectors. */
10910 first_vec = std::make_pair (0, 0);
10911 second_vec = std::make_pair (0, 1);
10913 else if (first_vec.first == -1U
10914 || first_vec == vperm[ei].first)
10915 first_vec = vperm[ei].first;
10916 else if (second_vec.first == -1U
10917 || second_vec == vperm[ei].first)
10919 second_vec = vperm[ei].first;
10920 mask_element += nunits;
10922 else
10924 if (dump_p)
10925 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10926 "permutation requires at "
10927 "least three vectors\n");
10928 gcc_assert (!gsi);
10929 return -1;
10932 mask[index++] = mask_element;
10934 if (index == count)
10936 indices.new_vector (mask, second_vec.first == -1U ? 1 : 2,
10937 TYPE_VECTOR_SUBPARTS (op_vectype));
10938 bool identity_p = (indices.series_p (0, 1, mask[0], 1)
10939 && constant_multiple_p (mask[0], nunits));
10940 machine_mode vmode = TYPE_MODE (vectype);
10941 machine_mode op_vmode = TYPE_MODE (op_vectype);
10942 unsigned HOST_WIDE_INT c;
10943 if ((!identity_p
10944 && !can_vec_perm_const_p (vmode, op_vmode, indices))
10945 || (identity_p
10946 && !known_le (nunits,
10947 TYPE_VECTOR_SUBPARTS (op_vectype))
10948 && (!constant_multiple_p (nunits,
10949 TYPE_VECTOR_SUBPARTS (op_vectype),
10950 &c) || c != 2)))
10952 if (dump_p)
10954 dump_printf_loc (MSG_MISSED_OPTIMIZATION,
10955 vect_location,
10956 "unsupported vect permute { ");
10957 for (i = 0; i < count; ++i)
10959 dump_dec (MSG_MISSED_OPTIMIZATION, mask[i]);
10960 dump_printf (MSG_MISSED_OPTIMIZATION, " ");
10962 dump_printf (MSG_MISSED_OPTIMIZATION, "}\n");
10964 gcc_assert (!gsi);
10965 return -1;
10968 if (!identity_p)
10969 nperms += CEIL (total_nelts, process_nelts) - (ei > last_ei);
10970 if (gsi)
10972 if (second_vec.first == -1U)
10973 second_vec = first_vec;
10975 slp_tree
10976 first_node = children[first_vec.first],
10977 second_node = children[second_vec.first];
10979 tree mask_vec = NULL_TREE;
10980 if (!identity_p)
10981 mask_vec = vect_gen_perm_mask_checked (vectype, indices);
10983 tree first_def
10984 = vect_get_slp_vect_def (first_node, first_vec.second + vi);
10985 tree second_def
10986 = vect_get_slp_vect_def (second_node, second_vec.second + vi);
10987 vect_add_slp_permutation (vinfo, gsi, node, first_def,
10988 second_def, mask_vec, mask[0]);
10991 index = 0;
10992 first_vec = std::make_pair (-1U, -1U);
10993 second_vec = std::make_pair (-1U, -1U);
10997 return nperms;
11000 /* Vectorize the SLP permutations in NODE as specified
11001 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
11002 child number and lane number.
11003 Interleaving of two two-lane two-child SLP subtrees (not supported):
11004 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
11005 A blend of two four-lane two-child SLP subtrees:
11006 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
11007 Highpart of a four-lane one-child SLP subtree (not supported):
11008 [ { 0, 2 }, { 0, 3 } ]
11009 Where currently only a subset is supported by code generating below. */
11011 static bool
11012 vectorizable_slp_permutation (vec_info *vinfo, gimple_stmt_iterator *gsi,
11013 slp_tree node, stmt_vector_for_cost *cost_vec)
11015 tree vectype = SLP_TREE_VECTYPE (node);
11016 lane_permutation_t &perm = SLP_TREE_LANE_PERMUTATION (node);
11017 int nperms = vectorizable_slp_permutation_1 (vinfo, gsi, node, perm,
11018 SLP_TREE_CHILDREN (node),
11019 dump_enabled_p ());
11020 if (nperms < 0)
11021 return false;
11023 if (!gsi)
11024 record_stmt_cost (cost_vec, nperms, vec_perm, node, vectype, 0, vect_body);
11026 return true;
11029 /* Vectorize SLP NODE. */
11031 static void
11032 vect_schedule_slp_node (vec_info *vinfo,
11033 slp_tree node, slp_instance instance)
11035 gimple_stmt_iterator si;
11036 int i;
11037 slp_tree child;
11039 /* Vectorize externals and constants. */
11040 if (SLP_TREE_DEF_TYPE (node) == vect_constant_def
11041 || SLP_TREE_DEF_TYPE (node) == vect_external_def)
11043 /* ??? vectorizable_shift can end up using a scalar operand which is
11044 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
11045 node in this case. */
11046 if (!SLP_TREE_VECTYPE (node))
11047 return;
11049 /* There are two reasons vector defs might already exist. The first
11050 is that we are vectorizing an existing vector def. The second is
11051 when performing BB vectorization shared constant/external nodes
11052 are not split apart during partitioning so during the code-gen
11053 DFS walk we can end up visiting them twice. */
11054 if (! SLP_TREE_VEC_DEFS (node).exists ())
11055 vect_create_constant_vectors (vinfo, node);
11056 return;
11059 gcc_assert (SLP_TREE_VEC_DEFS (node).is_empty ());
11061 stmt_vec_info stmt_info = SLP_TREE_REPRESENTATIVE (node);
11063 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node) != 0);
11064 SLP_TREE_VEC_DEFS (node).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node));
11066 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
11067 && STMT_VINFO_DATA_REF (stmt_info))
11069 /* Vectorized loads go before the first scalar load to make it
11070 ready early, vectorized stores go before the last scalar
11071 stmt which is where all uses are ready. */
11072 stmt_vec_info last_stmt_info = NULL;
11073 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
11074 last_stmt_info = vect_find_first_scalar_stmt_in_slp (node);
11075 else /* DR_IS_WRITE */
11076 last_stmt_info = vect_find_last_scalar_stmt_in_slp (node);
11077 si = gsi_for_stmt (last_stmt_info->stmt);
11079 else if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
11080 && (STMT_VINFO_TYPE (stmt_info) == cycle_phi_info_type
11081 || STMT_VINFO_TYPE (stmt_info) == induc_vec_info_type
11082 || STMT_VINFO_TYPE (stmt_info) == phi_info_type))
11084 /* For PHI node vectorization we do not use the insertion iterator. */
11085 si = gsi_none ();
11087 else
11089 /* Emit other stmts after the children vectorized defs which is
11090 earliest possible. */
11091 gimple *last_stmt = NULL;
11092 bool seen_vector_def = false;
11093 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11094 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def)
11096 /* For fold-left reductions we are retaining the scalar
11097 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
11098 set so the representation isn't perfect. Resort to the
11099 last scalar def here. */
11100 if (SLP_TREE_VEC_DEFS (child).is_empty ())
11102 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child))
11103 == cycle_phi_info_type);
11104 gphi *phi = as_a <gphi *>
11105 (vect_find_last_scalar_stmt_in_slp (child)->stmt);
11106 if (!last_stmt
11107 || vect_stmt_dominates_stmt_p (last_stmt, phi))
11108 last_stmt = phi;
11110 /* We are emitting all vectorized stmts in the same place and
11111 the last one is the last.
11112 ??? Unless we have a load permutation applied and that
11113 figures to re-use an earlier generated load. */
11114 unsigned j;
11115 tree vdef;
11116 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11118 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11119 if (!last_stmt
11120 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11121 last_stmt = vstmt;
11124 else if (!SLP_TREE_VECTYPE (child))
11126 /* For externals we use unvectorized at all scalar defs. */
11127 unsigned j;
11128 tree def;
11129 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child), j, def)
11130 if (TREE_CODE (def) == SSA_NAME
11131 && !SSA_NAME_IS_DEFAULT_DEF (def))
11133 gimple *stmt = SSA_NAME_DEF_STMT (def);
11134 if (!last_stmt
11135 || vect_stmt_dominates_stmt_p (last_stmt, stmt))
11136 last_stmt = stmt;
11139 else
11141 /* For externals we have to look at all defs since their
11142 insertion place is decided per vector. But beware
11143 of pre-existing vectors where we need to make sure
11144 we do not insert before the region boundary. */
11145 if (SLP_TREE_SCALAR_OPS (child).is_empty ()
11146 && !vinfo->lookup_def (SLP_TREE_VEC_DEFS (child)[0]))
11147 seen_vector_def = true;
11148 else
11150 unsigned j;
11151 tree vdef;
11152 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child), j, vdef)
11153 if (TREE_CODE (vdef) == SSA_NAME
11154 && !SSA_NAME_IS_DEFAULT_DEF (vdef))
11156 gimple *vstmt = SSA_NAME_DEF_STMT (vdef);
11157 if (!last_stmt
11158 || vect_stmt_dominates_stmt_p (last_stmt, vstmt))
11159 last_stmt = vstmt;
11163 /* This can happen when all children are pre-existing vectors or
11164 constants. */
11165 if (!last_stmt)
11166 last_stmt = vect_find_first_scalar_stmt_in_slp (node)->stmt;
11167 if (!last_stmt)
11169 gcc_assert (seen_vector_def);
11170 si = gsi_after_labels (vinfo->bbs[0]);
11172 else if (is_ctrl_altering_stmt (last_stmt))
11174 /* We split regions to vectorize at control altering stmts
11175 with a definition so this must be an external which
11176 we can insert at the start of the region. */
11177 si = gsi_after_labels (vinfo->bbs[0]);
11179 else if (is_a <bb_vec_info> (vinfo)
11180 && SLP_TREE_CODE (node) != VEC_PERM_EXPR
11181 && gimple_bb (last_stmt) != gimple_bb (stmt_info->stmt)
11182 && gimple_could_trap_p (stmt_info->stmt))
11184 /* We've constrained possibly trapping operations to all come
11185 from the same basic-block, if vectorized defs would allow earlier
11186 scheduling still force vectorized stmts to the original block.
11187 This is only necessary for BB vectorization since for loop vect
11188 all operations are in a single BB and scalar stmt based
11189 placement doesn't play well with epilogue vectorization. */
11190 gcc_assert (dominated_by_p (CDI_DOMINATORS,
11191 gimple_bb (stmt_info->stmt),
11192 gimple_bb (last_stmt)));
11193 si = gsi_after_labels (gimple_bb (stmt_info->stmt));
11195 else if (is_a <gphi *> (last_stmt))
11196 si = gsi_after_labels (gimple_bb (last_stmt));
11197 else
11199 si = gsi_for_stmt (last_stmt);
11200 gsi_next (&si);
11202 /* Avoid scheduling internal defs outside of the loop when
11203 we might have only implicitly tracked loop mask/len defs. */
11204 if (auto loop_vinfo = dyn_cast <loop_vec_info> (vinfo))
11205 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
11206 || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
11208 gimple_stmt_iterator si2
11209 = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo)->header);
11210 if ((gsi_end_p (si2)
11211 && (LOOP_VINFO_LOOP (loop_vinfo)->header
11212 != gimple_bb (last_stmt))
11213 && dominated_by_p (CDI_DOMINATORS,
11214 LOOP_VINFO_LOOP (loop_vinfo)->header,
11215 gimple_bb (last_stmt)))
11216 || (!gsi_end_p (si2)
11217 && last_stmt != *si2
11218 && vect_stmt_dominates_stmt_p (last_stmt, *si2)))
11219 si = si2;
11224 /* Handle purely internal nodes. */
11225 if (SLP_TREE_CODE (node) == VEC_PERM_EXPR)
11227 if (dump_enabled_p ())
11228 dump_printf_loc (MSG_NOTE, vect_location,
11229 "------>vectorizing SLP permutation node\n");
11230 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
11231 be shared with different SLP nodes (but usually it's the same
11232 operation apart from the case the stmt is only there for denoting
11233 the actual scalar lane defs ...). So do not call vect_transform_stmt
11234 but open-code it here (partly). */
11235 bool done = vectorizable_slp_permutation (vinfo, &si, node, NULL);
11236 gcc_assert (done);
11237 stmt_vec_info slp_stmt_info;
11238 unsigned int i;
11239 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, slp_stmt_info)
11240 if (slp_stmt_info && STMT_VINFO_LIVE_P (slp_stmt_info))
11242 done = vectorizable_live_operation (vinfo, slp_stmt_info, node,
11243 instance, i, true, NULL);
11244 gcc_assert (done);
11247 else
11249 if (dump_enabled_p ())
11250 dump_printf_loc (MSG_NOTE, vect_location,
11251 "------>vectorizing SLP node starting from: %G",
11252 stmt_info->stmt);
11253 vect_transform_stmt (vinfo, stmt_info, &si, node, instance);
11257 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
11258 For loop vectorization this is done in vectorizable_call, but for SLP
11259 it needs to be deferred until end of vect_schedule_slp, because multiple
11260 SLP instances may refer to the same scalar stmt. */
11262 static void
11263 vect_remove_slp_scalar_calls (vec_info *vinfo,
11264 slp_tree node, hash_set<slp_tree> &visited)
11266 gimple *new_stmt;
11267 gimple_stmt_iterator gsi;
11268 int i;
11269 slp_tree child;
11270 tree lhs;
11271 stmt_vec_info stmt_info;
11273 if (!node || SLP_TREE_DEF_TYPE (node) != vect_internal_def)
11274 return;
11276 if (visited.add (node))
11277 return;
11279 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11280 vect_remove_slp_scalar_calls (vinfo, child, visited);
11282 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node), i, stmt_info)
11284 if (!stmt_info)
11285 continue;
11286 gcall *stmt = dyn_cast <gcall *> (stmt_info->stmt);
11287 if (!stmt || gimple_bb (stmt) == NULL)
11288 continue;
11289 if (is_pattern_stmt_p (stmt_info)
11290 || !PURE_SLP_STMT (stmt_info))
11291 continue;
11292 lhs = gimple_call_lhs (stmt);
11293 if (lhs)
11294 new_stmt = gimple_build_assign (lhs, build_zero_cst (TREE_TYPE (lhs)));
11295 else
11297 new_stmt = gimple_build_nop ();
11298 unlink_stmt_vdef (stmt_info->stmt);
11300 gsi = gsi_for_stmt (stmt);
11301 vinfo->replace_stmt (&gsi, stmt_info, new_stmt);
11302 if (lhs)
11303 SSA_NAME_DEF_STMT (lhs) = new_stmt;
11307 static void
11308 vect_remove_slp_scalar_calls (vec_info *vinfo, slp_tree node)
11310 hash_set<slp_tree> visited;
11311 vect_remove_slp_scalar_calls (vinfo, node, visited);
11314 /* Vectorize the instance root. */
11316 void
11317 vectorize_slp_instance_root_stmt (vec_info *vinfo, slp_tree node, slp_instance instance)
11319 gassign *rstmt = NULL;
11321 if (instance->kind == slp_inst_kind_ctor)
11323 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) == 1)
11325 tree vect_lhs = SLP_TREE_VEC_DEFS (node)[0];
11326 tree root_lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
11327 if (!useless_type_conversion_p (TREE_TYPE (root_lhs),
11328 TREE_TYPE (vect_lhs)))
11329 vect_lhs = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (root_lhs),
11330 vect_lhs);
11331 rstmt = gimple_build_assign (root_lhs, vect_lhs);
11333 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node) > 1)
11335 int nelts = SLP_TREE_NUMBER_OF_VEC_STMTS (node);
11336 tree child_def;
11337 int j;
11338 vec<constructor_elt, va_gc> *v;
11339 vec_alloc (v, nelts);
11341 /* A CTOR can handle V16HI composition from VNx8HI so we
11342 do not need to convert vector elements if the types
11343 do not match. */
11344 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node), j, child_def)
11345 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, child_def);
11346 tree lhs = gimple_get_lhs (instance->root_stmts[0]->stmt);
11347 tree rtype
11348 = TREE_TYPE (gimple_assign_rhs1 (instance->root_stmts[0]->stmt));
11349 tree r_constructor = build_constructor (rtype, v);
11350 rstmt = gimple_build_assign (lhs, r_constructor);
11353 else if (instance->kind == slp_inst_kind_bb_reduc)
11355 /* Largely inspired by reduction chain epilogue handling in
11356 vect_create_epilog_for_reduction. */
11357 vec<tree> vec_defs = vNULL;
11358 vect_get_slp_defs (node, &vec_defs);
11359 enum tree_code reduc_code
11360 = gimple_assign_rhs_code (instance->root_stmts[0]->stmt);
11361 /* ??? We actually have to reflect signs somewhere. */
11362 if (reduc_code == MINUS_EXPR)
11363 reduc_code = PLUS_EXPR;
11364 gimple_seq epilogue = NULL;
11365 /* We may end up with more than one vector result, reduce them
11366 to one vector. */
11367 tree vec_def = vec_defs[0];
11368 tree vectype = TREE_TYPE (vec_def);
11369 tree compute_vectype = vectype;
11370 bool pun_for_overflow_p = (ANY_INTEGRAL_TYPE_P (vectype)
11371 && TYPE_OVERFLOW_UNDEFINED (vectype)
11372 && operation_can_overflow (reduc_code));
11373 if (pun_for_overflow_p)
11375 compute_vectype = unsigned_type_for (vectype);
11376 vec_def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
11377 compute_vectype, vec_def);
11379 for (unsigned i = 1; i < vec_defs.length (); ++i)
11381 tree def = vec_defs[i];
11382 if (pun_for_overflow_p)
11383 def = gimple_build (&epilogue, VIEW_CONVERT_EXPR,
11384 compute_vectype, def);
11385 vec_def = gimple_build (&epilogue, reduc_code, compute_vectype,
11386 vec_def, def);
11388 vec_defs.release ();
11389 /* ??? Support other schemes than direct internal fn. */
11390 internal_fn reduc_fn;
11391 if (!reduction_fn_for_scalar_code (reduc_code, &reduc_fn)
11392 || reduc_fn == IFN_LAST)
11393 gcc_unreachable ();
11394 tree scalar_def = gimple_build (&epilogue, as_combined_fn (reduc_fn),
11395 TREE_TYPE (compute_vectype), vec_def);
11396 if (!SLP_INSTANCE_REMAIN_DEFS (instance).is_empty ())
11398 tree rem_def = NULL_TREE;
11399 for (auto def : SLP_INSTANCE_REMAIN_DEFS (instance))
11401 def = gimple_convert (&epilogue, TREE_TYPE (scalar_def), def);
11402 if (!rem_def)
11403 rem_def = def;
11404 else
11405 rem_def = gimple_build (&epilogue, reduc_code,
11406 TREE_TYPE (scalar_def),
11407 rem_def, def);
11409 scalar_def = gimple_build (&epilogue, reduc_code,
11410 TREE_TYPE (scalar_def),
11411 scalar_def, rem_def);
11413 scalar_def = gimple_convert (&epilogue,
11414 TREE_TYPE (vectype), scalar_def);
11415 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
11416 gsi_insert_seq_before (&rgsi, epilogue, GSI_SAME_STMT);
11417 gimple_assign_set_rhs_from_tree (&rgsi, scalar_def);
11418 update_stmt (gsi_stmt (rgsi));
11419 return;
11421 else if (instance->kind == slp_inst_kind_gcond)
11423 /* Only support a single root for now as we can't codegen CFG yet and so we
11424 can't support lane > 1 at this time. */
11425 gcc_assert (instance->root_stmts.length () == 1);
11426 auto root_stmt_info = instance->root_stmts[0];
11427 auto last_stmt = STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info));
11428 gimple_stmt_iterator rgsi = gsi_for_stmt (last_stmt);
11429 gimple *vec_stmt = NULL;
11430 gcc_assert (!SLP_TREE_VEC_DEFS (node).is_empty ());
11431 bool res = vectorizable_early_exit (vinfo, root_stmt_info, &rgsi,
11432 &vec_stmt, node, NULL);
11433 gcc_assert (res);
11434 return;
11436 else
11437 gcc_unreachable ();
11439 gcc_assert (rstmt);
11441 gimple_stmt_iterator rgsi = gsi_for_stmt (instance->root_stmts[0]->stmt);
11442 gsi_replace (&rgsi, rstmt, true);
11445 struct slp_scc_info
11447 bool on_stack;
11448 int dfs;
11449 int lowlink;
11452 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
11454 static void
11455 vect_schedule_scc (vec_info *vinfo, slp_tree node, slp_instance instance,
11456 hash_map<slp_tree, slp_scc_info> &scc_info,
11457 int &maxdfs, vec<slp_tree> &stack)
11459 bool existed_p;
11460 slp_scc_info *info = &scc_info.get_or_insert (node, &existed_p);
11461 gcc_assert (!existed_p);
11462 info->dfs = maxdfs;
11463 info->lowlink = maxdfs;
11464 maxdfs++;
11466 /* Leaf. */
11467 if (SLP_TREE_DEF_TYPE (node) != vect_internal_def)
11469 info->on_stack = false;
11470 vect_schedule_slp_node (vinfo, node, instance);
11471 return;
11474 info->on_stack = true;
11475 stack.safe_push (node);
11477 unsigned i;
11478 slp_tree child;
11479 /* DFS recurse. */
11480 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node), i, child)
11482 if (!child)
11483 continue;
11484 slp_scc_info *child_info = scc_info.get (child);
11485 if (!child_info)
11487 vect_schedule_scc (vinfo, child, instance, scc_info, maxdfs, stack);
11488 /* Recursion might have re-allocated the node. */
11489 info = scc_info.get (node);
11490 child_info = scc_info.get (child);
11491 info->lowlink = MIN (info->lowlink, child_info->lowlink);
11493 else if (child_info->on_stack)
11494 info->lowlink = MIN (info->lowlink, child_info->dfs);
11496 if (info->lowlink != info->dfs)
11497 return;
11499 auto_vec<slp_tree, 4> phis_to_fixup;
11501 /* Singleton. */
11502 if (stack.last () == node)
11504 stack.pop ();
11505 info->on_stack = false;
11506 vect_schedule_slp_node (vinfo, node, instance);
11507 if (SLP_TREE_CODE (node) != VEC_PERM_EXPR
11508 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (node)->stmt))
11509 phis_to_fixup.quick_push (node);
11511 else
11513 /* SCC. */
11514 int last_idx = stack.length () - 1;
11515 while (stack[last_idx] != node)
11516 last_idx--;
11517 /* We can break the cycle at PHIs who have at least one child
11518 code generated. Then we could re-start the DFS walk until
11519 all nodes in the SCC are covered (we might have new entries
11520 for only back-reachable nodes). But it's simpler to just
11521 iterate and schedule those that are ready. */
11522 unsigned todo = stack.length () - last_idx;
11525 for (int idx = stack.length () - 1; idx >= last_idx; --idx)
11527 slp_tree entry = stack[idx];
11528 if (!entry)
11529 continue;
11530 bool phi = (SLP_TREE_CODE (entry) != VEC_PERM_EXPR
11531 && is_a <gphi *> (SLP_TREE_REPRESENTATIVE (entry)->stmt));
11532 bool ready = !phi;
11533 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry), i, child)
11534 if (!child)
11536 gcc_assert (phi);
11537 ready = true;
11538 break;
11540 else if (scc_info.get (child)->on_stack)
11542 if (!phi)
11544 ready = false;
11545 break;
11548 else
11550 if (phi)
11552 ready = true;
11553 break;
11556 if (ready)
11558 vect_schedule_slp_node (vinfo, entry, instance);
11559 scc_info.get (entry)->on_stack = false;
11560 stack[idx] = NULL;
11561 todo--;
11562 if (phi)
11563 phis_to_fixup.safe_push (entry);
11567 while (todo != 0);
11569 /* Pop the SCC. */
11570 stack.truncate (last_idx);
11573 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
11574 slp_tree phi_node;
11575 FOR_EACH_VEC_ELT (phis_to_fixup, i, phi_node)
11577 gphi *phi = as_a <gphi *> (SLP_TREE_REPRESENTATIVE (phi_node)->stmt);
11578 edge_iterator ei;
11579 edge e;
11580 FOR_EACH_EDGE (e, ei, gimple_bb (phi)->preds)
11582 unsigned dest_idx = e->dest_idx;
11583 child = SLP_TREE_CHILDREN (phi_node)[dest_idx];
11584 if (!child || SLP_TREE_DEF_TYPE (child) != vect_internal_def)
11585 continue;
11586 unsigned n = SLP_TREE_VEC_DEFS (phi_node).length ();
11587 /* Simply fill all args. */
11588 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node))
11589 != vect_first_order_recurrence)
11590 for (unsigned i = 0; i < n; ++i)
11592 tree phidef = SLP_TREE_VEC_DEFS (phi_node)[i];
11593 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11594 add_phi_arg (phi, vect_get_slp_vect_def (child, i),
11595 e, gimple_phi_arg_location (phi, dest_idx));
11597 else
11599 /* Unless it is a first order recurrence which needs
11600 args filled in for both the PHI node and the permutes. */
11601 gimple *perm
11602 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[0]);
11603 gimple *rphi = SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm));
11604 add_phi_arg (as_a <gphi *> (rphi),
11605 vect_get_slp_vect_def (child, n - 1),
11606 e, gimple_phi_arg_location (phi, dest_idx));
11607 for (unsigned i = 0; i < n; ++i)
11609 gimple *perm
11610 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node)[i]);
11611 if (i > 0)
11612 gimple_assign_set_rhs1 (perm,
11613 vect_get_slp_vect_def (child, i - 1));
11614 gimple_assign_set_rhs2 (perm,
11615 vect_get_slp_vect_def (child, i));
11616 update_stmt (perm);
11623 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
11625 void
11626 vect_schedule_slp (vec_info *vinfo, const vec<slp_instance> &slp_instances)
11628 slp_instance instance;
11629 unsigned int i;
11631 hash_map<slp_tree, slp_scc_info> scc_info;
11632 int maxdfs = 0;
11633 FOR_EACH_VEC_ELT (slp_instances, i, instance)
11635 slp_tree node = SLP_INSTANCE_TREE (instance);
11636 if (dump_enabled_p ())
11638 dump_printf_loc (MSG_NOTE, vect_location,
11639 "Vectorizing SLP tree:\n");
11640 /* ??? Dump all? */
11641 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
11642 dump_printf_loc (MSG_NOTE, vect_location, "Root stmt: %G",
11643 SLP_INSTANCE_ROOT_STMTS (instance)[0]->stmt);
11644 vect_print_slp_graph (MSG_NOTE, vect_location,
11645 SLP_INSTANCE_TREE (instance));
11647 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
11648 have a PHI be the node breaking the cycle. */
11649 auto_vec<slp_tree> stack;
11650 if (!scc_info.get (node))
11651 vect_schedule_scc (vinfo, node, instance, scc_info, maxdfs, stack);
11653 if (!SLP_INSTANCE_ROOT_STMTS (instance).is_empty ())
11654 vectorize_slp_instance_root_stmt (vinfo, node, instance);
11656 if (dump_enabled_p ())
11657 dump_printf_loc (MSG_NOTE, vect_location,
11658 "vectorizing stmts using SLP.\n");
11661 FOR_EACH_VEC_ELT (slp_instances, i, instance)
11663 slp_tree root = SLP_INSTANCE_TREE (instance);
11664 stmt_vec_info store_info;
11665 unsigned int j;
11667 /* Remove scalar call stmts. Do not do this for basic-block
11668 vectorization as not all uses may be vectorized.
11669 ??? Why should this be necessary? DCE should be able to
11670 remove the stmts itself.
11671 ??? For BB vectorization we can as well remove scalar
11672 stmts starting from the SLP tree root if they have no
11673 uses. */
11674 if (is_a <loop_vec_info> (vinfo))
11675 vect_remove_slp_scalar_calls (vinfo, root);
11677 /* Remove vectorized stores original scalar stmts. */
11678 for (j = 0; SLP_TREE_SCALAR_STMTS (root).iterate (j, &store_info); j++)
11680 if (!STMT_VINFO_DATA_REF (store_info)
11681 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info)))
11682 break;
11684 store_info = vect_orig_stmt (store_info);
11685 /* Free the attached stmt_vec_info and remove the stmt. */
11686 vinfo->remove_stmt (store_info);
11688 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
11689 to not crash in vect_free_slp_tree later. */
11690 if (SLP_TREE_REPRESENTATIVE (root) == store_info)
11691 SLP_TREE_REPRESENTATIVE (root) = NULL;