1 /* SLP - Basic Block Vectorization
2 Copyright (C) 2007-2025 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com>
4 and Ira Rosen <irar@il.ibm.com>
6 This file is part of GCC.
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
23 #define INCLUDE_ALGORITHM
25 #include "coretypes.h"
31 #include "tree-pass.h"
33 #include "optabs-tree.h"
34 #include "insn-config.h"
35 #include "recog.h" /* FIXME: for insn_data */
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "gimple-iterator.h"
40 #include "tree-vectorizer.h"
41 #include "langhooks.h"
42 #include "gimple-walk.h"
44 #include "tree-vector-builder.h"
45 #include "vec-perm-indices.h"
46 #include "gimple-fold.h"
47 #include "internal-fn.h"
48 #include "dump-context.h"
52 #include "alloc-pool.h"
56 static bool vect_transform_slp_perm_load_1 (vec_info
*, slp_tree
,
59 gimple_stmt_iterator
*,
60 poly_uint64
, bool, bool,
64 static int vectorizable_slp_permutation_1 (vec_info
*, gimple_stmt_iterator
*,
65 slp_tree
, lane_permutation_t
&,
66 vec
<slp_tree
> &, bool);
67 static bool vectorizable_slp_permutation (vec_info
*, gimple_stmt_iterator
*,
68 slp_tree
, stmt_vector_for_cost
*);
69 static void vect_print_slp_tree (dump_flags_t
, dump_location_t
, slp_tree
);
70 static bool vect_slp_can_convert_to_external (const vec
<stmt_vec_info
> &);
72 static object_allocator
<_slp_tree
> *slp_tree_pool
;
73 static slp_tree slp_first_node
;
78 slp_tree_pool
= new object_allocator
<_slp_tree
> ("SLP nodes");
84 while (slp_first_node
)
85 delete slp_first_node
;
91 _slp_tree::operator new (size_t n
)
93 gcc_assert (n
== sizeof (_slp_tree
));
94 return slp_tree_pool
->allocate_raw ();
98 _slp_tree::operator delete (void *node
, size_t n
)
100 gcc_assert (n
== sizeof (_slp_tree
));
101 slp_tree_pool
->remove_raw (node
);
105 /* Initialize a SLP node. */
107 _slp_tree::_slp_tree ()
109 this->prev_node
= NULL
;
111 slp_first_node
->prev_node
= this;
112 this->next_node
= slp_first_node
;
113 slp_first_node
= this;
114 SLP_TREE_SCALAR_STMTS (this) = vNULL
;
115 SLP_TREE_SCALAR_OPS (this) = vNULL
;
116 SLP_TREE_VEC_DEFS (this) = vNULL
;
117 SLP_TREE_NUMBER_OF_VEC_STMTS (this) = 0;
118 SLP_TREE_CHILDREN (this) = vNULL
;
119 SLP_TREE_LOAD_PERMUTATION (this) = vNULL
;
120 SLP_TREE_LANE_PERMUTATION (this) = vNULL
;
121 SLP_TREE_SIMD_CLONE_INFO (this) = vNULL
;
122 SLP_TREE_DEF_TYPE (this) = vect_uninitialized_def
;
123 SLP_TREE_CODE (this) = ERROR_MARK
;
124 this->ldst_lanes
= false;
125 SLP_TREE_VECTYPE (this) = NULL_TREE
;
126 SLP_TREE_REPRESENTATIVE (this) = NULL
;
127 SLP_TREE_MEMORY_ACCESS_TYPE (this) = VMAT_INVARIANT
;
128 SLP_TREE_REF_COUNT (this) = 1;
130 this->max_nunits
= 1;
134 /* Tear down a SLP node. */
136 _slp_tree::~_slp_tree ()
139 this->prev_node
->next_node
= this->next_node
;
141 slp_first_node
= this->next_node
;
143 this->next_node
->prev_node
= this->prev_node
;
144 SLP_TREE_CHILDREN (this).release ();
145 SLP_TREE_SCALAR_STMTS (this).release ();
146 SLP_TREE_SCALAR_OPS (this).release ();
147 SLP_TREE_VEC_DEFS (this).release ();
148 SLP_TREE_LOAD_PERMUTATION (this).release ();
149 SLP_TREE_LANE_PERMUTATION (this).release ();
150 SLP_TREE_SIMD_CLONE_INFO (this).release ();
155 /* Push the single SSA definition in DEF to the vector of vector defs. */
158 _slp_tree::push_vec_def (gimple
*def
)
160 if (gphi
*phi
= dyn_cast
<gphi
*> (def
))
161 vec_defs
.quick_push (gimple_phi_result (phi
));
164 def_operand_p defop
= single_ssa_def_operand (def
, SSA_OP_ALL_DEFS
);
165 vec_defs
.quick_push (get_def_from_ptr (defop
));
169 /* Recursively free the memory allocated for the SLP tree rooted at NODE. */
172 vect_free_slp_tree (slp_tree node
)
177 if (--SLP_TREE_REF_COUNT (node
) != 0)
180 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
182 vect_free_slp_tree (child
);
184 /* If the node defines any SLP only patterns then those patterns are no
185 longer valid and should be removed. */
186 stmt_vec_info rep_stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
187 if (rep_stmt_info
&& STMT_VINFO_SLP_VECT_ONLY_PATTERN (rep_stmt_info
))
189 stmt_vec_info stmt_info
= vect_orig_stmt (rep_stmt_info
);
190 STMT_VINFO_IN_PATTERN_P (stmt_info
) = false;
191 STMT_SLP_TYPE (stmt_info
) = STMT_SLP_TYPE (rep_stmt_info
);
197 /* Return a location suitable for dumpings related to the SLP instance. */
200 _slp_instance::location () const
202 if (!root_stmts
.is_empty ())
203 return root_stmts
[0]->stmt
;
205 return SLP_TREE_SCALAR_STMTS (root
)[0]->stmt
;
209 /* Free the memory allocated for the SLP instance. */
212 vect_free_slp_instance (slp_instance instance
)
214 vect_free_slp_tree (SLP_INSTANCE_TREE (instance
));
215 SLP_INSTANCE_LOADS (instance
).release ();
216 SLP_INSTANCE_ROOT_STMTS (instance
).release ();
217 SLP_INSTANCE_REMAIN_DEFS (instance
).release ();
218 instance
->subgraph_entries
.release ();
219 instance
->cost_vec
.release ();
224 /* Create an SLP node for SCALAR_STMTS. */
227 vect_create_new_slp_node (unsigned nops
, tree_code code
)
229 slp_tree node
= new _slp_tree
;
230 SLP_TREE_SCALAR_STMTS (node
) = vNULL
;
231 SLP_TREE_CHILDREN (node
).create (nops
);
232 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
233 SLP_TREE_CODE (node
) = code
;
236 /* Create an SLP node for SCALAR_STMTS. */
239 vect_create_new_slp_node (slp_tree node
,
240 vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
242 SLP_TREE_SCALAR_STMTS (node
) = scalar_stmts
;
243 SLP_TREE_CHILDREN (node
).create (nops
);
244 SLP_TREE_DEF_TYPE (node
) = vect_internal_def
;
245 SLP_TREE_REPRESENTATIVE (node
) = scalar_stmts
[0];
246 SLP_TREE_LANES (node
) = scalar_stmts
.length ();
250 /* Create an SLP node for SCALAR_STMTS. */
253 vect_create_new_slp_node (vec
<stmt_vec_info
> scalar_stmts
, unsigned nops
)
255 return vect_create_new_slp_node (new _slp_tree
, scalar_stmts
, nops
);
258 /* Create an SLP node for OPS. */
261 vect_create_new_slp_node (slp_tree node
, vec
<tree
> ops
)
263 SLP_TREE_SCALAR_OPS (node
) = ops
;
264 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
265 SLP_TREE_LANES (node
) = ops
.length ();
269 /* Create an SLP node for OPS. */
272 vect_create_new_slp_node (vec
<tree
> ops
)
274 return vect_create_new_slp_node (new _slp_tree
, ops
);
278 /* This structure is used in creation of an SLP tree. Each instance
279 corresponds to the same operand in a group of scalar stmts in an SLP
281 typedef struct _slp_oprnd_info
283 /* Def-stmts for the operands. */
284 vec
<stmt_vec_info
> def_stmts
;
287 /* Information about the first statement, its vector def-type, type, the
288 operand itself in case it's constant, and an indication if it's a pattern
289 stmt and gather/scatter info. */
291 enum vect_def_type first_dt
;
294 gather_scatter_info first_gs_info
;
298 /* Allocate operands info for NOPS operands, and GROUP_SIZE def-stmts for each
300 static vec
<slp_oprnd_info
>
301 vect_create_oprnd_info (int nops
, int group_size
)
304 slp_oprnd_info oprnd_info
;
305 vec
<slp_oprnd_info
> oprnds_info
;
307 oprnds_info
.create (nops
);
308 for (i
= 0; i
< nops
; i
++)
310 oprnd_info
= XNEW (struct _slp_oprnd_info
);
311 oprnd_info
->def_stmts
.create (group_size
);
312 oprnd_info
->ops
.create (group_size
);
313 oprnd_info
->first_dt
= vect_uninitialized_def
;
314 oprnd_info
->first_op_type
= NULL_TREE
;
315 oprnd_info
->any_pattern
= false;
316 oprnd_info
->first_gs_p
= false;
317 oprnds_info
.quick_push (oprnd_info
);
324 /* Free operands info. */
327 vect_free_oprnd_info (vec
<slp_oprnd_info
> &oprnds_info
)
330 slp_oprnd_info oprnd_info
;
332 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
334 oprnd_info
->def_stmts
.release ();
335 oprnd_info
->ops
.release ();
336 XDELETE (oprnd_info
);
339 oprnds_info
.release ();
342 /* Return the execution frequency of NODE (so that a higher value indicates
343 a "more important" node when optimizing for speed). */
346 vect_slp_node_weight (slp_tree node
)
348 stmt_vec_info stmt_info
= vect_orig_stmt (SLP_TREE_REPRESENTATIVE (node
));
349 basic_block bb
= gimple_bb (stmt_info
->stmt
);
350 return bb
->count
.to_sreal_scale (ENTRY_BLOCK_PTR_FOR_FN (cfun
)->count
);
353 /* Return true if STMTS contains a pattern statement. */
356 vect_contains_pattern_stmt_p (vec
<stmt_vec_info
> stmts
)
358 stmt_vec_info stmt_info
;
360 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
361 if (stmt_info
&& is_pattern_stmt_p (stmt_info
))
366 /* Return true when all lanes in the external or constant NODE have
370 vect_slp_tree_uniform_p (slp_tree node
)
372 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
373 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
);
375 /* Pre-exsting vectors. */
376 if (SLP_TREE_SCALAR_OPS (node
).is_empty ())
380 tree op
, first
= NULL_TREE
;
381 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
384 else if (!operand_equal_p (first
, op
, 0))
390 /* Find the place of the data-ref in STMT_INFO in the interleaving chain
391 that starts from FIRST_STMT_INFO. Return -1 if the data-ref is not a part
395 vect_get_place_in_interleaving_chain (stmt_vec_info stmt_info
,
396 stmt_vec_info first_stmt_info
)
398 stmt_vec_info next_stmt_info
= first_stmt_info
;
401 if (first_stmt_info
!= DR_GROUP_FIRST_ELEMENT (stmt_info
))
406 if (next_stmt_info
== stmt_info
)
408 next_stmt_info
= DR_GROUP_NEXT_ELEMENT (next_stmt_info
);
410 result
+= DR_GROUP_GAP (next_stmt_info
);
412 while (next_stmt_info
);
417 /* Check whether it is possible to load COUNT elements of type ELT_TYPE
418 using the method implemented by duplicate_and_interleave. Return true
419 if so, returning the number of intermediate vectors in *NVECTORS_OUT
420 (if nonnull) and the type of each intermediate vector in *VECTOR_TYPE_OUT
424 can_duplicate_and_interleave_p (vec_info
*vinfo
, unsigned int count
,
425 tree elt_type
, unsigned int *nvectors_out
,
426 tree
*vector_type_out
,
429 tree base_vector_type
= get_vectype_for_scalar_type (vinfo
, elt_type
, count
);
430 if (!base_vector_type
|| !VECTOR_MODE_P (TYPE_MODE (base_vector_type
)))
433 machine_mode base_vector_mode
= TYPE_MODE (base_vector_type
);
434 poly_int64 elt_bytes
= count
* GET_MODE_UNIT_SIZE (base_vector_mode
);
435 unsigned int nvectors
= 1;
438 scalar_int_mode int_mode
;
439 poly_int64 elt_bits
= elt_bytes
* BITS_PER_UNIT
;
440 if (int_mode_for_size (elt_bits
, 1).exists (&int_mode
))
442 /* Get the natural vector type for this SLP group size. */
443 tree int_type
= build_nonstandard_integer_type
444 (GET_MODE_BITSIZE (int_mode
), 1);
446 = get_vectype_for_scalar_type (vinfo
, int_type
, count
);
447 poly_int64 half_nelts
;
449 && VECTOR_MODE_P (TYPE_MODE (vector_type
))
450 && known_eq (GET_MODE_SIZE (TYPE_MODE (vector_type
)),
451 GET_MODE_SIZE (base_vector_mode
))
452 && multiple_p (GET_MODE_NUNITS (TYPE_MODE (vector_type
)),
455 /* Try fusing consecutive sequences of COUNT / NVECTORS elements
456 together into elements of type INT_TYPE and using the result
457 to build NVECTORS vectors. */
458 poly_uint64 nelts
= GET_MODE_NUNITS (TYPE_MODE (vector_type
));
459 vec_perm_builder
sel1 (nelts
, 2, 3);
460 vec_perm_builder
sel2 (nelts
, 2, 3);
462 for (unsigned int i
= 0; i
< 3; ++i
)
465 sel1
.quick_push (i
+ nelts
);
466 sel2
.quick_push (half_nelts
+ i
);
467 sel2
.quick_push (half_nelts
+ i
+ nelts
);
469 vec_perm_indices
indices1 (sel1
, 2, nelts
);
470 vec_perm_indices
indices2 (sel2
, 2, nelts
);
471 machine_mode vmode
= TYPE_MODE (vector_type
);
472 if (can_vec_perm_const_p (vmode
, vmode
, indices1
)
473 && can_vec_perm_const_p (vmode
, vmode
, indices2
))
476 *nvectors_out
= nvectors
;
478 *vector_type_out
= vector_type
;
481 permutes
[0] = vect_gen_perm_mask_checked (vector_type
,
483 permutes
[1] = vect_gen_perm_mask_checked (vector_type
,
490 if (!multiple_p (elt_bytes
, 2, &elt_bytes
))
493 /* We need to be able to fuse COUNT / NVECTORS elements together. */
494 if (!multiple_p (count
, nvectors
))
499 /* Return true if DTA and DTB match. */
502 vect_def_types_match (enum vect_def_type dta
, enum vect_def_type dtb
)
505 || ((dta
== vect_external_def
|| dta
== vect_constant_def
)
506 && (dtb
== vect_external_def
|| dtb
== vect_constant_def
)));
509 static const int cond_expr_maps
[3][5] = {
514 static const int no_arg_map
[] = { 0 };
515 static const int arg0_map
[] = { 1, 0 };
516 static const int arg1_map
[] = { 1, 1 };
517 static const int arg2_arg3_map
[] = { 2, 2, 3 };
518 static const int arg1_arg3_map
[] = { 2, 1, 3 };
519 static const int arg1_arg4_arg5_map
[] = { 3, 1, 4, 5 };
520 static const int arg1_arg3_arg4_map
[] = { 3, 1, 3, 4 };
521 static const int arg3_arg2_map
[] = { 2, 3, 2 };
522 static const int op1_op0_map
[] = { 2, 1, 0 };
523 static const int off_map
[] = { 1, -3 };
524 static const int off_op0_map
[] = { 2, -3, 0 };
525 static const int off_arg2_arg3_map
[] = { 3, -3, 2, 3 };
526 static const int off_arg3_arg2_map
[] = { 3, -3, 3, 2 };
527 static const int mask_call_maps
[6][7] = {
532 { 5, 1, 2, 3, 4, 5, },
533 { 6, 1, 2, 3, 4, 5, 6 },
536 /* For most SLP statements, there is a one-to-one mapping between
537 gimple arguments and child nodes. If that is not true for STMT,
538 return an array that contains:
540 - the number of child nodes, followed by
541 - for each child node, the index of the argument associated with that node.
542 The special index -1 is the first operand of an embedded comparison and
543 the special index -2 is the second operand of an embedded comparison.
544 The special indes -3 is the offset of a gather as analyzed by
545 vect_check_gather_scatter.
547 SWAP is as for vect_get_and_check_slp_defs. */
550 vect_get_operand_map (const gimple
*stmt
, bool gather_scatter_p
= false,
551 unsigned char swap
= 0)
553 if (auto assign
= dyn_cast
<const gassign
*> (stmt
))
555 if (gimple_assign_rhs_code (assign
) == COND_EXPR
556 && COMPARISON_CLASS_P (gimple_assign_rhs1 (assign
)))
558 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign
)) == tcc_comparison
561 if (gather_scatter_p
)
562 return (TREE_CODE (gimple_assign_lhs (assign
)) != SSA_NAME
563 ? off_op0_map
: off_map
);
566 if (auto call
= dyn_cast
<const gcall
*> (stmt
))
568 if (gimple_call_internal_p (call
))
569 switch (gimple_call_internal_fn (call
))
572 return gather_scatter_p
? off_arg2_arg3_map
: arg2_arg3_map
;
574 case IFN_GATHER_LOAD
:
577 case IFN_MASK_GATHER_LOAD
:
578 case IFN_MASK_LEN_GATHER_LOAD
:
579 return arg1_arg4_arg5_map
;
581 case IFN_SCATTER_STORE
:
582 return arg1_arg3_map
;
584 case IFN_MASK_SCATTER_STORE
:
585 case IFN_MASK_LEN_SCATTER_STORE
:
586 return arg1_arg3_arg4_map
;
589 return gather_scatter_p
? off_arg3_arg2_map
: arg3_arg2_map
;
593 unsigned nargs
= gimple_call_num_args (call
);
594 if (nargs
>= 2 && nargs
<= 7)
595 return mask_call_maps
[nargs
-2];
604 case IFN_GOMP_SIMD_LANE
:
614 /* Return the SLP node child index for operand OP of STMT. */
617 vect_slp_child_index_for_operand (const gimple
*stmt
, int op
,
618 bool gather_scatter_p
)
620 const int *opmap
= vect_get_operand_map (stmt
, gather_scatter_p
);
623 for (int i
= 1; i
< 1 + opmap
[0]; ++i
)
629 /* Get the defs for the rhs of STMT (collect them in OPRNDS_INFO), check that
630 they are of a valid type and that they match the defs of the first stmt of
631 the SLP group (stored in OPRNDS_INFO). This function tries to match stmts
632 by swapping operands of STMTS[STMT_NUM] when possible. Non-zero SWAP
633 indicates swap is required for cond_expr stmts. Specifically, SWAP
634 is 1 if STMT is cond and operands of comparison need to be swapped;
635 SWAP is 2 if STMT is cond and code of comparison needs to be inverted.
637 If there was a fatal error return -1; if the error could be corrected by
638 swapping operands of father node of this one, return 1; if everything is
641 vect_get_and_check_slp_defs (vec_info
*vinfo
, unsigned char swap
,
643 vec
<stmt_vec_info
> stmts
, unsigned stmt_num
,
644 vec
<slp_oprnd_info
> *oprnds_info
)
646 stmt_vec_info stmt_info
= stmts
[stmt_num
];
648 unsigned int i
, number_of_oprnds
;
649 enum vect_def_type dt
= vect_uninitialized_def
;
650 slp_oprnd_info oprnd_info
;
651 gather_scatter_info gs_info
;
652 unsigned int gs_op
= -1u;
653 unsigned int commutative_op
= -1U;
654 bool first
= stmt_num
== 0;
658 for (auto oi
: *oprnds_info
)
660 oi
->def_stmts
.quick_push (NULL
);
661 oi
->ops
.quick_push (NULL_TREE
);
666 if (!is_a
<gcall
*> (stmt_info
->stmt
)
667 && !is_a
<gassign
*> (stmt_info
->stmt
)
668 && !is_a
<gphi
*> (stmt_info
->stmt
))
671 number_of_oprnds
= gimple_num_args (stmt_info
->stmt
);
673 = vect_get_operand_map (stmt_info
->stmt
,
674 STMT_VINFO_GATHER_SCATTER_P (stmt_info
), swap
);
676 number_of_oprnds
= *map
++;
677 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
679 if (gimple_call_internal_p (stmt
))
681 internal_fn ifn
= gimple_call_internal_fn (stmt
);
682 commutative_op
= first_commutative_argument (ifn
);
685 else if (gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
))
687 if (commutative_tree_code (gimple_assign_rhs_code (stmt
)))
691 bool swapped
= (swap
!= 0);
692 bool backedge
= false;
693 enum vect_def_type
*dts
= XALLOCAVEC (enum vect_def_type
, number_of_oprnds
);
694 for (i
= 0; i
< number_of_oprnds
; i
++)
696 oprnd_info
= (*oprnds_info
)[i
];
697 int opno
= map
? map
[i
] : int (i
);
700 gcc_assert (STMT_VINFO_GATHER_SCATTER_P (stmt_info
));
701 if (!is_a
<loop_vec_info
> (vinfo
)
702 || !vect_check_gather_scatter (stmt_info
,
703 as_a
<loop_vec_info
> (vinfo
),
704 first
? &oprnd_info
->first_gs_info
710 oprnd_info
->first_gs_p
= true;
711 oprnd
= oprnd_info
->first_gs_info
.offset
;
716 oprnd
= gs_info
.offset
;
720 oprnd
= TREE_OPERAND (gimple_arg (stmt_info
->stmt
, 0), -1 - opno
);
723 oprnd
= gimple_arg (stmt_info
->stmt
, opno
);
724 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
726 edge e
= gimple_phi_arg_edge (stmt
, opno
);
727 backedge
= (is_a
<bb_vec_info
> (vinfo
)
728 ? e
->flags
& EDGE_DFS_BACK
729 : dominated_by_p (CDI_DOMINATORS
, e
->src
,
730 gimple_bb (stmt_info
->stmt
)));
733 if (TREE_CODE (oprnd
) == VIEW_CONVERT_EXPR
)
734 oprnd
= TREE_OPERAND (oprnd
, 0);
736 stmt_vec_info def_stmt_info
;
737 if (!vect_is_simple_use (oprnd
, vinfo
, &dts
[i
], &def_stmt_info
))
739 if (dump_enabled_p ())
740 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
741 "Build SLP failed: can't analyze def for %T\n",
749 oprnd_info
->def_stmts
.quick_push (NULL
);
750 oprnd_info
->ops
.quick_push (NULL_TREE
);
751 oprnd_info
->first_dt
= vect_uninitialized_def
;
755 oprnd_info
->def_stmts
.quick_push (def_stmt_info
);
756 oprnd_info
->ops
.quick_push (oprnd
);
759 && is_pattern_stmt_p (def_stmt_info
))
761 if (STMT_VINFO_RELATED_STMT (vect_orig_stmt (def_stmt_info
))
763 oprnd_info
->any_pattern
= true;
765 /* If we promote this to external use the original stmt def. */
766 oprnd_info
->ops
.last ()
767 = gimple_get_lhs (vect_orig_stmt (def_stmt_info
)->stmt
);
770 /* If there's a extern def on a backedge make sure we can
771 code-generate at the region start.
772 ??? This is another case that could be fixed by adjusting
773 how we split the function but at the moment we'd have conflicting
776 && dts
[i
] == vect_external_def
777 && is_a
<bb_vec_info
> (vinfo
)
778 && TREE_CODE (oprnd
) == SSA_NAME
779 && !SSA_NAME_IS_DEFAULT_DEF (oprnd
)
780 && !dominated_by_p (CDI_DOMINATORS
, vinfo
->bbs
[0],
781 gimple_bb (SSA_NAME_DEF_STMT (oprnd
))))
783 if (dump_enabled_p ())
784 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
785 "Build SLP failed: extern def %T only defined "
786 "on backedge\n", oprnd
);
792 tree type
= TREE_TYPE (oprnd
);
795 /* For the swapping logic below force vect_reduction_def
796 for the reduction op in a SLP reduction group. */
797 if (!STMT_VINFO_DATA_REF (stmt_info
)
798 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
799 && (int)i
== STMT_VINFO_REDUC_IDX (stmt_info
)
801 dts
[i
] = dt
= vect_reduction_def
;
803 /* Check the types of the definition. */
806 case vect_external_def
:
807 case vect_constant_def
:
808 case vect_internal_def
:
809 case vect_reduction_def
:
810 case vect_double_reduction_def
:
811 case vect_induction_def
:
812 case vect_nested_cycle
:
813 case vect_first_order_recurrence
:
817 /* FORNOW: Not supported. */
818 if (dump_enabled_p ())
819 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
820 "Build SLP failed: illegal type of def %T\n",
825 oprnd_info
->first_dt
= dt
;
826 oprnd_info
->first_op_type
= type
;
832 /* Now match the operand definition types to that of the first stmt. */
833 for (i
= 0; i
< number_of_oprnds
;)
841 oprnd_info
= (*oprnds_info
)[i
];
843 stmt_vec_info def_stmt_info
= oprnd_info
->def_stmts
[stmt_num
];
844 oprnd
= oprnd_info
->ops
[stmt_num
];
845 tree type
= TREE_TYPE (oprnd
);
847 if (!types_compatible_p (oprnd_info
->first_op_type
, type
))
849 if (dump_enabled_p ())
850 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
851 "Build SLP failed: different operand types\n");
855 if ((gs_op
== i
) != oprnd_info
->first_gs_p
)
857 if (dump_enabled_p ())
858 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
859 "Build SLP failed: mixed gather and non-gather\n");
864 if (!operand_equal_p (oprnd_info
->first_gs_info
.base
,
867 if (dump_enabled_p ())
868 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
869 "Build SLP failed: different gather base\n");
872 if (oprnd_info
->first_gs_info
.scale
!= gs_info
.scale
)
874 if (dump_enabled_p ())
875 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
876 "Build SLP failed: different gather scale\n");
881 /* Not first stmt of the group, check that the def-stmt/s match
882 the def-stmt/s of the first stmt. Allow different definition
883 types for reduction chains: the first stmt must be a
884 vect_reduction_def (a phi node), and the rest
885 end in the reduction chain. */
886 if ((!vect_def_types_match (oprnd_info
->first_dt
, dt
)
887 && !(oprnd_info
->first_dt
== vect_reduction_def
888 && !STMT_VINFO_DATA_REF (stmt_info
)
889 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
891 && !STMT_VINFO_DATA_REF (def_stmt_info
)
892 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
893 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
))))
894 || (!STMT_VINFO_DATA_REF (stmt_info
)
895 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
897 || STMT_VINFO_DATA_REF (def_stmt_info
)
898 || (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
899 != REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
900 != (oprnd_info
->first_dt
!= vect_reduction_def
))))
902 /* Try swapping operands if we got a mismatch. For BB
903 vectorization only in case it will clearly improve things. */
904 if (i
== commutative_op
&& !swapped
905 && (!is_a
<bb_vec_info
> (vinfo
)
906 || (!vect_def_types_match ((*oprnds_info
)[i
+1]->first_dt
,
908 && (vect_def_types_match (oprnd_info
->first_dt
, dts
[i
+1])
909 || vect_def_types_match
910 ((*oprnds_info
)[i
+1]->first_dt
, dts
[i
])))))
912 if (dump_enabled_p ())
913 dump_printf_loc (MSG_NOTE
, vect_location
,
914 "trying swapped operands\n");
915 std::swap (dts
[i
], dts
[i
+1]);
916 std::swap ((*oprnds_info
)[i
]->def_stmts
[stmt_num
],
917 (*oprnds_info
)[i
+1]->def_stmts
[stmt_num
]);
918 std::swap ((*oprnds_info
)[i
]->ops
[stmt_num
],
919 (*oprnds_info
)[i
+1]->ops
[stmt_num
]);
920 /* After swapping some operands we lost track whether an
921 operand has any pattern defs so be conservative here. */
922 if ((*oprnds_info
)[i
]->any_pattern
923 || (*oprnds_info
)[i
+1]->any_pattern
)
924 (*oprnds_info
)[i
]->any_pattern
925 = (*oprnds_info
)[i
+1]->any_pattern
= true;
930 if (is_a
<bb_vec_info
> (vinfo
)
931 && !oprnd_info
->any_pattern
932 && number_of_oprnds
> 1)
934 /* Now for commutative ops we should see whether we can
935 make the other operand matching. */
936 if (dump_enabled_p ())
937 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
938 "treating operand as external\n");
939 oprnd_info
->first_dt
= dt
= vect_external_def
;
943 if (dump_enabled_p ())
944 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
945 "Build SLP failed: different types\n");
950 /* Make sure to demote the overall operand to external. */
951 if (dt
== vect_external_def
)
952 oprnd_info
->first_dt
= vect_external_def
;
953 /* For a SLP reduction chain we want to duplicate the reduction to
954 each of the chain members. That gets us a sane SLP graph (still
955 the stmts are not 100% correct wrt the initial values). */
956 else if ((dt
== vect_internal_def
957 || dt
== vect_reduction_def
)
958 && oprnd_info
->first_dt
== vect_reduction_def
959 && !STMT_VINFO_DATA_REF (stmt_info
)
960 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)
961 && !STMT_VINFO_DATA_REF (def_stmt_info
)
962 && (REDUC_GROUP_FIRST_ELEMENT (def_stmt_info
)
963 == REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
965 oprnd_info
->def_stmts
[stmt_num
] = oprnd_info
->def_stmts
[0];
966 oprnd_info
->ops
[stmt_num
] = oprnd_info
->ops
[0];
975 if (dump_enabled_p ())
976 dump_printf_loc (MSG_NOTE
, vect_location
,
977 "swapped operands to match def types in %G",
984 /* Return true if call statements CALL1 and CALL2 are similar enough
985 to be combined into the same SLP group. */
988 compatible_calls_p (gcall
*call1
, gcall
*call2
)
990 unsigned int nargs
= gimple_call_num_args (call1
);
991 if (nargs
!= gimple_call_num_args (call2
))
994 if (gimple_call_combined_fn (call1
) != gimple_call_combined_fn (call2
))
997 if (gimple_call_internal_p (call1
))
999 if (!types_compatible_p (TREE_TYPE (gimple_call_lhs (call1
)),
1000 TREE_TYPE (gimple_call_lhs (call2
))))
1002 for (unsigned int i
= 0; i
< nargs
; ++i
)
1003 if (!types_compatible_p (TREE_TYPE (gimple_call_arg (call1
, i
)),
1004 TREE_TYPE (gimple_call_arg (call2
, i
))))
1009 if (!operand_equal_p (gimple_call_fn (call1
),
1010 gimple_call_fn (call2
), 0))
1013 if (gimple_call_fntype (call1
) != gimple_call_fntype (call2
))
1017 /* Check that any unvectorized arguments are equal. */
1018 if (const int *map
= vect_get_operand_map (call1
))
1020 unsigned int nkept
= *map
++;
1021 unsigned int mapi
= 0;
1022 for (unsigned int i
= 0; i
< nargs
; ++i
)
1023 if (mapi
< nkept
&& map
[mapi
] == int (i
))
1025 else if (!operand_equal_p (gimple_call_arg (call1
, i
),
1026 gimple_call_arg (call2
, i
)))
1033 /* A subroutine of vect_build_slp_tree for checking VECTYPE, which is the
1034 caller's attempt to find the vector type in STMT_INFO with the narrowest
1035 element type. Return true if VECTYPE is nonnull and if it is valid
1036 for STMT_INFO. When returning true, update MAX_NUNITS to reflect the
1037 number of units in VECTYPE. GROUP_SIZE and MAX_NUNITS are as for
1038 vect_build_slp_tree. */
1041 vect_record_max_nunits (vec_info
*vinfo
, stmt_vec_info stmt_info
,
1042 unsigned int group_size
,
1043 tree vectype
, poly_uint64
*max_nunits
)
1047 if (dump_enabled_p ())
1048 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1049 "Build SLP failed: unsupported data-type in %G\n",
1051 /* Fatal mismatch. */
1055 /* If populating the vector type requires unrolling then fail
1056 before adjusting *max_nunits for basic-block vectorization. */
1057 if (is_a
<bb_vec_info
> (vinfo
)
1058 && !multiple_p (group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
1060 if (dump_enabled_p ())
1061 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1062 "Build SLP failed: unrolling required "
1063 "in basic block SLP\n");
1064 /* Fatal mismatch. */
1068 /* In case of multiple types we need to detect the smallest type. */
1069 vect_update_max_nunits (max_nunits
, vectype
);
1073 /* Verify if the scalar stmts STMTS are isomorphic, require data
1074 permutation or are of unsupported types of operation. Return
1075 true if they are, otherwise return false and indicate in *MATCHES
1076 which stmts are not isomorphic to the first one. If MATCHES[0]
1077 is false then this indicates the comparison could not be
1078 carried out or the stmts will never be vectorized by SLP.
1080 Note COND_EXPR is possibly isomorphic to another one after swapping its
1081 operands. Set SWAP[i] to 1 if stmt I is COND_EXPR and isomorphic to
1082 the first stmt by swapping the two operands of comparison; set SWAP[i]
1083 to 2 if stmt I is isormorphic to the first stmt by inverting the code
1084 of comparison. Take A1 >= B1 ? X1 : Y1 as an exmple, it can be swapped
1085 to (B1 <= A1 ? X1 : Y1); or be inverted to (A1 < B1) ? Y1 : X1. */
1088 vect_build_slp_tree_1 (vec_info
*vinfo
, unsigned char *swap
,
1089 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1090 poly_uint64
*max_nunits
, bool *matches
,
1091 bool *two_operators
, tree
*node_vectype
)
1094 stmt_vec_info first_stmt_info
= stmts
[0];
1095 code_helper first_stmt_code
= ERROR_MARK
;
1096 code_helper alt_stmt_code
= ERROR_MARK
;
1097 code_helper first_cond_code
= ERROR_MARK
;
1098 bool need_same_oprnds
= false;
1099 tree first_lhs
= NULL_TREE
;
1100 tree first_op1
= NULL_TREE
;
1101 stmt_vec_info first_load
= NULL
, prev_first_load
= NULL
;
1102 bool first_stmt_ldst_p
= false;
1103 bool first_stmt_phi_p
= false;
1104 int first_reduc_idx
= -1;
1105 bool maybe_soft_fail
= false;
1106 tree soft_fail_nunits_vectype
= NULL_TREE
;
1108 tree vectype
, nunits_vectype
;
1109 if (!vect_get_vector_types_for_stmt (vinfo
, first_stmt_info
, &vectype
,
1110 &nunits_vectype
, group_size
))
1112 /* Fatal mismatch. */
1116 /* Record nunits required but continue analysis, producing matches[]
1117 as if nunits was not an issue. This allows splitting of groups
1120 && !vect_record_max_nunits (vinfo
, first_stmt_info
, group_size
,
1121 nunits_vectype
, max_nunits
))
1123 gcc_assert (is_a
<bb_vec_info
> (vinfo
));
1124 maybe_soft_fail
= true;
1125 soft_fail_nunits_vectype
= nunits_vectype
;
1128 gcc_assert (vectype
);
1129 *node_vectype
= vectype
;
1131 /* For every stmt in NODE find its def stmt/s. */
1132 stmt_vec_info stmt_info
;
1133 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
1135 bool ldst_p
= false;
1137 code_helper rhs_code
= ERROR_MARK
;
1147 gimple
*stmt
= stmt_info
->stmt
;
1148 if (dump_enabled_p ())
1149 dump_printf_loc (MSG_NOTE
, vect_location
, "Build SLP for %G", stmt
);
1151 /* Fail to vectorize statements marked as unvectorizable, throw
1153 if (!STMT_VINFO_VECTORIZABLE (stmt_info
)
1154 || stmt_can_throw_internal (cfun
, stmt
)
1155 || gimple_has_volatile_ops (stmt
))
1157 if (dump_enabled_p ())
1158 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1159 "Build SLP failed: unvectorizable statement %G",
1161 /* ??? For BB vectorization we want to commutate operands in a way
1162 to shuffle all unvectorizable defs into one operand and have
1163 the other still vectorized. The following doesn't reliably
1164 work for this though but it's the easiest we can do here. */
1165 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1167 /* Fatal mismatch. */
1172 gcall
*call_stmt
= dyn_cast
<gcall
*> (stmt
);
1173 tree lhs
= gimple_get_lhs (stmt
);
1174 if (lhs
== NULL_TREE
1176 || !gimple_call_internal_p (stmt
)
1177 || !internal_store_fn_p (gimple_call_internal_fn (stmt
))))
1179 if (dump_enabled_p ())
1180 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1181 "Build SLP failed: not GIMPLE_ASSIGN nor "
1182 "GIMPLE_CALL %G", stmt
);
1183 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1185 /* Fatal mismatch. */
1192 combined_fn cfn
= gimple_call_combined_fn (call_stmt
);
1193 if (cfn
!= CFN_LAST
&& cfn
!= CFN_MASK_CALL
)
1196 rhs_code
= CALL_EXPR
;
1198 if (cfn
== CFN_MASK_LOAD
1199 || cfn
== CFN_GATHER_LOAD
1200 || cfn
== CFN_MASK_GATHER_LOAD
1201 || cfn
== CFN_MASK_LEN_GATHER_LOAD
1202 || cfn
== CFN_SCATTER_STORE
1203 || cfn
== CFN_MASK_SCATTER_STORE
1204 || cfn
== CFN_MASK_LEN_SCATTER_STORE
)
1206 else if (cfn
== CFN_MASK_STORE
)
1209 rhs_code
= CFN_MASK_STORE
;
1211 else if (cfn
== CFN_GOMP_SIMD_LANE
)
1213 else if ((cfn
!= CFN_LAST
1214 && cfn
!= CFN_MASK_CALL
1215 && internal_fn_p (cfn
)
1216 && !vectorizable_internal_fn_p (as_internal_fn (cfn
)))
1217 || gimple_call_tail_p (call_stmt
)
1218 || gimple_call_noreturn_p (call_stmt
)
1219 || gimple_call_chain (call_stmt
))
1221 if (dump_enabled_p ())
1222 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1223 "Build SLP failed: unsupported call type %G",
1224 (gimple
*) call_stmt
);
1225 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1227 /* Fatal mismatch. */
1232 else if (gimple_code (stmt
) == GIMPLE_PHI
)
1234 rhs_code
= ERROR_MARK
;
1239 rhs_code
= gimple_assign_rhs_code (stmt
);
1240 ldst_p
= STMT_VINFO_DATA_REF (stmt_info
) != nullptr;
1243 /* Check the operation. */
1247 first_stmt_code
= rhs_code
;
1248 first_stmt_ldst_p
= ldst_p
;
1249 first_stmt_phi_p
= phi_p
;
1250 first_reduc_idx
= STMT_VINFO_REDUC_IDX (stmt_info
);
1252 /* Shift arguments should be equal in all the packed stmts for a
1253 vector shift with scalar shift operand. */
1254 if (rhs_code
== LSHIFT_EXPR
|| rhs_code
== RSHIFT_EXPR
1255 || rhs_code
== LROTATE_EXPR
1256 || rhs_code
== RROTATE_EXPR
)
1258 /* First see if we have a vector/vector shift. */
1259 if (!directly_supported_p (rhs_code
, vectype
, optab_vector
))
1261 /* No vector/vector shift, try for a vector/scalar shift. */
1262 if (!directly_supported_p (rhs_code
, vectype
, optab_scalar
))
1264 if (dump_enabled_p ())
1265 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1266 "Build SLP failed: "
1267 "op not supported by target.\n");
1268 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1270 /* Fatal mismatch. */
1274 need_same_oprnds
= true;
1275 first_op1
= gimple_assign_rhs2 (stmt
);
1278 else if (rhs_code
== WIDEN_LSHIFT_EXPR
)
1280 need_same_oprnds
= true;
1281 first_op1
= gimple_assign_rhs2 (stmt
);
1284 && rhs_code
== BIT_FIELD_REF
)
1286 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt
), 0);
1287 if (!is_a
<bb_vec_info
> (vinfo
)
1288 || TREE_CODE (vec
) != SSA_NAME
1289 /* When the element types are not compatible we pun the
1290 source to the target vectype which requires equal size. */
1291 || ((!VECTOR_TYPE_P (TREE_TYPE (vec
))
1292 || !types_compatible_p (TREE_TYPE (vectype
),
1293 TREE_TYPE (TREE_TYPE (vec
))))
1294 && !operand_equal_p (TYPE_SIZE (vectype
),
1295 TYPE_SIZE (TREE_TYPE (vec
)))))
1297 if (dump_enabled_p ())
1298 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1299 "Build SLP failed: "
1300 "BIT_FIELD_REF not supported\n");
1301 /* Fatal mismatch. */
1306 else if (rhs_code
== CFN_DIV_POW2
)
1308 need_same_oprnds
= true;
1309 first_op1
= gimple_call_arg (call_stmt
, 1);
1311 else if (rhs_code
== CFN_GOMP_SIMD_LANE
)
1313 need_same_oprnds
= true;
1314 first_op1
= gimple_call_arg (call_stmt
, 1);
1319 if (first_reduc_idx
!= STMT_VINFO_REDUC_IDX (stmt_info
)
1320 /* For SLP reduction groups the index isn't necessarily
1321 uniform but only that of the first stmt matters. */
1322 && !(first_reduc_idx
!= -1
1323 && STMT_VINFO_REDUC_IDX (stmt_info
) != -1
1324 && REDUC_GROUP_FIRST_ELEMENT (stmt_info
)))
1326 if (dump_enabled_p ())
1328 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1329 "Build SLP failed: different reduc_idx "
1330 "%d instead of %d in %G",
1331 STMT_VINFO_REDUC_IDX (stmt_info
),
1332 first_reduc_idx
, stmt
);
1338 && first_stmt_code
!= rhs_code
1339 && alt_stmt_code
== ERROR_MARK
)
1340 alt_stmt_code
= rhs_code
;
1342 && first_stmt_code
!= rhs_code
1343 && (first_stmt_code
!= IMAGPART_EXPR
1344 || rhs_code
!= REALPART_EXPR
)
1345 && (first_stmt_code
!= REALPART_EXPR
1346 || rhs_code
!= IMAGPART_EXPR
)
1347 /* Handle mismatches in plus/minus by computing both
1348 and merging the results. */
1349 && !((first_stmt_code
== PLUS_EXPR
1350 || first_stmt_code
== MINUS_EXPR
)
1351 && (alt_stmt_code
== PLUS_EXPR
1352 || alt_stmt_code
== MINUS_EXPR
)
1353 && rhs_code
== alt_stmt_code
)
1354 && !(first_stmt_code
.is_tree_code ()
1355 && rhs_code
.is_tree_code ()
1356 && (TREE_CODE_CLASS (tree_code (first_stmt_code
))
1358 && (swap_tree_comparison (tree_code (first_stmt_code
))
1359 == tree_code (rhs_code
))))
1361 && (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
1362 != STMT_VINFO_GROUPED_ACCESS (first_stmt_info
)))
1364 && (STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
1365 != STMT_VINFO_GATHER_SCATTER_P (first_stmt_info
)))
1366 || first_stmt_ldst_p
!= ldst_p
1367 || first_stmt_phi_p
!= phi_p
)
1369 if (dump_enabled_p ())
1371 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1372 "Build SLP failed: different operation "
1373 "in stmt %G", stmt
);
1374 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1375 "original stmt %G", first_stmt_info
->stmt
);
1382 && first_stmt_code
== BIT_FIELD_REF
1383 && (TREE_OPERAND (gimple_assign_rhs1 (first_stmt_info
->stmt
), 0)
1384 != TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0)))
1386 if (dump_enabled_p ())
1387 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1388 "Build SLP failed: different BIT_FIELD_REF "
1389 "arguments in %G", stmt
);
1395 && first_stmt_code
!= CFN_MASK_LOAD
1396 && first_stmt_code
!= CFN_MASK_STORE
)
1398 if (!is_a
<gcall
*> (stmts
[0]->stmt
)
1399 || !compatible_calls_p (as_a
<gcall
*> (stmts
[0]->stmt
),
1402 if (dump_enabled_p ())
1403 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1404 "Build SLP failed: different calls in %G",
1411 if ((phi_p
|| gimple_could_trap_p (stmt_info
->stmt
))
1412 && (gimple_bb (first_stmt_info
->stmt
)
1413 != gimple_bb (stmt_info
->stmt
)))
1415 if (dump_enabled_p ())
1416 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1417 "Build SLP failed: different BB for PHI "
1418 "or possibly trapping operation in %G", stmt
);
1423 if (need_same_oprnds
)
1425 tree other_op1
= gimple_arg (stmt
, 1);
1426 if (!operand_equal_p (first_op1
, other_op1
, 0))
1428 if (dump_enabled_p ())
1429 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1430 "Build SLP failed: different shift "
1431 "arguments in %G", stmt
);
1439 && !types_compatible_p (TREE_TYPE (lhs
), TREE_TYPE (first_lhs
)))
1441 if (dump_enabled_p ())
1442 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1443 "Build SLP failed: different vector type "
1450 /* Grouped store or load. */
1451 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
1453 gcc_assert (ldst_p
);
1454 if (DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info
)))
1457 gcc_assert (rhs_code
== CFN_MASK_STORE
1458 || REFERENCE_CLASS_P (lhs
)
1464 first_load
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
1465 if (prev_first_load
)
1467 /* Check that there are no loads from different interleaving
1468 chains in the same node. */
1469 if (prev_first_load
!= first_load
)
1471 if (dump_enabled_p ())
1472 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
1474 "Build SLP failed: different "
1475 "interleaving chains in one node %G",
1482 prev_first_load
= first_load
;
1485 /* Non-grouped store or load. */
1488 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
))
1489 && rhs_code
!= CFN_GATHER_LOAD
1490 && rhs_code
!= CFN_MASK_GATHER_LOAD
1491 && rhs_code
!= CFN_MASK_LEN_GATHER_LOAD
1492 && rhs_code
!= CFN_SCATTER_STORE
1493 && rhs_code
!= CFN_MASK_SCATTER_STORE
1494 && rhs_code
!= CFN_MASK_LEN_SCATTER_STORE
1495 && !STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
1496 /* Not grouped loads are handled as externals for BB
1497 vectorization. For loop vectorization we can handle
1498 splats the same we handle single element interleaving. */
1499 && (is_a
<bb_vec_info
> (vinfo
)
1500 || stmt_info
!= first_stmt_info
))
1502 /* Not grouped load. */
1503 if (dump_enabled_p ())
1504 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1505 "Build SLP failed: not grouped load %G", stmt
);
1509 /* Fatal mismatch. */
1514 /* Not memory operation. */
1518 && rhs_code
.is_tree_code ()
1519 && TREE_CODE_CLASS (tree_code (rhs_code
)) != tcc_binary
1520 && TREE_CODE_CLASS (tree_code (rhs_code
)) != tcc_unary
1521 && TREE_CODE_CLASS (tree_code (rhs_code
)) != tcc_expression
1522 && TREE_CODE_CLASS (tree_code (rhs_code
)) != tcc_comparison
1523 && rhs_code
!= VIEW_CONVERT_EXPR
1524 && rhs_code
!= CALL_EXPR
1525 && rhs_code
!= BIT_FIELD_REF
1526 && rhs_code
!= SSA_NAME
)
1528 if (dump_enabled_p ())
1529 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1530 "Build SLP failed: operation unsupported %G",
1532 if (is_a
<bb_vec_info
> (vinfo
) && i
!= 0)
1534 /* Fatal mismatch. */
1539 if (rhs_code
== COND_EXPR
)
1541 tree cond_expr
= gimple_assign_rhs1 (stmt
);
1542 enum tree_code cond_code
= TREE_CODE (cond_expr
);
1543 enum tree_code swap_code
= ERROR_MARK
;
1544 enum tree_code invert_code
= ERROR_MARK
;
1547 first_cond_code
= TREE_CODE (cond_expr
);
1548 else if (TREE_CODE_CLASS (cond_code
) == tcc_comparison
)
1550 bool honor_nans
= HONOR_NANS (TREE_OPERAND (cond_expr
, 0));
1551 swap_code
= swap_tree_comparison (cond_code
);
1552 invert_code
= invert_tree_comparison (cond_code
, honor_nans
);
1555 if (first_cond_code
== cond_code
)
1557 /* Isomorphic can be achieved by swapping. */
1558 else if (first_cond_code
== swap_code
)
1560 /* Isomorphic can be achieved by inverting. */
1561 else if (first_cond_code
== invert_code
)
1565 if (dump_enabled_p ())
1566 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
1567 "Build SLP failed: different"
1568 " operation %G", stmt
);
1574 if (rhs_code
.is_tree_code ()
1575 && TREE_CODE_CLASS ((tree_code
)rhs_code
) == tcc_comparison
1576 && (swap_tree_comparison ((tree_code
)first_stmt_code
)
1577 == (tree_code
)rhs_code
))
1584 for (i
= 0; i
< group_size
; ++i
)
1588 /* If we allowed a two-operation SLP node verify the target can cope
1589 with the permute we are going to use. */
1590 if (alt_stmt_code
!= ERROR_MARK
1591 && (!alt_stmt_code
.is_tree_code ()
1592 || (TREE_CODE_CLASS (tree_code (alt_stmt_code
)) != tcc_reference
1593 && TREE_CODE_CLASS (tree_code (alt_stmt_code
)) != tcc_comparison
)))
1595 *two_operators
= true;
1598 if (maybe_soft_fail
)
1600 unsigned HOST_WIDE_INT const_nunits
;
1601 if (!TYPE_VECTOR_SUBPARTS
1602 (soft_fail_nunits_vectype
).is_constant (&const_nunits
)
1603 || const_nunits
> group_size
)
1607 /* With constant vector elements simulate a mismatch at the
1608 point we need to split. */
1609 unsigned tail
= group_size
& (const_nunits
- 1);
1610 memset (&matches
[group_size
- tail
], 0, sizeof (bool) * tail
);
1618 /* Traits for the hash_set to record failed SLP builds for a stmt set.
1619 Note we never remove apart from at destruction time so we do not
1620 need a special value for deleted that differs from empty. */
1623 typedef vec
<stmt_vec_info
> value_type
;
1624 typedef vec
<stmt_vec_info
> compare_type
;
1625 static inline hashval_t
hash (value_type
);
1626 static inline bool equal (value_type existing
, value_type candidate
);
1627 static inline bool is_empty (value_type x
) { return !x
.exists (); }
1628 static inline bool is_deleted (value_type x
) { return !x
.exists (); }
1629 static const bool empty_zero_p
= true;
1630 static inline void mark_empty (value_type
&x
) { x
.release (); }
1631 static inline void mark_deleted (value_type
&x
) { x
.release (); }
1632 static inline void remove (value_type
&x
) { x
.release (); }
1635 bst_traits::hash (value_type x
)
1638 for (unsigned i
= 0; i
< x
.length (); ++i
)
1639 h
.add_int (x
[i
] ? gimple_uid (x
[i
]->stmt
) : -1);
1643 bst_traits::equal (value_type existing
, value_type candidate
)
1645 if (existing
.length () != candidate
.length ())
1647 for (unsigned i
= 0; i
< existing
.length (); ++i
)
1648 if (existing
[i
] != candidate
[i
])
1653 typedef hash_map
<vec
<stmt_vec_info
>, slp_tree
,
1654 simple_hashmap_traits
<bst_traits
, slp_tree
> >
1655 scalar_stmts_to_slp_tree_map_t
;
1657 /* Release BST_MAP. */
1660 release_scalar_stmts_to_slp_tree_map (scalar_stmts_to_slp_tree_map_t
*bst_map
)
1662 /* The map keeps a reference on SLP nodes built, release that. */
1663 for (scalar_stmts_to_slp_tree_map_t::iterator it
= bst_map
->begin ();
1664 it
!= bst_map
->end (); ++it
)
1666 vect_free_slp_tree ((*it
).second
);
1670 /* ??? This was std::pair<std::pair<tree_code, vect_def_type>, tree>
1671 but then vec::insert does memmove and that's not compatible with
1675 chain_op_t (tree_code code_
, vect_def_type dt_
, tree op_
)
1676 : code (code_
), dt (dt_
), op (op_
) {}
1682 /* Comparator for sorting associatable chains. */
1685 dt_sort_cmp (const void *op1_
, const void *op2_
, void *)
1687 auto *op1
= (const chain_op_t
*) op1_
;
1688 auto *op2
= (const chain_op_t
*) op2_
;
1689 if (op1
->dt
!= op2
->dt
)
1690 return (int)op1
->dt
- (int)op2
->dt
;
1691 return (int)op1
->code
- (int)op2
->code
;
1694 /* Linearize the associatable expression chain at START with the
1695 associatable operation CODE (where PLUS_EXPR also allows MINUS_EXPR),
1696 filling CHAIN with the result and using WORKLIST as intermediate storage.
1697 CODE_STMT and ALT_CODE_STMT are filled with the first stmt using CODE
1698 or MINUS_EXPR. *CHAIN_STMTS if not NULL is filled with all computation
1699 stmts, starting with START. */
1702 vect_slp_linearize_chain (vec_info
*vinfo
,
1703 vec
<std::pair
<tree_code
, gimple
*> > &worklist
,
1704 vec
<chain_op_t
> &chain
,
1705 enum tree_code code
, gimple
*start
,
1706 gimple
*&code_stmt
, gimple
*&alt_code_stmt
,
1707 vec
<gimple
*> *chain_stmts
)
1709 /* For each lane linearize the addition/subtraction (or other
1710 uniform associatable operation) expression tree. */
1711 worklist
.safe_push (std::make_pair (code
, start
));
1712 while (!worklist
.is_empty ())
1714 auto entry
= worklist
.pop ();
1715 gassign
*stmt
= as_a
<gassign
*> (entry
.second
);
1716 enum tree_code in_code
= entry
.first
;
1717 enum tree_code this_code
= gimple_assign_rhs_code (stmt
);
1718 /* Pick some stmts suitable for SLP_TREE_REPRESENTATIVE. */
1720 && gimple_assign_rhs_code (stmt
) == code
)
1722 else if (!alt_code_stmt
1723 && gimple_assign_rhs_code (stmt
) == MINUS_EXPR
)
1724 alt_code_stmt
= stmt
;
1726 chain_stmts
->safe_push (stmt
);
1727 for (unsigned opnum
= 1; opnum
<= 2; ++opnum
)
1729 tree op
= gimple_op (stmt
, opnum
);
1731 stmt_vec_info def_stmt_info
;
1732 bool res
= vect_is_simple_use (op
, vinfo
, &dt
, &def_stmt_info
);
1734 if (dt
== vect_internal_def
1735 && is_pattern_stmt_p (def_stmt_info
))
1736 op
= gimple_get_lhs (def_stmt_info
->stmt
);
1738 use_operand_p use_p
;
1739 if (dt
== vect_internal_def
1740 && single_imm_use (op
, &use_p
, &use_stmt
)
1741 && is_gimple_assign (def_stmt_info
->stmt
)
1742 && (gimple_assign_rhs_code (def_stmt_info
->stmt
) == code
1743 || (code
== PLUS_EXPR
1744 && (gimple_assign_rhs_code (def_stmt_info
->stmt
)
1747 tree_code op_def_code
= this_code
;
1748 if (op_def_code
== MINUS_EXPR
&& opnum
== 1)
1749 op_def_code
= PLUS_EXPR
;
1750 if (in_code
== MINUS_EXPR
)
1751 op_def_code
= op_def_code
== PLUS_EXPR
? MINUS_EXPR
: PLUS_EXPR
;
1752 worklist
.safe_push (std::make_pair (op_def_code
,
1753 def_stmt_info
->stmt
));
1757 tree_code op_def_code
= this_code
;
1758 if (op_def_code
== MINUS_EXPR
&& opnum
== 1)
1759 op_def_code
= PLUS_EXPR
;
1760 if (in_code
== MINUS_EXPR
)
1761 op_def_code
= op_def_code
== PLUS_EXPR
? MINUS_EXPR
: PLUS_EXPR
;
1762 chain
.safe_push (chain_op_t (op_def_code
, dt
, op
));
1769 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1770 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1771 poly_uint64
*max_nunits
,
1772 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1773 scalar_stmts_to_slp_tree_map_t
*bst_map
);
1776 vect_build_slp_tree (vec_info
*vinfo
,
1777 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1778 poly_uint64
*max_nunits
,
1779 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1780 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1782 if (slp_tree
*leader
= bst_map
->get (stmts
))
1784 if (dump_enabled_p ())
1785 dump_printf_loc (MSG_NOTE
, vect_location
, "re-using %sSLP tree %p\n",
1786 !(*leader
)->failed
? "" : "failed ",
1788 if (!(*leader
)->failed
)
1790 SLP_TREE_REF_COUNT (*leader
)++;
1791 vect_update_max_nunits (max_nunits
, (*leader
)->max_nunits
);
1795 memcpy (matches
, (*leader
)->failed
, sizeof (bool) * group_size
);
1799 /* Single-lane SLP doesn't have the chance of run-away, do not account
1801 if (stmts
.length () > 1)
1805 if (dump_enabled_p ())
1806 dump_printf_loc (MSG_NOTE
, vect_location
,
1807 "SLP discovery limit exceeded\n");
1808 memset (matches
, 0, sizeof (bool) * group_size
);
1814 /* Seed the bst_map with a stub node to be filled by vect_build_slp_tree_2
1815 so we can pick up backedge destinations during discovery. */
1816 slp_tree res
= new _slp_tree
;
1817 SLP_TREE_DEF_TYPE (res
) = vect_internal_def
;
1818 SLP_TREE_SCALAR_STMTS (res
) = stmts
;
1819 bst_map
->put (stmts
.copy (), res
);
1821 if (dump_enabled_p ())
1822 dump_printf_loc (MSG_NOTE
, vect_location
,
1823 "starting SLP discovery for node %p\n", (void *) res
);
1825 poly_uint64 this_max_nunits
= 1;
1826 slp_tree res_
= vect_build_slp_tree_2 (vinfo
, res
, stmts
, group_size
,
1828 matches
, limit
, tree_size
, bst_map
);
1831 if (dump_enabled_p ())
1832 dump_printf_loc (MSG_NOTE
, vect_location
,
1833 "SLP discovery for node %p failed\n", (void *) res
);
1834 /* Mark the node invalid so we can detect those when still in use
1835 as backedge destinations. */
1836 SLP_TREE_SCALAR_STMTS (res
) = vNULL
;
1837 SLP_TREE_DEF_TYPE (res
) = vect_uninitialized_def
;
1838 res
->failed
= XNEWVEC (bool, group_size
);
1842 for (i
= 0; i
< group_size
; ++i
)
1845 gcc_assert (i
< group_size
);
1847 memcpy (res
->failed
, matches
, sizeof (bool) * group_size
);
1851 if (dump_enabled_p ())
1852 dump_printf_loc (MSG_NOTE
, vect_location
,
1853 "SLP discovery for node %p succeeded\n",
1855 gcc_assert (res_
== res
);
1856 res
->max_nunits
= this_max_nunits
;
1857 vect_update_max_nunits (max_nunits
, this_max_nunits
);
1858 /* Keep a reference for the bst_map use. */
1859 SLP_TREE_REF_COUNT (res
)++;
1864 /* Helper for building an associated SLP node chain. */
1867 vect_slp_build_two_operator_nodes (slp_tree perm
, tree vectype
,
1868 slp_tree op0
, slp_tree op1
,
1869 stmt_vec_info oper1
, stmt_vec_info oper2
,
1870 vec
<std::pair
<unsigned, unsigned> > lperm
)
1872 unsigned group_size
= SLP_TREE_LANES (op1
);
1874 slp_tree child1
= new _slp_tree
;
1875 SLP_TREE_DEF_TYPE (child1
) = vect_internal_def
;
1876 SLP_TREE_VECTYPE (child1
) = vectype
;
1877 SLP_TREE_LANES (child1
) = group_size
;
1878 SLP_TREE_CHILDREN (child1
).create (2);
1879 SLP_TREE_CHILDREN (child1
).quick_push (op0
);
1880 SLP_TREE_CHILDREN (child1
).quick_push (op1
);
1881 SLP_TREE_REPRESENTATIVE (child1
) = oper1
;
1883 slp_tree child2
= new _slp_tree
;
1884 SLP_TREE_DEF_TYPE (child2
) = vect_internal_def
;
1885 SLP_TREE_VECTYPE (child2
) = vectype
;
1886 SLP_TREE_LANES (child2
) = group_size
;
1887 SLP_TREE_CHILDREN (child2
).create (2);
1888 SLP_TREE_CHILDREN (child2
).quick_push (op0
);
1889 SLP_TREE_REF_COUNT (op0
)++;
1890 SLP_TREE_CHILDREN (child2
).quick_push (op1
);
1891 SLP_TREE_REF_COUNT (op1
)++;
1892 SLP_TREE_REPRESENTATIVE (child2
) = oper2
;
1894 SLP_TREE_DEF_TYPE (perm
) = vect_internal_def
;
1895 SLP_TREE_CODE (perm
) = VEC_PERM_EXPR
;
1896 SLP_TREE_VECTYPE (perm
) = vectype
;
1897 SLP_TREE_LANES (perm
) = group_size
;
1898 /* ??? We should set this NULL but that's not expected. */
1899 SLP_TREE_REPRESENTATIVE (perm
) = oper1
;
1900 SLP_TREE_LANE_PERMUTATION (perm
) = lperm
;
1901 SLP_TREE_CHILDREN (perm
).quick_push (child1
);
1902 SLP_TREE_CHILDREN (perm
).quick_push (child2
);
1905 /* Recursively build an SLP tree starting from NODE.
1906 Fail (and return a value not equal to zero) if def-stmts are not
1907 isomorphic, require data permutation or are of unsupported types of
1908 operation. Otherwise, return 0.
1909 The value returned is the depth in the SLP tree where a mismatch
1913 vect_build_slp_tree_2 (vec_info
*vinfo
, slp_tree node
,
1914 vec
<stmt_vec_info
> stmts
, unsigned int group_size
,
1915 poly_uint64
*max_nunits
,
1916 bool *matches
, unsigned *limit
, unsigned *tree_size
,
1917 scalar_stmts_to_slp_tree_map_t
*bst_map
)
1919 unsigned nops
, i
, this_tree_size
= 0;
1920 poly_uint64 this_max_nunits
= *max_nunits
;
1924 stmt_vec_info stmt_info
= stmts
[0];
1925 if (!is_a
<gcall
*> (stmt_info
->stmt
)
1926 && !is_a
<gassign
*> (stmt_info
->stmt
)
1927 && !is_a
<gphi
*> (stmt_info
->stmt
))
1930 nops
= gimple_num_args (stmt_info
->stmt
);
1931 if (const int *map
= vect_get_operand_map (stmt_info
->stmt
,
1932 STMT_VINFO_GATHER_SCATTER_P
1936 /* If the SLP node is a PHI (induction or reduction), terminate
1938 bool *skip_args
= XALLOCAVEC (bool, nops
);
1939 memset (skip_args
, 0, sizeof (bool) * nops
);
1940 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
1941 if (gphi
*stmt
= dyn_cast
<gphi
*> (stmt_info
->stmt
))
1943 tree scalar_type
= TREE_TYPE (PHI_RESULT (stmt
));
1944 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
1946 if (!vect_record_max_nunits (vinfo
, stmt_info
, group_size
, vectype
,
1950 vect_def_type def_type
= STMT_VINFO_DEF_TYPE (stmt_info
);
1951 if (def_type
== vect_induction_def
)
1953 /* Induction PHIs are not cycles but walk the initial
1954 value. Only for inner loops through, for outer loops
1955 we need to pick up the value from the actual PHIs
1956 to more easily support peeling and epilogue vectorization. */
1957 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1958 if (!nested_in_vect_loop_p (loop
, stmt_info
))
1959 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1962 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1964 else if (def_type
== vect_reduction_def
1965 || def_type
== vect_double_reduction_def
1966 || def_type
== vect_nested_cycle
1967 || def_type
== vect_first_order_recurrence
)
1969 /* Else def types have to match. */
1970 stmt_vec_info other_info
;
1971 bool all_same
= true;
1972 FOR_EACH_VEC_ELT (stmts
, i
, other_info
)
1974 if (STMT_VINFO_DEF_TYPE (other_info
) != def_type
)
1976 if (other_info
!= stmt_info
)
1979 class loop
*loop
= LOOP_VINFO_LOOP (loop_vinfo
);
1980 /* Reduction initial values are not explicitely represented. */
1981 if (def_type
!= vect_first_order_recurrence
1982 && gimple_bb (stmt_info
->stmt
) == loop
->header
)
1983 skip_args
[loop_preheader_edge (loop
)->dest_idx
] = true;
1984 /* Reduction chain backedge defs are filled manually.
1985 ??? Need a better way to identify a SLP reduction chain PHI.
1986 Or a better overall way to SLP match those. */
1987 if (stmts
.length () > 1
1988 && all_same
&& def_type
== vect_reduction_def
)
1989 skip_args
[loop_latch_edge (loop
)->dest_idx
] = true;
1991 else if (def_type
!= vect_internal_def
)
1996 bool two_operators
= false;
1997 unsigned char *swap
= XALLOCAVEC (unsigned char, group_size
);
1998 tree vectype
= NULL_TREE
;
1999 if (!vect_build_slp_tree_1 (vinfo
, swap
, stmts
, group_size
,
2000 &this_max_nunits
, matches
, &two_operators
,
2004 /* If the SLP node is a load, terminate the recursion unless masked. */
2005 if (STMT_VINFO_DATA_REF (stmt_info
)
2006 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
2008 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
))
2009 gcc_assert (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)));
2012 *max_nunits
= this_max_nunits
;
2014 node
= vect_create_new_slp_node (node
, stmts
, 0);
2015 SLP_TREE_VECTYPE (node
) = vectype
;
2016 /* And compute the load permutation. Whether it is actually
2017 a permutation depends on the unrolling factor which is
2019 vec
<unsigned> load_permutation
;
2021 stmt_vec_info load_info
;
2022 load_permutation
.create (group_size
);
2023 stmt_vec_info first_stmt_info
2024 = STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2025 ? DR_GROUP_FIRST_ELEMENT (stmt_info
) : stmt_info
;
2026 bool any_permute
= false;
2027 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
2032 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2037 else if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2038 load_place
= vect_get_place_in_interleaving_chain
2039 (load_info
, first_stmt_info
);
2042 gcc_assert (load_place
!= -1);
2043 any_permute
|= load_place
!= j
;
2044 load_permutation
.quick_push (load_place
);
2047 if (gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
2049 gcc_assert (gimple_call_internal_p (stmt
, IFN_MASK_LOAD
));
2050 bool has_gaps
= false;
2051 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
2052 for (stmt_vec_info si
= DR_GROUP_NEXT_ELEMENT (first_stmt_info
);
2053 si
; si
= DR_GROUP_NEXT_ELEMENT (si
))
2054 if (DR_GROUP_GAP (si
) != 1)
2056 /* We cannot handle permuted masked loads directly, see
2057 PR114375. We cannot handle strided masked loads or masked
2058 loads with gaps unless the mask is uniform. */
2059 if ((STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2060 && (DR_GROUP_GAP (first_stmt_info
) != 0
2062 && STMT_VINFO_SLP_VECT_ONLY (first_stmt_info
))))
2063 || STMT_VINFO_STRIDED_P (stmt_info
))
2065 load_permutation
.release ();
2070 /* For permuted masked loads do an unpermuted masked load of
2071 the whole group followed by a SLP permute node. */
2073 || (STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2074 && DR_GROUP_SIZE (first_stmt_info
) != group_size
))
2076 /* Discover the whole unpermuted load. */
2077 vec
<stmt_vec_info
> stmts2
;
2078 unsigned dr_group_size
= STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2079 ? DR_GROUP_SIZE (first_stmt_info
) : 1;
2080 stmts2
.create (dr_group_size
);
2081 stmts2
.quick_grow_cleared (dr_group_size
);
2083 for (stmt_vec_info si
= first_stmt_info
;
2084 si
; si
= DR_GROUP_NEXT_ELEMENT (si
))
2086 if (si
!= first_stmt_info
)
2087 for (unsigned k
= 1; k
< DR_GROUP_GAP (si
); ++k
)
2091 bool *matches2
= XALLOCAVEC (bool, dr_group_size
);
2092 slp_tree unperm_load
2093 = vect_build_slp_tree (vinfo
, stmts2
, dr_group_size
,
2094 &this_max_nunits
, matches2
, limit
,
2095 &this_tree_size
, bst_map
);
2096 /* When we are able to do the full masked load emit that
2097 followed by 'node' being the desired final permutation. */
2101 (!SLP_TREE_LOAD_PERMUTATION (unperm_load
).exists ());
2102 lane_permutation_t lperm
;
2103 lperm
.create (group_size
);
2104 for (unsigned j
= 0; j
< load_permutation
.length (); ++j
)
2106 (std::make_pair (0, load_permutation
[j
]));
2107 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
2108 SLP_TREE_CHILDREN (node
).safe_push (unperm_load
);
2109 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
2110 load_permutation
.release ();
2114 load_permutation
.release ();
2118 load_permutation
.release ();
2123 && STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2124 && group_size
== DR_GROUP_SIZE (first_stmt_info
))
2125 load_permutation
.release ();
2126 SLP_TREE_LOAD_PERMUTATION (node
) = load_permutation
;
2131 else if (gimple_assign_single_p (stmt_info
->stmt
)
2132 && !gimple_vuse (stmt_info
->stmt
)
2133 && gimple_assign_rhs_code (stmt_info
->stmt
) == BIT_FIELD_REF
)
2135 /* vect_build_slp_tree_2 determined all BIT_FIELD_REFs reference
2136 the same SSA name vector of a compatible type to vectype. */
2137 vec
<std::pair
<unsigned, unsigned> > lperm
= vNULL
;
2138 tree vec
= TREE_OPERAND (gimple_assign_rhs1 (stmt_info
->stmt
), 0);
2139 stmt_vec_info estmt_info
;
2140 FOR_EACH_VEC_ELT (stmts
, i
, estmt_info
)
2142 gassign
*estmt
= as_a
<gassign
*> (estmt_info
->stmt
);
2143 tree bfref
= gimple_assign_rhs1 (estmt
);
2145 if (!known_eq (bit_field_size (bfref
),
2146 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (vectype
))))
2147 || !constant_multiple_p (bit_field_offset (bfref
),
2148 bit_field_size (bfref
), &lane
))
2154 lperm
.safe_push (std::make_pair (0, (unsigned)lane
));
2156 slp_tree vnode
= vect_create_new_slp_node (vNULL
);
2157 if (operand_equal_p (TYPE_SIZE (vectype
), TYPE_SIZE (TREE_TYPE (vec
))))
2158 /* ??? We record vectype here but we hide eventually necessary
2159 punning and instead rely on code generation to materialize
2160 VIEW_CONVERT_EXPRs as necessary. We instead should make
2161 this explicit somehow. */
2162 SLP_TREE_VECTYPE (vnode
) = vectype
;
2165 /* For different size but compatible elements we can still
2166 use VEC_PERM_EXPR without punning. */
2167 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (vec
))
2168 && types_compatible_p (TREE_TYPE (vectype
),
2169 TREE_TYPE (TREE_TYPE (vec
))));
2170 SLP_TREE_VECTYPE (vnode
) = TREE_TYPE (vec
);
2172 auto nunits
= TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode
));
2173 unsigned HOST_WIDE_INT const_nunits
;
2174 if (nunits
.is_constant (&const_nunits
))
2175 SLP_TREE_LANES (vnode
) = const_nunits
;
2176 SLP_TREE_VEC_DEFS (vnode
).safe_push (vec
);
2177 /* We are always building a permutation node even if it is an identity
2178 permute to shield the rest of the vectorizer from the odd node
2179 representing an actual vector without any scalar ops.
2180 ??? We could hide it completely with making the permute node
2182 node
= vect_create_new_slp_node (node
, stmts
, 1);
2183 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
2184 SLP_TREE_LANE_PERMUTATION (node
) = lperm
;
2185 SLP_TREE_VECTYPE (node
) = vectype
;
2186 SLP_TREE_CHILDREN (node
).quick_push (vnode
);
2189 /* When discovery reaches an associatable operation see whether we can
2190 improve that to match up lanes in a way superior to the operand
2191 swapping code which at most looks at two defs.
2192 ??? For BB vectorization we cannot do the brute-force search
2193 for matching as we can succeed by means of builds from scalars
2194 and have no good way to "cost" one build against another. */
2195 else if (is_a
<loop_vec_info
> (vinfo
)
2196 /* Do not bother for single-lane SLP. */
2198 /* ??? We don't handle !vect_internal_def defs below. */
2199 && STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
2200 /* ??? Do not associate a reduction, this will wreck REDUC_IDX
2201 mapping as long as that exists on the stmt_info level. */
2202 && STMT_VINFO_REDUC_IDX (stmt_info
) == -1
2203 && is_gimple_assign (stmt_info
->stmt
)
2204 && (associative_tree_code (gimple_assign_rhs_code (stmt_info
->stmt
))
2205 || gimple_assign_rhs_code (stmt_info
->stmt
) == MINUS_EXPR
)
2206 && ((FLOAT_TYPE_P (vectype
) && flag_associative_math
)
2207 || (INTEGRAL_TYPE_P (TREE_TYPE (vectype
))
2208 && TYPE_OVERFLOW_WRAPS (TREE_TYPE (vectype
)))))
2210 /* See if we have a chain of (mixed) adds or subtracts or other
2211 associatable ops. */
2212 enum tree_code code
= gimple_assign_rhs_code (stmt_info
->stmt
);
2213 if (code
== MINUS_EXPR
)
2215 stmt_vec_info other_op_stmt_info
= NULL
;
2216 stmt_vec_info op_stmt_info
= NULL
;
2217 unsigned chain_len
= 0;
2218 auto_vec
<chain_op_t
> chain
;
2219 auto_vec
<std::pair
<tree_code
, gimple
*> > worklist
;
2220 auto_vec
<vec
<chain_op_t
> > chains (group_size
);
2221 auto_vec
<slp_tree
, 4> children
;
2222 bool hard_fail
= true;
2223 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2227 /* ??? Below we require lane zero is present. */
2233 chains
.quick_push (vNULL
);
2236 /* For each lane linearize the addition/subtraction (or other
2237 uniform associatable operation) expression tree. */
2238 gimple
*op_stmt
= NULL
, *other_op_stmt
= NULL
;
2239 vect_slp_linearize_chain (vinfo
, worklist
, chain
, code
,
2240 stmts
[lane
]->stmt
, op_stmt
, other_op_stmt
,
2242 if (!op_stmt_info
&& op_stmt
)
2243 op_stmt_info
= vinfo
->lookup_stmt (op_stmt
);
2244 if (!other_op_stmt_info
&& other_op_stmt
)
2245 other_op_stmt_info
= vinfo
->lookup_stmt (other_op_stmt
);
2246 if (chain
.length () == 2)
2248 /* In a chain of just two elements resort to the regular
2249 operand swapping scheme. Likewise if we run into a
2250 length mismatch process regularly as well as we did not
2251 process the other lanes we cannot report a good hint what
2252 lanes to try swapping in the parent. */
2256 else if (chain_len
== 0)
2257 chain_len
= chain
.length ();
2258 else if (chain
.length () != chain_len
)
2260 /* ??? Here we could slip in magic to compensate with
2261 neutral operands. */
2262 matches
[lane
] = false;
2263 if (lane
!= group_size
- 1)
2267 chains
.quick_push (chain
.copy ());
2270 if (chains
.length () == group_size
)
2272 /* We cannot yet use SLP_TREE_CODE to communicate the operation. */
2278 /* Now we have a set of chains with the same length. */
2279 /* 1. pre-sort according to def_type and operation. */
2280 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2281 chains
[lane
].stablesort (dt_sort_cmp
, vinfo
);
2282 if (dump_enabled_p ())
2284 dump_printf_loc (MSG_NOTE
, vect_location
,
2285 "pre-sorted chains of %s\n",
2286 get_tree_code_name (code
));
2287 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2290 dump_printf (MSG_NOTE
, "--");
2292 for (unsigned opnum
= 0; opnum
< chain_len
; ++opnum
)
2293 dump_printf (MSG_NOTE
, "%s %T ",
2294 get_tree_code_name (chains
[lane
][opnum
].code
),
2295 chains
[lane
][opnum
].op
);
2296 dump_printf (MSG_NOTE
, "\n");
2299 /* 2. try to build children nodes, associating as necessary. */
2300 /* 2a. prepare and perform early checks to avoid eating into
2301 discovery limit unnecessarily. */
2302 vect_def_type
*dts
= XALLOCAVEC (vect_def_type
, chain_len
);
2303 for (unsigned n
= 0; n
< chain_len
; ++n
)
2305 vect_def_type dt
= chains
[0][n
].dt
;
2307 for (lane
= 0; lane
< group_size
; ++lane
)
2308 if (stmts
[lane
] && chains
[lane
][n
].dt
!= dt
)
2310 if (dt
== vect_constant_def
2311 && chains
[lane
][n
].dt
== vect_external_def
)
2312 dt
= vect_external_def
;
2313 else if (dt
== vect_external_def
2314 && chains
[lane
][n
].dt
== vect_constant_def
)
2319 if (lane
!= group_size
)
2321 if (dump_enabled_p ())
2322 dump_printf_loc (MSG_NOTE
, vect_location
,
2323 "giving up on chain due to mismatched "
2325 matches
[lane
] = false;
2326 if (lane
!= group_size
- 1)
2331 if (dt
== vect_constant_def
2332 || dt
== vect_external_def
)
2334 /* Check whether we can build the invariant. If we can't
2335 we never will be able to. */
2336 tree type
= TREE_TYPE (chains
[0][n
].op
);
2337 if (!GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ()
2338 && (TREE_CODE (type
) == BOOLEAN_TYPE
2339 || !can_duplicate_and_interleave_p (vinfo
, group_size
,
2346 else if (dt
!= vect_internal_def
)
2348 /* Not sure, we might need sth special.
2349 gcc.dg/vect/pr96854.c,
2350 gfortran.dg/vect/fast-math-pr37021.f90
2351 and gfortran.dg/vect/pr61171.f trigger. */
2352 /* Soft-fail for now. */
2357 /* 2b. do the actual build. */
2358 for (unsigned n
= 0; n
< chain_len
; ++n
)
2360 vect_def_type dt
= dts
[n
];
2362 if (dt
== vect_constant_def
2363 || dt
== vect_external_def
)
2366 ops
.create (group_size
);
2367 for (lane
= 0; lane
< group_size
; ++lane
)
2369 ops
.quick_push (chains
[lane
][n
].op
);
2371 ops
.quick_push (NULL_TREE
);
2372 slp_tree child
= vect_create_new_slp_node (ops
);
2373 SLP_TREE_DEF_TYPE (child
) = dt
;
2374 children
.safe_push (child
);
2378 vec
<stmt_vec_info
> op_stmts
;
2379 op_stmts
.create (group_size
);
2380 slp_tree child
= NULL
;
2381 /* Brute-force our way. We have to consider a lane
2382 failing after fixing an earlier fail up in the
2383 SLP discovery recursion. So track the current
2384 permute per lane. */
2385 unsigned *perms
= XALLOCAVEC (unsigned, group_size
);
2386 memset (perms
, 0, sizeof (unsigned) * group_size
);
2389 op_stmts
.truncate (0);
2390 for (lane
= 0; lane
< group_size
; ++lane
)
2393 (vinfo
->lookup_def (chains
[lane
][n
].op
));
2395 op_stmts
.quick_push (NULL
);
2396 child
= vect_build_slp_tree (vinfo
, op_stmts
,
2397 group_size
, &this_max_nunits
,
2399 &this_tree_size
, bst_map
);
2400 /* ??? We're likely getting too many fatal mismatches
2401 here so maybe we want to ignore them (but then we
2402 have no idea which lanes fatally mismatched). */
2403 if (child
|| !matches
[0])
2405 /* Swap another lane we have not yet matched up into
2406 lanes that did not match. If we run out of
2407 permute possibilities for a lane terminate the
2410 for (lane
= 1; lane
< group_size
; ++lane
)
2413 if (n
+ perms
[lane
] + 1 == chain_len
)
2418 if (dump_enabled_p ())
2419 dump_printf_loc (MSG_NOTE
, vect_location
,
2420 "swapping operand %d and %d "
2422 n
, n
+ perms
[lane
] + 1, lane
);
2423 std::swap (chains
[lane
][n
],
2424 chains
[lane
][n
+ perms
[lane
] + 1]);
2433 if (dump_enabled_p ())
2434 dump_printf_loc (MSG_NOTE
, vect_location
,
2435 "failed to match up op %d\n", n
);
2436 op_stmts
.release ();
2437 if (lane
!= group_size
- 1)
2440 matches
[lane
] = false;
2443 if (dump_enabled_p ())
2445 dump_printf_loc (MSG_NOTE
, vect_location
,
2446 "matched up op %d to\n", n
);
2447 vect_print_slp_tree (MSG_NOTE
, vect_location
, child
);
2449 children
.safe_push (child
);
2452 /* 3. build SLP nodes to combine the chain. */
2453 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2454 if (stmts
[lane
] && chains
[lane
][0].code
!= code
)
2456 /* See if there's any alternate all-PLUS entry. */
2458 for (n
= 1; n
< chain_len
; ++n
)
2460 for (lane
= 0; lane
< group_size
; ++lane
)
2461 if (stmts
[lane
] && chains
[lane
][n
].code
!= code
)
2463 if (lane
== group_size
)
2468 /* Swap that in at first position. */
2469 std::swap (children
[0], children
[n
]);
2470 for (lane
= 0; lane
< group_size
; ++lane
)
2472 std::swap (chains
[lane
][0], chains
[lane
][n
]);
2476 /* ??? When this triggers and we end up with two
2477 vect_constant/external_def up-front things break (ICE)
2478 spectacularly finding an insertion place for the
2479 all-constant op. We should have a fully
2480 vect_internal_def operand though(?) so we can swap
2481 that into first place and then prepend the all-zero
2483 if (dump_enabled_p ())
2484 dump_printf_loc (MSG_NOTE
, vect_location
,
2485 "inserting constant zero to compensate "
2486 "for (partially) negated first "
2489 for (lane
= 0; lane
< group_size
; ++lane
)
2491 chains
[lane
].safe_insert
2492 (0, chain_op_t (code
, vect_constant_def
, NULL_TREE
));
2494 zero_ops
.create (group_size
);
2495 zero_ops
.quick_push (build_zero_cst (TREE_TYPE (vectype
)));
2496 for (lane
= 1; lane
< group_size
; ++lane
)
2498 zero_ops
.quick_push (zero_ops
[0]);
2500 zero_ops
.quick_push (NULL_TREE
);
2501 slp_tree zero
= vect_create_new_slp_node (zero_ops
);
2502 SLP_TREE_DEF_TYPE (zero
) = vect_constant_def
;
2503 children
.safe_insert (0, zero
);
2507 for (unsigned i
= 1; i
< children
.length (); ++i
)
2509 slp_tree op0
= children
[i
- 1];
2510 slp_tree op1
= children
[i
];
2511 bool this_two_op
= false;
2512 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2513 if (stmts
[lane
] && chains
[lane
][i
].code
!= chains
[0][i
].code
)
2519 if (i
== children
.length () - 1)
2520 child
= vect_create_new_slp_node (node
, stmts
, 2);
2522 child
= vect_create_new_slp_node (2, ERROR_MARK
);
2525 vec
<std::pair
<unsigned, unsigned> > lperm
;
2526 lperm
.create (group_size
);
2527 for (unsigned lane
= 0; lane
< group_size
; ++lane
)
2528 lperm
.quick_push (std::make_pair
2529 (chains
[lane
][i
].code
!= chains
[0][i
].code
, lane
));
2530 vect_slp_build_two_operator_nodes (child
, vectype
, op0
, op1
,
2531 (chains
[0][i
].code
== code
2533 : other_op_stmt_info
),
2534 (chains
[0][i
].code
== code
2535 ? other_op_stmt_info
2541 SLP_TREE_DEF_TYPE (child
) = vect_internal_def
;
2542 SLP_TREE_VECTYPE (child
) = vectype
;
2543 SLP_TREE_LANES (child
) = group_size
;
2544 SLP_TREE_CHILDREN (child
).quick_push (op0
);
2545 SLP_TREE_CHILDREN (child
).quick_push (op1
);
2546 SLP_TREE_REPRESENTATIVE (child
)
2547 = (chains
[0][i
].code
== code
2548 ? op_stmt_info
: other_op_stmt_info
);
2550 children
[i
] = child
;
2552 *tree_size
+= this_tree_size
+ 1;
2553 *max_nunits
= this_max_nunits
;
2554 while (!chains
.is_empty ())
2555 chains
.pop ().release ();
2559 if (dump_enabled_p ())
2560 dump_printf_loc (MSG_NOTE
, vect_location
,
2561 "failed to line up SLP graph by re-associating "
2562 "operations in lanes%s\n",
2563 !hard_fail
? " trying regular discovery" : "");
2564 while (!children
.is_empty ())
2565 vect_free_slp_tree (children
.pop ());
2566 while (!chains
.is_empty ())
2567 chains
.pop ().release ();
2568 /* Hard-fail, otherwise we might run into quadratic processing of the
2569 chains starting one stmt into the chain again. */
2572 /* Fall thru to normal processing. */
2575 /* Get at the operands, verifying they are compatible. */
2576 vec
<slp_oprnd_info
> oprnds_info
= vect_create_oprnd_info (nops
, group_size
);
2577 slp_oprnd_info oprnd_info
;
2578 FOR_EACH_VEC_ELT (stmts
, i
, stmt_info
)
2580 int res
= vect_get_and_check_slp_defs (vinfo
, swap
[i
], skip_args
,
2581 stmts
, i
, &oprnds_info
);
2583 matches
[(res
== -1) ? 0 : i
] = false;
2587 for (i
= 0; i
< group_size
; ++i
)
2590 vect_free_oprnd_info (oprnds_info
);
2595 bool has_two_operators_perm
= false;
2596 auto_vec
<unsigned> two_op_perm_indices
[2];
2597 vec
<stmt_vec_info
> two_op_scalar_stmts
[2] = {vNULL
, vNULL
};
2599 if (two_operators
&& oprnds_info
.length () == 2 && group_size
> 2)
2602 hash_map
<gimple
*, unsigned> seen
;
2603 vec
<slp_oprnd_info
> new_oprnds_info
2604 = vect_create_oprnd_info (1, group_size
);
2605 bool success
= true;
2607 enum tree_code code
= ERROR_MARK
;
2608 if (oprnds_info
[0]->def_stmts
[0]
2609 && is_a
<gassign
*> (oprnds_info
[0]->def_stmts
[0]->stmt
))
2610 code
= gimple_assign_rhs_code (oprnds_info
[0]->def_stmts
[0]->stmt
);
2612 for (unsigned j
= 0; j
< group_size
; ++j
)
2614 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
2616 stmt_vec_info stmt_info
= oprnd_info
->def_stmts
[j
];
2617 if (!stmt_info
|| !stmt_info
->stmt
2618 || !is_a
<gassign
*> (stmt_info
->stmt
)
2619 || gimple_assign_rhs_code (stmt_info
->stmt
) != code
2628 = seen
.get_or_insert (stmt_info
->stmt
, &exists
);
2632 new_oprnds_info
[0]->def_stmts
.safe_push (stmt_info
);
2633 new_oprnds_info
[0]->ops
.safe_push (oprnd_info
->ops
[j
]);
2638 two_op_perm_indices
[i
].safe_push (stmt_idx
);
2645 if (success
&& idx
== group_size
)
2647 if (dump_enabled_p ())
2649 dump_printf_loc (MSG_NOTE
, vect_location
,
2650 "Replace two_operators operands:\n");
2652 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
2654 dump_printf_loc (MSG_NOTE
, vect_location
,
2655 "Operand %u:\n", i
);
2656 for (unsigned j
= 0; j
< group_size
; j
++)
2657 dump_printf_loc (MSG_NOTE
, vect_location
, "\tstmt %u %G",
2658 j
, oprnd_info
->def_stmts
[j
]->stmt
);
2661 dump_printf_loc (MSG_NOTE
, vect_location
,
2662 "With a single operand:\n");
2663 for (unsigned j
= 0; j
< group_size
; j
++)
2664 dump_printf_loc (MSG_NOTE
, vect_location
, "\tstmt %u %G",
2665 j
, new_oprnds_info
[0]->def_stmts
[j
]->stmt
);
2668 two_op_scalar_stmts
[0].safe_splice (oprnds_info
[0]->def_stmts
);
2669 two_op_scalar_stmts
[1].safe_splice (oprnds_info
[1]->def_stmts
);
2671 new_oprnds_info
[0]->first_op_type
= oprnds_info
[0]->first_op_type
;
2672 new_oprnds_info
[0]->first_dt
= oprnds_info
[0]->first_dt
;
2673 new_oprnds_info
[0]->any_pattern
= oprnds_info
[0]->any_pattern
;
2674 new_oprnds_info
[0]->first_gs_p
= oprnds_info
[0]->first_gs_p
;
2675 new_oprnds_info
[0]->first_gs_info
= oprnds_info
[0]->first_gs_info
;
2677 vect_free_oprnd_info (oprnds_info
);
2678 oprnds_info
= new_oprnds_info
;
2680 has_two_operators_perm
= true;
2683 vect_free_oprnd_info (new_oprnds_info
);
2686 auto_vec
<slp_tree
, 4> children
;
2688 stmt_info
= stmts
[0];
2690 /* Create SLP_TREE nodes for the definition node/s. */
2691 FOR_EACH_VEC_ELT (oprnds_info
, i
, oprnd_info
)
2693 slp_tree child
= nullptr;
2696 /* We're skipping certain operands from processing, for example
2697 outer loop reduction initial defs. */
2700 children
.safe_push (NULL
);
2704 if (oprnd_info
->first_dt
== vect_uninitialized_def
)
2706 /* COND_EXPR have one too many eventually if the condition
2708 gcc_assert (i
== 3 && nops
== 4);
2712 if (is_a
<bb_vec_info
> (vinfo
)
2713 && oprnd_info
->first_dt
== vect_internal_def
2714 && !oprnd_info
->any_pattern
)
2716 /* For BB vectorization, if all defs are the same do not
2717 bother to continue the build along the single-lane
2718 graph but use a splat of the scalar value. */
2719 stmt_vec_info first_def
= oprnd_info
->def_stmts
[0];
2720 for (j
= 1; j
< group_size
; ++j
)
2721 if (oprnd_info
->def_stmts
[j
] != first_def
)
2724 /* But avoid doing this for loads where we may be
2725 able to CSE things, unless the stmt is not
2727 && (!STMT_VINFO_VECTORIZABLE (first_def
)
2728 || !gimple_vuse (first_def
->stmt
)))
2730 if (dump_enabled_p ())
2731 dump_printf_loc (MSG_NOTE
, vect_location
,
2732 "Using a splat of the uniform operand %G",
2734 oprnd_info
->first_dt
= vect_external_def
;
2738 if (oprnd_info
->first_dt
== vect_external_def
2739 || oprnd_info
->first_dt
== vect_constant_def
)
2741 if (!GET_MODE_SIZE (vinfo
->vector_mode
).is_constant ())
2744 tree uniform_val
= op0
= oprnd_info
->ops
[0];
2745 for (j
= 1; j
< oprnd_info
->ops
.length (); ++j
)
2746 if (oprnd_info
->ops
[j
]
2747 && !operand_equal_p (uniform_val
, oprnd_info
->ops
[j
]))
2749 uniform_val
= NULL_TREE
;
2753 && !can_duplicate_and_interleave_p (vinfo
,
2754 oprnd_info
->ops
.length (),
2758 if (dump_enabled_p ())
2759 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
2760 "Build SLP failed: invalid type of def "
2761 "for variable-length SLP %T\n", op0
);
2765 slp_tree invnode
= vect_create_new_slp_node (oprnd_info
->ops
);
2766 SLP_TREE_DEF_TYPE (invnode
) = oprnd_info
->first_dt
;
2767 oprnd_info
->ops
= vNULL
;
2768 children
.safe_push (invnode
);
2772 /* When we have a masked load with uniform mask discover this
2773 as a single-lane mask with a splat permute. This way we can
2774 recognize this as a masked load-lane by stripping the splat. */
2775 if (is_a
<gcall
*> (STMT_VINFO_STMT (stmt_info
))
2776 && gimple_call_internal_p (STMT_VINFO_STMT (stmt_info
),
2778 && STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2779 && ! STMT_VINFO_SLP_VECT_ONLY (DR_GROUP_FIRST_ELEMENT (stmt_info
)))
2781 vec
<stmt_vec_info
> def_stmts2
;
2782 def_stmts2
.create (1);
2783 def_stmts2
.quick_push (oprnd_info
->def_stmts
[0]);
2784 child
= vect_build_slp_tree (vinfo
, def_stmts2
, 1,
2787 &this_tree_size
, bst_map
);
2790 slp_tree pnode
= vect_create_new_slp_node (1, VEC_PERM_EXPR
);
2791 SLP_TREE_VECTYPE (pnode
) = SLP_TREE_VECTYPE (child
);
2792 SLP_TREE_LANES (pnode
) = group_size
;
2793 SLP_TREE_SCALAR_STMTS (pnode
).create (group_size
);
2794 SLP_TREE_LANE_PERMUTATION (pnode
).create (group_size
);
2795 for (unsigned k
= 0; k
< group_size
; ++k
)
2797 SLP_TREE_SCALAR_STMTS (pnode
)
2798 .quick_push (oprnd_info
->def_stmts
[0]);
2799 SLP_TREE_LANE_PERMUTATION (pnode
)
2800 .quick_push (std::make_pair (0u, 0u));
2802 SLP_TREE_CHILDREN (pnode
).quick_push (child
);
2803 pnode
->max_nunits
= child
->max_nunits
;
2804 children
.safe_push (pnode
);
2805 oprnd_info
->def_stmts
= vNULL
;
2809 def_stmts2
.release ();
2812 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
2813 group_size
, &this_max_nunits
,
2815 &this_tree_size
, bst_map
)) != NULL
)
2817 oprnd_info
->def_stmts
= vNULL
;
2818 children
.safe_push (child
);
2822 /* If the SLP build for operand zero failed and operand zero
2823 and one can be commutated try that for the scalar stmts
2824 that failed the match. */
2826 /* A first scalar stmt mismatch signals a fatal mismatch. */
2828 /* ??? For COND_EXPRs we can swap the comparison operands
2829 as well as the arms under some constraints. */
2831 && oprnds_info
[1]->first_dt
== vect_internal_def
2832 && is_gimple_assign (stmt_info
->stmt
)
2833 /* Swapping operands for reductions breaks assumptions later on. */
2834 && STMT_VINFO_REDUC_IDX (stmt_info
) == -1)
2836 /* See whether we can swap the matching or the non-matching
2838 bool swap_not_matching
= true;
2841 for (j
= 0; j
< group_size
; ++j
)
2843 if (matches
[j
] != !swap_not_matching
)
2845 stmt_vec_info stmt_info
= stmts
[j
];
2846 /* Verify if we can swap operands of this stmt. */
2847 gassign
*stmt
= dyn_cast
<gassign
*> (stmt_info
->stmt
);
2849 || !commutative_tree_code (gimple_assign_rhs_code (stmt
)))
2851 if (!swap_not_matching
)
2853 swap_not_matching
= false;
2858 while (j
!= group_size
);
2860 /* Swap mismatched definition stmts. */
2861 if (dump_enabled_p ())
2862 dump_printf_loc (MSG_NOTE
, vect_location
,
2863 "Re-trying with swapped operands of stmts ");
2864 for (j
= 0; j
< group_size
; ++j
)
2865 if (matches
[j
] == !swap_not_matching
)
2867 std::swap (oprnds_info
[0]->def_stmts
[j
],
2868 oprnds_info
[1]->def_stmts
[j
]);
2869 std::swap (oprnds_info
[0]->ops
[j
],
2870 oprnds_info
[1]->ops
[j
]);
2871 if (dump_enabled_p ())
2872 dump_printf (MSG_NOTE
, "%d ", j
);
2874 if (dump_enabled_p ())
2875 dump_printf (MSG_NOTE
, "\n");
2876 /* After swapping some operands we lost track whether an
2877 operand has any pattern defs so be conservative here. */
2878 if (oprnds_info
[0]->any_pattern
|| oprnds_info
[1]->any_pattern
)
2879 oprnds_info
[0]->any_pattern
= oprnds_info
[1]->any_pattern
= true;
2880 /* And try again with scratch 'matches' ... */
2881 bool *tem
= XALLOCAVEC (bool, group_size
);
2882 if ((child
= vect_build_slp_tree (vinfo
, oprnd_info
->def_stmts
,
2883 group_size
, &this_max_nunits
,
2885 &this_tree_size
, bst_map
)) != NULL
)
2887 oprnd_info
->def_stmts
= vNULL
;
2888 children
.safe_push (child
);
2894 /* If the SLP build failed and we analyze a basic-block
2895 simply treat nodes we fail to build as externally defined
2896 (and thus build vectors from the scalar defs).
2897 The cost model will reject outright expensive cases.
2898 ??? This doesn't treat cases where permutation ultimatively
2899 fails (or we don't try permutation below). Ideally we'd
2900 even compute a permutation that will end up with the maximum
2902 if (is_a
<bb_vec_info
> (vinfo
)
2903 /* ??? Rejecting patterns this way doesn't work. We'd have to
2904 do extra work to cancel the pattern so the uses see the
2906 && !is_pattern_stmt_p (stmt_info
)
2907 && !oprnd_info
->any_pattern
)
2909 /* But if there's a leading vector sized set of matching stmts
2910 fail here so we can split the group. This matches the condition
2911 vect_analyze_slp_instance uses. */
2912 /* ??? We might want to split here and combine the results to support
2913 multiple vector sizes better. */
2914 for (j
= 0; j
< group_size
; ++j
)
2917 if (!known_ge (j
, TYPE_VECTOR_SUBPARTS (vectype
))
2918 && vect_slp_can_convert_to_external (oprnd_info
->def_stmts
))
2920 if (dump_enabled_p ())
2921 dump_printf_loc (MSG_NOTE
, vect_location
,
2922 "Building vector operands from scalars\n");
2924 child
= vect_create_new_slp_node (oprnd_info
->ops
);
2925 children
.safe_push (child
);
2926 oprnd_info
->ops
= vNULL
;
2931 gcc_assert (child
== NULL
);
2932 FOR_EACH_VEC_ELT (children
, j
, child
)
2934 vect_free_slp_tree (child
);
2935 vect_free_oprnd_info (oprnds_info
);
2939 vect_free_oprnd_info (oprnds_info
);
2941 /* If we have all children of a child built up from uniform scalars
2942 or does more than one possibly expensive vector construction then
2943 just throw that away, causing it built up from scalars.
2944 The exception is the SLP node for the vector store. */
2945 if (is_a
<bb_vec_info
> (vinfo
)
2946 && !STMT_VINFO_GROUPED_ACCESS (stmt_info
)
2947 /* ??? Rejecting patterns this way doesn't work. We'd have to
2948 do extra work to cancel the pattern so the uses see the
2950 && !is_pattern_stmt_p (stmt_info
))
2954 bool all_uniform_p
= true;
2955 unsigned n_vector_builds
= 0;
2956 FOR_EACH_VEC_ELT (children
, j
, child
)
2960 else if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
2961 all_uniform_p
= false;
2962 else if (!vect_slp_tree_uniform_p (child
))
2964 all_uniform_p
= false;
2965 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
2970 || n_vector_builds
> 1
2971 || (n_vector_builds
== children
.length ()
2972 && is_a
<gphi
*> (stmt_info
->stmt
)))
2976 FOR_EACH_VEC_ELT (children
, j
, child
)
2978 vect_free_slp_tree (child
);
2980 if (dump_enabled_p ())
2981 dump_printf_loc (MSG_NOTE
, vect_location
,
2982 "Building parent vector operands from "
2983 "scalars instead\n");
2988 *tree_size
+= this_tree_size
+ 1;
2989 *max_nunits
= this_max_nunits
;
2993 /* ??? We'd likely want to either cache in bst_map sth like
2994 { a+b, NULL, a+b, NULL } and { NULL, a-b, NULL, a-b } or
2995 the true { a+b, a+b, a+b, a+b } ... but there we don't have
2996 explicit stmts to put in so the keying on 'stmts' doesn't
2997 work (but we have the same issue with nodes that use 'ops'). */
2999 if (has_two_operators_perm
)
3001 slp_tree child
= children
[0];
3002 children
.truncate (0);
3003 for (i
= 0; i
< 2; i
++)
3006 = vect_create_new_slp_node (two_op_scalar_stmts
[i
], 2);
3007 SLP_TREE_CODE (pnode
) = VEC_PERM_EXPR
;
3008 SLP_TREE_VECTYPE (pnode
) = vectype
;
3009 SLP_TREE_CHILDREN (pnode
).quick_push (child
);
3010 SLP_TREE_CHILDREN (pnode
).quick_push (child
);
3011 lane_permutation_t
& perm
= SLP_TREE_LANE_PERMUTATION (pnode
);
3012 children
.safe_push (pnode
);
3014 for (unsigned j
= 0; j
< stmts
.length (); j
++)
3015 perm
.safe_push (std::make_pair (0, two_op_perm_indices
[i
][j
]));
3018 SLP_TREE_REF_COUNT (child
) += 4;
3021 slp_tree one
= new _slp_tree
;
3022 slp_tree two
= new _slp_tree
;
3023 SLP_TREE_DEF_TYPE (one
) = vect_internal_def
;
3024 SLP_TREE_DEF_TYPE (two
) = vect_internal_def
;
3025 SLP_TREE_VECTYPE (one
) = vectype
;
3026 SLP_TREE_VECTYPE (two
) = vectype
;
3027 SLP_TREE_CHILDREN (one
).safe_splice (children
);
3028 SLP_TREE_CHILDREN (two
).safe_splice (children
);
3030 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (two
), i
, child
)
3031 SLP_TREE_REF_COUNT (child
)++;
3033 /* Here we record the original defs since this
3034 node represents the final lane configuration. */
3035 node
= vect_create_new_slp_node (node
, stmts
, 2);
3036 SLP_TREE_VECTYPE (node
) = vectype
;
3037 SLP_TREE_CODE (node
) = VEC_PERM_EXPR
;
3038 SLP_TREE_CHILDREN (node
).quick_push (one
);
3039 SLP_TREE_CHILDREN (node
).quick_push (two
);
3040 gassign
*stmt
= as_a
<gassign
*> (stmts
[0]->stmt
);
3041 enum tree_code code0
= gimple_assign_rhs_code (stmt
);
3042 enum tree_code ocode
= ERROR_MARK
;
3043 stmt_vec_info ostmt_info
;
3045 FOR_EACH_VEC_ELT (stmts
, i
, ostmt_info
)
3047 gassign
*ostmt
= as_a
<gassign
*> (ostmt_info
->stmt
);
3048 if (gimple_assign_rhs_code (ostmt
) != code0
)
3050 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (1, i
));
3051 ocode
= gimple_assign_rhs_code (ostmt
);
3055 SLP_TREE_LANE_PERMUTATION (node
).safe_push (std::make_pair (0, i
));
3058 SLP_TREE_CODE (one
) = code0
;
3059 SLP_TREE_CODE (two
) = ocode
;
3060 SLP_TREE_LANES (one
) = stmts
.length ();
3061 SLP_TREE_LANES (two
) = stmts
.length ();
3062 SLP_TREE_REPRESENTATIVE (one
) = stmts
[0];
3063 SLP_TREE_REPRESENTATIVE (two
) = stmts
[j
];
3068 node
= vect_create_new_slp_node (node
, stmts
, nops
);
3069 SLP_TREE_VECTYPE (node
) = vectype
;
3070 SLP_TREE_CHILDREN (node
).splice (children
);
3074 /* Dump a single SLP tree NODE. */
3077 vect_print_slp_tree (dump_flags_t dump_kind
, dump_location_t loc
,
3082 stmt_vec_info stmt_info
;
3085 dump_metadata_t
metadata (dump_kind
, loc
.get_impl_location ());
3086 dump_user_location_t user_loc
= loc
.get_user_location ();
3087 dump_printf_loc (metadata
, user_loc
,
3088 "node%s %p (max_nunits=" HOST_WIDE_INT_PRINT_UNSIGNED
3090 SLP_TREE_DEF_TYPE (node
) == vect_external_def
3092 : (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
3094 : ""), (void *) node
,
3095 estimated_poly_value (node
->max_nunits
),
3096 SLP_TREE_REF_COUNT (node
));
3097 if (SLP_TREE_VECTYPE (node
))
3098 dump_printf (metadata
, " %T", SLP_TREE_VECTYPE (node
));
3099 dump_printf (metadata
, "\n");
3100 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
3102 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
3103 dump_printf_loc (metadata
, user_loc
, "op: VEC_PERM_EXPR\n");
3105 dump_printf_loc (metadata
, user_loc
, "op template: %G",
3106 SLP_TREE_REPRESENTATIVE (node
)->stmt
);
3108 if (SLP_TREE_SCALAR_STMTS (node
).exists ())
3109 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3111 dump_printf_loc (metadata
, user_loc
, "\t%sstmt %u %G",
3112 STMT_VINFO_LIVE_P (stmt_info
) ? "[l] " : "",
3113 i
, stmt_info
->stmt
);
3115 dump_printf_loc (metadata
, user_loc
, "\tstmt %u ---\n", i
);
3118 dump_printf_loc (metadata
, user_loc
, "\t{ ");
3119 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (node
), i
, op
)
3120 dump_printf (metadata
, "%T%s ", op
,
3121 i
< SLP_TREE_SCALAR_OPS (node
).length () - 1 ? "," : "");
3122 dump_printf (metadata
, "}\n");
3124 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
3126 dump_printf_loc (metadata
, user_loc
, "\tload permutation {");
3127 FOR_EACH_VEC_ELT (SLP_TREE_LOAD_PERMUTATION (node
), i
, j
)
3128 dump_printf (dump_kind
, " %u", j
);
3129 dump_printf (dump_kind
, " }\n");
3131 if (SLP_TREE_LANE_PERMUTATION (node
).exists ())
3133 dump_printf_loc (metadata
, user_loc
, "\tlane permutation {");
3134 for (i
= 0; i
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++i
)
3135 dump_printf (dump_kind
, " %u[%u]",
3136 SLP_TREE_LANE_PERMUTATION (node
)[i
].first
,
3137 SLP_TREE_LANE_PERMUTATION (node
)[i
].second
);
3138 dump_printf (dump_kind
, " }%s\n",
3139 node
->ldst_lanes
? " (load-lanes)" : "");
3141 if (SLP_TREE_CHILDREN (node
).is_empty ())
3143 dump_printf_loc (metadata
, user_loc
, "\tchildren");
3144 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3145 dump_printf (dump_kind
, " %p", (void *)child
);
3146 dump_printf (dump_kind
, "%s\n",
3147 node
->ldst_lanes
&& !SLP_TREE_LANE_PERMUTATION (node
).exists ()
3148 ? " (store-lanes)" : "");
3152 debug (slp_tree node
)
3154 debug_dump_context ctx
;
3155 vect_print_slp_tree (MSG_NOTE
,
3156 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
3160 /* Recursive helper for the dot producer below. */
3163 dot_slp_tree (FILE *f
, slp_tree node
, hash_set
<slp_tree
> &visited
)
3165 if (visited
.add (node
))
3168 fprintf (f
, "\"%p\" [label=\"", (void *)node
);
3169 vect_print_slp_tree (MSG_NOTE
,
3170 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
3172 fprintf (f
, "\"];\n");
3175 for (slp_tree child
: SLP_TREE_CHILDREN (node
))
3176 fprintf (f
, "\"%p\" -> \"%p\";", (void *)node
, (void *)child
);
3178 for (slp_tree child
: SLP_TREE_CHILDREN (node
))
3180 dot_slp_tree (f
, child
, visited
);
3184 dot_slp_tree (const char *fname
, slp_tree node
)
3186 FILE *f
= fopen (fname
, "w");
3187 fprintf (f
, "digraph {\n");
3190 debug_dump_context
ctx (f
);
3191 hash_set
<slp_tree
> visited
;
3192 dot_slp_tree (f
, node
, visited
);
3200 dot_slp_tree (const char *fname
, const vec
<slp_instance
> &slp_instances
)
3202 FILE *f
= fopen (fname
, "w");
3203 fprintf (f
, "digraph {\n");
3206 debug_dump_context
ctx (f
);
3207 hash_set
<slp_tree
> visited
;
3208 for (auto inst
: slp_instances
)
3209 dot_slp_tree (f
, SLP_INSTANCE_TREE (inst
), visited
);
3216 /* Dump a slp tree NODE using flags specified in DUMP_KIND. */
3219 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
3220 slp_tree node
, hash_set
<slp_tree
> &visited
)
3225 if (visited
.add (node
))
3228 vect_print_slp_tree (dump_kind
, loc
, node
);
3230 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3232 vect_print_slp_graph (dump_kind
, loc
, child
, visited
);
3236 vect_print_slp_graph (dump_flags_t dump_kind
, dump_location_t loc
,
3239 hash_set
<slp_tree
> visited
;
3240 vect_print_slp_graph (dump_kind
, loc
, entry
, visited
);
3244 debug (slp_instance instance
)
3246 debug_dump_context ctx
;
3247 vect_print_slp_graph (MSG_NOTE
,
3248 dump_location_t::from_location_t (UNKNOWN_LOCATION
),
3249 SLP_INSTANCE_TREE (instance
));
3252 /* Mark the tree rooted at NODE with PURE_SLP. */
3255 vect_mark_slp_stmts (vec_info
*vinfo
, slp_tree node
,
3256 hash_set
<slp_tree
> &visited
)
3259 stmt_vec_info stmt_info
;
3262 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
3265 if (visited
.add (node
))
3268 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3271 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
3272 /* ??? For .MASK_LOAD and .MASK_STORE detected as load/store-lanes
3273 when there is the mask_conversion pattern applied we have lost the
3274 alternate lanes of the uniform mask which nevertheless
3275 have separate pattern defs. To not confuse hybrid
3276 analysis we mark those as covered as well here. */
3277 if (node
->ldst_lanes
)
3278 if (gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
))
3279 if (gimple_call_internal_p (call
, IFN_MASK_LOAD
)
3280 || gimple_call_internal_p (call
, IFN_MASK_STORE
))
3282 tree mask
= gimple_call_arg (call
,
3283 internal_fn_mask_index
3284 (gimple_call_internal_fn (call
)));
3285 if (TREE_CODE (mask
) == SSA_NAME
)
3286 if (stmt_vec_info mask_info
= vinfo
->lookup_def (mask
))
3288 mask_info
= vect_stmt_to_vectorize (mask_info
);
3289 STMT_SLP_TYPE (mask_info
) = pure_slp
;
3294 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3296 vect_mark_slp_stmts (vinfo
, child
, visited
);
3300 vect_mark_slp_stmts (vec_info
*vinfo
, slp_tree node
)
3302 hash_set
<slp_tree
> visited
;
3303 vect_mark_slp_stmts (vinfo
, node
, visited
);
3306 /* Mark the statements of the tree rooted at NODE as relevant (vect_used). */
3309 vect_mark_slp_stmts_relevant (slp_tree node
, hash_set
<slp_tree
> &visited
)
3312 stmt_vec_info stmt_info
;
3315 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
3318 if (visited
.add (node
))
3321 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
3324 gcc_assert (!STMT_VINFO_RELEVANT (stmt_info
)
3325 || STMT_VINFO_RELEVANT (stmt_info
) == vect_used_in_scope
);
3326 STMT_VINFO_RELEVANT (stmt_info
) = vect_used_in_scope
;
3329 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3331 vect_mark_slp_stmts_relevant (child
, visited
);
3335 vect_mark_slp_stmts_relevant (slp_tree node
)
3337 hash_set
<slp_tree
> visited
;
3338 vect_mark_slp_stmts_relevant (node
, visited
);
3342 /* Gather loads in the SLP graph NODE and populate the INST loads array. */
3345 vect_gather_slp_loads (vec
<slp_tree
> &loads
, slp_tree node
,
3346 hash_set
<slp_tree
> &visited
)
3348 if (!node
|| visited
.add (node
))
3351 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
3354 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
)
3356 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
3357 if (STMT_VINFO_DATA_REF (stmt_info
)
3358 && DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
3359 loads
.safe_push (node
);
3364 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3365 vect_gather_slp_loads (loads
, child
, visited
);
3369 /* Find the last store in SLP INSTANCE. */
3372 vect_find_last_scalar_stmt_in_slp (slp_tree node
)
3374 stmt_vec_info last
= NULL
;
3375 stmt_vec_info stmt_vinfo
;
3377 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
3380 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
3381 last
= last
? get_later_stmt (stmt_vinfo
, last
) : stmt_vinfo
;
3387 /* Find the first stmt in NODE. */
3390 vect_find_first_scalar_stmt_in_slp (slp_tree node
)
3392 stmt_vec_info first
= NULL
;
3393 stmt_vec_info stmt_vinfo
;
3395 for (int i
= 0; SLP_TREE_SCALAR_STMTS (node
).iterate (i
, &stmt_vinfo
); i
++)
3398 stmt_vinfo
= vect_orig_stmt (stmt_vinfo
);
3400 || get_later_stmt (stmt_vinfo
, first
) == first
)
3407 /* Splits a group of stores, currently beginning at FIRST_VINFO, into
3408 two groups: one (still beginning at FIRST_VINFO) of size GROUP1_SIZE
3409 (also containing the first GROUP1_SIZE stmts, since stores are
3410 consecutive), the second containing the remainder.
3411 Return the first stmt in the second group. */
3413 static stmt_vec_info
3414 vect_split_slp_store_group (stmt_vec_info first_vinfo
, unsigned group1_size
)
3416 gcc_assert (DR_GROUP_FIRST_ELEMENT (first_vinfo
) == first_vinfo
);
3417 gcc_assert (group1_size
> 0);
3418 int group2_size
= DR_GROUP_SIZE (first_vinfo
) - group1_size
;
3419 gcc_assert (group2_size
> 0);
3420 DR_GROUP_SIZE (first_vinfo
) = group1_size
;
3422 stmt_vec_info stmt_info
= first_vinfo
;
3423 for (unsigned i
= group1_size
; i
> 1; i
--)
3425 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
3426 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
3428 /* STMT is now the last element of the first group. */
3429 stmt_vec_info group2
= DR_GROUP_NEXT_ELEMENT (stmt_info
);
3430 DR_GROUP_NEXT_ELEMENT (stmt_info
) = 0;
3432 DR_GROUP_SIZE (group2
) = group2_size
;
3433 for (stmt_info
= group2
; stmt_info
;
3434 stmt_info
= DR_GROUP_NEXT_ELEMENT (stmt_info
))
3436 DR_GROUP_FIRST_ELEMENT (stmt_info
) = group2
;
3437 gcc_assert (DR_GROUP_GAP (stmt_info
) == 1);
3440 /* For the second group, the DR_GROUP_GAP is that before the original group,
3441 plus skipping over the first vector. */
3442 DR_GROUP_GAP (group2
) = DR_GROUP_GAP (first_vinfo
) + group1_size
;
3444 /* DR_GROUP_GAP of the first group now has to skip over the second group too. */
3445 DR_GROUP_GAP (first_vinfo
) += group2_size
;
3447 if (dump_enabled_p ())
3448 dump_printf_loc (MSG_NOTE
, vect_location
, "Split group into %d and %d\n",
3449 group1_size
, group2_size
);
3454 /* Calculate the unrolling factor for an SLP instance with GROUP_SIZE
3455 statements and a vector of NUNITS elements. */
3458 calculate_unrolling_factor (poly_uint64 nunits
, unsigned int group_size
)
3460 return exact_div (common_multiple (nunits
, group_size
), group_size
);
3463 /* Helper that checks to see if a node is a load node. */
3466 vect_is_slp_load_node (slp_tree root
)
3468 return (SLP_TREE_CODE (root
) != VEC_PERM_EXPR
3469 && SLP_TREE_DEF_TYPE (root
) == vect_internal_def
3470 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_REPRESENTATIVE (root
))
3471 && DR_IS_READ (STMT_VINFO_DATA_REF (SLP_TREE_REPRESENTATIVE (root
))));
3475 /* Helper function of optimize_load_redistribution that performs the operation
3479 optimize_load_redistribution_1 (scalar_stmts_to_slp_tree_map_t
*bst_map
,
3480 vec_info
*vinfo
, unsigned int group_size
,
3481 hash_map
<slp_tree
, slp_tree
> *load_map
,
3484 if (slp_tree
*leader
= load_map
->get (root
))
3490 /* For now, we don't know anything about externals so do not do anything. */
3491 if (!root
|| SLP_TREE_DEF_TYPE (root
) != vect_internal_def
)
3493 else if (SLP_TREE_CODE (root
) == VEC_PERM_EXPR
)
3495 /* First convert this node into a load node and add it to the leaves
3496 list and flatten the permute from a lane to a load one. If it's
3497 unneeded it will be elided later. */
3498 vec
<stmt_vec_info
> stmts
;
3499 stmts
.create (SLP_TREE_LANES (root
));
3500 lane_permutation_t lane_perm
= SLP_TREE_LANE_PERMUTATION (root
);
3501 for (unsigned j
= 0; j
< lane_perm
.length (); j
++)
3503 std::pair
<unsigned, unsigned> perm
= lane_perm
[j
];
3504 node
= SLP_TREE_CHILDREN (root
)[perm
.first
];
3506 if (!vect_is_slp_load_node (node
)
3507 || SLP_TREE_CHILDREN (node
).exists ())
3513 stmts
.quick_push (SLP_TREE_SCALAR_STMTS (node
)[perm
.second
]);
3516 if (dump_enabled_p ())
3517 dump_printf_loc (MSG_NOTE
, vect_location
,
3518 "converting stmts on permute node %p\n",
3521 bool *matches
= XALLOCAVEC (bool, group_size
);
3522 poly_uint64 max_nunits
= 1;
3523 unsigned tree_size
= 0, limit
= 1;
3524 node
= vect_build_slp_tree (vinfo
, stmts
, group_size
, &max_nunits
,
3525 matches
, &limit
, &tree_size
, bst_map
);
3529 load_map
->put (root
, node
);
3534 load_map
->put (root
, NULL
);
3536 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
3539 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
3543 SLP_TREE_REF_COUNT (value
)++;
3544 SLP_TREE_CHILDREN (root
)[i
] = value
;
3545 /* ??? We know the original leafs of the replaced nodes will
3546 be referenced by bst_map, only the permutes created by
3547 pattern matching are not. */
3548 if (SLP_TREE_REF_COUNT (node
) == 1)
3549 load_map
->remove (node
);
3550 vect_free_slp_tree (node
);
3557 /* Temporary workaround for loads not being CSEd during SLP build. This
3558 function will traverse the SLP tree rooted in ROOT for INSTANCE and find
3559 VEC_PERM nodes that blend vectors from multiple nodes that all read from the
3560 same DR such that the final operation is equal to a permuted load. Such
3561 NODES are then directly converted into LOADS themselves. The nodes are
3562 CSEd using BST_MAP. */
3565 optimize_load_redistribution (scalar_stmts_to_slp_tree_map_t
*bst_map
,
3566 vec_info
*vinfo
, unsigned int group_size
,
3567 hash_map
<slp_tree
, slp_tree
> *load_map
,
3573 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (root
), i
, node
)
3576 = optimize_load_redistribution_1 (bst_map
, vinfo
, group_size
, load_map
,
3580 SLP_TREE_REF_COUNT (value
)++;
3581 SLP_TREE_CHILDREN (root
)[i
] = value
;
3582 /* ??? We know the original leafs of the replaced nodes will
3583 be referenced by bst_map, only the permutes created by
3584 pattern matching are not. */
3585 if (SLP_TREE_REF_COUNT (node
) == 1)
3586 load_map
->remove (node
);
3587 vect_free_slp_tree (node
);
3592 /* Helper function of vect_match_slp_patterns.
3594 Attempts to match patterns against the slp tree rooted in REF_NODE using
3595 VINFO. Patterns are matched in post-order traversal.
3597 If matching is successful the value in REF_NODE is updated and returned, if
3598 not then it is returned unchanged. */
3601 vect_match_slp_patterns_2 (slp_tree
*ref_node
, vec_info
*vinfo
,
3602 slp_tree_to_load_perm_map_t
*perm_cache
,
3603 slp_compat_nodes_map_t
*compat_cache
,
3604 hash_set
<slp_tree
> *visited
)
3607 slp_tree node
= *ref_node
;
3608 bool found_p
= false;
3609 if (!node
|| visited
->add (node
))
3613 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
3614 found_p
|= vect_match_slp_patterns_2 (&SLP_TREE_CHILDREN (node
)[i
],
3615 vinfo
, perm_cache
, compat_cache
,
3618 for (unsigned x
= 0; x
< num__slp_patterns
; x
++)
3620 vect_pattern
*pattern
3621 = slp_patterns
[x
] (perm_cache
, compat_cache
, ref_node
);
3624 pattern
->build (vinfo
);
3633 /* Applies pattern matching to the given SLP tree rooted in REF_NODE using
3636 The modified tree is returned. Patterns are tried in order and multiple
3637 patterns may match. */
3640 vect_match_slp_patterns (slp_instance instance
, vec_info
*vinfo
,
3641 hash_set
<slp_tree
> *visited
,
3642 slp_tree_to_load_perm_map_t
*perm_cache
,
3643 slp_compat_nodes_map_t
*compat_cache
)
3645 DUMP_VECT_SCOPE ("vect_match_slp_patterns");
3646 slp_tree
*ref_node
= &SLP_INSTANCE_TREE (instance
);
3648 if (dump_enabled_p ())
3649 dump_printf_loc (MSG_NOTE
, vect_location
,
3650 "Analyzing SLP tree %p for patterns\n",
3651 (void *) SLP_INSTANCE_TREE (instance
));
3653 return vect_match_slp_patterns_2 (ref_node
, vinfo
, perm_cache
, compat_cache
,
3657 /* STMT_INFO is a store group of size GROUP_SIZE that we are considering
3658 vectorizing with VECTYPE that might be NULL. MASKED_P indicates whether
3659 the stores are masked.
3660 Return true if we could use IFN_STORE_LANES instead and if that appears
3661 to be the better approach. */
3664 vect_slp_prefer_store_lanes_p (vec_info
*vinfo
, stmt_vec_info stmt_info
,
3665 tree vectype
, bool masked_p
,
3666 unsigned int group_size
,
3667 unsigned int new_group_size
)
3671 tree scalar_type
= TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
3672 vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
);
3676 /* Allow the split if one of the two new groups would operate on full
3677 vectors *within* rather than across one scalar loop iteration.
3678 This is purely a heuristic, but it should work well for group
3679 sizes of 3 and 4, where the possible splits are:
3681 3->2+1: OK if the vector has exactly two elements
3683 4->3+1: Less clear-cut. */
3684 if (multiple_p (group_size
- new_group_size
, TYPE_VECTOR_SUBPARTS (vectype
))
3685 || multiple_p (new_group_size
, TYPE_VECTOR_SUBPARTS (vectype
)))
3687 return vect_store_lanes_supported (vectype
, group_size
, masked_p
) != IFN_LAST
;
3690 /* Analyze an SLP instance starting from a group of grouped stores. Call
3691 vect_build_slp_tree to build a tree of packed stmts if possible.
3692 Return FALSE if it's impossible to SLP any stmt in the loop. */
3695 vect_analyze_slp_instance (vec_info
*vinfo
,
3696 scalar_stmts_to_slp_tree_map_t
*bst_map
,
3697 stmt_vec_info stmt_info
, slp_instance_kind kind
,
3698 unsigned max_tree_size
, unsigned *limit
,
3699 bool force_single_lane
);
3701 /* Build an interleaving scheme for the store sources RHS_NODES from
3705 vect_build_slp_store_interleaving (vec
<slp_tree
> &rhs_nodes
,
3706 vec
<stmt_vec_info
> &scalar_stmts
,
3707 poly_uint64 max_nunits
)
3709 unsigned int group_size
= scalar_stmts
.length ();
3710 slp_tree node
= vect_create_new_slp_node (scalar_stmts
,
3712 (rhs_nodes
[0]).length ());
3713 SLP_TREE_VECTYPE (node
) = SLP_TREE_VECTYPE (rhs_nodes
[0]);
3714 node
->max_nunits
= max_nunits
;
3715 for (unsigned l
= 0;
3716 l
< SLP_TREE_CHILDREN (rhs_nodes
[0]).length (); ++l
)
3718 /* And a permute merging all RHS SLP trees. */
3719 slp_tree perm
= vect_create_new_slp_node (rhs_nodes
.length (),
3721 SLP_TREE_CHILDREN (node
).quick_push (perm
);
3722 SLP_TREE_LANE_PERMUTATION (perm
).create (group_size
);
3723 SLP_TREE_VECTYPE (perm
) = SLP_TREE_VECTYPE (node
);
3724 perm
->max_nunits
= max_nunits
;
3725 SLP_TREE_LANES (perm
) = group_size
;
3726 /* ??? We should set this NULL but that's not expected. */
3727 SLP_TREE_REPRESENTATIVE (perm
)
3728 = SLP_TREE_REPRESENTATIVE (SLP_TREE_CHILDREN (rhs_nodes
[0])[l
]);
3729 for (unsigned j
= 0; j
< rhs_nodes
.length (); ++j
)
3731 SLP_TREE_CHILDREN (perm
)
3732 .quick_push (SLP_TREE_CHILDREN (rhs_nodes
[j
])[l
]);
3733 SLP_TREE_CHILDREN (rhs_nodes
[j
])[l
]->refcnt
++;
3734 for (unsigned k
= 0;
3735 k
< SLP_TREE_SCALAR_STMTS (rhs_nodes
[j
]).length (); ++k
)
3737 /* ??? We should populate SLP_TREE_SCALAR_STMTS
3738 or SLP_TREE_SCALAR_OPS but then we might have
3739 a mix of both in our children. */
3740 SLP_TREE_LANE_PERMUTATION (perm
)
3741 .quick_push (std::make_pair (j
, k
));
3745 /* Now we have a single permute node but we cannot code-generate
3746 the case with more than two inputs.
3747 Perform pairwise reduction, reducing the two inputs
3748 with the least number of lanes to one and then repeat until
3749 we end up with two inputs. That scheme makes sure we end
3750 up with permutes satisfying the restriction of requiring at
3751 most two vector inputs to produce a single vector output
3752 when the number of lanes is even. */
3753 while (SLP_TREE_CHILDREN (perm
).length () > 2)
3755 /* When we have three equal sized groups left the pairwise
3756 reduction does not result in a scheme that avoids using
3757 three vectors. Instead merge the first two groups
3758 to the final size with do-not-care elements (chosen
3759 from the first group) and then merge with the third.
3760 { A0, B0, x, A1, B1, x, ... }
3761 -> { A0, B0, C0, A1, B1, C1, ... }
3762 This handles group size of three (and at least
3763 power-of-two multiples of that). */
3764 if (SLP_TREE_CHILDREN (perm
).length () == 3
3765 && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm
)[0])
3766 == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm
)[1]))
3767 && (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm
)[0])
3768 == SLP_TREE_LANES (SLP_TREE_CHILDREN (perm
)[2])))
3772 slp_tree a
= SLP_TREE_CHILDREN (perm
)[ai
];
3773 slp_tree b
= SLP_TREE_CHILDREN (perm
)[bi
];
3774 unsigned n
= SLP_TREE_LANES (perm
);
3776 slp_tree permab
= vect_create_new_slp_node (2, VEC_PERM_EXPR
);
3777 SLP_TREE_LANES (permab
) = n
;
3778 SLP_TREE_LANE_PERMUTATION (permab
).create (n
);
3779 SLP_TREE_VECTYPE (permab
) = SLP_TREE_VECTYPE (perm
);
3780 permab
->max_nunits
= max_nunits
;
3781 /* ??? Should be NULL but that's not expected. */
3782 SLP_TREE_REPRESENTATIVE (permab
) = SLP_TREE_REPRESENTATIVE (perm
);
3783 SLP_TREE_CHILDREN (permab
).quick_push (a
);
3784 for (unsigned k
= 0; k
< SLP_TREE_LANES (a
); ++k
)
3785 SLP_TREE_LANE_PERMUTATION (permab
)
3786 .quick_push (std::make_pair (0, k
));
3787 SLP_TREE_CHILDREN (permab
).quick_push (b
);
3788 for (unsigned k
= 0; k
< SLP_TREE_LANES (b
); ++k
)
3789 SLP_TREE_LANE_PERMUTATION (permab
)
3790 .quick_push (std::make_pair (1, k
));
3791 /* Push the do-not-care lanes. */
3792 for (unsigned k
= 0; k
< SLP_TREE_LANES (a
); ++k
)
3793 SLP_TREE_LANE_PERMUTATION (permab
)
3794 .quick_push (std::make_pair (0, k
));
3796 /* Put the merged node into 'perm', in place of a. */
3797 SLP_TREE_CHILDREN (perm
)[ai
] = permab
;
3798 /* Adjust the references to b in the permutation
3799 of perm and to the later children which we'll
3801 for (unsigned k
= 0; k
< SLP_TREE_LANES (perm
); ++k
)
3803 std::pair
<unsigned, unsigned> &p
3804 = SLP_TREE_LANE_PERMUTATION (perm
)[k
];
3805 if (p
.first
== (unsigned) bi
)
3808 p
.second
+= SLP_TREE_LANES (a
);
3810 else if (p
.first
> (unsigned) bi
)
3813 SLP_TREE_CHILDREN (perm
).ordered_remove (bi
);
3817 /* Pick the two nodes with the least number of lanes,
3818 prefer the earliest candidate and maintain ai < bi. */
3821 for (unsigned ci
= 0; ci
< SLP_TREE_CHILDREN (perm
).length (); ++ci
)
3827 else if ((SLP_TREE_LANES (SLP_TREE_CHILDREN (perm
)[ci
])
3828 < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm
)[ai
]))
3829 || (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm
)[ci
])
3830 < SLP_TREE_LANES (SLP_TREE_CHILDREN (perm
)[bi
])))
3832 if (SLP_TREE_LANES (SLP_TREE_CHILDREN (perm
)[ai
])
3833 <= SLP_TREE_LANES (SLP_TREE_CHILDREN (perm
)[bi
]))
3843 /* Produce a merge of nodes ai and bi. */
3844 slp_tree a
= SLP_TREE_CHILDREN (perm
)[ai
];
3845 slp_tree b
= SLP_TREE_CHILDREN (perm
)[bi
];
3846 unsigned n
= SLP_TREE_LANES (a
) + SLP_TREE_LANES (b
);
3847 slp_tree permab
= vect_create_new_slp_node (2, VEC_PERM_EXPR
);
3848 SLP_TREE_LANES (permab
) = n
;
3849 SLP_TREE_LANE_PERMUTATION (permab
).create (n
);
3850 SLP_TREE_VECTYPE (permab
) = SLP_TREE_VECTYPE (perm
);
3851 permab
->max_nunits
= max_nunits
;
3852 /* ??? Should be NULL but that's not expected. */
3853 SLP_TREE_REPRESENTATIVE (permab
) = SLP_TREE_REPRESENTATIVE (perm
);
3854 SLP_TREE_CHILDREN (permab
).quick_push (a
);
3855 for (unsigned k
= 0; k
< SLP_TREE_LANES (a
); ++k
)
3856 SLP_TREE_LANE_PERMUTATION (permab
)
3857 .quick_push (std::make_pair (0, k
));
3858 SLP_TREE_CHILDREN (permab
).quick_push (b
);
3859 for (unsigned k
= 0; k
< SLP_TREE_LANES (b
); ++k
)
3860 SLP_TREE_LANE_PERMUTATION (permab
)
3861 .quick_push (std::make_pair (1, k
));
3863 /* Put the merged node into 'perm', in place of a. */
3864 SLP_TREE_CHILDREN (perm
)[ai
] = permab
;
3865 /* Adjust the references to b in the permutation
3866 of perm and to the later children which we'll
3868 for (unsigned k
= 0; k
< SLP_TREE_LANES (perm
); ++k
)
3870 std::pair
<unsigned, unsigned> &p
3871 = SLP_TREE_LANE_PERMUTATION (perm
)[k
];
3872 if (p
.first
== (unsigned) bi
)
3875 p
.second
+= SLP_TREE_LANES (a
);
3877 else if (p
.first
> (unsigned) bi
)
3880 SLP_TREE_CHILDREN (perm
).ordered_remove (bi
);
3887 /* Analyze an SLP instance starting from SCALAR_STMTS which are a group
3888 of KIND. Return true if successful. */
3891 vect_build_slp_instance (vec_info
*vinfo
,
3892 slp_instance_kind kind
,
3893 vec
<stmt_vec_info
> &scalar_stmts
,
3894 vec
<stmt_vec_info
> &root_stmt_infos
,
3896 unsigned max_tree_size
, unsigned *limit
,
3897 scalar_stmts_to_slp_tree_map_t
*bst_map
,
3898 /* ??? We need stmt_info for group splitting. */
3899 stmt_vec_info stmt_info_
,
3900 bool force_single_lane
)
3902 /* If there's no budget left bail out early. */
3906 if (kind
== slp_inst_kind_ctor
)
3908 if (dump_enabled_p ())
3909 dump_printf_loc (MSG_NOTE
, vect_location
,
3910 "Analyzing vectorizable constructor: %G\n",
3911 root_stmt_infos
[0]->stmt
);
3913 else if (kind
== slp_inst_kind_gcond
)
3915 if (dump_enabled_p ())
3916 dump_printf_loc (MSG_NOTE
, vect_location
,
3917 "Analyzing vectorizable control flow: %G",
3918 root_stmt_infos
[0]->stmt
);
3921 if (dump_enabled_p ())
3923 dump_printf_loc (MSG_NOTE
, vect_location
,
3924 "Starting SLP discovery for\n");
3925 for (unsigned i
= 0; i
< scalar_stmts
.length (); ++i
)
3926 dump_printf_loc (MSG_NOTE
, vect_location
,
3927 " %G", scalar_stmts
[i
]->stmt
);
3930 /* Build the tree for the SLP instance. */
3931 unsigned int group_size
= scalar_stmts
.length ();
3932 bool *matches
= XALLOCAVEC (bool, group_size
);
3933 poly_uint64 max_nunits
= 1;
3934 unsigned tree_size
= 0;
3937 slp_tree node
= NULL
;
3938 if (group_size
> 1 && force_single_lane
)
3944 node
= vect_build_slp_tree (vinfo
, scalar_stmts
, group_size
,
3945 &max_nunits
, matches
, limit
,
3946 &tree_size
, bst_map
);
3949 /* Calculate the unrolling factor based on the smallest type. */
3950 poly_uint64 unrolling_factor
3951 = calculate_unrolling_factor (max_nunits
, group_size
);
3953 if (maybe_ne (unrolling_factor
, 1U)
3954 && is_a
<bb_vec_info
> (vinfo
))
3956 unsigned HOST_WIDE_INT const_max_nunits
;
3957 if (!max_nunits
.is_constant (&const_max_nunits
)
3958 || const_max_nunits
> group_size
)
3960 if (dump_enabled_p ())
3961 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
3962 "Build SLP failed: store group "
3963 "size not a multiple of the vector size "
3964 "in basic block SLP\n");
3965 vect_free_slp_tree (node
);
3968 /* Fatal mismatch. */
3969 if (dump_enabled_p ())
3970 dump_printf_loc (MSG_NOTE
, vect_location
,
3971 "SLP discovery succeeded but node needs "
3973 memset (matches
, true, group_size
);
3974 matches
[group_size
/ const_max_nunits
* const_max_nunits
] = false;
3975 vect_free_slp_tree (node
);
3979 /* Create a new SLP instance. */
3980 slp_instance new_instance
= XNEW (class _slp_instance
);
3981 SLP_INSTANCE_TREE (new_instance
) = node
;
3982 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
3983 SLP_INSTANCE_ROOT_STMTS (new_instance
) = root_stmt_infos
;
3984 SLP_INSTANCE_REMAIN_DEFS (new_instance
) = remain
;
3985 SLP_INSTANCE_KIND (new_instance
) = kind
;
3986 new_instance
->reduc_phis
= NULL
;
3987 new_instance
->cost_vec
= vNULL
;
3988 new_instance
->subgraph_entries
= vNULL
;
3990 if (dump_enabled_p ())
3991 dump_printf_loc (MSG_NOTE
, vect_location
,
3992 "SLP size %u vs. limit %u.\n",
3993 tree_size
, max_tree_size
);
3995 /* Fixup SLP reduction chains. */
3996 if (kind
== slp_inst_kind_reduc_chain
)
3998 /* If this is a reduction chain with a conversion in front
3999 amend the SLP tree with a node for that. */
4001 = vect_orig_stmt (scalar_stmts
[group_size
- 1])->stmt
;
4002 if (STMT_VINFO_DEF_TYPE (scalar_stmts
[0]) != vect_reduction_def
)
4004 /* Get at the conversion stmt - we know it's the single use
4005 of the last stmt of the reduction chain. */
4006 use_operand_p use_p
;
4007 bool r
= single_imm_use (gimple_assign_lhs (scalar_def
),
4008 &use_p
, &scalar_def
);
4010 stmt_vec_info next_info
= vinfo
->lookup_stmt (scalar_def
);
4011 next_info
= vect_stmt_to_vectorize (next_info
);
4012 scalar_stmts
= vNULL
;
4013 scalar_stmts
.create (group_size
);
4014 for (unsigned i
= 0; i
< group_size
; ++i
)
4015 scalar_stmts
.quick_push (next_info
);
4016 slp_tree conv
= vect_create_new_slp_node (scalar_stmts
, 1);
4017 SLP_TREE_VECTYPE (conv
) = STMT_VINFO_VECTYPE (next_info
);
4018 SLP_TREE_CHILDREN (conv
).quick_push (node
);
4019 SLP_INSTANCE_TREE (new_instance
) = conv
;
4020 /* We also have to fake this conversion stmt as SLP reduction
4021 group so we don't have to mess with too much code
4023 REDUC_GROUP_FIRST_ELEMENT (next_info
) = next_info
;
4024 REDUC_GROUP_NEXT_ELEMENT (next_info
) = NULL
;
4026 /* Fill the backedge child of the PHI SLP node. The
4027 general matching code cannot find it because the
4028 scalar code does not reflect how we vectorize the
4030 use_operand_p use_p
;
4031 imm_use_iterator imm_iter
;
4032 class loop
*loop
= LOOP_VINFO_LOOP (as_a
<loop_vec_info
> (vinfo
));
4033 FOR_EACH_IMM_USE_FAST (use_p
, imm_iter
,
4034 gimple_get_lhs (scalar_def
))
4035 /* There are exactly two non-debug uses, the reduction
4036 PHI and the loop-closed PHI node. */
4037 if (!is_gimple_debug (USE_STMT (use_p
))
4038 && gimple_bb (USE_STMT (use_p
)) == loop
->header
)
4040 auto_vec
<stmt_vec_info
, 64> phis (group_size
);
4041 stmt_vec_info phi_info
4042 = vinfo
->lookup_stmt (USE_STMT (use_p
));
4043 for (unsigned i
= 0; i
< group_size
; ++i
)
4044 phis
.quick_push (phi_info
);
4045 slp_tree
*phi_node
= bst_map
->get (phis
);
4046 unsigned dest_idx
= loop_latch_edge (loop
)->dest_idx
;
4047 SLP_TREE_CHILDREN (*phi_node
)[dest_idx
]
4048 = SLP_INSTANCE_TREE (new_instance
);
4049 SLP_INSTANCE_TREE (new_instance
)->refcnt
++;
4053 vinfo
->slp_instances
.safe_push (new_instance
);
4055 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4056 the number of scalar stmts in the root in a few places.
4057 Verify that assumption holds. */
4058 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
4059 .length () == group_size
);
4061 if (dump_enabled_p ())
4063 dump_printf_loc (MSG_NOTE
, vect_location
,
4064 "Final SLP tree for instance %p:\n",
4065 (void *) new_instance
);
4066 vect_print_slp_graph (MSG_NOTE
, vect_location
,
4067 SLP_INSTANCE_TREE (new_instance
));
4073 /* Failed to SLP. */
4075 stmt_vec_info stmt_info
= stmt_info_
;
4076 /* Try to break the group up into pieces. */
4077 if (*limit
> 0 && kind
== slp_inst_kind_store
)
4079 /* ??? We could delay all the actual splitting of store-groups
4080 until after SLP discovery of the original group completed.
4081 Then we can recurse to vect_build_slp_instance directly. */
4082 for (i
= 0; i
< group_size
; i
++)
4086 /* For basic block SLP, try to break the group up into multiples of
4088 if (is_a
<bb_vec_info
> (vinfo
)
4089 && (i
> 1 && i
< group_size
))
4091 /* Free the allocated memory. */
4092 scalar_stmts
.release ();
4095 = TREE_TYPE (DR_REF (STMT_VINFO_DATA_REF (stmt_info
)));
4096 tree vectype
= get_vectype_for_scalar_type (vinfo
, scalar_type
,
4097 1 << floor_log2 (i
));
4098 unsigned HOST_WIDE_INT const_nunits
;
4100 && TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
))
4102 /* Split into two groups at the first vector boundary. */
4103 gcc_assert ((const_nunits
& (const_nunits
- 1)) == 0);
4104 unsigned group1_size
= i
& ~(const_nunits
- 1);
4106 if (dump_enabled_p ())
4107 dump_printf_loc (MSG_NOTE
, vect_location
,
4108 "Splitting SLP group at stmt %u\n", i
);
4109 stmt_vec_info rest
= vect_split_slp_store_group (stmt_info
,
4111 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
, stmt_info
,
4112 kind
, max_tree_size
,
4114 /* Split the rest at the failure point and possibly
4115 re-analyze the remaining matching part if it has
4116 at least two lanes. */
4118 && (i
+ 1 < group_size
4119 || i
- group1_size
> 1))
4121 stmt_vec_info rest2
= rest
;
4122 rest
= vect_split_slp_store_group (rest
, i
- group1_size
);
4123 if (i
- group1_size
> 1)
4124 res
|= vect_analyze_slp_instance (vinfo
, bst_map
, rest2
,
4125 kind
, max_tree_size
,
4128 /* Re-analyze the non-matching tail if it has at least
4130 if (i
+ 1 < group_size
)
4131 res
|= vect_analyze_slp_instance (vinfo
, bst_map
,
4132 rest
, kind
, max_tree_size
,
4138 /* For loop vectorization split the RHS into arbitrary pieces of
4140 else if (is_a
<loop_vec_info
> (vinfo
)
4141 && (group_size
!= 1 && i
< group_size
))
4143 gcall
*call
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
4144 bool masked_p
= call
4145 && gimple_call_internal_p (call
)
4146 && internal_fn_mask_index (gimple_call_internal_fn (call
)) != -1;
4147 /* There are targets that cannot do even/odd interleaving schemes
4148 so they absolutely need to use load/store-lanes. For now
4149 force single-lane SLP for them - they would be happy with
4150 uniform power-of-two lanes (but depending on element size),
4151 but even if we can use 'i' as indicator we would need to
4152 backtrack when later lanes fail to discover with the same
4153 granularity. We cannot turn any of strided or scatter store
4154 into store-lanes. */
4155 /* ??? If this is not in sync with what get_load_store_type
4156 later decides the SLP representation is not good for other
4157 store vectorization methods. */
4158 bool want_store_lanes
4159 = (! STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
4160 && ! STMT_VINFO_STRIDED_P (stmt_info
)
4161 && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info
)
4162 && compare_step_with_zero (vinfo
, stmt_info
) > 0
4163 && vect_slp_prefer_store_lanes_p (vinfo
, stmt_info
, NULL_TREE
,
4164 masked_p
, group_size
, 1));
4165 if (want_store_lanes
|| force_single_lane
)
4168 /* A fatal discovery fail doesn't always mean single-lane SLP
4169 isn't a possibility, so try. */
4173 if (dump_enabled_p ())
4174 dump_printf_loc (MSG_NOTE
, vect_location
,
4175 "Splitting SLP group at stmt %u\n", i
);
4177 /* Analyze the stored values and pinch them together with
4178 a permute node so we can preserve the whole store group. */
4179 auto_vec
<slp_tree
> rhs_nodes
;
4180 poly_uint64 max_nunits
= 1;
4182 unsigned int rhs_common_nlanes
= 0;
4183 unsigned int start
= 0, end
= i
;
4184 while (start
< group_size
)
4186 gcc_assert (end
- start
>= 1);
4187 vec
<stmt_vec_info
> substmts
;
4188 substmts
.create (end
- start
);
4189 for (unsigned j
= start
; j
< end
; ++j
)
4190 substmts
.quick_push (scalar_stmts
[j
]);
4192 node
= vect_build_slp_tree (vinfo
, substmts
, end
- start
,
4194 matches
, limit
, &tree_size
, bst_map
);
4197 rhs_nodes
.safe_push (node
);
4198 vect_update_max_nunits (&max_nunits
, node
->max_nunits
);
4200 rhs_common_nlanes
= SLP_TREE_LANES (node
);
4201 else if (rhs_common_nlanes
!= SLP_TREE_LANES (node
))
4202 rhs_common_nlanes
= 0;
4204 if (want_store_lanes
|| force_single_lane
)
4211 substmts
.release ();
4212 if (end
- start
== 1)
4214 /* Single-lane discovery failed. Free ressources. */
4215 for (auto node
: rhs_nodes
)
4216 vect_free_slp_tree (node
);
4217 scalar_stmts
.release ();
4218 if (dump_enabled_p ())
4219 dump_printf_loc (MSG_NOTE
, vect_location
,
4220 "SLP discovery failed\n");
4224 /* ??? It really happens that we soft-fail SLP
4225 build at a mismatch but the matching part hard-fails
4226 later. As we know we arrived here with a group
4227 larger than one try a group of size one! */
4231 for (unsigned j
= start
; j
< end
; j
++)
4232 if (!matches
[j
- start
])
4240 /* Now re-assess whether we want store lanes in case the
4241 discovery ended up producing all single-lane RHSs. */
4242 if (! want_store_lanes
4243 && rhs_common_nlanes
== 1
4244 && ! STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
4245 && ! STMT_VINFO_STRIDED_P (stmt_info
)
4246 && ! STMT_VINFO_SLP_VECT_ONLY (stmt_info
)
4247 && compare_step_with_zero (vinfo
, stmt_info
) > 0
4248 && (vect_store_lanes_supported (SLP_TREE_VECTYPE (rhs_nodes
[0]),
4249 group_size
, masked_p
)
4251 want_store_lanes
= true;
4253 /* Now we assume we can build the root SLP node from all stores. */
4254 if (want_store_lanes
)
4256 /* For store-lanes feed the store node with all RHS nodes
4258 node
= vect_create_new_slp_node (scalar_stmts
,
4260 (rhs_nodes
[0]).length ());
4261 SLP_TREE_VECTYPE (node
) = SLP_TREE_VECTYPE (rhs_nodes
[0]);
4262 node
->max_nunits
= max_nunits
;
4263 node
->ldst_lanes
= true;
4264 SLP_TREE_CHILDREN (node
)
4265 .reserve_exact (SLP_TREE_CHILDREN (rhs_nodes
[0]).length ()
4266 + rhs_nodes
.length () - 1);
4267 /* First store value and possibly mask. */
4268 SLP_TREE_CHILDREN (node
)
4269 .splice (SLP_TREE_CHILDREN (rhs_nodes
[0]));
4270 /* Rest of the store values. All mask nodes are the same,
4271 this should be guaranteed by dataref group discovery. */
4272 for (unsigned j
= 1; j
< rhs_nodes
.length (); ++j
)
4273 SLP_TREE_CHILDREN (node
)
4274 .quick_push (SLP_TREE_CHILDREN (rhs_nodes
[j
])[0]);
4275 for (slp_tree child
: SLP_TREE_CHILDREN (node
))
4279 node
= vect_build_slp_store_interleaving (rhs_nodes
, scalar_stmts
,
4282 while (!rhs_nodes
.is_empty ())
4283 vect_free_slp_tree (rhs_nodes
.pop ());
4285 /* Create a new SLP instance. */
4286 slp_instance new_instance
= XNEW (class _slp_instance
);
4287 SLP_INSTANCE_TREE (new_instance
) = node
;
4288 SLP_INSTANCE_LOADS (new_instance
) = vNULL
;
4289 SLP_INSTANCE_ROOT_STMTS (new_instance
) = root_stmt_infos
;
4290 SLP_INSTANCE_REMAIN_DEFS (new_instance
) = remain
;
4291 SLP_INSTANCE_KIND (new_instance
) = kind
;
4292 new_instance
->reduc_phis
= NULL
;
4293 new_instance
->cost_vec
= vNULL
;
4294 new_instance
->subgraph_entries
= vNULL
;
4296 if (dump_enabled_p ())
4297 dump_printf_loc (MSG_NOTE
, vect_location
,
4298 "SLP size %u vs. limit %u.\n",
4299 tree_size
, max_tree_size
);
4301 vinfo
->slp_instances
.safe_push (new_instance
);
4303 /* ??? We've replaced the old SLP_INSTANCE_GROUP_SIZE with
4304 the number of scalar stmts in the root in a few places.
4305 Verify that assumption holds. */
4306 gcc_assert (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (new_instance
))
4307 .length () == group_size
);
4309 if (dump_enabled_p ())
4311 dump_printf_loc (MSG_NOTE
, vect_location
,
4312 "Final SLP tree for instance %p:\n",
4313 (void *) new_instance
);
4314 vect_print_slp_graph (MSG_NOTE
, vect_location
,
4315 SLP_INSTANCE_TREE (new_instance
));
4320 /* Free the allocated memory. */
4321 scalar_stmts
.release ();
4323 /* Even though the first vector did not all match, we might be able to SLP
4324 (some) of the remainder. FORNOW ignore this possibility. */
4327 /* Free the allocated memory. */
4328 scalar_stmts
.release ();
4330 /* Failed to SLP. */
4331 if (dump_enabled_p ())
4332 dump_printf_loc (MSG_NOTE
, vect_location
, "SLP discovery failed\n");
4337 /* Analyze an SLP instance starting from a group of grouped stores. Call
4338 vect_build_slp_tree to build a tree of packed stmts if possible.
4339 Return FALSE if it's impossible to SLP any stmt in the loop. */
4342 vect_analyze_slp_instance (vec_info
*vinfo
,
4343 scalar_stmts_to_slp_tree_map_t
*bst_map
,
4344 stmt_vec_info stmt_info
,
4345 slp_instance_kind kind
,
4346 unsigned max_tree_size
, unsigned *limit
,
4347 bool force_single_lane
)
4349 vec
<stmt_vec_info
> scalar_stmts
;
4351 if (is_a
<bb_vec_info
> (vinfo
))
4352 vect_location
= stmt_info
->stmt
;
4354 stmt_vec_info next_info
= stmt_info
;
4355 if (kind
== slp_inst_kind_store
)
4357 /* Collect the stores and store them in scalar_stmts. */
4358 scalar_stmts
.create (DR_GROUP_SIZE (stmt_info
));
4361 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
4362 next_info
= DR_GROUP_NEXT_ELEMENT (next_info
);
4365 else if (kind
== slp_inst_kind_reduc_chain
)
4367 /* Collect the reduction stmts and store them in scalar_stmts. */
4368 scalar_stmts
.create (REDUC_GROUP_SIZE (stmt_info
));
4371 scalar_stmts
.quick_push (vect_stmt_to_vectorize (next_info
));
4372 next_info
= REDUC_GROUP_NEXT_ELEMENT (next_info
);
4374 /* Mark the first element of the reduction chain as reduction to properly
4375 transform the node. In the reduction analysis phase only the last
4376 element of the chain is marked as reduction. */
4377 STMT_VINFO_DEF_TYPE (stmt_info
)
4378 = STMT_VINFO_DEF_TYPE (scalar_stmts
.last ());
4379 STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info
))
4380 = STMT_VINFO_REDUC_DEF (vect_orig_stmt (scalar_stmts
.last ()));
4385 vec
<stmt_vec_info
> roots
= vNULL
;
4386 vec
<tree
> remain
= vNULL
;
4387 /* Build the tree for the SLP instance. */
4388 bool res
= vect_build_slp_instance (vinfo
, kind
, scalar_stmts
,
4390 max_tree_size
, limit
, bst_map
,
4391 kind
== slp_inst_kind_store
4392 ? stmt_info
: NULL
, force_single_lane
);
4394 /* ??? If this is slp_inst_kind_store and the above succeeded here's
4395 where we should do store group splitting. */
4400 /* qsort comparator ordering SLP load nodes. */
4403 vllp_cmp (const void *a_
, const void *b_
)
4405 const slp_tree a
= *(const slp_tree
*)a_
;
4406 const slp_tree b
= *(const slp_tree
*)b_
;
4407 stmt_vec_info a0
= SLP_TREE_SCALAR_STMTS (a
)[0];
4408 stmt_vec_info b0
= SLP_TREE_SCALAR_STMTS (b
)[0];
4409 if (STMT_VINFO_GROUPED_ACCESS (a0
)
4410 && STMT_VINFO_GROUPED_ACCESS (b0
)
4411 && DR_GROUP_FIRST_ELEMENT (a0
) == DR_GROUP_FIRST_ELEMENT (b0
))
4413 /* Same group, order after lanes used. */
4414 if (SLP_TREE_LANES (a
) < SLP_TREE_LANES (b
))
4416 else if (SLP_TREE_LANES (a
) > SLP_TREE_LANES (b
))
4420 /* Try to order loads using the same lanes together, breaking
4421 the tie with the lane number that first differs. */
4422 if (!SLP_TREE_LOAD_PERMUTATION (a
).exists ()
4423 && !SLP_TREE_LOAD_PERMUTATION (b
).exists ())
4425 else if (SLP_TREE_LOAD_PERMUTATION (a
).exists ()
4426 && !SLP_TREE_LOAD_PERMUTATION (b
).exists ())
4428 else if (!SLP_TREE_LOAD_PERMUTATION (a
).exists ()
4429 && SLP_TREE_LOAD_PERMUTATION (b
).exists ())
4433 for (unsigned i
= 0; i
< SLP_TREE_LANES (a
); ++i
)
4434 if (SLP_TREE_LOAD_PERMUTATION (a
)[i
]
4435 != SLP_TREE_LOAD_PERMUTATION (b
)[i
])
4437 /* In-order lane first, that's what the above case for
4438 no permutation does. */
4439 if (SLP_TREE_LOAD_PERMUTATION (a
)[i
] == i
)
4441 else if (SLP_TREE_LOAD_PERMUTATION (b
)[i
] == i
)
4443 else if (SLP_TREE_LOAD_PERMUTATION (a
)[i
]
4444 < SLP_TREE_LOAD_PERMUTATION (b
)[i
])
4453 else /* Different groups or non-groups. */
4455 /* Order groups as their first element to keep them together. */
4456 if (STMT_VINFO_GROUPED_ACCESS (a0
))
4457 a0
= DR_GROUP_FIRST_ELEMENT (a0
);
4458 if (STMT_VINFO_GROUPED_ACCESS (b0
))
4459 b0
= DR_GROUP_FIRST_ELEMENT (b0
);
4462 /* Tie using UID. */
4463 else if (gimple_uid (STMT_VINFO_STMT (a0
))
4464 < gimple_uid (STMT_VINFO_STMT (b0
)))
4468 gcc_assert (gimple_uid (STMT_VINFO_STMT (a0
))
4469 != gimple_uid (STMT_VINFO_STMT (b0
)));
4475 /* Process the set of LOADS that are all from the same dataref group. */
4478 vect_lower_load_permutations (loop_vec_info loop_vinfo
,
4479 scalar_stmts_to_slp_tree_map_t
*bst_map
,
4480 const array_slice
<slp_tree
> &loads
,
4481 bool force_single_lane
)
4483 /* We at this point want to lower without a fixed VF or vector
4484 size in mind which means we cannot actually compute whether we
4485 need three or more vectors for a load permutation yet. So always
4488 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (loads
[0])[0]);
4489 unsigned group_lanes
= DR_GROUP_SIZE (first
);
4491 /* Verify if all load permutations can be implemented with a suitably
4492 large element load-lanes operation. */
4493 unsigned ld_lanes_lanes
= SLP_TREE_LANES (loads
[0]);
4494 if (STMT_VINFO_STRIDED_P (first
)
4495 || compare_step_with_zero (loop_vinfo
, first
) <= 0
4496 || exact_log2 (ld_lanes_lanes
) == -1
4497 /* ??? For now only support the single-lane case as there is
4498 missing support on the store-lane side and code generation
4499 isn't up to the task yet. */
4500 || ld_lanes_lanes
!= 1
4501 || vect_load_lanes_supported (SLP_TREE_VECTYPE (loads
[0]),
4502 group_lanes
/ ld_lanes_lanes
,
4506 /* Verify the loads access the same number of lanes aligned to
4508 for (slp_tree load
: loads
)
4510 if (SLP_TREE_LANES (load
) != ld_lanes_lanes
)
4515 unsigned first
= SLP_TREE_LOAD_PERMUTATION (load
)[0];
4516 if (first
% ld_lanes_lanes
!= 0)
4521 for (unsigned i
= 1; i
< SLP_TREE_LANES (load
); ++i
)
4522 if (SLP_TREE_LOAD_PERMUTATION (load
)[i
] != first
+ i
)
4529 /* Only a power-of-two number of lanes matches interleaving with N levels.
4530 ??? An even number of lanes could be reduced to 1<<ceil_log2(N)-1 lanes
4532 if (ld_lanes_lanes
== 0 && exact_log2 (group_lanes
) == -1 && group_lanes
!= 3)
4535 for (slp_tree load
: loads
)
4537 /* Leave masked or gather loads alone for now. */
4538 if (!SLP_TREE_CHILDREN (load
).is_empty ())
4541 /* We want to pattern-match special cases here and keep those
4542 alone. Candidates are splats and load-lane. */
4544 /* We need to lower only loads of less than half of the groups
4545 lanes, including duplicate lanes. Note this leaves nodes
4546 with a non-1:1 load permutation around instead of canonicalizing
4547 those into a load and a permute node. Removing this early
4548 check would do such canonicalization. */
4549 if (SLP_TREE_LANES (load
) >= (group_lanes
+ 1) / 2
4550 && ld_lanes_lanes
== 0)
4553 /* Build the permute to get the original load permutation order. */
4554 bool contiguous
= true;
4555 lane_permutation_t final_perm
;
4556 final_perm
.create (SLP_TREE_LANES (load
));
4557 for (unsigned i
= 0; i
< SLP_TREE_LANES (load
); ++i
)
4559 final_perm
.quick_push
4560 (std::make_pair (0, SLP_TREE_LOAD_PERMUTATION (load
)[i
]));
4562 && (SLP_TREE_LOAD_PERMUTATION (load
)[i
]
4563 != SLP_TREE_LOAD_PERMUTATION (load
)[i
-1] + 1))
4567 /* When the load permutation accesses a contiguous unpermuted,
4568 power-of-two aligned and sized chunk leave the load alone.
4569 We can likely (re-)load it more efficiently rather than
4570 extracting it from the larger load.
4571 ??? Long-term some of the lowering should move to where
4572 the vector types involved are fixed. */
4573 if (!force_single_lane
4574 && ld_lanes_lanes
== 0
4576 && (SLP_TREE_LANES (load
) > 1 || loads
.size () == 1)
4577 && pow2p_hwi (SLP_TREE_LANES (load
))
4578 && pow2p_hwi (group_lanes
)
4579 && SLP_TREE_LOAD_PERMUTATION (load
)[0] % SLP_TREE_LANES (load
) == 0
4580 && group_lanes
% SLP_TREE_LANES (load
) == 0)
4582 final_perm
.release ();
4586 /* First build (and possibly re-use) a load node for the
4587 unpermuted group. Gaps in the middle and on the end are
4588 represented with NULL stmts. */
4589 vec
<stmt_vec_info
> stmts
;
4590 stmts
.create (group_lanes
);
4591 for (stmt_vec_info s
= first
; s
; s
= DR_GROUP_NEXT_ELEMENT (s
))
4594 for (unsigned i
= 1; i
< DR_GROUP_GAP (s
); ++i
)
4595 stmts
.quick_push (NULL
);
4596 stmts
.quick_push (s
);
4598 for (unsigned i
= 0; i
< DR_GROUP_GAP (first
); ++i
)
4599 stmts
.quick_push (NULL
);
4600 poly_uint64 max_nunits
= 1;
4601 bool *matches
= XALLOCAVEC (bool, group_lanes
);
4603 unsigned tree_size
= 0;
4604 slp_tree l0
= vect_build_slp_tree (loop_vinfo
, stmts
,
4606 &max_nunits
, matches
, &limit
,
4607 &tree_size
, bst_map
);
4608 gcc_assert (!SLP_TREE_LOAD_PERMUTATION (l0
).exists ());
4610 if (ld_lanes_lanes
!= 0)
4612 /* ??? If this is not in sync with what get_load_store_type
4613 later decides the SLP representation is not good for other
4614 store vectorization methods. */
4615 l0
->ldst_lanes
= true;
4616 load
->ldst_lanes
= true;
4621 unsigned group_lanes
= SLP_TREE_LANES (l0
);
4622 if (ld_lanes_lanes
!= 0
4623 || SLP_TREE_LANES (load
) >= (group_lanes
+ 1) / 2)
4626 /* Try to lower by reducing the group to half its size using an
4627 interleaving scheme. For this try to compute whether all
4628 elements needed for this load are in even or odd elements of
4629 an even/odd decomposition with N consecutive elements.
4630 Thus { e, e, o, o, e, e, o, o } woud be an even/odd decomposition
4632 /* ??? Only an even number of lanes can be handed this way, but the
4633 fallback below could work for any number. We have to make sure
4634 to round up in that case. */
4635 gcc_assert ((group_lanes
& 1) == 0 || group_lanes
== 3);
4636 unsigned even
= 0, odd
= 0;
4637 if ((group_lanes
& 1) == 0)
4639 even
= (1 << ceil_log2 (group_lanes
)) - 1;
4641 for (auto l
: final_perm
)
4648 /* Now build an even or odd extraction from the unpermuted load. */
4649 lane_permutation_t perm
;
4650 perm
.create ((group_lanes
+ 1) / 2);
4651 unsigned even_level
= even
? 1 << ctz_hwi (even
) : 0;
4652 unsigned odd_level
= odd
? 1 << ctz_hwi (odd
) : 0;
4654 && group_lanes
% (2 * even_level
) == 0
4655 /* ??? When code generating permutes we do not try to pun
4656 to larger component modes so level != 1 isn't a natural
4657 even/odd extract. Prefer one if possible. */
4658 && (even_level
== 1 || !odd_level
|| odd_level
!= 1))
4660 /* { 0, 1, ... 4, 5 ..., } */
4661 for (unsigned i
= 0; i
< group_lanes
/ 2 / even_level
; ++i
)
4662 for (unsigned j
= 0; j
< even_level
; ++j
)
4663 perm
.quick_push (std::make_pair (0, 2 * i
* even_level
+ j
));
4667 /* { ..., 2, 3, ... 6, 7 } */
4668 gcc_assert (group_lanes
% (2 * odd_level
) == 0);
4669 for (unsigned i
= 0; i
< group_lanes
/ 2 / odd_level
; ++i
)
4670 for (unsigned j
= 0; j
< odd_level
; ++j
)
4672 (std::make_pair (0, (2 * i
+ 1) * odd_level
+ j
));
4676 /* As fallback extract all used lanes and fill to half the
4677 group size by repeating the last element.
4678 ??? This is quite a bad strathegy for re-use - we could
4679 brute force our way to find more optimal filling lanes to
4680 maximize re-use when looking at all loads from the group. */
4682 for (auto p
: final_perm
)
4683 bitmap_set_bit (l
, p
.second
);
4686 EXECUTE_IF_SET_IN_BITMAP (l
, 0, i
, bi
)
4687 perm
.quick_push (std::make_pair (0, i
));
4688 while (perm
.length () < (group_lanes
+ 1) / 2)
4689 perm
.quick_push (perm
.last ());
4692 /* Update final_perm with the intermediate permute. */
4693 for (unsigned i
= 0; i
< final_perm
.length (); ++i
)
4695 unsigned l
= final_perm
[i
].second
;
4697 for (j
= 0; j
< perm
.length (); ++j
)
4698 if (perm
[j
].second
== l
)
4700 final_perm
[i
].second
= j
;
4703 gcc_assert (j
< perm
.length ());
4706 /* And create scalar stmts. */
4707 vec
<stmt_vec_info
> perm_stmts
;
4708 perm_stmts
.create (perm
.length ());
4709 for (unsigned i
= 0; i
< perm
.length (); ++i
)
4710 perm_stmts
.quick_push (SLP_TREE_SCALAR_STMTS (l0
)[perm
[i
].second
]);
4712 slp_tree p
= vect_create_new_slp_node (1, VEC_PERM_EXPR
);
4713 SLP_TREE_CHILDREN (p
).quick_push (l0
);
4714 SLP_TREE_LANE_PERMUTATION (p
) = perm
;
4715 SLP_TREE_VECTYPE (p
) = SLP_TREE_VECTYPE (load
);
4716 SLP_TREE_LANES (p
) = perm
.length ();
4717 SLP_TREE_REPRESENTATIVE (p
) = SLP_TREE_REPRESENTATIVE (load
);
4718 /* ??? As we have scalar stmts for this intermediate permute we
4719 could CSE it via bst_map but we do not want to pick up
4720 another SLP node with a load permutation. We instead should
4721 have a "local" CSE map here. */
4722 SLP_TREE_SCALAR_STMTS (p
) = perm_stmts
;
4724 /* We now have a node for (group_lanes + 1) / 2 lanes. */
4728 /* And finally from the ordered reduction node create the
4729 permute to shuffle the lanes into the original load-permutation
4730 order. We replace the original load node with this. */
4731 SLP_TREE_CODE (load
) = VEC_PERM_EXPR
;
4732 SLP_TREE_LOAD_PERMUTATION (load
).release ();
4733 SLP_TREE_LANE_PERMUTATION (load
) = final_perm
;
4734 SLP_TREE_CHILDREN (load
).create (1);
4735 SLP_TREE_CHILDREN (load
).quick_push (l0
);
4739 /* Transform SLP loads in the SLP graph created by SLP discovery to
4740 group loads from the same group and lower load permutations that
4741 are unlikely to be supported into a series of permutes.
4742 In the degenerate case of having only single-lane SLP instances
4743 this should result in a series of permute nodes emulating an
4744 interleaving scheme. */
4747 vect_lower_load_permutations (loop_vec_info loop_vinfo
,
4748 scalar_stmts_to_slp_tree_map_t
*bst_map
,
4749 bool force_single_lane
)
4751 /* Gather and sort loads across all instances. */
4752 hash_set
<slp_tree
> visited
;
4753 auto_vec
<slp_tree
> loads
;
4754 for (auto inst
: loop_vinfo
->slp_instances
)
4755 vect_gather_slp_loads (loads
, SLP_INSTANCE_TREE (inst
), visited
);
4756 if (loads
.is_empty ())
4758 loads
.qsort (vllp_cmp
);
4760 /* Now process each dataref group separately. */
4761 unsigned firsti
= 0;
4762 for (unsigned i
= 1; i
< loads
.length (); ++i
)
4764 slp_tree first
= loads
[firsti
];
4765 slp_tree next
= loads
[i
];
4766 stmt_vec_info a0
= SLP_TREE_SCALAR_STMTS (first
)[0];
4767 stmt_vec_info b0
= SLP_TREE_SCALAR_STMTS (next
)[0];
4768 if (STMT_VINFO_GROUPED_ACCESS (a0
)
4769 && STMT_VINFO_GROUPED_ACCESS (b0
)
4770 && DR_GROUP_FIRST_ELEMENT (a0
) == DR_GROUP_FIRST_ELEMENT (b0
))
4772 /* Now we have one or multiple SLP loads of the same group from
4774 if (STMT_VINFO_GROUPED_ACCESS (a0
))
4775 vect_lower_load_permutations (loop_vinfo
, bst_map
,
4776 make_array_slice (&loads
[firsti
],
4781 if (firsti
< loads
.length ()
4782 && STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (loads
[firsti
])[0]))
4783 vect_lower_load_permutations (loop_vinfo
, bst_map
,
4784 make_array_slice (&loads
[firsti
],
4785 loads
.length () - firsti
),
4789 /* Check if there are stmts in the loop can be vectorized using SLP. Build SLP
4790 trees of packed scalar stmts if SLP is possible. */
4793 vect_analyze_slp (vec_info
*vinfo
, unsigned max_tree_size
,
4794 bool force_single_lane
)
4796 loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
);
4798 stmt_vec_info first_element
;
4799 slp_instance instance
;
4801 DUMP_VECT_SCOPE ("vect_analyze_slp");
4803 unsigned limit
= max_tree_size
;
4805 scalar_stmts_to_slp_tree_map_t
*bst_map
4806 = new scalar_stmts_to_slp_tree_map_t ();
4808 /* Find SLP sequences starting from groups of grouped stores. */
4809 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
4810 vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
4811 slp_inst_kind_store
, max_tree_size
, &limit
,
4814 /* For loops also start SLP discovery from non-grouped stores. */
4817 data_reference_p dr
;
4818 FOR_EACH_VEC_ELT (vinfo
->shared
->datarefs
, i
, dr
)
4819 if (DR_IS_WRITE (dr
))
4821 stmt_vec_info stmt_info
= vinfo
->lookup_dr (dr
)->stmt
;
4822 /* Grouped stores are already handled above. */
4823 if (STMT_VINFO_GROUPED_ACCESS (stmt_info
))
4825 vec
<stmt_vec_info
> stmts
;
4826 vec
<stmt_vec_info
> roots
= vNULL
;
4827 vec
<tree
> remain
= vNULL
;
4829 stmts
.quick_push (stmt_info
);
4830 vect_build_slp_instance (vinfo
, slp_inst_kind_store
,
4831 stmts
, roots
, remain
, max_tree_size
,
4832 &limit
, bst_map
, NULL
, force_single_lane
);
4836 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
4838 for (unsigned i
= 0; i
< bb_vinfo
->roots
.length (); ++i
)
4840 vect_location
= bb_vinfo
->roots
[i
].roots
[0]->stmt
;
4841 /* Apply patterns. */
4842 for (unsigned j
= 0; j
< bb_vinfo
->roots
[i
].stmts
.length (); ++j
)
4843 bb_vinfo
->roots
[i
].stmts
[j
]
4844 = vect_stmt_to_vectorize (bb_vinfo
->roots
[i
].stmts
[j
]);
4845 if (vect_build_slp_instance (bb_vinfo
, bb_vinfo
->roots
[i
].kind
,
4846 bb_vinfo
->roots
[i
].stmts
,
4847 bb_vinfo
->roots
[i
].roots
,
4848 bb_vinfo
->roots
[i
].remain
,
4849 max_tree_size
, &limit
, bst_map
, NULL
,
4852 bb_vinfo
->roots
[i
].stmts
= vNULL
;
4853 bb_vinfo
->roots
[i
].roots
= vNULL
;
4854 bb_vinfo
->roots
[i
].remain
= vNULL
;
4859 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
4861 /* Find SLP sequences starting from reduction chains. */
4862 FOR_EACH_VEC_ELT (loop_vinfo
->reduction_chains
, i
, first_element
)
4863 if (! STMT_VINFO_RELEVANT_P (first_element
)
4864 && ! STMT_VINFO_LIVE_P (first_element
))
4866 else if (force_single_lane
4867 || ! vect_analyze_slp_instance (vinfo
, bst_map
, first_element
,
4868 slp_inst_kind_reduc_chain
,
4869 max_tree_size
, &limit
,
4872 /* Dissolve reduction chain group. */
4873 stmt_vec_info vinfo
= first_element
;
4874 stmt_vec_info last
= NULL
;
4877 stmt_vec_info next
= REDUC_GROUP_NEXT_ELEMENT (vinfo
);
4878 REDUC_GROUP_FIRST_ELEMENT (vinfo
) = NULL
;
4879 REDUC_GROUP_NEXT_ELEMENT (vinfo
) = NULL
;
4883 STMT_VINFO_DEF_TYPE (first_element
) = vect_internal_def
;
4884 /* It can be still vectorized as part of an SLP reduction. */
4885 loop_vinfo
->reductions
.safe_push (last
);
4888 /* Find SLP sequences starting from groups of reductions. */
4889 if (loop_vinfo
->reductions
.length () > 0)
4891 /* Collect reduction statements we can combine into
4893 vec
<stmt_vec_info
> scalar_stmts
;
4894 scalar_stmts
.create (loop_vinfo
->reductions
.length ());
4895 for (auto next_info
: loop_vinfo
->reductions
)
4897 next_info
= vect_stmt_to_vectorize (next_info
);
4898 if ((STMT_VINFO_RELEVANT_P (next_info
)
4899 || STMT_VINFO_LIVE_P (next_info
))
4900 /* ??? Make sure we didn't skip a conversion around a
4901 reduction path. In that case we'd have to reverse
4902 engineer that conversion stmt following the chain using
4903 reduc_idx and from the PHI using reduc_def. */
4904 && (STMT_VINFO_DEF_TYPE (next_info
) == vect_reduction_def
4905 || (STMT_VINFO_DEF_TYPE (next_info
)
4906 == vect_double_reduction_def
)))
4908 /* Do not discover SLP reductions combining lane-reducing
4909 ops, that will fail later. */
4910 if (!force_single_lane
4911 && !lane_reducing_stmt_p (STMT_VINFO_STMT (next_info
)))
4912 scalar_stmts
.quick_push (next_info
);
4915 /* Do SLP discovery for single-lane reductions. */
4916 vec
<stmt_vec_info
> stmts
;
4917 vec
<stmt_vec_info
> roots
= vNULL
;
4918 vec
<tree
> remain
= vNULL
;
4920 stmts
.quick_push (next_info
);
4921 vect_build_slp_instance (vinfo
,
4922 slp_inst_kind_reduc_group
,
4923 stmts
, roots
, remain
,
4924 max_tree_size
, &limit
,
4930 /* Save for re-processing on failure. */
4931 vec
<stmt_vec_info
> saved_stmts
= scalar_stmts
.copy ();
4932 vec
<stmt_vec_info
> roots
= vNULL
;
4933 vec
<tree
> remain
= vNULL
;
4934 if (scalar_stmts
.length () <= 1
4935 || !vect_build_slp_instance (loop_vinfo
,
4936 slp_inst_kind_reduc_group
,
4937 scalar_stmts
, roots
, remain
,
4938 max_tree_size
, &limit
, bst_map
,
4939 NULL
, force_single_lane
))
4941 if (scalar_stmts
.length () <= 1)
4942 scalar_stmts
.release ();
4943 /* Do SLP discovery for single-lane reductions. */
4944 for (auto stmt_info
: saved_stmts
)
4946 vec
<stmt_vec_info
> stmts
;
4947 vec
<stmt_vec_info
> roots
= vNULL
;
4948 vec
<tree
> remain
= vNULL
;
4950 stmts
.quick_push (vect_stmt_to_vectorize (stmt_info
));
4951 vect_build_slp_instance (vinfo
,
4952 slp_inst_kind_reduc_group
,
4953 stmts
, roots
, remain
,
4954 max_tree_size
, &limit
,
4955 bst_map
, NULL
, force_single_lane
);
4958 saved_stmts
.release ();
4961 /* Make sure to vectorize only-live stmts, usually inductions. */
4962 for (edge e
: get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo
)))
4963 for (auto gsi
= gsi_start_phis (e
->dest
); !gsi_end_p (gsi
);
4966 gphi
*lc_phi
= *gsi
;
4967 tree def
= gimple_phi_arg_def_from_edge (lc_phi
, e
);
4968 stmt_vec_info stmt_info
;
4969 if (TREE_CODE (def
) == SSA_NAME
4970 && !virtual_operand_p (def
)
4971 && (stmt_info
= loop_vinfo
->lookup_def (def
))
4972 && ((stmt_info
= vect_stmt_to_vectorize (stmt_info
)), true)
4973 && STMT_VINFO_RELEVANT (stmt_info
) == vect_used_only_live
4974 && STMT_VINFO_LIVE_P (stmt_info
)
4975 && (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_induction_def
4976 || (STMT_VINFO_DEF_TYPE (stmt_info
) == vect_internal_def
4977 && STMT_VINFO_REDUC_IDX (stmt_info
) == -1)))
4979 vec
<stmt_vec_info
> stmts
;
4980 vec
<stmt_vec_info
> roots
= vNULL
;
4981 vec
<tree
> remain
= vNULL
;
4983 stmts
.quick_push (vect_stmt_to_vectorize (stmt_info
));
4984 vect_build_slp_instance (vinfo
,
4985 slp_inst_kind_reduc_group
,
4986 stmts
, roots
, remain
,
4987 max_tree_size
, &limit
,
4988 bst_map
, NULL
, force_single_lane
);
4992 /* Find SLP sequences starting from gconds. */
4993 for (auto cond
: LOOP_VINFO_LOOP_CONDS (loop_vinfo
))
4995 auto cond_info
= loop_vinfo
->lookup_stmt (cond
);
4997 cond_info
= vect_stmt_to_vectorize (cond_info
);
4998 vec
<stmt_vec_info
> roots
= vNULL
;
4999 roots
.safe_push (cond_info
);
5000 gimple
*stmt
= STMT_VINFO_STMT (cond_info
);
5001 tree args0
= gimple_cond_lhs (stmt
);
5002 tree args1
= gimple_cond_rhs (stmt
);
5004 /* These should be enforced by cond lowering. */
5005 gcc_assert (gimple_cond_code (stmt
) == NE_EXPR
);
5006 gcc_assert (zerop (args1
));
5008 /* An argument without a loop def will be codegened from vectorizing the
5009 root gcond itself. As such we don't need to try to build an SLP tree
5010 from them. It's highly likely that the resulting SLP tree here if both
5011 arguments have a def will be incompatible, but we rely on it being split
5013 auto varg
= loop_vinfo
->lookup_def (args0
);
5014 vec
<stmt_vec_info
> stmts
;
5015 vec
<tree
> remain
= vNULL
;
5017 stmts
.quick_push (vect_stmt_to_vectorize (varg
));
5019 if (! vect_build_slp_instance (vinfo
, slp_inst_kind_gcond
,
5020 stmts
, roots
, remain
,
5021 max_tree_size
, &limit
,
5022 bst_map
, NULL
, force_single_lane
))
5026 /* Find and create slp instances for inductions that have been forced
5027 live due to early break. */
5028 edge latch_e
= loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo
));
5029 for (auto stmt_info
: LOOP_VINFO_EARLY_BREAKS_LIVE_IVS (loop_vinfo
))
5031 vec
<stmt_vec_info
> stmts
;
5032 vec
<stmt_vec_info
> roots
= vNULL
;
5033 vec
<tree
> remain
= vNULL
;
5034 gphi
*lc_phi
= as_a
<gphi
*> (STMT_VINFO_STMT (stmt_info
));
5035 tree def
= gimple_phi_arg_def_from_edge (lc_phi
, latch_e
);
5036 stmt_vec_info lc_info
= loop_vinfo
->lookup_def (def
);
5038 stmts
.quick_push (vect_stmt_to_vectorize (lc_info
));
5039 vect_build_slp_instance (vinfo
, slp_inst_kind_reduc_group
,
5040 stmts
, roots
, remain
,
5041 max_tree_size
, &limit
,
5042 bst_map
, NULL
, force_single_lane
);
5046 hash_set
<slp_tree
> visited_patterns
;
5047 slp_tree_to_load_perm_map_t perm_cache
;
5048 slp_compat_nodes_map_t compat_cache
;
5050 /* See if any patterns can be found in the SLP tree. */
5051 bool pattern_found
= false;
5052 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
5053 pattern_found
|= vect_match_slp_patterns (instance
, vinfo
,
5054 &visited_patterns
, &perm_cache
,
5057 /* If any were found optimize permutations of loads. */
5060 hash_map
<slp_tree
, slp_tree
> load_map
;
5061 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
5063 slp_tree root
= SLP_INSTANCE_TREE (instance
);
5064 optimize_load_redistribution (bst_map
, vinfo
, SLP_TREE_LANES (root
),
5069 /* Check whether we should force some SLP instances to use load/store-lanes
5070 and do so by forcing SLP re-discovery with single lanes. We used
5071 to cancel SLP when this applied to all instances in a loop but now
5072 we decide this per SLP instance. It's important to do this only
5073 after SLP pattern recognition. */
5074 if (is_a
<loop_vec_info
> (vinfo
))
5075 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
5076 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_store
5077 && !SLP_INSTANCE_TREE (instance
)->ldst_lanes
)
5079 slp_tree slp_root
= SLP_INSTANCE_TREE (instance
);
5080 int group_size
= SLP_TREE_LANES (slp_root
);
5081 tree vectype
= SLP_TREE_VECTYPE (slp_root
);
5083 stmt_vec_info rep_info
= SLP_TREE_REPRESENTATIVE (slp_root
);
5084 gimple
*rep
= STMT_VINFO_STMT (rep_info
);
5085 bool masked
= (is_gimple_call (rep
)
5086 && gimple_call_internal_p (rep
)
5087 && internal_fn_mask_index
5088 (gimple_call_internal_fn (rep
)) != -1);
5089 if (!STMT_VINFO_GROUPED_ACCESS (rep_info
)
5090 || slp_root
->ldst_lanes
5091 || (vect_store_lanes_supported (vectype
, group_size
, masked
)
5095 auto_vec
<slp_tree
> loads
;
5096 hash_set
<slp_tree
> visited
;
5097 vect_gather_slp_loads (loads
, slp_root
, visited
);
5099 /* Check whether any load in the SLP instance is possibly
5101 bool loads_permuted
= false;
5104 FOR_EACH_VEC_ELT (loads
, j
, load_node
)
5106 if (!SLP_TREE_LOAD_PERMUTATION (load_node
).exists ())
5109 stmt_vec_info load_info
;
5110 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node
), k
, load_info
)
5111 if (SLP_TREE_LOAD_PERMUTATION (load_node
)[k
] != k
)
5113 loads_permuted
= true;
5118 /* If the loads and stores can use load/store-lanes force re-discovery
5119 with single lanes. */
5122 bool can_use_lanes
= true;
5123 FOR_EACH_VEC_ELT (loads
, j
, load_node
)
5124 if (STMT_VINFO_GROUPED_ACCESS
5125 (SLP_TREE_REPRESENTATIVE (load_node
)))
5127 stmt_vec_info stmt_vinfo
= DR_GROUP_FIRST_ELEMENT
5128 (SLP_TREE_REPRESENTATIVE (load_node
));
5129 rep
= STMT_VINFO_STMT (stmt_vinfo
);
5130 masked
= (is_gimple_call (rep
)
5131 && gimple_call_internal_p (rep
)
5132 && internal_fn_mask_index
5133 (gimple_call_internal_fn (rep
)));
5134 /* Use SLP for strided accesses (or if we can't
5136 if (STMT_VINFO_STRIDED_P (stmt_vinfo
)
5137 || compare_step_with_zero (vinfo
, stmt_vinfo
) <= 0
5138 || vect_load_lanes_supported
5139 (STMT_VINFO_VECTYPE (stmt_vinfo
),
5140 DR_GROUP_SIZE (stmt_vinfo
), masked
) == IFN_LAST
5141 /* ??? During SLP re-discovery with a single lane
5142 a masked grouped load will appear permuted and
5143 discovery will fail. We have to rework this
5144 on the discovery side - for now avoid ICEing. */
5147 can_use_lanes
= false;
5154 if (dump_enabled_p ())
5155 dump_printf_loc (MSG_NOTE
, vect_location
,
5156 "SLP instance %p can use load/store-lanes,"
5157 " re-discovering with single-lanes\n",
5160 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (slp_root
);
5162 vect_free_slp_instance (instance
);
5163 limit
= max_tree_size
;
5164 bool res
= vect_analyze_slp_instance (vinfo
, bst_map
,
5166 slp_inst_kind_store
,
5167 max_tree_size
, &limit
,
5170 auto new_inst
= LOOP_VINFO_SLP_INSTANCES (vinfo
).pop ();
5171 LOOP_VINFO_SLP_INSTANCES (vinfo
)[i
] = new_inst
;
5176 /* When we end up with load permutations that we cannot possibly handle,
5177 like those requiring three vector inputs, lower them using interleaving
5179 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
5181 vect_lower_load_permutations (loop_vinfo
, bst_map
, force_single_lane
);
5182 if (dump_enabled_p ())
5184 dump_printf_loc (MSG_NOTE
, vect_location
,
5185 "SLP graph after lowering permutations:\n");
5186 hash_set
<slp_tree
> visited
;
5187 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
5188 vect_print_slp_graph (MSG_NOTE
, vect_location
,
5189 SLP_INSTANCE_TREE (instance
), visited
);
5193 release_scalar_stmts_to_slp_tree_map (bst_map
);
5195 if (pattern_found
&& dump_enabled_p ())
5197 dump_printf_loc (MSG_NOTE
, vect_location
,
5198 "Pattern matched SLP tree\n");
5199 hash_set
<slp_tree
> visited
;
5200 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (vinfo
), i
, instance
)
5201 vect_print_slp_graph (MSG_NOTE
, vect_location
,
5202 SLP_INSTANCE_TREE (instance
), visited
);
5205 return opt_result::success ();
5208 /* Estimates the cost of inserting layout changes into the SLP graph.
5209 It can also say that the insertion is impossible. */
5211 struct slpg_layout_cost
5213 slpg_layout_cost () = default;
5214 slpg_layout_cost (sreal
, bool);
5216 static slpg_layout_cost
impossible () { return { sreal::max (), 0 }; }
5217 bool is_possible () const { return depth
!= sreal::max (); }
5219 bool operator== (const slpg_layout_cost
&) const;
5220 bool operator!= (const slpg_layout_cost
&) const;
5222 bool is_better_than (const slpg_layout_cost
&, bool) const;
5224 void add_parallel_cost (const slpg_layout_cost
&);
5225 void add_serial_cost (const slpg_layout_cost
&);
5226 void split (unsigned int);
5228 /* The longest sequence of layout changes needed during any traversal
5229 of the partition dag, weighted by execution frequency.
5231 This is the most important metric when optimizing for speed, since
5232 it helps to ensure that we keep the number of operations on
5233 critical paths to a minimum. */
5236 /* An estimate of the total number of operations needed. It is weighted by
5237 execution frequency when optimizing for speed but not when optimizing for
5238 size. In order to avoid double-counting, a node with a fanout of N will
5239 distribute 1/N of its total cost to each successor.
5241 This is the most important metric when optimizing for size, since
5242 it helps to keep the total number of operations to a minimum, */
5246 /* Construct costs for a node with weight WEIGHT. A higher weight
5247 indicates more frequent execution. IS_FOR_SIZE is true if we are
5248 optimizing for size rather than speed. */
5250 slpg_layout_cost::slpg_layout_cost (sreal weight
, bool is_for_size
)
5251 : depth (weight
), total (is_for_size
&& weight
> 0 ? 1 : weight
)
5256 slpg_layout_cost::operator== (const slpg_layout_cost
&other
) const
5258 return depth
== other
.depth
&& total
== other
.total
;
5262 slpg_layout_cost::operator!= (const slpg_layout_cost
&other
) const
5264 return !operator== (other
);
5267 /* Return true if these costs are better than OTHER. IS_FOR_SIZE is
5268 true if we are optimizing for size rather than speed. */
5271 slpg_layout_cost::is_better_than (const slpg_layout_cost
&other
,
5272 bool is_for_size
) const
5276 if (total
!= other
.total
)
5277 return total
< other
.total
;
5278 return depth
< other
.depth
;
5282 if (depth
!= other
.depth
)
5283 return depth
< other
.depth
;
5284 return total
< other
.total
;
5288 /* Increase the costs to account for something with cost INPUT_COST
5289 happening in parallel with the current costs. */
5292 slpg_layout_cost::add_parallel_cost (const slpg_layout_cost
&input_cost
)
5294 depth
= std::max (depth
, input_cost
.depth
);
5295 total
+= input_cost
.total
;
5298 /* Increase the costs to account for something with cost INPUT_COST
5299 happening in series with the current costs. */
5302 slpg_layout_cost::add_serial_cost (const slpg_layout_cost
&other
)
5304 depth
+= other
.depth
;
5305 total
+= other
.total
;
5308 /* Split the total cost among TIMES successors or predecessors. */
5311 slpg_layout_cost::split (unsigned int times
)
5317 /* Information about one node in the SLP graph, for use during
5318 vect_optimize_slp_pass. */
5322 slpg_vertex (slp_tree node_
) : node (node_
) {}
5324 /* The node itself. */
5327 /* Which partition the node belongs to, or -1 if none. Nodes outside of
5328 partitions are flexible; they can have whichever layout consumers
5329 want them to have. */
5332 /* The number of nodes that directly use the result of this one
5333 (i.e. the number of nodes that count this one as a child). */
5334 unsigned int out_degree
= 0;
5336 /* The execution frequency of the node. */
5339 /* The total execution frequency of all nodes that directly use the
5340 result of this one. */
5341 sreal out_weight
= 0;
5344 /* Information about one partition of the SLP graph, for use during
5345 vect_optimize_slp_pass. */
5347 struct slpg_partition_info
5349 /* The nodes in the partition occupy indices [NODE_BEGIN, NODE_END)
5350 of m_partitioned_nodes. */
5351 unsigned int node_begin
= 0;
5352 unsigned int node_end
= 0;
5354 /* Which layout we've chosen to use for this partition, or -1 if
5355 we haven't picked one yet. */
5358 /* The number of predecessors and successors in the partition dag.
5359 The predecessors always have lower partition numbers and the
5360 successors always have higher partition numbers.
5362 Note that the directions of these edges are not necessarily the
5363 same as in the data flow graph. For example, if an SCC has separate
5364 partitions for an inner loop and an outer loop, the inner loop's
5365 partition will have at least two incoming edges from the outer loop's
5366 partition: one for a live-in value and one for a live-out value.
5367 In data flow terms, one of these edges would also be from the outer loop
5368 to the inner loop, but the other would be in the opposite direction. */
5369 unsigned int in_degree
= 0;
5370 unsigned int out_degree
= 0;
5373 /* Information about the costs of using a particular layout for a
5374 particular partition. It can also say that the combination is
5377 struct slpg_partition_layout_costs
5379 bool is_possible () const { return internal_cost
.is_possible (); }
5380 void mark_impossible () { internal_cost
= slpg_layout_cost::impossible (); }
5382 /* The costs inherited from predecessor partitions. */
5383 slpg_layout_cost in_cost
;
5385 /* The inherent cost of the layout within the node itself. For example,
5386 this is nonzero for a load if choosing a particular layout would require
5387 the load to permute the loaded elements. It is nonzero for a
5388 VEC_PERM_EXPR if the permutation cannot be eliminated or converted
5389 to full-vector moves. */
5390 slpg_layout_cost internal_cost
;
5392 /* The costs inherited from successor partitions. */
5393 slpg_layout_cost out_cost
;
5396 /* This class tries to optimize the layout of vectors in order to avoid
5397 unnecessary shuffling. At the moment, the set of possible layouts are
5398 restricted to bijective permutations.
5400 The goal of the pass depends on whether we're optimizing for size or
5401 for speed. When optimizing for size, the goal is to reduce the overall
5402 number of layout changes (including layout changes implied by things
5403 like load permutations). When optimizing for speed, the goal is to
5404 reduce the maximum latency attributable to layout changes on any
5405 non-cyclical path through the data flow graph.
5407 For example, when optimizing a loop nest for speed, we will prefer
5408 to make layout changes outside of a loop rather than inside of a loop,
5409 and will prefer to make layout changes in parallel rather than serially,
5410 even if that increases the overall number of layout changes.
5412 The high-level procedure is:
5414 (1) Build a graph in which edges go from uses (parents) to definitions
5417 (2) Divide the graph into a dag of strongly-connected components (SCCs).
5419 (3) When optimizing for speed, partition the nodes in each SCC based
5420 on their containing cfg loop. When optimizing for size, treat
5421 each SCC as a single partition.
5423 This gives us a dag of partitions. The goal is now to assign a
5424 layout to each partition.
5426 (4) Construct a set of vector layouts that are worth considering.
5427 Record which nodes must keep their current layout.
5429 (5) Perform a forward walk over the partition dag (from loads to stores)
5430 accumulating the "forward" cost of using each layout. When visiting
5431 each partition, assign a tentative choice of layout to the partition
5432 and use that choice when calculating the cost of using a different
5433 layout in successor partitions.
5435 (6) Perform a backward walk over the partition dag (from stores to loads),
5436 accumulating the "backward" cost of using each layout. When visiting
5437 each partition, make a final choice of layout for that partition based
5438 on the accumulated forward costs (from (5)) and backward costs
5441 (7) Apply the chosen layouts to the SLP graph.
5443 For example, consider the SLP statements:
5447 S2: a_2 = PHI<a_1, a_3>
5454 S2 and S4 form an SCC and are part of the same loop. Every other
5455 statement is in a singleton SCC. In this example there is a one-to-one
5456 mapping between SCCs and partitions and the partition dag looks like this;
5466 S2, S3 and S4 will have a higher execution frequency than the other
5467 statements, so when optimizing for speed, the goal is to avoid any
5472 - on the S3->S2+S4 edge
5474 For example, if S3 was originally a reversing load, the goal of the
5475 pass is to make it an unreversed load and change the layout on the
5476 S1->S2+S4 and S2+S4->S5 edges to compensate. (Changing the layout
5477 on S1->S2+S4 and S5->S6 would also be acceptable.)
5479 The difference between SCCs and partitions becomes important if we
5484 S2: a_2 = PHI<a_1, a_6>
5488 S5: a_4 = PHI<a_3, a_5>
5496 Here, S2, S4, S5, S7 and S8 form a single SCC. However, when optimizing
5497 for speed, we usually do not want restrictions in the outer loop to "infect"
5498 the decision for the inner loop. For example, if an outer-loop node
5499 in the SCC contains a statement with a fixed layout, that should not
5500 prevent the inner loop from using a different layout. Conversely,
5501 the inner loop should not dictate a layout to the outer loop: if the
5502 outer loop does a lot of computation, then it may not be efficient to
5503 do all of that computation in the inner loop's preferred layout.
5505 So when optimizing for speed, we partition the SCC into S2+S4+S8 (outer)
5506 and S5+S7 (inner). We also try to arrange partitions so that:
5508 - the partition for an outer loop comes before the partition for
5511 - if a sibling loop A dominates a sibling loop B, A's partition
5514 This gives the following partition dag for the example above:
5524 There are two edges from S2+S4+S8 to S5+S7: one for the edge S4->S5 and
5525 one for a reversal of the edge S7->S8.
5527 The backward walk picks a layout for S5+S7 before S2+S4+S8. The choice
5528 for S2+S4+S8 therefore has to balance the cost of using the outer loop's
5529 preferred layout against the cost of changing the layout on entry to the
5530 inner loop (S4->S5) and on exit from the inner loop (S7->S8 reversed).
5532 Although this works well when optimizing for speed, it has the downside
5533 when optimizing for size that the choice of layout for S5+S7 is completely
5534 independent of S9, which lessens the chance of reducing the overall number
5535 of permutations. We therefore do not partition SCCs when optimizing
5538 To give a concrete example of the difference between optimizing
5539 for size and speed, consider:
5541 a[0] = (b[1] << c[3]) - d[1];
5542 a[1] = (b[0] << c[2]) - d[0];
5543 a[2] = (b[3] << c[1]) - d[3];
5544 a[3] = (b[2] << c[0]) - d[2];
5546 There are three different layouts here: one for a, one for b and d,
5547 and one for c. When optimizing for speed it is better to permute each
5548 of b, c and d into the order required by a, since those permutations
5549 happen in parallel. But when optimizing for size, it is better to:
5551 - permute c into the same order as b
5553 - permute the result into the order required by a
5555 This gives 2 permutations rather than 3. */
5557 class vect_optimize_slp_pass
5560 vect_optimize_slp_pass (vec_info
*vinfo
) : m_vinfo (vinfo
) {}
5564 /* Graph building. */
5565 struct loop
*containing_loop (slp_tree
);
5566 bool is_cfg_latch_edge (graph_edge
*);
5567 void build_vertices (hash_set
<slp_tree
> &, slp_tree
);
5568 void build_vertices ();
5569 void build_graph ();
5572 void create_partitions ();
5573 template<typename T
> void for_each_partition_edge (unsigned int, T
);
5575 /* Layout selection. */
5576 bool is_compatible_layout (slp_tree
, unsigned int);
5577 int change_layout_cost (slp_tree
, unsigned int, unsigned int);
5578 slpg_partition_layout_costs
&partition_layout_costs (unsigned int,
5580 void change_vec_perm_layout (slp_tree
, lane_permutation_t
&,
5582 int internal_node_cost (slp_tree
, int, unsigned int);
5583 void start_choosing_layouts ();
5585 /* Cost propagation. */
5586 slpg_layout_cost
edge_layout_cost (graph_edge
*, unsigned int,
5587 unsigned int, unsigned int);
5588 slpg_layout_cost
total_in_cost (unsigned int);
5589 slpg_layout_cost
forward_cost (graph_edge
*, unsigned int, unsigned int);
5590 slpg_layout_cost
backward_cost (graph_edge
*, unsigned int, unsigned int);
5591 void forward_pass ();
5592 void backward_pass ();
5594 /* Rematerialization. */
5595 slp_tree
get_result_with_layout (slp_tree
, unsigned int);
5596 void materialize ();
5599 void remove_redundant_permutations ();
5601 /* Masked load lanes discovery. */
5602 void decide_masked_load_lanes ();
5608 /* True if we should optimize the graph for size, false if we should
5609 optimize it for speed. (It wouldn't be easy to make this decision
5611 bool m_optimize_size
;
5613 /* A graph of all SLP nodes, with edges leading from uses to definitions.
5614 In other words, a node's predecessors are its slp_tree parents and
5615 a node's successors are its slp_tree children. */
5616 graph
*m_slpg
= nullptr;
5618 /* The vertices of M_SLPG, indexed by slp_tree::vertex. */
5619 auto_vec
<slpg_vertex
> m_vertices
;
5621 /* The list of all leaves of M_SLPG. such as external definitions, constants,
5623 auto_vec
<int> m_leafs
;
5625 /* This array has one entry for every vector layout that we're considering.
5626 Element 0 is null and indicates "no change". Other entries describe
5627 permutations that are inherent in the current graph and that we would
5628 like to reverse if possible.
5630 For example, a permutation { 1, 2, 3, 0 } means that something has
5631 effectively been permuted in that way, such as a load group
5632 { a[1], a[2], a[3], a[0] } (viewed as a permutation of a[0:3]).
5633 We'd then like to apply the reverse permutation { 3, 0, 1, 2 }
5634 in order to put things "back" in order. */
5635 auto_vec
<vec
<unsigned> > m_perms
;
5637 /* A partitioning of the nodes for which a layout must be chosen.
5638 Each partition represents an <SCC, cfg loop> pair; that is,
5639 nodes in different SCCs belong to different partitions, and nodes
5640 within an SCC can be further partitioned according to a containing
5641 cfg loop. Partition <SCC1, L1> comes before <SCC2, L2> if:
5643 - SCC1 != SCC2 and SCC1 is a predecessor of SCC2 in a forward walk
5644 from leaves (such as loads) to roots (such as stores).
5646 - SCC1 == SCC2 and L1's header strictly dominates L2's header. */
5647 auto_vec
<slpg_partition_info
> m_partitions
;
5649 /* The list of all nodes for which a layout must be chosen. Nodes for
5650 partition P come before the nodes for partition P+1. Nodes within a
5651 partition are in reverse postorder. */
5652 auto_vec
<unsigned int> m_partitioned_nodes
;
5654 /* Index P * num-layouts + L contains the cost of using layout L
5656 auto_vec
<slpg_partition_layout_costs
> m_partition_layout_costs
;
5658 /* Index N * num-layouts + L, if nonnull, is a node that provides the
5659 original output of node N adjusted to have layout L. */
5660 auto_vec
<slp_tree
> m_node_layouts
;
5663 /* Fill the vertices and leafs vector with all nodes in the SLP graph.
5664 Also record whether we should optimize anything for speed rather
5668 vect_optimize_slp_pass::build_vertices (hash_set
<slp_tree
> &visited
,
5674 if (visited
.add (node
))
5677 if (stmt_vec_info rep
= SLP_TREE_REPRESENTATIVE (node
))
5679 basic_block bb
= gimple_bb (vect_orig_stmt (rep
)->stmt
);
5680 if (optimize_bb_for_speed_p (bb
))
5681 m_optimize_size
= false;
5684 node
->vertex
= m_vertices
.length ();
5685 m_vertices
.safe_push (slpg_vertex (node
));
5688 bool force_leaf
= false;
5689 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
5693 build_vertices (visited
, child
);
5697 /* Since SLP discovery works along use-def edges all cycles have an
5698 entry - but there's the exception of cycles where we do not handle
5699 the entry explicitely (but with a NULL SLP node), like some reductions
5700 and inductions. Force those SLP PHIs to act as leafs to make them
5701 backwards reachable. */
5702 if (leaf
|| force_leaf
)
5703 m_leafs
.safe_push (node
->vertex
);
5706 /* Fill the vertices and leafs vector with all nodes in the SLP graph. */
5709 vect_optimize_slp_pass::build_vertices ()
5711 hash_set
<slp_tree
> visited
;
5713 slp_instance instance
;
5714 m_vertices
.truncate (0);
5715 m_leafs
.truncate (0);
5716 FOR_EACH_VEC_ELT (m_vinfo
->slp_instances
, i
, instance
)
5717 build_vertices (visited
, SLP_INSTANCE_TREE (instance
));
5720 /* Apply (reverse) bijectite PERM to VEC. */
5724 vect_slp_permute (vec
<unsigned> perm
,
5725 vec
<T
> &vec
, bool reverse
)
5727 auto_vec
<T
, 64> saved
;
5728 saved
.create (vec
.length ());
5729 for (unsigned i
= 0; i
< vec
.length (); ++i
)
5730 saved
.quick_push (vec
[i
]);
5734 for (unsigned i
= 0; i
< vec
.length (); ++i
)
5735 vec
[perm
[i
]] = saved
[i
];
5736 for (unsigned i
= 0; i
< vec
.length (); ++i
)
5737 gcc_assert (vec
[perm
[i
]] == saved
[i
]);
5741 for (unsigned i
= 0; i
< vec
.length (); ++i
)
5742 vec
[i
] = saved
[perm
[i
]];
5743 for (unsigned i
= 0; i
< vec
.length (); ++i
)
5744 gcc_assert (vec
[i
] == saved
[perm
[i
]]);
5748 /* Return the cfg loop that contains NODE. */
5751 vect_optimize_slp_pass::containing_loop (slp_tree node
)
5753 stmt_vec_info rep
= SLP_TREE_REPRESENTATIVE (node
);
5755 return ENTRY_BLOCK_PTR_FOR_FN (cfun
)->loop_father
;
5756 return gimple_bb (vect_orig_stmt (rep
)->stmt
)->loop_father
;
5759 /* Return true if UD (an edge from a use to a definition) is associated
5760 with a loop latch edge in the cfg. */
5763 vect_optimize_slp_pass::is_cfg_latch_edge (graph_edge
*ud
)
5765 slp_tree use
= m_vertices
[ud
->src
].node
;
5766 slp_tree def
= m_vertices
[ud
->dest
].node
;
5767 if ((SLP_TREE_DEF_TYPE (use
) != vect_internal_def
5768 || SLP_TREE_CODE (use
) == VEC_PERM_EXPR
)
5769 || SLP_TREE_DEF_TYPE (def
) != vect_internal_def
)
5772 stmt_vec_info use_rep
= vect_orig_stmt (SLP_TREE_REPRESENTATIVE (use
));
5773 return (is_a
<gphi
*> (use_rep
->stmt
)
5774 && bb_loop_header_p (gimple_bb (use_rep
->stmt
))
5775 && containing_loop (def
) == containing_loop (use
));
5778 /* Build the graph. Mark edges that correspond to cfg loop latch edges with
5779 a nonnull data field. */
5782 vect_optimize_slp_pass::build_graph ()
5784 m_optimize_size
= true;
5787 m_slpg
= new_graph (m_vertices
.length ());
5788 for (slpg_vertex
&v
: m_vertices
)
5789 for (slp_tree child
: SLP_TREE_CHILDREN (v
.node
))
5792 graph_edge
*ud
= add_edge (m_slpg
, v
.node
->vertex
, child
->vertex
);
5793 if (is_cfg_latch_edge (ud
))
5798 /* Return true if E corresponds to a loop latch edge in the cfg. */
5801 skip_cfg_latch_edges (graph_edge
*e
)
5806 /* Create the node partitions. */
5809 vect_optimize_slp_pass::create_partitions ()
5811 /* Calculate a postorder of the graph, ignoring edges that correspond
5812 to natural latch edges in the cfg. Reading the vector from the end
5813 to the beginning gives the reverse postorder. */
5814 auto_vec
<int> initial_rpo
;
5815 graphds_dfs (m_slpg
, &m_leafs
[0], m_leafs
.length (), &initial_rpo
,
5816 false, NULL
, skip_cfg_latch_edges
);
5817 gcc_assert (initial_rpo
.length () == m_vertices
.length ());
5819 /* Calculate the strongly connected components of the graph. */
5820 auto_vec
<int> scc_grouping
;
5821 unsigned int num_sccs
= graphds_scc (m_slpg
, NULL
, NULL
, &scc_grouping
);
5823 /* Create a new index order in which all nodes from the same SCC are
5824 consecutive. Use scc_pos to record the index of the first node in
5826 auto_vec
<unsigned int> scc_pos (num_sccs
);
5827 int last_component
= -1;
5828 unsigned int node_count
= 0;
5829 for (unsigned int node_i
: scc_grouping
)
5831 if (last_component
!= m_slpg
->vertices
[node_i
].component
)
5833 last_component
= m_slpg
->vertices
[node_i
].component
;
5834 gcc_assert (last_component
== int (scc_pos
.length ()));
5835 scc_pos
.quick_push (node_count
);
5839 gcc_assert (node_count
== initial_rpo
.length ()
5840 && last_component
+ 1 == int (num_sccs
));
5842 /* Use m_partitioned_nodes to group nodes into SCC order, with the nodes
5843 inside each SCC following the RPO we calculated above. The fact that
5844 we ignored natural latch edges when calculating the RPO should ensure
5845 that, for natural loop nests:
5847 - the first node that we encounter in a cfg loop is the loop header phi
5848 - the loop header phis are in dominance order
5850 Arranging for this is an optimization (see below) rather than a
5851 correctness issue. Unnatural loops with a tangled mess of backedges
5852 will still work correctly, but might give poorer results.
5854 Also update scc_pos so that it gives 1 + the index of the last node
5856 m_partitioned_nodes
.safe_grow (node_count
);
5857 for (unsigned int old_i
= initial_rpo
.length (); old_i
-- > 0;)
5859 unsigned int node_i
= initial_rpo
[old_i
];
5860 unsigned int new_i
= scc_pos
[m_slpg
->vertices
[node_i
].component
]++;
5861 m_partitioned_nodes
[new_i
] = node_i
;
5864 /* When optimizing for speed, partition each SCC based on the containing
5865 cfg loop. The order we constructed above should ensure that, for natural
5866 cfg loops, we'll create sub-SCC partitions for outer loops before
5867 the corresponding sub-SCC partitions for inner loops. Similarly,
5868 when one sibling loop A dominates another sibling loop B, we should
5869 create a sub-SCC partition for A before a sub-SCC partition for B.
5871 As above, nothing depends for correctness on whether this achieves
5872 a natural nesting, but we should get better results when it does. */
5873 m_partitions
.reserve (m_vertices
.length ());
5874 unsigned int next_partition_i
= 0;
5875 hash_map
<struct loop
*, int> loop_partitions
;
5876 unsigned int rpo_begin
= 0;
5877 unsigned int num_partitioned_nodes
= 0;
5878 for (unsigned int rpo_end
: scc_pos
)
5880 loop_partitions
.empty ();
5881 unsigned int partition_i
= next_partition_i
;
5882 for (unsigned int rpo_i
= rpo_begin
; rpo_i
< rpo_end
; ++rpo_i
)
5884 /* Handle externals and constants optimistically throughout.
5885 But treat existing vectors as fixed since we do not handle
5887 unsigned int node_i
= m_partitioned_nodes
[rpo_i
];
5888 auto &vertex
= m_vertices
[node_i
];
5889 if ((SLP_TREE_DEF_TYPE (vertex
.node
) == vect_external_def
5890 && !SLP_TREE_VEC_DEFS (vertex
.node
).exists ())
5891 || SLP_TREE_DEF_TYPE (vertex
.node
) == vect_constant_def
)
5892 vertex
.partition
= -1;
5896 if (m_optimize_size
)
5897 existed
= next_partition_i
> partition_i
;
5900 struct loop
*loop
= containing_loop (vertex
.node
);
5901 auto &entry
= loop_partitions
.get_or_insert (loop
, &existed
);
5903 entry
= next_partition_i
;
5904 partition_i
= entry
;
5908 m_partitions
.quick_push (slpg_partition_info ());
5909 next_partition_i
+= 1;
5911 vertex
.partition
= partition_i
;
5912 num_partitioned_nodes
+= 1;
5913 m_partitions
[partition_i
].node_end
+= 1;
5916 rpo_begin
= rpo_end
;
5919 /* Assign ranges of consecutive node indices to each partition,
5920 in partition order. Start with node_end being the same as
5921 node_begin so that the next loop can use it as a counter. */
5922 unsigned int node_begin
= 0;
5923 for (auto &partition
: m_partitions
)
5925 partition
.node_begin
= node_begin
;
5926 node_begin
+= partition
.node_end
;
5927 partition
.node_end
= partition
.node_begin
;
5929 gcc_assert (node_begin
== num_partitioned_nodes
);
5931 /* Finally build the list of nodes in partition order. */
5932 m_partitioned_nodes
.truncate (num_partitioned_nodes
);
5933 for (unsigned int node_i
= 0; node_i
< m_vertices
.length (); ++node_i
)
5935 int partition_i
= m_vertices
[node_i
].partition
;
5936 if (partition_i
>= 0)
5938 unsigned int order_i
= m_partitions
[partition_i
].node_end
++;
5939 m_partitioned_nodes
[order_i
] = node_i
;
5944 /* Look for edges from earlier partitions into node NODE_I and edges from
5945 node NODE_I into later partitions. Call:
5947 FN (ud, other_node_i)
5949 for each such use-to-def edge ud, where other_node_i is the node at the
5950 other end of the edge. */
5952 template<typename T
>
5954 vect_optimize_slp_pass::for_each_partition_edge (unsigned int node_i
, T fn
)
5956 int partition_i
= m_vertices
[node_i
].partition
;
5957 for (graph_edge
*pred
= m_slpg
->vertices
[node_i
].pred
;
5958 pred
; pred
= pred
->pred_next
)
5960 int src_partition_i
= m_vertices
[pred
->src
].partition
;
5961 if (src_partition_i
>= 0 && src_partition_i
!= partition_i
)
5962 fn (pred
, pred
->src
);
5964 for (graph_edge
*succ
= m_slpg
->vertices
[node_i
].succ
;
5965 succ
; succ
= succ
->succ_next
)
5967 int dest_partition_i
= m_vertices
[succ
->dest
].partition
;
5968 if (dest_partition_i
>= 0 && dest_partition_i
!= partition_i
)
5969 fn (succ
, succ
->dest
);
5973 /* Return true if layout LAYOUT_I is compatible with the number of SLP lanes
5974 that NODE would operate on. This test is independent of NODE's actual
5978 vect_optimize_slp_pass::is_compatible_layout (slp_tree node
,
5979 unsigned int layout_i
)
5984 if (SLP_TREE_LANES (node
) != m_perms
[layout_i
].length ())
5990 /* Return the cost (in arbtirary units) of going from layout FROM_LAYOUT_I
5991 to layout TO_LAYOUT_I for a node like NODE. Return -1 if either of the
5992 layouts is incompatible with NODE or if the change is not possible for
5995 The properties taken from NODE include the number of lanes and the
5996 vector type. The actual operation doesn't matter. */
5999 vect_optimize_slp_pass::change_layout_cost (slp_tree node
,
6000 unsigned int from_layout_i
,
6001 unsigned int to_layout_i
)
6003 if (!is_compatible_layout (node
, from_layout_i
)
6004 || !is_compatible_layout (node
, to_layout_i
))
6007 if (from_layout_i
== to_layout_i
)
6010 auto_vec
<slp_tree
, 1> children (1);
6011 children
.quick_push (node
);
6012 auto_lane_permutation_t
perm (SLP_TREE_LANES (node
));
6013 if (from_layout_i
> 0)
6014 for (unsigned int i
: m_perms
[from_layout_i
])
6015 perm
.quick_push ({ 0, i
});
6017 for (unsigned int i
= 0; i
< SLP_TREE_LANES (node
); ++i
)
6018 perm
.quick_push ({ 0, i
});
6019 if (to_layout_i
> 0)
6020 vect_slp_permute (m_perms
[to_layout_i
], perm
, true);
6021 auto count
= vectorizable_slp_permutation_1 (m_vinfo
, nullptr, node
, perm
,
6024 return MAX (count
, 1);
6026 /* ??? In principle we could try changing via layout 0, giving two
6027 layout changes rather than 1. Doing that would require
6028 corresponding support in get_result_with_layout. */
6032 /* Return the costs of assigning layout LAYOUT_I to partition PARTITION_I. */
6034 inline slpg_partition_layout_costs
&
6035 vect_optimize_slp_pass::partition_layout_costs (unsigned int partition_i
,
6036 unsigned int layout_i
)
6038 return m_partition_layout_costs
[partition_i
* m_perms
.length () + layout_i
];
6041 /* Change PERM in one of two ways:
6043 - if IN_LAYOUT_I < 0, accept input operand I in the layout that has been
6044 chosen for child I of NODE.
6046 - if IN_LAYOUT >= 0, accept all inputs operands with that layout.
6048 In both cases, arrange for the output to have layout OUT_LAYOUT_I */
6051 vect_optimize_slp_pass::
6052 change_vec_perm_layout (slp_tree node
, lane_permutation_t
&perm
,
6053 int in_layout_i
, unsigned int out_layout_i
)
6055 for (auto &entry
: perm
)
6057 int this_in_layout_i
= in_layout_i
;
6058 if (this_in_layout_i
< 0)
6060 slp_tree in_node
= SLP_TREE_CHILDREN (node
)[entry
.first
];
6061 unsigned int in_partition_i
= m_vertices
[in_node
->vertex
].partition
;
6062 if (in_partition_i
== -1u)
6064 this_in_layout_i
= m_partitions
[in_partition_i
].layout
;
6066 if (this_in_layout_i
> 0)
6067 entry
.second
= m_perms
[this_in_layout_i
][entry
.second
];
6069 if (out_layout_i
> 0)
6070 vect_slp_permute (m_perms
[out_layout_i
], perm
, true);
6073 /* Check whether the target allows NODE to be rearranged so that the node's
6074 output has layout OUT_LAYOUT_I. Return the cost of the change if so,
6075 in the same arbitrary units as for change_layout_cost. Return -1 otherwise.
6077 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I < 0, also check whether
6078 NODE can adapt to the layout changes that have (perhaps provisionally)
6079 been chosen for NODE's children, so that no extra permutations are
6080 needed on either the input or the output of NODE.
6082 If NODE is a VEC_PERM_EXPR and IN_LAYOUT_I >= 0, instead assume
6083 that all inputs will be forced into layout IN_LAYOUT_I beforehand.
6085 IN_LAYOUT_I has no meaning for other types of node.
6087 Keeping the node as-is is always valid. If the target doesn't appear
6088 to support the node as-is, but might realistically support other layouts,
6089 then layout 0 instead has the cost of a worst-case permutation. On the
6090 one hand, this ensures that every node has at least one valid layout,
6091 avoiding what would otherwise be an awkward special case. On the other,
6092 it still encourages the pass to change an invalid pre-existing layout
6093 choice into a valid one. */
6096 vect_optimize_slp_pass::internal_node_cost (slp_tree node
, int in_layout_i
,
6097 unsigned int out_layout_i
)
6099 const int fallback_cost
= 1;
6101 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
6103 auto_lane_permutation_t tmp_perm
;
6104 tmp_perm
.safe_splice (SLP_TREE_LANE_PERMUTATION (node
));
6106 /* Check that the child nodes support the chosen layout. Checking
6107 the first child is enough, since any second child would have the
6109 auto first_child
= SLP_TREE_CHILDREN (node
)[0];
6111 && !is_compatible_layout (first_child
, in_layout_i
))
6114 change_vec_perm_layout (node
, tmp_perm
, in_layout_i
, out_layout_i
);
6115 int count
= vectorizable_slp_permutation_1 (m_vinfo
, nullptr,
6117 SLP_TREE_CHILDREN (node
),
6121 if (in_layout_i
== 0 && out_layout_i
== 0)
6123 /* Use the fallback cost if the node could in principle support
6124 some nonzero layout for both the inputs and the outputs.
6125 Otherwise assume that the node will be rejected later
6126 and rebuilt from scalars. */
6127 if (SLP_TREE_LANES (node
) == SLP_TREE_LANES (first_child
))
6128 return fallback_cost
;
6134 /* We currently have no way of telling whether the new layout is cheaper
6135 or more expensive than the old one. But at least in principle,
6136 it should be worth making zero permutations (whole-vector shuffles)
6137 cheaper than real permutations, in case the pass is able to remove
6139 return count
== 0 ? 0 : 1;
6142 stmt_vec_info rep
= SLP_TREE_REPRESENTATIVE (node
);
6144 && STMT_VINFO_DATA_REF (rep
)
6145 && DR_IS_READ (STMT_VINFO_DATA_REF (rep
))
6146 && SLP_TREE_LOAD_PERMUTATION (node
).exists ())
6148 auto_load_permutation_t tmp_perm
;
6149 tmp_perm
.safe_splice (SLP_TREE_LOAD_PERMUTATION (node
));
6150 if (out_layout_i
> 0)
6151 vect_slp_permute (m_perms
[out_layout_i
], tmp_perm
, true);
6154 if (auto loop_vinfo
= dyn_cast
<loop_vec_info
> (m_vinfo
))
6155 vf
= LOOP_VINFO_VECT_FACTOR (loop_vinfo
);
6156 unsigned int n_perms
;
6157 if (!vect_transform_slp_perm_load_1 (m_vinfo
, node
, tmp_perm
, vNULL
,
6158 nullptr, vf
, true, false, &n_perms
))
6160 auto rep
= SLP_TREE_REPRESENTATIVE (node
);
6161 if (out_layout_i
== 0)
6163 /* Use the fallback cost if the load is an N-to-N permutation.
6164 Otherwise assume that the node will be rejected later
6165 and rebuilt from scalars. */
6166 if (STMT_VINFO_GROUPED_ACCESS (rep
)
6167 && (DR_GROUP_SIZE (DR_GROUP_FIRST_ELEMENT (rep
))
6168 == SLP_TREE_LANES (node
)))
6169 return fallback_cost
;
6175 /* See the comment above the corresponding VEC_PERM_EXPR handling. */
6176 return n_perms
== 0 ? 0 : 1;
6182 /* Decide which element layouts we should consider using. Calculate the
6183 weights associated with inserting layout changes on partition edges.
6184 Also mark partitions that cannot change layout, by setting their
6188 vect_optimize_slp_pass::start_choosing_layouts ()
6190 /* Used to assign unique permutation indices. */
6191 using perm_hash
= unbounded_hashmap_traits
<
6192 vec_free_hash_base
<int_hash_base
<unsigned>>,
6193 int_hash
<int, -1, -2>
6195 hash_map
<vec
<unsigned>, int, perm_hash
> layout_ids
;
6197 /* Layout 0 is "no change". */
6198 m_perms
.safe_push (vNULL
);
6200 /* Create layouts from existing permutations. */
6201 auto_load_permutation_t tmp_perm
;
6202 for (unsigned int node_i
: m_partitioned_nodes
)
6204 /* Leafs also double as entries to the reverse graph. Allow the
6205 layout of those to be changed. */
6206 auto &vertex
= m_vertices
[node_i
];
6207 auto &partition
= m_partitions
[vertex
.partition
];
6208 if (!m_slpg
->vertices
[node_i
].succ
)
6209 partition
.layout
= 0;
6211 /* Loads and VEC_PERM_EXPRs are the only things generating permutes. */
6212 slp_tree node
= vertex
.node
;
6213 stmt_vec_info dr_stmt
= SLP_TREE_REPRESENTATIVE (node
);
6215 unsigned HOST_WIDE_INT imin
, imax
= 0;
6216 bool any_permute
= false;
6217 tmp_perm
.truncate (0);
6218 if (SLP_TREE_LOAD_PERMUTATION (node
).exists ())
6220 /* If splitting out a SLP_TREE_LANE_PERMUTATION can make the node
6221 unpermuted, record a layout that reverses this permutation.
6223 We would need more work to cope with loads that are internally
6224 permuted and also have inputs (such as masks for
6226 gcc_assert (partition
.layout
== 0 && !m_slpg
->vertices
[node_i
].succ
);
6227 if (!STMT_VINFO_GROUPED_ACCESS (dr_stmt
))
6229 partition
.layout
= -1;
6232 dr_stmt
= DR_GROUP_FIRST_ELEMENT (dr_stmt
);
6233 imin
= DR_GROUP_SIZE (dr_stmt
) + 1;
6234 tmp_perm
.safe_splice (SLP_TREE_LOAD_PERMUTATION (node
));
6236 else if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
6237 && SLP_TREE_CHILDREN (node
).length () == 1
6238 && (child
= SLP_TREE_CHILDREN (node
)[0])
6239 && (TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (child
))
6240 .is_constant (&imin
)))
6242 /* If the child has the same vector size as this node,
6243 reversing the permutation can make the permutation a no-op.
6244 In other cases it can change a true permutation into a
6245 full-vector extract. */
6246 tmp_perm
.reserve (SLP_TREE_LANES (node
));
6247 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
6248 tmp_perm
.quick_push (SLP_TREE_LANE_PERMUTATION (node
)[j
].second
);
6253 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
6255 unsigned idx
= tmp_perm
[j
];
6256 imin
= MIN (imin
, idx
);
6257 imax
= MAX (imax
, idx
);
6258 if (idx
- tmp_perm
[0] != j
)
6261 /* If the span doesn't match we'd disrupt VF computation, avoid
6263 if (imax
- imin
+ 1 != SLP_TREE_LANES (node
))
6265 /* If there's no permute no need to split one out. In this case
6266 we can consider turning a load into a permuted load, if that
6267 turns out to be cheaper than alternatives. */
6270 partition
.layout
= -1;
6274 /* For now only handle true permutes, like
6275 vect_attempt_slp_rearrange_stmts did. This allows us to be lazy
6276 when permuting constants and invariants keeping the permute
6278 auto_sbitmap
load_index (SLP_TREE_LANES (node
));
6279 bitmap_clear (load_index
);
6280 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
6281 bitmap_set_bit (load_index
, tmp_perm
[j
] - imin
);
6283 for (j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
6284 if (!bitmap_bit_p (load_index
, j
))
6286 if (j
!= SLP_TREE_LANES (node
))
6289 vec
<unsigned> perm
= vNULL
;
6290 perm
.safe_grow (SLP_TREE_LANES (node
), true);
6291 for (unsigned j
= 0; j
< SLP_TREE_LANES (node
); ++j
)
6292 perm
[j
] = tmp_perm
[j
] - imin
;
6294 if (int (m_perms
.length ()) >= param_vect_max_layout_candidates
)
6296 /* Continue to use existing layouts, but don't add any more. */
6297 int *entry
= layout_ids
.get (perm
);
6298 partition
.layout
= entry
? *entry
: 0;
6304 int &layout_i
= layout_ids
.get_or_insert (perm
, &existed
);
6309 layout_i
= m_perms
.length ();
6310 m_perms
.safe_push (perm
);
6312 partition
.layout
= layout_i
;
6316 /* Initially assume that every layout is possible and has zero cost
6317 in every partition. */
6318 m_partition_layout_costs
.safe_grow_cleared (m_partitions
.length ()
6319 * m_perms
.length ());
6321 /* We have to mark outgoing permutations facing non-associating-reduction
6322 graph entries that are not represented as to be materialized.
6323 slp_inst_kind_bb_reduc currently only covers associatable reductions. */
6324 for (slp_instance instance
: m_vinfo
->slp_instances
)
6325 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_ctor
)
6327 unsigned int node_i
= SLP_INSTANCE_TREE (instance
)->vertex
;
6328 m_partitions
[m_vertices
[node_i
].partition
].layout
= 0;
6330 else if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_reduc_chain
)
6332 stmt_vec_info stmt_info
6333 = SLP_TREE_REPRESENTATIVE (SLP_INSTANCE_TREE (instance
));
6334 stmt_vec_info reduc_info
= info_for_reduction (m_vinfo
, stmt_info
);
6335 if (needs_fold_left_reduction_p (TREE_TYPE
6336 (gimple_get_lhs (stmt_info
->stmt
)),
6337 STMT_VINFO_REDUC_CODE (reduc_info
)))
6339 unsigned int node_i
= SLP_INSTANCE_TREE (instance
)->vertex
;
6340 m_partitions
[m_vertices
[node_i
].partition
].layout
= 0;
6344 /* Check which layouts each node and partition can handle. Calculate the
6345 weights associated with inserting layout changes on edges. */
6346 for (unsigned int node_i
: m_partitioned_nodes
)
6348 auto &vertex
= m_vertices
[node_i
];
6349 auto &partition
= m_partitions
[vertex
.partition
];
6350 slp_tree node
= vertex
.node
;
6352 if (stmt_vec_info rep
= SLP_TREE_REPRESENTATIVE (node
))
6354 vertex
.weight
= vect_slp_node_weight (node
);
6356 /* We do not handle stores with a permutation, so all
6357 incoming permutations must have been materialized.
6359 We also don't handle masked grouped loads, which lack a
6360 permutation vector. In this case the memory locations
6361 form an implicit second input to the loads, on top of the
6362 explicit mask input, and the memory input's layout cannot
6365 On the other hand, we do support permuting gather loads and
6366 masked gather loads, where each scalar load is independent
6367 of the others. This can be useful if the address/index input
6368 benefits from permutation. */
6369 if (STMT_VINFO_DATA_REF (rep
)
6370 && STMT_VINFO_GROUPED_ACCESS (rep
)
6371 && !SLP_TREE_LOAD_PERMUTATION (node
).exists ())
6372 partition
.layout
= 0;
6374 /* We cannot change the layout of an operation that is
6375 not independent on lanes. Note this is an explicit
6376 negative list since that's much shorter than the respective
6377 positive one but it's critical to keep maintaining it. */
6378 if (is_gimple_call (STMT_VINFO_STMT (rep
)))
6379 switch (gimple_call_combined_fn (STMT_VINFO_STMT (rep
)))
6381 case CFN_COMPLEX_ADD_ROT90
:
6382 case CFN_COMPLEX_ADD_ROT270
:
6383 case CFN_COMPLEX_MUL
:
6384 case CFN_COMPLEX_MUL_CONJ
:
6385 case CFN_VEC_ADDSUB
:
6386 case CFN_VEC_FMADDSUB
:
6387 case CFN_VEC_FMSUBADD
:
6388 partition
.layout
= 0;
6393 auto process_edge
= [&](graph_edge
*ud
, unsigned int other_node_i
)
6395 auto &other_vertex
= m_vertices
[other_node_i
];
6397 /* Count the number of edges from earlier partitions and the number
6398 of edges to later partitions. */
6399 if (other_vertex
.partition
< vertex
.partition
)
6400 partition
.in_degree
+= 1;
6402 partition
.out_degree
+= 1;
6404 /* If the current node uses the result of OTHER_NODE_I, accumulate
6405 the effects of that. */
6406 if (ud
->src
== int (node_i
))
6408 other_vertex
.out_weight
+= vertex
.weight
;
6409 other_vertex
.out_degree
+= 1;
6412 for_each_partition_edge (node_i
, process_edge
);
6416 /* Return the incoming costs for node NODE_I, assuming that each input keeps
6417 its current (provisional) choice of layout. The inputs do not necessarily
6418 have the same layout as each other. */
6421 vect_optimize_slp_pass::total_in_cost (unsigned int node_i
)
6423 auto &vertex
= m_vertices
[node_i
];
6424 slpg_layout_cost cost
;
6425 auto add_cost
= [&](graph_edge
*, unsigned int other_node_i
)
6427 auto &other_vertex
= m_vertices
[other_node_i
];
6428 if (other_vertex
.partition
< vertex
.partition
)
6430 auto &other_partition
= m_partitions
[other_vertex
.partition
];
6431 auto &other_costs
= partition_layout_costs (other_vertex
.partition
,
6432 other_partition
.layout
);
6433 slpg_layout_cost this_cost
= other_costs
.in_cost
;
6434 this_cost
.add_serial_cost (other_costs
.internal_cost
);
6435 this_cost
.split (other_partition
.out_degree
);
6436 cost
.add_parallel_cost (this_cost
);
6439 for_each_partition_edge (node_i
, add_cost
);
6443 /* Return the cost of switching between layout LAYOUT1_I (at node NODE1_I)
6444 and layout LAYOUT2_I on cross-partition use-to-def edge UD. Return
6445 slpg_layout_cost::impossible () if the change isn't possible. */
6448 vect_optimize_slp_pass::
6449 edge_layout_cost (graph_edge
*ud
, unsigned int node1_i
, unsigned int layout1_i
,
6450 unsigned int layout2_i
)
6452 auto &def_vertex
= m_vertices
[ud
->dest
];
6453 auto &use_vertex
= m_vertices
[ud
->src
];
6454 auto def_layout_i
= ud
->dest
== int (node1_i
) ? layout1_i
: layout2_i
;
6455 auto use_layout_i
= ud
->dest
== int (node1_i
) ? layout2_i
: layout1_i
;
6456 auto factor
= change_layout_cost (def_vertex
.node
, def_layout_i
,
6459 return slpg_layout_cost::impossible ();
6461 /* We have a choice of putting the layout change at the site of the
6462 definition or at the site of the use. Prefer the former when
6463 optimizing for size or when the execution frequency of the
6464 definition is no greater than the combined execution frequencies of
6465 the uses. When putting the layout change at the site of the definition,
6466 divvy up the cost among all consumers. */
6467 if (m_optimize_size
|| def_vertex
.weight
<= def_vertex
.out_weight
)
6469 slpg_layout_cost cost
= { def_vertex
.weight
* factor
, m_optimize_size
};
6470 cost
.split (def_vertex
.out_degree
);
6473 return { use_vertex
.weight
* factor
, m_optimize_size
};
6476 /* UD represents a use-def link between FROM_NODE_I and a node in a later
6477 partition; FROM_NODE_I could be the definition node or the use node.
6478 The node at the other end of the link wants to use layout TO_LAYOUT_I.
6479 Return the cost of any necessary fix-ups on edge UD, or return
6480 slpg_layout_cost::impossible () if the change isn't possible.
6482 At this point, FROM_NODE_I's partition has chosen the cheapest
6483 layout based on the information available so far, but this choice
6484 is only provisional. */
6487 vect_optimize_slp_pass::forward_cost (graph_edge
*ud
, unsigned int from_node_i
,
6488 unsigned int to_layout_i
)
6490 auto &from_vertex
= m_vertices
[from_node_i
];
6491 unsigned int from_partition_i
= from_vertex
.partition
;
6492 slpg_partition_info
&from_partition
= m_partitions
[from_partition_i
];
6493 gcc_assert (from_partition
.layout
>= 0);
6495 /* First calculate the cost on the assumption that FROM_PARTITION sticks
6496 with its current layout preference. */
6497 slpg_layout_cost cost
= slpg_layout_cost::impossible ();
6498 auto edge_cost
= edge_layout_cost (ud
, from_node_i
,
6499 from_partition
.layout
, to_layout_i
);
6500 if (edge_cost
.is_possible ())
6502 auto &from_costs
= partition_layout_costs (from_partition_i
,
6503 from_partition
.layout
);
6504 cost
= from_costs
.in_cost
;
6505 cost
.add_serial_cost (from_costs
.internal_cost
);
6506 cost
.split (from_partition
.out_degree
);
6507 cost
.add_serial_cost (edge_cost
);
6509 else if (from_partition
.layout
== 0)
6510 /* We must allow the source partition to have layout 0 as a fallback,
6511 in case all other options turn out to be impossible. */
6514 /* Take the minimum of that cost and the cost that applies if
6515 FROM_PARTITION instead switches to TO_LAYOUT_I. */
6516 auto &direct_layout_costs
= partition_layout_costs (from_partition_i
,
6518 if (direct_layout_costs
.is_possible ())
6520 slpg_layout_cost direct_cost
= direct_layout_costs
.in_cost
;
6521 direct_cost
.add_serial_cost (direct_layout_costs
.internal_cost
);
6522 direct_cost
.split (from_partition
.out_degree
);
6523 if (!cost
.is_possible ()
6524 || direct_cost
.is_better_than (cost
, m_optimize_size
))
6531 /* UD represents a use-def link between TO_NODE_I and a node in an earlier
6532 partition; TO_NODE_I could be the definition node or the use node.
6533 The node at the other end of the link wants to use layout FROM_LAYOUT_I;
6534 return the cost of any necessary fix-ups on edge UD, or
6535 slpg_layout_cost::impossible () if the choice cannot be made.
6537 At this point, TO_NODE_I's partition has a fixed choice of layout. */
6540 vect_optimize_slp_pass::backward_cost (graph_edge
*ud
, unsigned int to_node_i
,
6541 unsigned int from_layout_i
)
6543 auto &to_vertex
= m_vertices
[to_node_i
];
6544 unsigned int to_partition_i
= to_vertex
.partition
;
6545 slpg_partition_info
&to_partition
= m_partitions
[to_partition_i
];
6546 gcc_assert (to_partition
.layout
>= 0);
6548 /* If TO_NODE_I is a VEC_PERM_EXPR consumer, see whether it can be
6549 adjusted for this input having layout FROM_LAYOUT_I. Assume that
6550 any other inputs keep their current choice of layout. */
6551 auto &to_costs
= partition_layout_costs (to_partition_i
,
6552 to_partition
.layout
);
6553 if (ud
->src
== int (to_node_i
)
6554 && SLP_TREE_CODE (to_vertex
.node
) == VEC_PERM_EXPR
)
6556 auto &from_partition
= m_partitions
[m_vertices
[ud
->dest
].partition
];
6557 auto old_layout
= from_partition
.layout
;
6558 from_partition
.layout
= from_layout_i
;
6559 int factor
= internal_node_cost (to_vertex
.node
, -1,
6560 to_partition
.layout
);
6561 from_partition
.layout
= old_layout
;
6564 slpg_layout_cost cost
= to_costs
.out_cost
;
6565 cost
.add_serial_cost ({ to_vertex
.weight
* factor
,
6567 cost
.split (to_partition
.in_degree
);
6572 /* Compute the cost if we insert any necessary layout change on edge UD. */
6573 auto edge_cost
= edge_layout_cost (ud
, to_node_i
,
6574 to_partition
.layout
, from_layout_i
);
6575 if (edge_cost
.is_possible ())
6577 slpg_layout_cost cost
= to_costs
.out_cost
;
6578 cost
.add_serial_cost (to_costs
.internal_cost
);
6579 cost
.split (to_partition
.in_degree
);
6580 cost
.add_serial_cost (edge_cost
);
6584 return slpg_layout_cost::impossible ();
6587 /* Make a forward pass through the partitions, accumulating input costs.
6588 Make a tentative (provisional) choice of layout for each partition,
6589 ensuring that this choice still allows later partitions to keep
6590 their original layout. */
6593 vect_optimize_slp_pass::forward_pass ()
6595 for (unsigned int partition_i
= 0; partition_i
< m_partitions
.length ();
6598 auto &partition
= m_partitions
[partition_i
];
6600 /* If the partition consists of a single VEC_PERM_EXPR, precompute
6601 the incoming cost that would apply if every predecessor partition
6602 keeps its current layout. This is used within the loop below. */
6603 slpg_layout_cost in_cost
;
6604 slp_tree single_node
= nullptr;
6605 if (partition
.node_end
== partition
.node_begin
+ 1)
6607 unsigned int node_i
= m_partitioned_nodes
[partition
.node_begin
];
6608 single_node
= m_vertices
[node_i
].node
;
6609 if (SLP_TREE_CODE (single_node
) == VEC_PERM_EXPR
)
6610 in_cost
= total_in_cost (node_i
);
6613 /* Go through the possible layouts. Decide which ones are valid
6614 for this partition and record which of the valid layouts has
6616 unsigned int min_layout_i
= 0;
6617 slpg_layout_cost min_layout_cost
= slpg_layout_cost::impossible ();
6618 for (unsigned int layout_i
= 0; layout_i
< m_perms
.length (); ++layout_i
)
6620 auto &layout_costs
= partition_layout_costs (partition_i
, layout_i
);
6621 if (!layout_costs
.is_possible ())
6624 /* If the recorded layout is already 0 then the layout cannot
6626 if (partition
.layout
== 0 && layout_i
!= 0)
6628 layout_costs
.mark_impossible ();
6632 bool is_possible
= true;
6633 for (unsigned int order_i
= partition
.node_begin
;
6634 order_i
< partition
.node_end
; ++order_i
)
6636 unsigned int node_i
= m_partitioned_nodes
[order_i
];
6637 auto &vertex
= m_vertices
[node_i
];
6639 /* Reject the layout if it is individually incompatible
6640 with any node in the partition. */
6641 if (!is_compatible_layout (vertex
.node
, layout_i
))
6643 is_possible
= false;
6647 auto add_cost
= [&](graph_edge
*ud
, unsigned int other_node_i
)
6649 auto &other_vertex
= m_vertices
[other_node_i
];
6650 if (other_vertex
.partition
< vertex
.partition
)
6652 /* Accumulate the incoming costs from earlier
6653 partitions, plus the cost of any layout changes
6655 auto cost
= forward_cost (ud
, other_node_i
, layout_i
);
6656 if (!cost
.is_possible ())
6657 is_possible
= false;
6659 layout_costs
.in_cost
.add_parallel_cost (cost
);
6662 /* Reject the layout if it would make layout 0 impossible
6663 for later partitions. This amounts to testing that the
6664 target supports reversing the layout change on edges
6665 to later partitions.
6667 In principle, it might be possible to push a layout
6668 change all the way down a graph, so that it never
6669 needs to be reversed and so that the target doesn't
6670 need to support the reverse operation. But it would
6671 be awkward to bail out if we hit a partition that
6672 does not support the new layout, especially since
6673 we are not dealing with a lattice. */
6674 is_possible
&= edge_layout_cost (ud
, other_node_i
, 0,
6675 layout_i
).is_possible ();
6677 for_each_partition_edge (node_i
, add_cost
);
6679 /* Accumulate the cost of using LAYOUT_I within NODE,
6680 both for the inputs and the outputs. */
6681 int factor
= internal_node_cost (vertex
.node
, layout_i
,
6685 is_possible
= false;
6689 layout_costs
.internal_cost
.add_serial_cost
6690 ({ vertex
.weight
* factor
, m_optimize_size
});
6694 layout_costs
.mark_impossible ();
6698 /* Combine the incoming and partition-internal costs. */
6699 slpg_layout_cost combined_cost
= layout_costs
.in_cost
;
6700 combined_cost
.add_serial_cost (layout_costs
.internal_cost
);
6702 /* If this partition consists of a single VEC_PERM_EXPR, see
6703 if the VEC_PERM_EXPR can be changed to support output layout
6704 LAYOUT_I while keeping all the provisional choices of input
6707 && SLP_TREE_CODE (single_node
) == VEC_PERM_EXPR
)
6709 int factor
= internal_node_cost (single_node
, -1, layout_i
);
6712 auto weight
= m_vertices
[single_node
->vertex
].weight
;
6713 slpg_layout_cost internal_cost
6714 = { weight
* factor
, m_optimize_size
};
6716 slpg_layout_cost alt_cost
= in_cost
;
6717 alt_cost
.add_serial_cost (internal_cost
);
6718 if (alt_cost
.is_better_than (combined_cost
, m_optimize_size
))
6720 combined_cost
= alt_cost
;
6721 layout_costs
.in_cost
= in_cost
;
6722 layout_costs
.internal_cost
= internal_cost
;
6727 /* Record the layout with the lowest cost. Prefer layout 0 in
6728 the event of a tie between it and another layout. */
6729 if (!min_layout_cost
.is_possible ()
6730 || combined_cost
.is_better_than (min_layout_cost
,
6733 min_layout_i
= layout_i
;
6734 min_layout_cost
= combined_cost
;
6738 /* This loop's handling of earlier partitions should ensure that
6739 choosing the original layout for the current partition is no
6740 less valid than it was in the original graph, even with the
6741 provisional layout choices for those earlier partitions. */
6742 gcc_assert (min_layout_cost
.is_possible ());
6743 partition
.layout
= min_layout_i
;
6747 /* Make a backward pass through the partitions, accumulating output costs.
6748 Make a final choice of layout for each partition. */
6751 vect_optimize_slp_pass::backward_pass ()
6753 for (unsigned int partition_i
= m_partitions
.length (); partition_i
-- > 0;)
6755 auto &partition
= m_partitions
[partition_i
];
6757 unsigned int min_layout_i
= 0;
6758 slpg_layout_cost min_layout_cost
= slpg_layout_cost::impossible ();
6759 for (unsigned int layout_i
= 0; layout_i
< m_perms
.length (); ++layout_i
)
6761 auto &layout_costs
= partition_layout_costs (partition_i
, layout_i
);
6762 if (!layout_costs
.is_possible ())
6765 /* Accumulate the costs from successor partitions. */
6766 bool is_possible
= true;
6767 for (unsigned int order_i
= partition
.node_begin
;
6768 order_i
< partition
.node_end
; ++order_i
)
6770 unsigned int node_i
= m_partitioned_nodes
[order_i
];
6771 auto &vertex
= m_vertices
[node_i
];
6772 auto add_cost
= [&](graph_edge
*ud
, unsigned int other_node_i
)
6774 auto &other_vertex
= m_vertices
[other_node_i
];
6775 auto &other_partition
= m_partitions
[other_vertex
.partition
];
6776 if (other_vertex
.partition
> vertex
.partition
)
6778 /* Accumulate the incoming costs from later
6779 partitions, plus the cost of any layout changes
6781 auto cost
= backward_cost (ud
, other_node_i
, layout_i
);
6782 if (!cost
.is_possible ())
6783 is_possible
= false;
6785 layout_costs
.out_cost
.add_parallel_cost (cost
);
6788 /* Make sure that earlier partitions can (if necessary
6789 or beneficial) keep the layout that they chose in
6790 the forward pass. This ensures that there is at
6791 least one valid choice of layout. */
6792 is_possible
&= edge_layout_cost (ud
, other_node_i
,
6793 other_partition
.layout
,
6794 layout_i
).is_possible ();
6796 for_each_partition_edge (node_i
, add_cost
);
6800 layout_costs
.mark_impossible ();
6804 /* Locally combine the costs from the forward and backward passes.
6805 (This combined cost is not passed on, since that would lead
6806 to double counting.) */
6807 slpg_layout_cost combined_cost
= layout_costs
.in_cost
;
6808 combined_cost
.add_serial_cost (layout_costs
.internal_cost
);
6809 combined_cost
.add_serial_cost (layout_costs
.out_cost
);
6811 /* Record the layout with the lowest cost. Prefer layout 0 in
6812 the event of a tie between it and another layout. */
6813 if (!min_layout_cost
.is_possible ()
6814 || combined_cost
.is_better_than (min_layout_cost
,
6817 min_layout_i
= layout_i
;
6818 min_layout_cost
= combined_cost
;
6822 gcc_assert (min_layout_cost
.is_possible ());
6823 partition
.layout
= min_layout_i
;
6827 /* Return a node that applies layout TO_LAYOUT_I to the original form of NODE.
6828 NODE already has the layout that was selected for its partition. */
6831 vect_optimize_slp_pass::get_result_with_layout (slp_tree node
,
6832 unsigned int to_layout_i
)
6834 unsigned int result_i
= node
->vertex
* m_perms
.length () + to_layout_i
;
6835 slp_tree result
= m_node_layouts
[result_i
];
6839 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
6840 || (SLP_TREE_DEF_TYPE (node
) == vect_external_def
6841 /* We can't permute vector defs in place. */
6842 && SLP_TREE_VEC_DEFS (node
).is_empty ()))
6844 /* If the vector is uniform or unchanged, there's nothing to do. */
6845 if (to_layout_i
== 0 || vect_slp_tree_uniform_p (node
))
6849 auto scalar_ops
= SLP_TREE_SCALAR_OPS (node
).copy ();
6850 result
= vect_create_new_slp_node (scalar_ops
);
6851 vect_slp_permute (m_perms
[to_layout_i
], scalar_ops
, true);
6856 unsigned int partition_i
= m_vertices
[node
->vertex
].partition
;
6857 unsigned int from_layout_i
= m_partitions
[partition_i
].layout
;
6858 if (from_layout_i
== to_layout_i
)
6861 /* If NODE is itself a VEC_PERM_EXPR, try to create a parallel
6862 permutation instead of a serial one. Leave the new permutation
6863 in TMP_PERM on success. */
6864 auto_lane_permutation_t tmp_perm
;
6865 unsigned int num_inputs
= 1;
6866 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
6868 tmp_perm
.safe_splice (SLP_TREE_LANE_PERMUTATION (node
));
6869 if (from_layout_i
!= 0)
6870 vect_slp_permute (m_perms
[from_layout_i
], tmp_perm
, false);
6871 if (to_layout_i
!= 0)
6872 vect_slp_permute (m_perms
[to_layout_i
], tmp_perm
, true);
6873 if (vectorizable_slp_permutation_1 (m_vinfo
, nullptr, node
,
6875 SLP_TREE_CHILDREN (node
),
6877 num_inputs
= SLP_TREE_CHILDREN (node
).length ();
6879 tmp_perm
.truncate (0);
6882 if (dump_enabled_p ())
6884 if (tmp_perm
.length () > 0)
6885 dump_printf_loc (MSG_NOTE
, vect_location
,
6886 "duplicating permutation node %p with"
6888 (void *) node
, to_layout_i
);
6890 dump_printf_loc (MSG_NOTE
, vect_location
,
6891 "inserting permutation node in place of %p\n",
6895 unsigned int num_lanes
= SLP_TREE_LANES (node
);
6896 result
= vect_create_new_slp_node (num_inputs
, VEC_PERM_EXPR
);
6897 if (SLP_TREE_SCALAR_STMTS (node
).length ())
6899 auto &stmts
= SLP_TREE_SCALAR_STMTS (result
);
6900 stmts
.safe_splice (SLP_TREE_SCALAR_STMTS (node
));
6901 if (from_layout_i
!= 0)
6902 vect_slp_permute (m_perms
[from_layout_i
], stmts
, false);
6903 if (to_layout_i
!= 0)
6904 vect_slp_permute (m_perms
[to_layout_i
], stmts
, true);
6906 SLP_TREE_REPRESENTATIVE (result
) = SLP_TREE_REPRESENTATIVE (node
);
6907 SLP_TREE_LANES (result
) = num_lanes
;
6908 SLP_TREE_VECTYPE (result
) = SLP_TREE_VECTYPE (node
);
6909 result
->vertex
= -1;
6911 auto &lane_perm
= SLP_TREE_LANE_PERMUTATION (result
);
6912 if (tmp_perm
.length ())
6914 lane_perm
.safe_splice (tmp_perm
);
6915 SLP_TREE_CHILDREN (result
).safe_splice (SLP_TREE_CHILDREN (node
));
6919 lane_perm
.create (num_lanes
);
6920 for (unsigned j
= 0; j
< num_lanes
; ++j
)
6921 lane_perm
.quick_push ({ 0, j
});
6922 if (from_layout_i
!= 0)
6923 vect_slp_permute (m_perms
[from_layout_i
], lane_perm
, false);
6924 if (to_layout_i
!= 0)
6925 vect_slp_permute (m_perms
[to_layout_i
], lane_perm
, true);
6926 SLP_TREE_CHILDREN (result
).safe_push (node
);
6928 for (slp_tree child
: SLP_TREE_CHILDREN (result
))
6931 m_node_layouts
[result_i
] = result
;
6935 /* Apply the chosen vector layouts to the SLP graph. */
6938 vect_optimize_slp_pass::materialize ()
6940 /* We no longer need the costs, so avoid having two O(N * P) arrays
6941 live at the same time. */
6942 m_partition_layout_costs
.release ();
6943 m_node_layouts
.safe_grow_cleared (m_vertices
.length () * m_perms
.length ());
6945 auto_sbitmap
fully_folded (m_vertices
.length ());
6946 bitmap_clear (fully_folded
);
6947 for (unsigned int node_i
: m_partitioned_nodes
)
6949 auto &vertex
= m_vertices
[node_i
];
6950 slp_tree node
= vertex
.node
;
6951 int layout_i
= m_partitions
[vertex
.partition
].layout
;
6952 gcc_assert (layout_i
>= 0);
6954 /* Rearrange the scalar statements to match the chosen layout. */
6956 vect_slp_permute (m_perms
[layout_i
],
6957 SLP_TREE_SCALAR_STMTS (node
), true);
6959 /* Update load and lane permutations. */
6960 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
6962 /* First try to absorb the input vector layouts. If that fails,
6963 force the inputs to have layout LAYOUT_I too. We checked that
6964 that was possible before deciding to use nonzero output layouts.
6965 (Note that at this stage we don't really have any guarantee that
6966 the target supports the original VEC_PERM_EXPR.) */
6967 auto &perm
= SLP_TREE_LANE_PERMUTATION (node
);
6968 auto_lane_permutation_t tmp_perm
;
6969 tmp_perm
.safe_splice (perm
);
6970 change_vec_perm_layout (node
, tmp_perm
, -1, layout_i
);
6971 if (vectorizable_slp_permutation_1 (m_vinfo
, nullptr, node
,
6973 SLP_TREE_CHILDREN (node
),
6976 if (dump_enabled_p ()
6977 && !std::equal (tmp_perm
.begin (), tmp_perm
.end (),
6979 dump_printf_loc (MSG_NOTE
, vect_location
,
6980 "absorbing input layouts into %p\n",
6982 std::copy (tmp_perm
.begin (), tmp_perm
.end (), perm
.begin ());
6983 bitmap_set_bit (fully_folded
, node_i
);
6987 /* Not MSG_MISSED because it would make no sense to users. */
6988 if (dump_enabled_p ())
6989 dump_printf_loc (MSG_NOTE
, vect_location
,
6990 "failed to absorb input layouts into %p\n",
6992 change_vec_perm_layout (nullptr, perm
, layout_i
, layout_i
);
6997 gcc_assert (!SLP_TREE_LANE_PERMUTATION (node
).exists ());
6998 auto &load_perm
= SLP_TREE_LOAD_PERMUTATION (node
);
7000 /* ??? When we handle non-bijective permutes the idea
7001 is that we can force the load-permutation to be
7002 { min, min + 1, min + 2, ... max }. But then the
7003 scalar defs might no longer match the lane content
7004 which means wrong-code with live lane vectorization.
7005 So we possibly have to have NULL entries for those. */
7006 vect_slp_permute (m_perms
[layout_i
], load_perm
, true);
7010 /* Do this before any nodes disappear, since it involves a walk
7012 remove_redundant_permutations ();
7014 /* Replace each child with a correctly laid-out version. */
7015 for (unsigned int node_i
: m_partitioned_nodes
)
7017 /* Skip nodes that have already been handled above. */
7018 if (bitmap_bit_p (fully_folded
, node_i
))
7021 auto &vertex
= m_vertices
[node_i
];
7022 int in_layout_i
= m_partitions
[vertex
.partition
].layout
;
7023 gcc_assert (in_layout_i
>= 0);
7027 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (vertex
.node
), j
, child
)
7032 slp_tree new_child
= get_result_with_layout (child
, in_layout_i
);
7033 if (new_child
!= child
)
7035 vect_free_slp_tree (child
);
7036 SLP_TREE_CHILDREN (vertex
.node
)[j
] = new_child
;
7037 new_child
->refcnt
+= 1;
7043 /* Elide load permutations that are not necessary. Such permutations might
7044 be pre-existing, rather than created by the layout optimizations. */
7047 vect_optimize_slp_pass::remove_redundant_permutations ()
7049 for (unsigned int node_i
: m_leafs
)
7051 slp_tree node
= m_vertices
[node_i
].node
;
7052 if (!SLP_TREE_LOAD_PERMUTATION (node
).exists ())
7055 /* In basic block vectorization we allow any subchain of an interleaving
7057 FORNOW: not in loop SLP because of realignment complications. */
7058 if (is_a
<bb_vec_info
> (m_vinfo
))
7060 bool subchain_p
= true;
7061 stmt_vec_info next_load_info
= NULL
;
7062 stmt_vec_info load_info
;
7064 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
7067 && (next_load_info
!= load_info
7069 || DR_GROUP_GAP (load_info
) != 1))
7074 next_load_info
= DR_GROUP_NEXT_ELEMENT (load_info
);
7078 SLP_TREE_LOAD_PERMUTATION (node
).release ();
7084 loop_vec_info loop_vinfo
= as_a
<loop_vec_info
> (m_vinfo
);
7085 stmt_vec_info load_info
;
7086 bool this_load_permuted
= false;
7088 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), j
, load_info
)
7089 if (SLP_TREE_LOAD_PERMUTATION (node
)[j
] != j
)
7091 this_load_permuted
= true;
7094 /* When this isn't a grouped access we know it's single element
7096 if (!STMT_VINFO_GROUPED_ACCESS (SLP_TREE_SCALAR_STMTS (node
)[0]))
7098 if (!this_load_permuted
7099 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1U)
7100 || SLP_TREE_LANES (node
) == 1))
7101 SLP_TREE_LOAD_PERMUTATION (node
).release ();
7104 stmt_vec_info first_stmt_info
7105 = DR_GROUP_FIRST_ELEMENT (SLP_TREE_SCALAR_STMTS (node
)[0]);
7106 if (!this_load_permuted
7107 /* The load requires permutation when unrolling exposes
7108 a gap either because the group is larger than the SLP
7109 group-size or because there is a gap between the groups. */
7110 && (known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo
), 1U)
7111 || ((SLP_TREE_LANES (node
) == DR_GROUP_SIZE (first_stmt_info
))
7112 && DR_GROUP_GAP (first_stmt_info
) == 0)))
7114 SLP_TREE_LOAD_PERMUTATION (node
).release ();
7121 /* Print the partition graph and layout information to the dump file. */
7124 vect_optimize_slp_pass::dump ()
7126 dump_printf_loc (MSG_NOTE
, vect_location
,
7127 "SLP optimize permutations:\n");
7128 for (unsigned int layout_i
= 1; layout_i
< m_perms
.length (); ++layout_i
)
7130 dump_printf_loc (MSG_NOTE
, vect_location
, " %d: { ", layout_i
);
7131 const char *sep
= "";
7132 for (unsigned int idx
: m_perms
[layout_i
])
7134 dump_printf (MSG_NOTE
, "%s%d", sep
, idx
);
7137 dump_printf (MSG_NOTE
, " }\n");
7139 dump_printf_loc (MSG_NOTE
, vect_location
,
7140 "SLP optimize partitions:\n");
7141 for (unsigned int partition_i
= 0; partition_i
< m_partitions
.length ();
7144 auto &partition
= m_partitions
[partition_i
];
7145 dump_printf_loc (MSG_NOTE
, vect_location
, " -------------\n");
7146 dump_printf_loc (MSG_NOTE
, vect_location
,
7147 " partition %d (layout %d):\n",
7148 partition_i
, partition
.layout
);
7149 dump_printf_loc (MSG_NOTE
, vect_location
, " nodes:\n");
7150 for (unsigned int order_i
= partition
.node_begin
;
7151 order_i
< partition
.node_end
; ++order_i
)
7153 auto &vertex
= m_vertices
[m_partitioned_nodes
[order_i
]];
7154 dump_printf_loc (MSG_NOTE
, vect_location
, " - %p:\n",
7155 (void *) vertex
.node
);
7156 dump_printf_loc (MSG_NOTE
, vect_location
,
7158 vertex
.weight
.to_double ());
7159 if (vertex
.out_degree
)
7160 dump_printf_loc (MSG_NOTE
, vect_location
,
7161 " out weight: %f (degree %d)\n",
7162 vertex
.out_weight
.to_double (),
7164 if (SLP_TREE_CODE (vertex
.node
) == VEC_PERM_EXPR
)
7165 dump_printf_loc (MSG_NOTE
, vect_location
,
7166 " op: VEC_PERM_EXPR\n");
7167 else if (auto rep
= SLP_TREE_REPRESENTATIVE (vertex
.node
))
7168 dump_printf_loc (MSG_NOTE
, vect_location
,
7169 " op template: %G", rep
->stmt
);
7171 dump_printf_loc (MSG_NOTE
, vect_location
, " edges:\n");
7172 for (unsigned int order_i
= partition
.node_begin
;
7173 order_i
< partition
.node_end
; ++order_i
)
7175 unsigned int node_i
= m_partitioned_nodes
[order_i
];
7176 auto &vertex
= m_vertices
[node_i
];
7177 auto print_edge
= [&](graph_edge
*, unsigned int other_node_i
)
7179 auto &other_vertex
= m_vertices
[other_node_i
];
7180 if (other_vertex
.partition
< vertex
.partition
)
7181 dump_printf_loc (MSG_NOTE
, vect_location
,
7182 " - %p [%d] --> %p\n",
7183 (void *) other_vertex
.node
,
7184 other_vertex
.partition
,
7185 (void *) vertex
.node
);
7187 dump_printf_loc (MSG_NOTE
, vect_location
,
7188 " - %p --> [%d] %p\n",
7189 (void *) vertex
.node
,
7190 other_vertex
.partition
,
7191 (void *) other_vertex
.node
);
7193 for_each_partition_edge (node_i
, print_edge
);
7196 for (unsigned int layout_i
= 0; layout_i
< m_perms
.length (); ++layout_i
)
7198 auto &layout_costs
= partition_layout_costs (partition_i
, layout_i
);
7199 if (layout_costs
.is_possible ())
7201 dump_printf_loc (MSG_NOTE
, vect_location
,
7202 " layout %d:%s\n", layout_i
,
7203 partition
.layout
== int (layout_i
)
7205 slpg_layout_cost combined_cost
= layout_costs
.in_cost
;
7206 combined_cost
.add_serial_cost (layout_costs
.internal_cost
);
7207 combined_cost
.add_serial_cost (layout_costs
.out_cost
);
7208 #define TEMPLATE "{depth: %f, total: %f}"
7209 dump_printf_loc (MSG_NOTE
, vect_location
,
7211 layout_costs
.in_cost
.depth
.to_double (),
7212 layout_costs
.in_cost
.total
.to_double ());
7213 dump_printf_loc (MSG_NOTE
, vect_location
,
7214 " + " TEMPLATE
"\n",
7215 layout_costs
.internal_cost
.depth
.to_double (),
7216 layout_costs
.internal_cost
.total
.to_double ());
7217 dump_printf_loc (MSG_NOTE
, vect_location
,
7218 " + " TEMPLATE
"\n",
7219 layout_costs
.out_cost
.depth
.to_double (),
7220 layout_costs
.out_cost
.total
.to_double ());
7221 dump_printf_loc (MSG_NOTE
, vect_location
,
7222 " = " TEMPLATE
"\n",
7223 combined_cost
.depth
.to_double (),
7224 combined_cost
.total
.to_double ());
7228 dump_printf_loc (MSG_NOTE
, vect_location
,
7229 " layout %d: rejected\n", layout_i
);
7234 /* Masked load lanes discovery. */
7237 vect_optimize_slp_pass::decide_masked_load_lanes ()
7239 for (auto v
: m_vertices
)
7241 slp_tree node
= v
.node
;
7242 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
7243 || SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
7245 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
7246 if (! STMT_VINFO_GROUPED_ACCESS (stmt_info
)
7247 /* The mask has to be uniform. */
7248 || STMT_VINFO_SLP_VECT_ONLY (stmt_info
)
7249 || ! is_a
<gcall
*> (STMT_VINFO_STMT (stmt_info
))
7250 || ! gimple_call_internal_p (STMT_VINFO_STMT (stmt_info
),
7253 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
7254 if (STMT_VINFO_STRIDED_P (stmt_info
)
7255 || compare_step_with_zero (m_vinfo
, stmt_info
) <= 0
7256 || vect_load_lanes_supported (SLP_TREE_VECTYPE (node
),
7257 DR_GROUP_SIZE (stmt_info
),
7261 /* Uniform masks need to be suitably represented. */
7262 slp_tree mask
= SLP_TREE_CHILDREN (node
)[0];
7263 if (SLP_TREE_CODE (mask
) != VEC_PERM_EXPR
7264 || SLP_TREE_CHILDREN (mask
).length () != 1)
7267 for (auto perm
: SLP_TREE_LANE_PERMUTATION (mask
))
7268 if (perm
.first
!= 0 || perm
.second
!= 0)
7276 /* Now see if the consumer side matches. */
7277 for (graph_edge
*pred
= m_slpg
->vertices
[node
->vertex
].pred
;
7278 pred
; pred
= pred
->pred_next
)
7280 slp_tree pred_node
= m_vertices
[pred
->src
].node
;
7281 /* All consumers should be a permute with a single outgoing lane. */
7282 if (SLP_TREE_CODE (pred_node
) != VEC_PERM_EXPR
7283 || SLP_TREE_LANES (pred_node
) != 1)
7288 gcc_assert (SLP_TREE_CHILDREN (pred_node
).length () == 1);
7292 /* Now we can mark the nodes as to use load lanes. */
7293 node
->ldst_lanes
= true;
7294 for (graph_edge
*pred
= m_slpg
->vertices
[node
->vertex
].pred
;
7295 pred
; pred
= pred
->pred_next
)
7296 m_vertices
[pred
->src
].node
->ldst_lanes
= true;
7297 /* The catch is we have to massage the mask. We have arranged
7298 analyzed uniform masks to be represented by a splat VEC_PERM
7299 which we can now simply elide as we cannot easily re-do SLP
7301 slp_tree new_mask
= SLP_TREE_CHILDREN (mask
)[0];
7302 SLP_TREE_REF_COUNT (new_mask
)++;
7303 SLP_TREE_CHILDREN (node
)[0] = new_mask
;
7304 vect_free_slp_tree (mask
);
7308 /* Main entry point for the SLP graph optimization pass. */
7311 vect_optimize_slp_pass::run ()
7314 create_partitions ();
7315 start_choosing_layouts ();
7316 if (m_perms
.length () > 1)
7320 if (dump_enabled_p ())
7323 while (!m_perms
.is_empty ())
7324 m_perms
.pop ().release ();
7327 remove_redundant_permutations ();
7328 free_graph (m_slpg
);
7330 decide_masked_load_lanes ();
7331 free_graph (m_slpg
);
7334 /* Apply CSE to NODE and its children using BST_MAP. */
7337 vect_cse_slp_nodes (scalar_stmts_to_slp_tree_map_t
*bst_map
, slp_tree
& node
)
7340 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
7341 /* Besides some VEC_PERM_EXPR, two-operator nodes also
7342 lack scalar stmts and thus CSE doesn't work via bst_map. Ideally
7343 we'd have sth that works for all internal and external nodes. */
7344 && !SLP_TREE_SCALAR_STMTS (node
).is_empty ())
7346 slp_tree
*leader
= bst_map
->get (SLP_TREE_SCALAR_STMTS (node
));
7349 /* We've visited this node already. */
7350 if (!*leader
|| *leader
== node
)
7353 if (dump_enabled_p ())
7354 dump_printf_loc (MSG_NOTE
, vect_location
,
7355 "re-using SLP tree %p for %p\n",
7356 (void *)*leader
, (void *)node
);
7357 vect_free_slp_tree (node
);
7358 (*leader
)->refcnt
+= 1;
7363 /* Avoid creating a cycle by populating the map only after recursion. */
7364 bst_map
->put (SLP_TREE_SCALAR_STMTS (node
).copy (), nullptr);
7370 for (slp_tree
&child
: SLP_TREE_CHILDREN (node
))
7372 vect_cse_slp_nodes (bst_map
, child
);
7374 /* Now record the node for CSE in other siblings. */
7376 *bst_map
->get (SLP_TREE_SCALAR_STMTS (node
)) = node
;
7379 /* Optimize the SLP graph of VINFO. */
7382 vect_optimize_slp (vec_info
*vinfo
)
7384 if (vinfo
->slp_instances
.is_empty ())
7386 vect_optimize_slp_pass (vinfo
).run ();
7388 /* Apply CSE again to nodes after permute optimization. */
7389 scalar_stmts_to_slp_tree_map_t
*bst_map
7390 = new scalar_stmts_to_slp_tree_map_t ();
7392 for (auto inst
: vinfo
->slp_instances
)
7393 vect_cse_slp_nodes (bst_map
, SLP_INSTANCE_TREE (inst
));
7395 release_scalar_stmts_to_slp_tree_map (bst_map
);
7398 /* Gather loads reachable from the individual SLP graph entries. */
7401 vect_gather_slp_loads (vec_info
*vinfo
)
7404 slp_instance instance
;
7405 FOR_EACH_VEC_ELT (vinfo
->slp_instances
, i
, instance
)
7407 hash_set
<slp_tree
> visited
;
7408 vect_gather_slp_loads (SLP_INSTANCE_LOADS (instance
),
7409 SLP_INSTANCE_TREE (instance
), visited
);
7413 /* For NODE update VF based on the number of lanes and the vector types
7417 vect_update_slp_vf_for_node (slp_tree node
, poly_uint64
&vf
,
7418 hash_set
<slp_tree
> &visited
)
7420 if (!node
|| SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
7422 if (visited
.add (node
))
7425 for (slp_tree child
: SLP_TREE_CHILDREN (node
))
7426 vect_update_slp_vf_for_node (child
, vf
, visited
);
7428 /* We do not visit SLP nodes for constants or externals - those neither
7429 have a vector type set yet (vectorizable_* does this) nor do they
7430 have max_nunits set. Instead we rely on internal nodes max_nunit
7431 to cover constant/external operands.
7432 Note that when we stop using fixed size vectors externs and constants
7433 shouldn't influence the (minimum) vectorization factor, instead
7434 vectorizable_* should honor the vectorization factor when trying to
7435 assign vector types to constants and externals and cause iteration
7436 to a higher vectorization factor when required. */
7438 = calculate_unrolling_factor (node
->max_nunits
, SLP_TREE_LANES (node
));
7439 vf
= force_common_multiple (vf
, node_vf
);
7441 /* For permute nodes that are fed from externs or constants we have to
7442 consider their number of lanes as well. Likewise for store-lanes. */
7443 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
7444 || node
->ldst_lanes
)
7445 for (slp_tree child
: SLP_TREE_CHILDREN (node
))
7446 if (SLP_TREE_DEF_TYPE (child
) != vect_internal_def
)
7448 poly_uint64 child_vf
7449 = calculate_unrolling_factor (node
->max_nunits
,
7450 SLP_TREE_LANES (child
));
7451 vf
= force_common_multiple (vf
, child_vf
);
7455 /* For each possible SLP instance decide whether to SLP it and calculate overall
7456 unrolling factor needed to SLP the loop. Return TRUE if decided to SLP at
7457 least one instance. */
7460 vect_make_slp_decision (loop_vec_info loop_vinfo
)
7463 poly_uint64 unrolling_factor
= 1;
7464 const vec
<slp_instance
> &slp_instances
7465 = LOOP_VINFO_SLP_INSTANCES (loop_vinfo
);
7466 slp_instance instance
;
7467 int decided_to_slp
= 0;
7469 DUMP_VECT_SCOPE ("vect_make_slp_decision");
7471 hash_set
<slp_tree
> visited
;
7472 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
7474 /* FORNOW: SLP if you can. */
7475 /* All unroll factors have the form:
7477 GET_MODE_SIZE (vinfo->vector_mode) * X
7479 for some rational X, so they must have a common multiple. */
7480 vect_update_slp_vf_for_node (SLP_INSTANCE_TREE (instance
),
7481 unrolling_factor
, visited
);
7483 /* Mark all the stmts that belong to INSTANCE as PURE_SLP stmts. Later we
7484 call vect_detect_hybrid_slp () to find stmts that need hybrid SLP and
7485 loop-based vectorization. Such stmts will be marked as HYBRID. */
7486 vect_mark_slp_stmts (loop_vinfo
, SLP_INSTANCE_TREE (instance
));
7490 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo
) = unrolling_factor
;
7492 if (decided_to_slp
&& dump_enabled_p ())
7494 dump_printf_loc (MSG_NOTE
, vect_location
,
7495 "Decided to SLP %d instances. Unrolling factor ",
7497 dump_dec (MSG_NOTE
, unrolling_factor
);
7498 dump_printf (MSG_NOTE
, "\n");
7501 return (decided_to_slp
> 0);
7504 /* Private data for vect_detect_hybrid_slp. */
7507 loop_vec_info loop_vinfo
;
7508 vec
<stmt_vec_info
> *worklist
;
7511 /* Walker for walk_gimple_op. */
7514 vect_detect_hybrid_slp (tree
*tp
, int *, void *data
)
7516 walk_stmt_info
*wi
= (walk_stmt_info
*)data
;
7517 vdhs_data
*dat
= (vdhs_data
*)wi
->info
;
7522 stmt_vec_info def_stmt_info
= dat
->loop_vinfo
->lookup_def (*tp
);
7525 def_stmt_info
= vect_stmt_to_vectorize (def_stmt_info
);
7526 if (PURE_SLP_STMT (def_stmt_info
))
7528 if (dump_enabled_p ())
7529 dump_printf_loc (MSG_NOTE
, vect_location
, "marking hybrid: %G",
7530 def_stmt_info
->stmt
);
7531 STMT_SLP_TYPE (def_stmt_info
) = hybrid
;
7532 dat
->worklist
->safe_push (def_stmt_info
);
7538 /* Look if STMT_INFO is consumed by SLP indirectly and mark it pure_slp
7539 if so, otherwise pushing it to WORKLIST. */
7542 maybe_push_to_hybrid_worklist (vec_info
*vinfo
,
7543 vec
<stmt_vec_info
> &worklist
,
7544 stmt_vec_info stmt_info
)
7546 if (dump_enabled_p ())
7547 dump_printf_loc (MSG_NOTE
, vect_location
,
7548 "Processing hybrid candidate : %G", stmt_info
->stmt
);
7549 stmt_vec_info orig_info
= vect_orig_stmt (stmt_info
);
7550 imm_use_iterator iter2
;
7552 use_operand_p use_p
;
7553 def_operand_p def_p
;
7554 bool any_def
= false;
7555 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_info
->stmt
, iter1
, SSA_OP_DEF
)
7558 FOR_EACH_IMM_USE_FAST (use_p
, iter2
, DEF_FROM_PTR (def_p
))
7560 if (is_gimple_debug (USE_STMT (use_p
)))
7562 stmt_vec_info use_info
= vinfo
->lookup_stmt (USE_STMT (use_p
));
7563 /* An out-of loop use means this is a loop_vect sink. */
7566 if (dump_enabled_p ())
7567 dump_printf_loc (MSG_NOTE
, vect_location
,
7568 "Found loop_vect sink: %G", stmt_info
->stmt
);
7569 worklist
.safe_push (stmt_info
);
7572 else if (!STMT_SLP_TYPE (vect_stmt_to_vectorize (use_info
)))
7574 if (dump_enabled_p ())
7575 dump_printf_loc (MSG_NOTE
, vect_location
,
7576 "Found loop_vect use: %G", use_info
->stmt
);
7577 worklist
.safe_push (stmt_info
);
7582 /* No def means this is a loop_vect sink. Gimple conditionals also don't have a
7583 def but shouldn't be considered sinks. */
7584 if (!any_def
&& STMT_VINFO_DEF_TYPE (stmt_info
) != vect_condition_def
)
7586 if (dump_enabled_p ())
7587 dump_printf_loc (MSG_NOTE
, vect_location
,
7588 "Found loop_vect sink: %G", stmt_info
->stmt
);
7589 worklist
.safe_push (stmt_info
);
7592 if (dump_enabled_p ())
7593 dump_printf_loc (MSG_NOTE
, vect_location
,
7594 "Marked SLP consumed stmt pure: %G", stmt_info
->stmt
);
7595 STMT_SLP_TYPE (stmt_info
) = pure_slp
;
7598 /* Find stmts that must be both vectorized and SLPed. */
7601 vect_detect_hybrid_slp (loop_vec_info loop_vinfo
)
7603 DUMP_VECT_SCOPE ("vect_detect_hybrid_slp");
7605 /* All stmts participating in SLP are marked pure_slp, all other
7606 stmts are loop_vect.
7607 First collect all loop_vect stmts into a worklist.
7608 SLP patterns cause not all original scalar stmts to appear in
7609 SLP_TREE_SCALAR_STMTS and thus not all of them are marked pure_slp.
7610 Rectify this here and do a backward walk over the IL only considering
7611 stmts as loop_vect when they are used by a loop_vect stmt and otherwise
7612 mark them as pure_slp. */
7613 auto_vec
<stmt_vec_info
> worklist
;
7614 for (int i
= LOOP_VINFO_LOOP (loop_vinfo
)->num_nodes
- 1; i
>= 0; --i
)
7616 basic_block bb
= LOOP_VINFO_BBS (loop_vinfo
)[i
];
7617 for (gphi_iterator gsi
= gsi_start_phis (bb
); !gsi_end_p (gsi
);
7620 gphi
*phi
= gsi
.phi ();
7621 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (phi
);
7622 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
7623 maybe_push_to_hybrid_worklist (loop_vinfo
,
7624 worklist
, stmt_info
);
7626 for (gimple_stmt_iterator gsi
= gsi_last_bb (bb
); !gsi_end_p (gsi
);
7629 gimple
*stmt
= gsi_stmt (gsi
);
7630 if (is_gimple_debug (stmt
))
7632 stmt_vec_info stmt_info
= loop_vinfo
->lookup_stmt (stmt
);
7633 if (STMT_VINFO_IN_PATTERN_P (stmt_info
))
7635 for (gimple_stmt_iterator gsi2
7636 = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info
));
7637 !gsi_end_p (gsi2
); gsi_next (&gsi2
))
7639 stmt_vec_info patt_info
7640 = loop_vinfo
->lookup_stmt (gsi_stmt (gsi2
));
7641 if (!STMT_SLP_TYPE (patt_info
)
7642 && STMT_VINFO_RELEVANT (patt_info
))
7643 maybe_push_to_hybrid_worklist (loop_vinfo
,
7644 worklist
, patt_info
);
7646 stmt_info
= STMT_VINFO_RELATED_STMT (stmt_info
);
7648 if (!STMT_SLP_TYPE (stmt_info
) && STMT_VINFO_RELEVANT (stmt_info
))
7649 maybe_push_to_hybrid_worklist (loop_vinfo
,
7650 worklist
, stmt_info
);
7654 /* Now we have a worklist of non-SLP stmts, follow use->def chains and
7655 mark any SLP vectorized stmt as hybrid.
7656 ??? We're visiting def stmts N times (once for each non-SLP and
7657 once for each hybrid-SLP use). */
7660 dat
.worklist
= &worklist
;
7661 dat
.loop_vinfo
= loop_vinfo
;
7662 memset (&wi
, 0, sizeof (wi
));
7663 wi
.info
= (void *)&dat
;
7664 while (!worklist
.is_empty ())
7666 stmt_vec_info stmt_info
= worklist
.pop ();
7667 /* Since SSA operands are not set up for pattern stmts we need
7668 to use walk_gimple_op. */
7670 walk_gimple_op (stmt_info
->stmt
, vect_detect_hybrid_slp
, &wi
);
7671 /* For gather/scatter make sure to walk the offset operand, that
7672 can be a scaling and conversion away. */
7673 gather_scatter_info gs_info
;
7674 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info
)
7675 && vect_check_gather_scatter (stmt_info
, loop_vinfo
, &gs_info
))
7678 vect_detect_hybrid_slp (&gs_info
.offset
, &dummy
, &wi
);
7684 /* Initialize a bb_vec_info struct for the statements in BBS basic blocks. */
7686 _bb_vec_info::_bb_vec_info (vec
<basic_block
> _bbs
, vec_info_shared
*shared
)
7687 : vec_info (vec_info::bb
, shared
),
7690 /* The region we are operating on. bbs[0] is the entry, excluding
7691 its PHI nodes. In the future we might want to track an explicit
7692 entry edge to cover bbs[0] PHI nodes and have a region entry
7694 bbs
= _bbs
.address ();
7695 nbbs
= _bbs
.length ();
7697 for (unsigned i
= 0; i
< nbbs
; ++i
)
7700 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
7703 gphi
*phi
= si
.phi ();
7704 gimple_set_uid (phi
, 0);
7707 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
7708 !gsi_end_p (gsi
); gsi_next (&gsi
))
7710 gimple
*stmt
= gsi_stmt (gsi
);
7711 gimple_set_uid (stmt
, 0);
7712 if (is_gimple_debug (stmt
))
7720 /* Free BB_VINFO struct, as well as all the stmt_vec_info structs of all the
7721 stmts in the basic block. */
7723 _bb_vec_info::~_bb_vec_info ()
7725 /* Reset region marker. */
7726 for (unsigned i
= 0; i
< nbbs
; ++i
)
7729 for (gphi_iterator si
= gsi_start_phis (bbs
[i
]); !gsi_end_p (si
);
7732 gphi
*phi
= si
.phi ();
7733 gimple_set_uid (phi
, -1);
7735 for (gimple_stmt_iterator gsi
= gsi_start_bb (bbs
[i
]);
7736 !gsi_end_p (gsi
); gsi_next (&gsi
))
7738 gimple
*stmt
= gsi_stmt (gsi
);
7739 gimple_set_uid (stmt
, -1);
7743 for (unsigned i
= 0; i
< roots
.length (); ++i
)
7745 roots
[i
].stmts
.release ();
7746 roots
[i
].roots
.release ();
7747 roots
[i
].remain
.release ();
7752 /* Subroutine of vect_slp_analyze_node_operations. Handle the root of NODE,
7753 given then that child nodes have already been processed, and that
7754 their def types currently match their SLP node's def type. */
7757 vect_slp_analyze_node_operations_1 (vec_info
*vinfo
, slp_tree node
,
7758 slp_instance node_instance
,
7759 stmt_vector_for_cost
*cost_vec
)
7761 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
7763 /* Calculate the number of vector statements to be created for the scalar
7764 stmts in this node. It is the number of scalar elements in one scalar
7765 iteration (DR_GROUP_SIZE) multiplied by VF divided by the number of
7766 elements in a vector. For single-defuse-cycle, lane-reducing op, and
7767 PHI statement that starts reduction comprised of only lane-reducing ops,
7768 the number is more than effective vector statements actually required. */
7769 SLP_TREE_NUMBER_OF_VEC_STMTS (node
) = vect_get_num_copies (vinfo
, node
);
7771 /* Handle purely internal nodes. */
7772 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
7774 if (!vectorizable_slp_permutation (vinfo
, NULL
, node
, cost_vec
))
7777 stmt_vec_info slp_stmt_info
;
7779 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, slp_stmt_info
)
7782 && STMT_VINFO_LIVE_P (slp_stmt_info
)
7783 && !vectorizable_live_operation (vinfo
, slp_stmt_info
, node
,
7792 return vect_analyze_stmt (vinfo
, stmt_info
, &dummy
,
7793 node
, node_instance
, cost_vec
);
7796 /* Verify if we can externalize a set of internal defs. */
7799 vect_slp_can_convert_to_external (const vec
<stmt_vec_info
> &stmts
)
7801 basic_block bb
= NULL
;
7802 for (stmt_vec_info stmt
: stmts
)
7805 /* Constant generation uses get_later_stmt which can only handle
7806 defs from the same BB. */
7808 bb
= gimple_bb (stmt
->stmt
);
7809 else if (gimple_bb (stmt
->stmt
) != bb
)
7814 /* Try to build NODE from scalars, returning true on success.
7815 NODE_INSTANCE is the SLP instance that contains NODE. */
7818 vect_slp_convert_to_external (vec_info
*vinfo
, slp_tree node
,
7819 slp_instance node_instance
)
7821 stmt_vec_info stmt_info
;
7824 if (!is_a
<bb_vec_info
> (vinfo
)
7825 || node
== SLP_INSTANCE_TREE (node_instance
)
7826 || !SLP_TREE_SCALAR_STMTS (node
).exists ()
7827 || vect_contains_pattern_stmt_p (SLP_TREE_SCALAR_STMTS (node
))
7828 /* Force the mask use to be built from scalars instead. */
7829 || VECTOR_BOOLEAN_TYPE_P (SLP_TREE_VECTYPE (node
))
7830 || !vect_slp_can_convert_to_external (SLP_TREE_SCALAR_STMTS (node
)))
7833 if (dump_enabled_p ())
7834 dump_printf_loc (MSG_NOTE
, vect_location
,
7835 "Building vector operands of %p from scalars instead\n",
7838 /* Don't remove and free the child nodes here, since they could be
7839 referenced by other structures. The analysis and scheduling phases
7840 (need to) ignore child nodes of anything that isn't vect_internal_def. */
7841 unsigned int group_size
= SLP_TREE_LANES (node
);
7842 SLP_TREE_DEF_TYPE (node
) = vect_external_def
;
7843 /* Invariants get their vector type from the uses. */
7844 SLP_TREE_VECTYPE (node
) = NULL_TREE
;
7845 SLP_TREE_SCALAR_OPS (node
).safe_grow (group_size
, true);
7846 SLP_TREE_LOAD_PERMUTATION (node
).release ();
7847 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
7849 tree lhs
= gimple_get_lhs (vect_orig_stmt (stmt_info
)->stmt
);
7850 SLP_TREE_SCALAR_OPS (node
)[i
] = lhs
;
7855 /* Return true if all elements of the slice are the same. */
7857 vect_scalar_ops_slice::all_same_p () const
7859 for (unsigned int i
= 1; i
< length
; ++i
)
7860 if (!operand_equal_p (op (0), op (i
)))
7866 vect_scalar_ops_slice_hash::hash (const value_type
&s
)
7869 for (unsigned i
= 0; i
< s
.length
; ++i
)
7870 hash
= iterative_hash_expr (s
.op (i
), hash
);
7875 vect_scalar_ops_slice_hash::equal (const value_type
&s1
,
7876 const compare_type
&s2
)
7878 if (s1
.length
!= s2
.length
)
7880 for (unsigned i
= 0; i
< s1
.length
; ++i
)
7881 if (!operand_equal_p (s1
.op (i
), s2
.op (i
)))
7886 /* Compute the prologue cost for invariant or constant operands represented
7890 vect_prologue_cost_for_slp (slp_tree node
,
7891 stmt_vector_for_cost
*cost_vec
)
7893 /* There's a special case of an existing vector, that costs nothing. */
7894 if (SLP_TREE_SCALAR_OPS (node
).length () == 0
7895 && !SLP_TREE_VEC_DEFS (node
).is_empty ())
7897 /* Without looking at the actual initializer a vector of
7898 constants can be implemented as load from the constant pool.
7899 When all elements are the same we can use a splat. */
7900 tree vectype
= SLP_TREE_VECTYPE (node
);
7901 unsigned group_size
= SLP_TREE_SCALAR_OPS (node
).length ();
7902 unsigned HOST_WIDE_INT const_nunits
;
7903 unsigned nelt_limit
;
7904 auto ops
= &SLP_TREE_SCALAR_OPS (node
);
7905 auto_vec
<unsigned int> starts (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
7906 if (TYPE_VECTOR_SUBPARTS (vectype
).is_constant (&const_nunits
)
7907 && ! multiple_p (const_nunits
, group_size
))
7909 nelt_limit
= const_nunits
;
7910 hash_set
<vect_scalar_ops_slice_hash
> vector_ops
;
7911 for (unsigned int i
= 0; i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); ++i
)
7912 if (!vector_ops
.add ({ ops
, i
* nelt_limit
, nelt_limit
}))
7913 starts
.quick_push (i
* nelt_limit
);
7917 /* If either the vector has variable length or the vectors
7918 are composed of repeated whole groups we only need to
7919 cost construction once. All vectors will be the same. */
7920 nelt_limit
= group_size
;
7921 starts
.quick_push (0);
7923 /* ??? We're just tracking whether vectors in a single node are the same.
7924 Ideally we'd do something more global. */
7925 bool passed
= false;
7926 for (unsigned int start
: starts
)
7928 vect_cost_for_stmt kind
;
7929 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
)
7931 else if (vect_scalar_ops_slice
{ ops
, start
, nelt_limit
}.all_same_p ())
7932 kind
= scalar_to_vec
;
7934 kind
= vec_construct
;
7935 /* The target cost hook has no idea which part of the SLP node
7936 we are costing so avoid passing it down more than once. Pass
7937 it to the first vec_construct or scalar_to_vec part since for those
7938 the x86 backend tries to account for GPR to XMM register moves. */
7939 record_stmt_cost (cost_vec
, 1, kind
,
7940 (kind
!= vector_load
&& !passed
) ? node
: nullptr,
7941 vectype
, 0, vect_prologue
);
7942 if (kind
!= vector_load
)
7947 /* Analyze statements contained in SLP tree NODE after recursively analyzing
7948 the subtree. NODE_INSTANCE contains NODE and VINFO contains INSTANCE.
7950 Return true if the operations are supported. */
7953 vect_slp_analyze_node_operations (vec_info
*vinfo
, slp_tree node
,
7954 slp_instance node_instance
,
7955 hash_set
<slp_tree
> &visited_set
,
7956 vec
<slp_tree
> &visited_vec
,
7957 stmt_vector_for_cost
*cost_vec
)
7962 /* Assume we can code-generate all invariants. */
7964 || SLP_TREE_DEF_TYPE (node
) == vect_constant_def
7965 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
7968 if (SLP_TREE_DEF_TYPE (node
) == vect_uninitialized_def
)
7970 if (dump_enabled_p ())
7971 dump_printf_loc (MSG_NOTE
, vect_location
,
7972 "Failed cyclic SLP reference in %p\n", (void *) node
);
7975 gcc_assert (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
);
7977 /* If we already analyzed the exact same set of scalar stmts we're done.
7978 We share the generated vector stmts for those. */
7979 if (visited_set
.add (node
))
7981 visited_vec
.safe_push (node
);
7984 unsigned visited_rec_start
= visited_vec
.length ();
7985 unsigned cost_vec_rec_start
= cost_vec
->length ();
7986 bool seen_non_constant_child
= false;
7987 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
7989 res
= vect_slp_analyze_node_operations (vinfo
, child
, node_instance
,
7990 visited_set
, visited_vec
,
7994 if (child
&& SLP_TREE_DEF_TYPE (child
) != vect_constant_def
)
7995 seen_non_constant_child
= true;
7997 /* We're having difficulties scheduling nodes with just constant
7998 operands and no scalar stmts since we then cannot compute a stmt
8001 && !seen_non_constant_child
8002 && SLP_TREE_SCALAR_STMTS (node
).is_empty ())
8004 if (dump_enabled_p ())
8005 dump_printf_loc (MSG_NOTE
, vect_location
,
8006 "Cannot vectorize all-constant op node %p\n",
8012 res
= vect_slp_analyze_node_operations_1 (vinfo
, node
, node_instance
,
8014 /* If analysis failed we have to pop all recursive visited nodes
8018 while (visited_vec
.length () >= visited_rec_start
)
8019 visited_set
.remove (visited_vec
.pop ());
8020 cost_vec
->truncate (cost_vec_rec_start
);
8023 /* When the node can be vectorized cost invariant nodes it references.
8024 This is not done in DFS order to allow the refering node
8025 vectorizable_* calls to nail down the invariant nodes vector type
8026 and possibly unshare it if it needs a different vector type than
8029 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), j
, child
)
8031 && (SLP_TREE_DEF_TYPE (child
) == vect_constant_def
8032 || SLP_TREE_DEF_TYPE (child
) == vect_external_def
)
8033 /* Perform usual caching, note code-generation still
8034 code-gens these nodes multiple times but we expect
8035 to CSE them later. */
8036 && !visited_set
.add (child
))
8038 visited_vec
.safe_push (child
);
8039 /* ??? After auditing more code paths make a "default"
8040 and push the vector type from NODE to all children
8041 if it is not already set. */
8042 /* Compute the number of vectors to be generated. */
8043 tree vector_type
= SLP_TREE_VECTYPE (child
);
8046 /* Masked loads can have an undefined (default SSA definition)
8047 else operand. We do not need to cost it. */
8048 vec
<tree
> ops
= SLP_TREE_SCALAR_OPS (child
);
8049 if ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
8050 == load_vec_info_type
)
8052 && TREE_CODE (ops
[0]) == SSA_NAME
8053 && SSA_NAME_IS_DEFAULT_DEF (ops
[0])
8054 && VAR_P (SSA_NAME_VAR (ops
[0])))
8055 || SLP_TREE_DEF_TYPE (child
) == vect_constant_def
))
8058 /* For shifts with a scalar argument we don't need
8059 to cost or code-generate anything.
8060 ??? Represent this more explicitely. */
8061 gcc_assert ((STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (node
))
8062 == shift_vec_info_type
)
8067 SLP_TREE_NUMBER_OF_VEC_STMTS (child
)
8068 = vect_get_num_copies (vinfo
, child
);
8069 /* And cost them. */
8070 vect_prologue_cost_for_slp (child
, cost_vec
);
8073 /* If this node or any of its children can't be vectorized, try pruning
8074 the tree here rather than felling the whole thing. */
8075 if (!res
&& vect_slp_convert_to_external (vinfo
, node
, node_instance
))
8077 /* We'll need to revisit this for invariant costing and number
8078 of vectorized stmt setting. */
8085 /* Given a definition DEF, analyze if it will have any live scalar use after
8086 performing SLP vectorization whose information is represented by BB_VINFO,
8087 and record result into hash map SCALAR_USE_MAP as cache for later fast
8088 check. If recursion DEPTH exceeds a limit, stop analysis and make a
8089 conservative assumption. Return 0 if no scalar use, 1 if there is, -1
8090 means recursion is limited. */
8093 vec_slp_has_scalar_use (bb_vec_info bb_vinfo
, tree def
,
8094 hash_map
<tree
, int> &scalar_use_map
,
8097 const int depth_limit
= 2;
8098 imm_use_iterator use_iter
;
8101 if (int *res
= scalar_use_map
.get (def
))
8106 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, def
)
8108 if (is_gimple_debug (use_stmt
))
8111 stmt_vec_info use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
);
8116 if (PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
8119 /* Do not step forward when encounter PHI statement, since it may
8120 involve cyclic reference and cause infinite recursive invocation. */
8121 if (gimple_code (use_stmt
) == GIMPLE_PHI
)
8124 /* When pattern recognition is involved, a statement whose definition is
8125 consumed in some pattern, may not be included in the final replacement
8126 pattern statements, so would be skipped when building SLP graph.
8129 char a_c = *(char *) a;
8130 char b_c = *(char *) b;
8131 unsigned short a_s = (unsigned short) a_c;
8132 int a_i = (int) a_s;
8133 int b_i = (int) b_c;
8134 int r_i = a_i - b_i;
8136 * After pattern replacement
8137 a_s = (unsigned short) a_c;
8140 patt_b_s = (unsigned short) b_c; // b_i = (int) b_c
8141 patt_b_i = (int) patt_b_s; // b_i = (int) b_c
8143 patt_r_s = widen_minus(a_c, b_c); // r_i = a_i - b_i
8144 patt_r_i = (int) patt_r_s; // r_i = a_i - b_i
8146 The definitions of a_i(original statement) and b_i(pattern statement)
8147 are related to, but actually not part of widen_minus pattern.
8148 Vectorizing the pattern does not cause these definition statements to
8149 be marked as PURE_SLP. For this case, we need to recursively check
8150 whether their uses are all absorbed into vectorized code. But there
8151 is an exception that some use may participate in an vectorized
8152 operation via an external SLP node containing that use as an element.
8153 The parameter "scalar_use_map" tags such kind of SSA as having scalar
8155 tree lhs
= gimple_get_lhs (use_stmt
);
8157 if (!lhs
|| TREE_CODE (lhs
) != SSA_NAME
)
8160 if (depth_limit
&& depth
>= depth_limit
)
8163 if ((scalar_use
= vec_slp_has_scalar_use (bb_vinfo
, lhs
, scalar_use_map
,
8168 if (end_imm_use_stmt_p (&use_iter
))
8171 /* If recursion is limited, do not cache result for non-root defs. */
8172 if (!depth
|| scalar_use
>= 0)
8174 bool added
= scalar_use_map
.put (def
, scalar_use
);
8175 gcc_assert (!added
);
8181 /* Mark lanes of NODE that are live outside of the basic-block vectorized
8182 region and that can be vectorized using vectorizable_live_operation
8183 with STMT_VINFO_LIVE_P. Not handled live operations will cause the
8184 scalar code computing it to be retained. */
8187 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
, slp_tree node
,
8188 slp_instance instance
,
8189 stmt_vector_for_cost
*cost_vec
,
8190 hash_map
<tree
, int> &scalar_use_map
,
8191 hash_set
<stmt_vec_info
> &svisited
,
8192 hash_set
<slp_tree
> &visited
)
8194 if (visited
.add (node
))
8198 stmt_vec_info stmt_info
;
8199 stmt_vec_info last_stmt
= vect_find_last_scalar_stmt_in_slp (node
);
8200 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
8202 if (!stmt_info
|| svisited
.contains (stmt_info
))
8204 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
8205 if (STMT_VINFO_IN_PATTERN_P (orig_stmt_info
)
8206 && STMT_VINFO_RELATED_STMT (orig_stmt_info
) != stmt_info
)
8207 /* Only the pattern root stmt computes the original scalar value. */
8209 bool mark_visited
= true;
8210 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
8211 ssa_op_iter op_iter
;
8212 def_operand_p def_p
;
8213 FOR_EACH_PHI_OR_STMT_DEF (def_p
, orig_stmt
, op_iter
, SSA_OP_DEF
)
8215 if (vec_slp_has_scalar_use (bb_vinfo
, DEF_FROM_PTR (def_p
),
8218 STMT_VINFO_LIVE_P (stmt_info
) = true;
8219 if (vectorizable_live_operation (bb_vinfo
, stmt_info
, node
,
8220 instance
, i
, false, cost_vec
))
8221 /* ??? So we know we can vectorize the live stmt from one SLP
8222 node. If we cannot do so from all or none consistently
8223 we'd have to record which SLP node (and lane) we want to
8224 use for the live operation. So make sure we can
8225 code-generate from all nodes. */
8226 mark_visited
= false;
8228 STMT_VINFO_LIVE_P (stmt_info
) = false;
8231 /* We have to verify whether we can insert the lane extract
8232 before all uses. The following is a conservative approximation.
8233 We cannot put this into vectorizable_live_operation because
8234 iterating over all use stmts from inside a FOR_EACH_IMM_USE_STMT
8236 Note that while the fact that we emit code for loads at the
8237 first load should make this a non-problem leafs we construct
8238 from scalars are vectorized after the last scalar def.
8239 ??? If we'd actually compute the insert location during
8240 analysis we could use sth less conservative than the last
8241 scalar stmt in the node for the dominance check. */
8242 /* ??? What remains is "live" uses in vector CTORs in the same
8243 SLP graph which is where those uses can end up code-generated
8244 right after their definition instead of close to their original
8245 use. But that would restrict us to code-generate lane-extracts
8246 from the latest stmt in a node. So we compensate for this
8247 during code-generation, simply not replacing uses for those
8248 hopefully rare cases. */
8249 imm_use_iterator use_iter
;
8251 stmt_vec_info use_stmt_info
;
8253 if (STMT_VINFO_LIVE_P (stmt_info
))
8254 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
, DEF_FROM_PTR (def_p
))
8255 if (!is_gimple_debug (use_stmt
)
8256 && (!(use_stmt_info
= bb_vinfo
->lookup_stmt (use_stmt
))
8257 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info
)))
8258 && !vect_stmt_dominates_stmt_p (last_stmt
->stmt
, use_stmt
))
8260 if (dump_enabled_p ())
8261 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8262 "Cannot determine insertion place for "
8264 STMT_VINFO_LIVE_P (stmt_info
) = false;
8265 mark_visited
= true;
8269 svisited
.add (stmt_info
);
8273 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
8274 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
8275 vect_bb_slp_mark_live_stmts (bb_vinfo
, child
, instance
, cost_vec
,
8276 scalar_use_map
, svisited
, visited
);
8279 /* Traverse all slp instances of BB_VINFO, and mark lanes of every node that
8280 are live outside of the basic-block vectorized region and that can be
8281 vectorized using vectorizable_live_operation with STMT_VINFO_LIVE_P. */
8284 vect_bb_slp_mark_live_stmts (bb_vec_info bb_vinfo
)
8286 if (bb_vinfo
->slp_instances
.is_empty ())
8289 hash_set
<stmt_vec_info
> svisited
;
8290 hash_set
<slp_tree
> visited
;
8291 hash_map
<tree
, int> scalar_use_map
;
8292 auto_vec
<slp_tree
> worklist
;
8294 for (slp_instance instance
: bb_vinfo
->slp_instances
)
8296 if (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_bb_reduc
)
8297 for (tree op
: SLP_INSTANCE_REMAIN_DEFS (instance
))
8298 if (TREE_CODE (op
) == SSA_NAME
)
8299 scalar_use_map
.put (op
, 1);
8300 if (!visited
.add (SLP_INSTANCE_TREE (instance
)))
8301 worklist
.safe_push (SLP_INSTANCE_TREE (instance
));
8306 slp_tree node
= worklist
.pop ();
8308 if (SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
8310 for (tree op
: SLP_TREE_SCALAR_OPS (node
))
8311 if (TREE_CODE (op
) == SSA_NAME
)
8312 scalar_use_map
.put (op
, 1);
8316 for (slp_tree child
: SLP_TREE_CHILDREN (node
))
8317 if (child
&& !visited
.add (child
))
8318 worklist
.safe_push (child
);
8321 while (!worklist
.is_empty ());
8325 for (slp_instance instance
: bb_vinfo
->slp_instances
)
8327 vect_location
= instance
->location ();
8328 vect_bb_slp_mark_live_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
),
8329 instance
, &instance
->cost_vec
,
8330 scalar_use_map
, svisited
, visited
);
8334 /* Determine whether we can vectorize the reduction epilogue for INSTANCE. */
8337 vectorizable_bb_reduc_epilogue (slp_instance instance
,
8338 stmt_vector_for_cost
*cost_vec
)
8340 gassign
*stmt
= as_a
<gassign
*> (instance
->root_stmts
[0]->stmt
);
8341 enum tree_code reduc_code
= gimple_assign_rhs_code (stmt
);
8342 if (reduc_code
== MINUS_EXPR
)
8343 reduc_code
= PLUS_EXPR
;
8344 internal_fn reduc_fn
;
8345 tree vectype
= SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (instance
));
8347 || !reduction_fn_for_scalar_code (reduc_code
, &reduc_fn
)
8348 || reduc_fn
== IFN_LAST
8349 || !direct_internal_fn_supported_p (reduc_fn
, vectype
, OPTIMIZE_FOR_BOTH
)
8350 || !useless_type_conversion_p (TREE_TYPE (gimple_assign_lhs (stmt
)),
8351 TREE_TYPE (vectype
)))
8353 if (dump_enabled_p ())
8354 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
8355 "not vectorized: basic block reduction epilogue "
8356 "operation unsupported.\n");
8360 /* There's no way to cost a horizontal vector reduction via REDUC_FN so
8361 cost log2 vector operations plus shuffles and one extraction. */
8362 unsigned steps
= floor_log2 (vect_nunits_for_cost (vectype
));
8363 record_stmt_cost (cost_vec
, steps
, vector_stmt
, instance
->root_stmts
[0],
8364 vectype
, 0, vect_body
);
8365 record_stmt_cost (cost_vec
, steps
, vec_perm
, instance
->root_stmts
[0],
8366 vectype
, 0, vect_body
);
8367 record_stmt_cost (cost_vec
, 1, vec_to_scalar
, instance
->root_stmts
[0],
8368 vectype
, 0, vect_body
);
8370 /* Since we replace all stmts of a possibly longer scalar reduction
8371 chain account for the extra scalar stmts for that. */
8372 record_stmt_cost (cost_vec
, instance
->remain_defs
.length (), scalar_stmt
,
8373 instance
->root_stmts
[0], 0, vect_body
);
8377 /* Prune from ROOTS all stmts that are computed as part of lanes of NODE
8378 and recurse to children. */
8381 vect_slp_prune_covered_roots (slp_tree node
, hash_set
<stmt_vec_info
> &roots
,
8382 hash_set
<slp_tree
> &visited
)
8384 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
8385 || visited
.add (node
))
8390 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt
)
8392 roots
.remove (vect_orig_stmt (stmt
));
8395 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
8397 vect_slp_prune_covered_roots (child
, roots
, visited
);
8400 /* Analyze statements in SLP instances of VINFO. Return true if the
8401 operations are supported. */
8404 vect_slp_analyze_operations (vec_info
*vinfo
)
8406 slp_instance instance
;
8409 DUMP_VECT_SCOPE ("vect_slp_analyze_operations");
8411 hash_set
<slp_tree
> visited
;
8412 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
8414 auto_vec
<slp_tree
> visited_vec
;
8415 stmt_vector_for_cost cost_vec
;
8416 cost_vec
.create (2);
8417 if (is_a
<bb_vec_info
> (vinfo
))
8418 vect_location
= instance
->location ();
8419 if (!vect_slp_analyze_node_operations (vinfo
,
8420 SLP_INSTANCE_TREE (instance
),
8421 instance
, visited
, visited_vec
,
8423 /* CTOR instances require vectorized defs for the SLP tree root. */
8424 || (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_ctor
8425 && (SLP_TREE_DEF_TYPE (SLP_INSTANCE_TREE (instance
))
8426 != vect_internal_def
8427 /* Make sure we vectorized with the expected type. */
8428 || !useless_type_conversion_p
8429 (TREE_TYPE (TREE_TYPE (gimple_assign_rhs1
8430 (instance
->root_stmts
[0]->stmt
))),
8431 TREE_TYPE (SLP_TREE_VECTYPE
8432 (SLP_INSTANCE_TREE (instance
))))))
8433 /* Check we can vectorize the reduction. */
8434 || (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_bb_reduc
8435 && !vectorizable_bb_reduc_epilogue (instance
, &cost_vec
))
8436 /* Check we can vectorize the gcond. */
8437 || (SLP_INSTANCE_KIND (instance
) == slp_inst_kind_gcond
8438 && !vectorizable_early_exit (vinfo
,
8439 SLP_INSTANCE_ROOT_STMTS (instance
)[0],
8441 SLP_INSTANCE_TREE (instance
),
8444 cost_vec
.release ();
8445 slp_tree node
= SLP_INSTANCE_TREE (instance
);
8446 stmt_vec_info stmt_info
;
8447 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
8448 stmt_info
= SLP_INSTANCE_ROOT_STMTS (instance
)[0];
8450 stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
8451 if (is_a
<loop_vec_info
> (vinfo
))
8453 if (dump_enabled_p ())
8454 dump_printf_loc (MSG_NOTE
, vect_location
,
8455 "unsupported SLP instance starting from: %G",
8459 if (dump_enabled_p ())
8460 dump_printf_loc (MSG_NOTE
, vect_location
,
8461 "removing SLP instance operations starting from: %G",
8463 vect_free_slp_instance (instance
);
8464 vinfo
->slp_instances
.ordered_remove (i
);
8465 while (!visited_vec
.is_empty ())
8466 visited
.remove (visited_vec
.pop ());
8471 if (loop_vec_info loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
8473 add_stmt_costs (loop_vinfo
->vector_costs
, &cost_vec
);
8474 cost_vec
.release ();
8477 /* For BB vectorization remember the SLP graph entry
8479 instance
->cost_vec
= cost_vec
;
8483 /* Now look for SLP instances with a root that are covered by other
8484 instances and remove them. */
8485 hash_set
<stmt_vec_info
> roots
;
8486 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
8487 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
8488 roots
.add (SLP_INSTANCE_ROOT_STMTS (instance
)[0]);
8489 if (!roots
.is_empty ())
8492 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
8493 vect_slp_prune_covered_roots (SLP_INSTANCE_TREE (instance
), roots
,
8495 for (i
= 0; vinfo
->slp_instances
.iterate (i
, &instance
); )
8496 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ()
8497 && !roots
.contains (SLP_INSTANCE_ROOT_STMTS (instance
)[0]))
8499 stmt_vec_info root
= SLP_INSTANCE_ROOT_STMTS (instance
)[0];
8500 if (dump_enabled_p ())
8501 dump_printf_loc (MSG_NOTE
, vect_location
,
8502 "removing SLP instance operations starting "
8503 "from: %G", root
->stmt
);
8504 vect_free_slp_instance (instance
);
8505 vinfo
->slp_instances
.ordered_remove (i
);
8511 /* Compute vectorizable live stmts. */
8512 if (bb_vec_info bb_vinfo
= dyn_cast
<bb_vec_info
> (vinfo
))
8513 vect_bb_slp_mark_live_stmts (bb_vinfo
);
8515 return !vinfo
->slp_instances
.is_empty ();
8518 /* Get the SLP instance leader from INSTANCE_LEADER thereby transitively
8519 closing the eventual chain. */
8522 get_ultimate_leader (slp_instance instance
,
8523 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
8525 auto_vec
<slp_instance
*, 8> chain
;
8527 while (*(tem
= instance_leader
.get (instance
)) != instance
)
8529 chain
.safe_push (tem
);
8532 while (!chain
.is_empty ())
8533 *chain
.pop () = instance
;
8538 /* Subroutine of vect_bb_partition_graph_r. Map KEY to INSTANCE in
8539 KEY_TO_INSTANCE, making INSTANCE the leader of any previous mapping
8540 for KEY. Return true if KEY was already in KEY_TO_INSTANCE.
8542 INSTANCE_LEADER is as for get_ultimate_leader. */
8544 template<typename T
>
8546 vect_map_to_instance (slp_instance instance
, T key
,
8547 hash_map
<T
, slp_instance
> &key_to_instance
,
8548 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
8551 slp_instance
&key_instance
= key_to_instance
.get_or_insert (key
, &existed_p
);
8554 else if (key_instance
!= instance
)
8556 /* If we're running into a previously marked key make us the
8557 leader of the current ultimate leader. This keeps the
8558 leader chain acyclic and works even when the current instance
8559 connects two previously independent graph parts. */
8560 slp_instance key_leader
8561 = get_ultimate_leader (key_instance
, instance_leader
);
8562 if (key_leader
!= instance
)
8563 instance_leader
.put (key_leader
, instance
);
8565 key_instance
= instance
;
8570 /* Worker of vect_bb_partition_graph, recurse on NODE. */
8573 vect_bb_partition_graph_r (bb_vec_info bb_vinfo
,
8574 slp_instance instance
, slp_tree node
,
8575 hash_map
<stmt_vec_info
, slp_instance
> &stmt_to_instance
,
8576 hash_map
<slp_tree
, slp_instance
> &node_to_instance
,
8577 hash_map
<slp_instance
, slp_instance
> &instance_leader
)
8579 stmt_vec_info stmt_info
;
8582 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
8584 vect_map_to_instance (instance
, stmt_info
, stmt_to_instance
,
8587 if (vect_map_to_instance (instance
, node
, node_to_instance
,
8592 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
8593 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
8594 vect_bb_partition_graph_r (bb_vinfo
, instance
, child
, stmt_to_instance
,
8595 node_to_instance
, instance_leader
);
8598 /* Partition the SLP graph into pieces that can be costed independently. */
8601 vect_bb_partition_graph (bb_vec_info bb_vinfo
)
8603 DUMP_VECT_SCOPE ("vect_bb_partition_graph");
8605 /* First walk the SLP graph assigning each involved scalar stmt a
8606 corresponding SLP graph entry and upon visiting a previously
8607 marked stmt, make the stmts leader the current SLP graph entry. */
8608 hash_map
<stmt_vec_info
, slp_instance
> stmt_to_instance
;
8609 hash_map
<slp_tree
, slp_instance
> node_to_instance
;
8610 hash_map
<slp_instance
, slp_instance
> instance_leader
;
8611 slp_instance instance
;
8612 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
8614 instance_leader
.put (instance
, instance
);
8615 vect_bb_partition_graph_r (bb_vinfo
,
8616 instance
, SLP_INSTANCE_TREE (instance
),
8617 stmt_to_instance
, node_to_instance
,
8621 /* Then collect entries to each independent subgraph. */
8622 for (unsigned i
= 0; bb_vinfo
->slp_instances
.iterate (i
, &instance
); ++i
)
8624 slp_instance leader
= get_ultimate_leader (instance
, instance_leader
);
8625 leader
->subgraph_entries
.safe_push (instance
);
8626 if (dump_enabled_p ()
8627 && leader
!= instance
)
8628 dump_printf_loc (MSG_NOTE
, vect_location
,
8629 "instance %p is leader of %p\n",
8630 (void *) leader
, (void *) instance
);
8634 /* Compute the set of scalar stmts participating in internal and external
8638 vect_slp_gather_vectorized_scalar_stmts (vec_info
*vinfo
, slp_tree node
,
8639 hash_set
<slp_tree
> &visited
,
8640 hash_set
<stmt_vec_info
> &vstmts
,
8641 hash_set
<stmt_vec_info
> &estmts
)
8644 stmt_vec_info stmt_info
;
8647 if (visited
.add (node
))
8650 if (SLP_TREE_DEF_TYPE (node
) == vect_internal_def
)
8652 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
8654 vstmts
.add (stmt_info
);
8656 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
8658 vect_slp_gather_vectorized_scalar_stmts (vinfo
, child
, visited
,
8662 for (tree def
: SLP_TREE_SCALAR_OPS (node
))
8664 stmt_vec_info def_stmt
= vinfo
->lookup_def (def
);
8666 estmts
.add (def_stmt
);
8671 /* Compute the scalar cost of the SLP node NODE and its children
8672 and return it. Do not account defs that are marked in LIFE and
8673 update LIFE according to uses of NODE. */
8676 vect_bb_slp_scalar_cost (vec_info
*vinfo
,
8677 slp_tree node
, vec
<bool, va_heap
> *life
,
8678 stmt_vector_for_cost
*cost_vec
,
8679 hash_set
<stmt_vec_info
> &vectorized_scalar_stmts
,
8680 hash_set
<stmt_vec_info
> &scalar_stmts_in_externs
,
8681 hash_set
<slp_tree
> &visited
)
8684 stmt_vec_info stmt_info
;
8687 if (visited
.add (node
))
8690 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
8692 ssa_op_iter op_iter
;
8693 def_operand_p def_p
;
8697 /* Defs also used in external nodes are not in the
8698 vectorized_scalar_stmts set as they need to be preserved.
8700 || scalar_stmts_in_externs
.contains (stmt_info
))
8703 stmt_vec_info orig_stmt_info
= vect_orig_stmt (stmt_info
);
8704 gimple
*orig_stmt
= orig_stmt_info
->stmt
;
8706 /* If there is a non-vectorized use of the defs then the scalar
8707 stmt is kept live in which case we do not account it or any
8708 required defs in the SLP children in the scalar cost. This
8709 way we make the vectorization more costly when compared to
8711 if (!STMT_VINFO_LIVE_P (stmt_info
))
8713 auto_vec
<gimple
*, 8> worklist
;
8714 hash_set
<gimple
*> *worklist_visited
= NULL
;
8715 worklist
.quick_push (orig_stmt
);
8718 gimple
*work_stmt
= worklist
.pop ();
8719 FOR_EACH_PHI_OR_STMT_DEF (def_p
, work_stmt
, op_iter
, SSA_OP_DEF
)
8721 imm_use_iterator use_iter
;
8723 FOR_EACH_IMM_USE_STMT (use_stmt
, use_iter
,
8724 DEF_FROM_PTR (def_p
))
8725 if (!is_gimple_debug (use_stmt
))
8727 stmt_vec_info use_stmt_info
8728 = vinfo
->lookup_stmt (use_stmt
);
8730 || !vectorized_scalar_stmts
.contains (use_stmt_info
))
8733 && STMT_VINFO_IN_PATTERN_P (use_stmt_info
))
8735 /* For stmts participating in patterns we have
8736 to check its uses recursively. */
8737 if (!worklist_visited
)
8738 worklist_visited
= new hash_set
<gimple
*> ();
8739 if (!worklist_visited
->add (use_stmt
))
8740 worklist
.safe_push (use_stmt
);
8749 while (!worklist
.is_empty ());
8751 if (worklist_visited
)
8752 delete worklist_visited
;
8757 /* Count scalar stmts only once. */
8758 if (gimple_visited_p (orig_stmt
))
8760 gimple_set_visited (orig_stmt
, true);
8762 vect_cost_for_stmt kind
;
8763 if (STMT_VINFO_DATA_REF (orig_stmt_info
))
8765 data_reference_p dr
= STMT_VINFO_DATA_REF (orig_stmt_info
);
8766 tree base
= get_base_address (DR_REF (dr
));
8767 /* When the scalar access is to a non-global not address-taken
8768 decl that is not BLKmode assume we can access it with a single
8769 non-load/store instruction. */
8771 && !is_global_var (base
)
8772 && !TREE_ADDRESSABLE (base
)
8773 && DECL_MODE (base
) != BLKmode
)
8775 else if (DR_IS_READ (STMT_VINFO_DATA_REF (orig_stmt_info
)))
8778 kind
= scalar_store
;
8780 else if (vect_nop_conversion_p (orig_stmt_info
))
8782 /* For single-argument PHIs assume coalescing which means zero cost
8783 for the scalar and the vector PHIs. This avoids artificially
8784 favoring the vector path (but may pessimize it in some cases). */
8785 else if (is_a
<gphi
*> (orig_stmt_info
->stmt
)
8786 && gimple_phi_num_args
8787 (as_a
<gphi
*> (orig_stmt_info
->stmt
)) == 1)
8791 record_stmt_cost (cost_vec
, 1, kind
, orig_stmt_info
,
8792 SLP_TREE_VECTYPE (node
), 0, vect_body
);
8795 auto_vec
<bool, 20> subtree_life
;
8796 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
8798 if (child
&& SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
8800 /* Do not directly pass LIFE to the recursive call, copy it to
8801 confine changes in the callee to the current child/subtree. */
8802 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
8804 subtree_life
.safe_grow_cleared (SLP_TREE_LANES (child
), true);
8805 for (unsigned j
= 0;
8806 j
< SLP_TREE_LANE_PERMUTATION (node
).length (); ++j
)
8808 auto perm
= SLP_TREE_LANE_PERMUTATION (node
)[j
];
8809 if (perm
.first
== i
)
8810 subtree_life
[perm
.second
] = (*life
)[j
];
8815 gcc_assert (SLP_TREE_LANES (node
) == SLP_TREE_LANES (child
));
8816 subtree_life
.safe_splice (*life
);
8818 vect_bb_slp_scalar_cost (vinfo
, child
, &subtree_life
, cost_vec
,
8819 vectorized_scalar_stmts
,
8820 scalar_stmts_in_externs
, visited
);
8821 subtree_life
.truncate (0);
8826 /* Comparator for the loop-index sorted cost vectors. */
8829 li_cost_vec_cmp (const void *a_
, const void *b_
)
8831 auto *a
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)a_
;
8832 auto *b
= (const std::pair
<unsigned, stmt_info_for_cost
*> *)b_
;
8833 if (a
->first
< b
->first
)
8835 else if (a
->first
== b
->first
)
8840 /* Check if vectorization of the basic block is profitable for the
8841 subgraph denoted by SLP_INSTANCES. */
8844 vect_bb_vectorization_profitable_p (bb_vec_info bb_vinfo
,
8845 vec
<slp_instance
> slp_instances
,
8848 slp_instance instance
;
8850 unsigned int vec_inside_cost
= 0, vec_outside_cost
= 0, scalar_cost
= 0;
8851 unsigned int vec_prologue_cost
= 0, vec_epilogue_cost
= 0;
8853 if (dump_enabled_p ())
8855 dump_printf_loc (MSG_NOTE
, vect_location
, "Costing subgraph: \n");
8856 hash_set
<slp_tree
> visited
;
8857 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
8858 vect_print_slp_graph (MSG_NOTE
, vect_location
,
8859 SLP_INSTANCE_TREE (instance
), visited
);
8862 /* Compute the set of scalar stmts we know will go away 'locally' when
8863 vectorizing. This used to be tracked with just PURE_SLP_STMT but that's
8864 not accurate for nodes promoted extern late or for scalar stmts that
8865 are used both in extern defs and in vectorized defs. */
8866 hash_set
<stmt_vec_info
> vectorized_scalar_stmts
;
8867 hash_set
<stmt_vec_info
> scalar_stmts_in_externs
;
8868 hash_set
<slp_tree
> visited
;
8869 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
8871 vect_slp_gather_vectorized_scalar_stmts (bb_vinfo
,
8872 SLP_INSTANCE_TREE (instance
),
8874 vectorized_scalar_stmts
,
8875 scalar_stmts_in_externs
);
8876 for (stmt_vec_info rstmt
: SLP_INSTANCE_ROOT_STMTS (instance
))
8877 vectorized_scalar_stmts
.add (rstmt
);
8879 /* Scalar stmts used as defs in external nodes need to be preseved, so
8880 remove them from vectorized_scalar_stmts. */
8881 for (stmt_vec_info stmt
: scalar_stmts_in_externs
)
8882 vectorized_scalar_stmts
.remove (stmt
);
8884 /* Calculate scalar cost and sum the cost for the vector stmts
8885 previously collected. */
8886 stmt_vector_for_cost scalar_costs
= vNULL
;
8887 stmt_vector_for_cost vector_costs
= vNULL
;
8889 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
8891 auto_vec
<bool, 20> life
;
8892 life
.safe_grow_cleared (SLP_TREE_LANES (SLP_INSTANCE_TREE (instance
)),
8894 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
8895 record_stmt_cost (&scalar_costs
,
8896 SLP_INSTANCE_ROOT_STMTS (instance
).length (),
8898 SLP_INSTANCE_ROOT_STMTS (instance
)[0], 0, vect_body
);
8899 vect_bb_slp_scalar_cost (bb_vinfo
,
8900 SLP_INSTANCE_TREE (instance
),
8901 &life
, &scalar_costs
, vectorized_scalar_stmts
,
8902 scalar_stmts_in_externs
, visited
);
8903 vector_costs
.safe_splice (instance
->cost_vec
);
8904 instance
->cost_vec
.release ();
8907 if (dump_enabled_p ())
8908 dump_printf_loc (MSG_NOTE
, vect_location
, "Cost model analysis: \n");
8910 /* When costing non-loop vectorization we need to consider each covered
8911 loop independently and make sure vectorization is profitable. For
8912 now we assume a loop may be not entered or executed an arbitrary
8913 number of iterations (??? static information can provide more
8914 precise info here) which means we can simply cost each containing
8915 loops stmts separately. */
8917 /* First produce cost vectors sorted by loop index. */
8918 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
8919 li_scalar_costs (scalar_costs
.length ());
8920 auto_vec
<std::pair
<unsigned, stmt_info_for_cost
*> >
8921 li_vector_costs (vector_costs
.length ());
8922 stmt_info_for_cost
*cost
;
8923 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
8925 unsigned l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
8926 li_scalar_costs
.quick_push (std::make_pair (l
, cost
));
8928 /* Use a random used loop as fallback in case the first vector_costs
8929 entry does not have a stmt_info associated with it. */
8930 unsigned l
= li_scalar_costs
[0].first
;
8931 FOR_EACH_VEC_ELT (vector_costs
, i
, cost
)
8933 /* We inherit from the previous COST, invariants, externals and
8934 extracts immediately follow the cost for the related stmt. */
8935 if (cost
->stmt_info
)
8936 l
= gimple_bb (cost
->stmt_info
->stmt
)->loop_father
->num
;
8937 li_vector_costs
.quick_push (std::make_pair (l
, cost
));
8939 li_scalar_costs
.qsort (li_cost_vec_cmp
);
8940 li_vector_costs
.qsort (li_cost_vec_cmp
);
8942 /* Now cost the portions individually. */
8945 bool profitable
= true;
8946 while (si
< li_scalar_costs
.length ()
8947 && vi
< li_vector_costs
.length ())
8949 unsigned sl
= li_scalar_costs
[si
].first
;
8950 unsigned vl
= li_vector_costs
[vi
].first
;
8953 if (dump_enabled_p ())
8954 dump_printf_loc (MSG_NOTE
, vect_location
,
8955 "Scalar %d and vector %d loop part do not "
8956 "match up, skipping scalar part\n", sl
, vl
);
8957 /* Skip the scalar part, assuming zero cost on the vector side. */
8962 while (si
< li_scalar_costs
.length ()
8963 && li_scalar_costs
[si
].first
== sl
);
8967 class vector_costs
*scalar_target_cost_data
= init_cost (bb_vinfo
, true);
8970 add_stmt_cost (scalar_target_cost_data
, li_scalar_costs
[si
].second
);
8973 while (si
< li_scalar_costs
.length ()
8974 && li_scalar_costs
[si
].first
== sl
);
8975 scalar_target_cost_data
->finish_cost (nullptr);
8976 scalar_cost
= scalar_target_cost_data
->body_cost ();
8978 /* Complete the target-specific vector cost calculation. */
8979 class vector_costs
*vect_target_cost_data
= init_cost (bb_vinfo
, false);
8982 add_stmt_cost (vect_target_cost_data
, li_vector_costs
[vi
].second
);
8985 while (vi
< li_vector_costs
.length ()
8986 && li_vector_costs
[vi
].first
== vl
);
8987 vect_target_cost_data
->finish_cost (scalar_target_cost_data
);
8988 vec_prologue_cost
= vect_target_cost_data
->prologue_cost ();
8989 vec_inside_cost
= vect_target_cost_data
->body_cost ();
8990 vec_epilogue_cost
= vect_target_cost_data
->epilogue_cost ();
8991 delete scalar_target_cost_data
;
8992 delete vect_target_cost_data
;
8994 vec_outside_cost
= vec_prologue_cost
+ vec_epilogue_cost
;
8996 if (dump_enabled_p ())
8998 dump_printf_loc (MSG_NOTE
, vect_location
,
8999 "Cost model analysis for part in loop %d:\n", sl
);
9000 dump_printf (MSG_NOTE
, " Vector cost: %d\n",
9001 vec_inside_cost
+ vec_outside_cost
);
9002 dump_printf (MSG_NOTE
, " Scalar cost: %d\n", scalar_cost
);
9005 /* Vectorization is profitable if its cost is more than the cost of scalar
9006 version. Note that we err on the vector side for equal cost because
9007 the cost estimate is otherwise quite pessimistic (constant uses are
9008 free on the scalar side but cost a load on the vector side for
9010 if (vec_outside_cost
+ vec_inside_cost
> scalar_cost
)
9016 if (profitable
&& vi
< li_vector_costs
.length ())
9018 if (dump_enabled_p ())
9019 dump_printf_loc (MSG_NOTE
, vect_location
,
9020 "Excess vector cost for part in loop %d:\n",
9021 li_vector_costs
[vi
].first
);
9025 /* Unset visited flag. This is delayed when the subgraph is profitable
9026 and we process the loop for remaining unvectorized if-converted code. */
9027 if (!orig_loop
|| !profitable
)
9028 FOR_EACH_VEC_ELT (scalar_costs
, i
, cost
)
9029 gimple_set_visited (cost
->stmt_info
->stmt
, false);
9031 scalar_costs
.release ();
9032 vector_costs
.release ();
9037 /* qsort comparator for lane defs. */
9040 vld_cmp (const void *a_
, const void *b_
)
9042 auto *a
= (const std::pair
<unsigned, tree
> *)a_
;
9043 auto *b
= (const std::pair
<unsigned, tree
> *)b_
;
9044 return a
->first
- b
->first
;
9047 /* Return true if USE_STMT is a vector lane insert into VEC and set
9048 *THIS_LANE to the lane number that is set. */
9051 vect_slp_is_lane_insert (gimple
*use_stmt
, tree vec
, unsigned *this_lane
)
9053 gassign
*use_ass
= dyn_cast
<gassign
*> (use_stmt
);
9055 || gimple_assign_rhs_code (use_ass
) != BIT_INSERT_EXPR
9057 ? gimple_assign_rhs1 (use_ass
) != vec
9058 : ((vec
= gimple_assign_rhs1 (use_ass
)), false))
9059 || !useless_type_conversion_p (TREE_TYPE (TREE_TYPE (vec
)),
9060 TREE_TYPE (gimple_assign_rhs2 (use_ass
)))
9061 || !constant_multiple_p
9062 (tree_to_poly_uint64 (gimple_assign_rhs3 (use_ass
)),
9063 tree_to_poly_uint64 (TYPE_SIZE (TREE_TYPE (TREE_TYPE (vec
)))),
9069 /* Find any vectorizable constructors and add them to the grouped_store
9073 vect_slp_check_for_roots (bb_vec_info bb_vinfo
)
9075 for (unsigned i
= 0; i
< bb_vinfo
->nbbs
; ++i
)
9076 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[i
]);
9077 !gsi_end_p (gsi
); gsi_next (&gsi
))
9079 gassign
*assign
= dyn_cast
<gassign
*> (gsi_stmt (gsi
));
9080 /* This can be used to start SLP discovery for early breaks for BB early breaks
9081 when we get that far. */
9085 tree rhs
= gimple_assign_rhs1 (assign
);
9086 enum tree_code code
= gimple_assign_rhs_code (assign
);
9087 use_operand_p use_p
;
9089 if (code
== CONSTRUCTOR
)
9091 if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
9092 || maybe_ne (TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)),
9093 CONSTRUCTOR_NELTS (rhs
))
9094 || VECTOR_TYPE_P (TREE_TYPE (CONSTRUCTOR_ELT (rhs
, 0)->value
))
9095 || uniform_vector_p (rhs
))
9100 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), j
, val
)
9101 if (TREE_CODE (val
) != SSA_NAME
9102 || !bb_vinfo
->lookup_def (val
))
9104 if (j
!= CONSTRUCTOR_NELTS (rhs
))
9107 vec
<stmt_vec_info
> roots
= vNULL
;
9108 roots
.safe_push (bb_vinfo
->lookup_stmt (assign
));
9109 vec
<stmt_vec_info
> stmts
;
9110 stmts
.create (CONSTRUCTOR_NELTS (rhs
));
9111 FOR_EACH_CONSTRUCTOR_VALUE (CONSTRUCTOR_ELTS (rhs
), j
, val
)
9113 (vect_stmt_to_vectorize (bb_vinfo
->lookup_def (val
)));
9114 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_ctor
,
9117 else if (code
== BIT_INSERT_EXPR
9118 && VECTOR_TYPE_P (TREE_TYPE (rhs
))
9119 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).is_constant ()
9120 && TYPE_VECTOR_SUBPARTS (TREE_TYPE (rhs
)).to_constant () > 1
9121 && integer_zerop (gimple_assign_rhs3 (assign
))
9122 && useless_type_conversion_p
9123 (TREE_TYPE (TREE_TYPE (rhs
)),
9124 TREE_TYPE (gimple_assign_rhs2 (assign
)))
9125 && bb_vinfo
->lookup_def (gimple_assign_rhs2 (assign
)))
9127 /* We start to match on insert to lane zero but since the
9128 inserts need not be ordered we'd have to search both
9129 the def and the use chains. */
9130 tree vectype
= TREE_TYPE (rhs
);
9131 unsigned nlanes
= TYPE_VECTOR_SUBPARTS (vectype
).to_constant ();
9132 auto_vec
<std::pair
<unsigned, tree
> > lane_defs (nlanes
);
9133 auto_sbitmap
lanes (nlanes
);
9134 bitmap_clear (lanes
);
9135 bitmap_set_bit (lanes
, 0);
9136 tree def
= gimple_assign_lhs (assign
);
9137 lane_defs
.quick_push
9138 (std::make_pair (0, gimple_assign_rhs2 (assign
)));
9139 unsigned lanes_found
= 1;
9140 /* Start with the use chains, the last stmt will be the root. */
9141 stmt_vec_info last
= bb_vinfo
->lookup_stmt (assign
);
9142 vec
<stmt_vec_info
> roots
= vNULL
;
9143 roots
.safe_push (last
);
9146 use_operand_p use_p
;
9148 if (!single_imm_use (def
, &use_p
, &use_stmt
))
9151 if (!bb_vinfo
->lookup_stmt (use_stmt
)
9152 || !vect_slp_is_lane_insert (use_stmt
, def
, &this_lane
)
9153 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (use_stmt
)))
9155 if (bitmap_bit_p (lanes
, this_lane
))
9158 bitmap_set_bit (lanes
, this_lane
);
9159 gassign
*use_ass
= as_a
<gassign
*> (use_stmt
);
9160 lane_defs
.quick_push (std::make_pair
9161 (this_lane
, gimple_assign_rhs2 (use_ass
)));
9162 last
= bb_vinfo
->lookup_stmt (use_ass
);
9163 roots
.safe_push (last
);
9164 def
= gimple_assign_lhs (use_ass
);
9166 while (lanes_found
< nlanes
);
9167 if (roots
.length () > 1)
9168 std::swap(roots
[0], roots
[roots
.length () - 1]);
9169 if (lanes_found
< nlanes
)
9171 /* Now search the def chain. */
9172 def
= gimple_assign_rhs1 (assign
);
9175 if (TREE_CODE (def
) != SSA_NAME
9176 || !has_single_use (def
))
9178 gimple
*def_stmt
= SSA_NAME_DEF_STMT (def
);
9180 if (!bb_vinfo
->lookup_stmt (def_stmt
)
9181 || !vect_slp_is_lane_insert (def_stmt
,
9182 NULL_TREE
, &this_lane
)
9183 || !bb_vinfo
->lookup_def (gimple_assign_rhs2 (def_stmt
)))
9185 if (bitmap_bit_p (lanes
, this_lane
))
9188 bitmap_set_bit (lanes
, this_lane
);
9189 lane_defs
.quick_push (std::make_pair
9191 gimple_assign_rhs2 (def_stmt
)));
9192 roots
.safe_push (bb_vinfo
->lookup_stmt (def_stmt
));
9193 def
= gimple_assign_rhs1 (def_stmt
);
9195 while (lanes_found
< nlanes
);
9197 if (lanes_found
== nlanes
)
9199 /* Sort lane_defs after the lane index and register the root. */
9200 lane_defs
.qsort (vld_cmp
);
9201 vec
<stmt_vec_info
> stmts
;
9202 stmts
.create (nlanes
);
9203 for (unsigned i
= 0; i
< nlanes
; ++i
)
9204 stmts
.quick_push (bb_vinfo
->lookup_def (lane_defs
[i
].second
));
9205 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_ctor
,
9211 else if (!VECTOR_TYPE_P (TREE_TYPE (rhs
))
9212 && (associative_tree_code (code
) || code
== MINUS_EXPR
)
9213 /* ??? This pessimizes a two-element reduction. PR54400.
9214 ??? In-order reduction could be handled if we only
9215 traverse one operand chain in vect_slp_linearize_chain. */
9216 && !needs_fold_left_reduction_p (TREE_TYPE (rhs
), code
)
9217 /* Ops with constants at the tail can be stripped here. */
9218 && TREE_CODE (rhs
) == SSA_NAME
9219 && TREE_CODE (gimple_assign_rhs2 (assign
)) == SSA_NAME
9220 /* Should be the chain end. */
9221 && (!single_imm_use (gimple_assign_lhs (assign
),
9223 || !is_gimple_assign (use_stmt
)
9224 || (gimple_assign_rhs_code (use_stmt
) != code
9225 && ((code
!= PLUS_EXPR
&& code
!= MINUS_EXPR
)
9226 || (gimple_assign_rhs_code (use_stmt
)
9227 != (code
== PLUS_EXPR
? MINUS_EXPR
: PLUS_EXPR
))))))
9229 /* We start the match at the end of a possible association
9231 auto_vec
<chain_op_t
> chain
;
9232 auto_vec
<std::pair
<tree_code
, gimple
*> > worklist
;
9233 auto_vec
<gimple
*> chain_stmts
;
9234 gimple
*code_stmt
= NULL
, *alt_code_stmt
= NULL
;
9235 if (code
== MINUS_EXPR
)
9237 internal_fn reduc_fn
;
9238 if (!reduction_fn_for_scalar_code (code
, &reduc_fn
)
9239 || reduc_fn
== IFN_LAST
)
9241 vect_slp_linearize_chain (bb_vinfo
, worklist
, chain
, code
, assign
,
9243 code_stmt
, alt_code_stmt
, &chain_stmts
);
9244 if (chain
.length () > 1)
9246 /* Sort the chain according to def_type and operation. */
9247 chain
.sort (dt_sort_cmp
, bb_vinfo
);
9248 /* ??? Now we'd want to strip externals and constants
9249 but record those to be handled in the epilogue. */
9250 /* ??? For now do not allow mixing ops or externs/constants. */
9251 bool invalid
= false;
9252 unsigned remain_cnt
= 0;
9253 unsigned last_idx
= 0;
9254 for (unsigned i
= 0; i
< chain
.length (); ++i
)
9256 if (chain
[i
].code
!= code
)
9261 if (chain
[i
].dt
!= vect_internal_def
9262 /* Avoid stmts where the def is not the LHS, like
9264 || (gimple_get_lhs (bb_vinfo
->lookup_def
9265 (chain
[i
].op
)->stmt
)
9271 /* Make sure to have an even number of lanes as we later do
9272 all-or-nothing discovery, not trying to split further. */
9273 if ((chain
.length () - remain_cnt
) & 1)
9275 if (!invalid
&& chain
.length () - remain_cnt
> 1)
9277 vec
<stmt_vec_info
> stmts
;
9278 vec
<tree
> remain
= vNULL
;
9279 stmts
.create (chain
.length ());
9281 remain
.create (remain_cnt
);
9282 for (unsigned i
= 0; i
< chain
.length (); ++i
)
9284 stmt_vec_info stmt_info
;
9285 if (chain
[i
].dt
== vect_internal_def
9286 && ((stmt_info
= bb_vinfo
->lookup_def (chain
[i
].op
)),
9287 gimple_get_lhs (stmt_info
->stmt
) == chain
[i
].op
)
9289 || (stmts
.length () & 1)))
9290 stmts
.quick_push (stmt_info
);
9292 remain
.quick_push (chain
[i
].op
);
9294 vec
<stmt_vec_info
> roots
;
9295 roots
.create (chain_stmts
.length ());
9296 for (unsigned i
= 0; i
< chain_stmts
.length (); ++i
)
9297 roots
.quick_push (bb_vinfo
->lookup_stmt (chain_stmts
[i
]));
9298 bb_vinfo
->roots
.safe_push (slp_root (slp_inst_kind_bb_reduc
,
9299 stmts
, roots
, remain
));
9306 /* Walk the grouped store chains and replace entries with their
9307 pattern variant if any. */
9310 vect_fixup_store_groups_with_patterns (vec_info
*vinfo
)
9312 stmt_vec_info first_element
;
9315 FOR_EACH_VEC_ELT (vinfo
->grouped_stores
, i
, first_element
)
9317 /* We also have CTORs in this array. */
9318 if (!STMT_VINFO_GROUPED_ACCESS (first_element
))
9320 if (STMT_VINFO_IN_PATTERN_P (first_element
))
9322 stmt_vec_info orig
= first_element
;
9323 first_element
= STMT_VINFO_RELATED_STMT (first_element
);
9324 DR_GROUP_FIRST_ELEMENT (first_element
) = first_element
;
9325 DR_GROUP_SIZE (first_element
) = DR_GROUP_SIZE (orig
);
9326 DR_GROUP_GAP (first_element
) = DR_GROUP_GAP (orig
);
9327 DR_GROUP_NEXT_ELEMENT (first_element
) = DR_GROUP_NEXT_ELEMENT (orig
);
9328 vinfo
->grouped_stores
[i
] = first_element
;
9330 stmt_vec_info prev
= first_element
;
9331 while (DR_GROUP_NEXT_ELEMENT (prev
))
9333 stmt_vec_info elt
= DR_GROUP_NEXT_ELEMENT (prev
);
9334 if (STMT_VINFO_IN_PATTERN_P (elt
))
9336 stmt_vec_info orig
= elt
;
9337 elt
= STMT_VINFO_RELATED_STMT (elt
);
9338 DR_GROUP_NEXT_ELEMENT (prev
) = elt
;
9339 DR_GROUP_GAP (elt
) = DR_GROUP_GAP (orig
);
9340 DR_GROUP_NEXT_ELEMENT (elt
) = DR_GROUP_NEXT_ELEMENT (orig
);
9342 DR_GROUP_FIRST_ELEMENT (elt
) = first_element
;
9348 /* Check if the region described by BB_VINFO can be vectorized, returning
9349 true if so. When returning false, set FATAL to true if the same failure
9350 would prevent vectorization at other vector sizes, false if it is still
9351 worth trying other sizes. N_STMTS is the number of statements in the
9355 vect_slp_analyze_bb_1 (bb_vec_info bb_vinfo
, int n_stmts
, bool &fatal
,
9356 vec
<int> *dataref_groups
)
9358 DUMP_VECT_SCOPE ("vect_slp_analyze_bb");
9360 slp_instance instance
;
9362 poly_uint64 min_vf
= 2;
9364 /* The first group of checks is independent of the vector size. */
9367 /* Analyze the data references. */
9369 if (!vect_analyze_data_refs (bb_vinfo
, &min_vf
, NULL
))
9371 if (dump_enabled_p ())
9372 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9373 "not vectorized: unhandled data-ref in basic "
9378 if (!vect_analyze_data_ref_accesses (bb_vinfo
, dataref_groups
))
9380 if (dump_enabled_p ())
9381 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9382 "not vectorized: unhandled data access in "
9387 vect_slp_check_for_roots (bb_vinfo
);
9389 /* If there are no grouped stores and no constructors in the region
9390 there is no need to continue with pattern recog as vect_analyze_slp
9391 will fail anyway. */
9392 if (bb_vinfo
->grouped_stores
.is_empty ()
9393 && bb_vinfo
->roots
.is_empty ())
9395 if (dump_enabled_p ())
9396 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9397 "not vectorized: no grouped stores in "
9402 /* While the rest of the analysis below depends on it in some way. */
9405 vect_pattern_recog (bb_vinfo
);
9407 /* Update store groups from pattern processing. */
9408 vect_fixup_store_groups_with_patterns (bb_vinfo
);
9410 /* Check the SLP opportunities in the basic block, analyze and build SLP
9412 if (!vect_analyze_slp (bb_vinfo
, n_stmts
, false))
9414 if (dump_enabled_p ())
9416 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9417 "Failed to SLP the basic block.\n");
9418 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9419 "not vectorized: failed to find SLP opportunities "
9420 "in basic block.\n");
9425 /* Optimize permutations. */
9426 vect_optimize_slp (bb_vinfo
);
9428 /* Gather the loads reachable from the SLP graph entries. */
9429 vect_gather_slp_loads (bb_vinfo
);
9431 vect_record_base_alignments (bb_vinfo
);
9433 /* Analyze and verify the alignment of data references and the
9434 dependence in the SLP instances. */
9435 for (i
= 0; BB_VINFO_SLP_INSTANCES (bb_vinfo
).iterate (i
, &instance
); )
9437 vect_location
= instance
->location ();
9438 if (! vect_slp_analyze_instance_alignment (bb_vinfo
, instance
)
9439 || ! vect_slp_analyze_instance_dependence (bb_vinfo
, instance
))
9441 slp_tree node
= SLP_INSTANCE_TREE (instance
);
9442 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
9443 if (dump_enabled_p ())
9444 dump_printf_loc (MSG_NOTE
, vect_location
,
9445 "removing SLP instance operations starting from: %G",
9447 vect_free_slp_instance (instance
);
9448 BB_VINFO_SLP_INSTANCES (bb_vinfo
).ordered_remove (i
);
9452 /* Mark all the statements that we want to vectorize as pure SLP and
9454 vect_mark_slp_stmts (bb_vinfo
, SLP_INSTANCE_TREE (instance
));
9455 vect_mark_slp_stmts_relevant (SLP_INSTANCE_TREE (instance
));
9458 /* Likewise consider instance root stmts as vectorized. */
9459 FOR_EACH_VEC_ELT (SLP_INSTANCE_ROOT_STMTS (instance
), j
, root
)
9460 STMT_SLP_TYPE (root
) = pure_slp
;
9464 if (! BB_VINFO_SLP_INSTANCES (bb_vinfo
).length ())
9467 if (!vect_slp_analyze_operations (bb_vinfo
))
9469 if (dump_enabled_p ())
9470 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9471 "not vectorized: bad operation in basic block.\n");
9475 vect_bb_partition_graph (bb_vinfo
);
9480 /* Subroutine of vect_slp_bb. Try to vectorize the statements for all
9481 basic blocks in BBS, returning true on success.
9482 The region has N_STMTS statements and has the datarefs given by DATAREFS. */
9485 vect_slp_region (vec
<basic_block
> bbs
, vec
<data_reference_p
> datarefs
,
9486 vec
<int> *dataref_groups
, unsigned int n_stmts
,
9489 bb_vec_info bb_vinfo
;
9490 auto_vector_modes vector_modes
;
9492 /* Autodetect first vector size we try. */
9493 machine_mode next_vector_mode
= VOIDmode
;
9494 targetm
.vectorize
.autovectorize_vector_modes (&vector_modes
, false);
9495 unsigned int mode_i
= 0;
9497 vec_info_shared shared
;
9499 machine_mode autodetected_vector_mode
= VOIDmode
;
9502 bool vectorized
= false;
9504 bb_vinfo
= new _bb_vec_info (bbs
, &shared
);
9506 bool first_time_p
= shared
.datarefs
.is_empty ();
9507 BB_VINFO_DATAREFS (bb_vinfo
) = datarefs
;
9509 bb_vinfo
->shared
->save_datarefs ();
9511 bb_vinfo
->shared
->check_datarefs ();
9512 bb_vinfo
->vector_mode
= next_vector_mode
;
9514 if (vect_slp_analyze_bb_1 (bb_vinfo
, n_stmts
, fatal
, dataref_groups
))
9516 if (dump_enabled_p ())
9518 dump_printf_loc (MSG_NOTE
, vect_location
,
9519 "***** Analysis succeeded with vector mode"
9520 " %s\n", GET_MODE_NAME (bb_vinfo
->vector_mode
));
9521 dump_printf_loc (MSG_NOTE
, vect_location
, "SLPing BB part\n");
9524 bb_vinfo
->shared
->check_datarefs ();
9526 bool force_clear
= false;
9527 auto_vec
<slp_instance
> profitable_subgraphs
;
9528 for (slp_instance instance
: BB_VINFO_SLP_INSTANCES (bb_vinfo
))
9530 if (instance
->subgraph_entries
.is_empty ())
9533 dump_user_location_t saved_vect_location
= vect_location
;
9534 vect_location
= instance
->location ();
9535 if (!unlimited_cost_model (NULL
)
9536 && !vect_bb_vectorization_profitable_p
9537 (bb_vinfo
, instance
->subgraph_entries
, orig_loop
))
9539 if (dump_enabled_p ())
9540 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9541 "not vectorized: vectorization is not "
9543 vect_location
= saved_vect_location
;
9547 vect_location
= saved_vect_location
;
9548 if (!dbg_cnt (vect_slp
))
9554 profitable_subgraphs
.safe_push (instance
);
9557 /* When we're vectorizing an if-converted loop body make sure
9558 we vectorized all if-converted code. */
9559 if ((!profitable_subgraphs
.is_empty () || force_clear
) && orig_loop
)
9561 gcc_assert (bb_vinfo
->nbbs
== 1);
9562 for (gimple_stmt_iterator gsi
= gsi_start_bb (bb_vinfo
->bbs
[0]);
9563 !gsi_end_p (gsi
); gsi_next (&gsi
))
9565 /* The costing above left us with DCEable vectorized scalar
9566 stmts having the visited flag set on profitable
9567 subgraphs. Do the delayed clearing of the flag here. */
9568 if (gimple_visited_p (gsi_stmt (gsi
)))
9570 gimple_set_visited (gsi_stmt (gsi
), false);
9573 if (flag_vect_cost_model
== VECT_COST_MODEL_UNLIMITED
)
9576 if (gassign
*ass
= dyn_cast
<gassign
*> (gsi_stmt (gsi
)))
9577 if (gimple_assign_rhs_code (ass
) == COND_EXPR
)
9579 if (!profitable_subgraphs
.is_empty ()
9580 && dump_enabled_p ())
9581 dump_printf_loc (MSG_NOTE
, vect_location
,
9582 "not profitable because of "
9583 "unprofitable if-converted scalar "
9585 profitable_subgraphs
.truncate (0);
9590 /* Finally schedule the profitable subgraphs. */
9591 for (slp_instance instance
: profitable_subgraphs
)
9593 if (!vectorized
&& dump_enabled_p ())
9594 dump_printf_loc (MSG_NOTE
, vect_location
,
9595 "Basic block will be vectorized "
9599 /* Dump before scheduling as store vectorization will remove
9600 the original stores and mess with the instance tree
9601 so querying its location will eventually ICE. */
9603 for (slp_instance sub
: instance
->subgraph_entries
)
9604 gcc_assert (SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub
)));
9605 unsigned HOST_WIDE_INT bytes
;
9606 if (dump_enabled_p ())
9607 for (slp_instance sub
: instance
->subgraph_entries
)
9609 tree vtype
= SLP_TREE_VECTYPE (SLP_INSTANCE_TREE (sub
));
9610 if (GET_MODE_SIZE (TYPE_MODE (vtype
)).is_constant (&bytes
))
9611 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
,
9613 "basic block part vectorized using %wu "
9614 "byte vectors\n", bytes
);
9616 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS
,
9618 "basic block part vectorized using "
9619 "variable length vectors\n");
9622 dump_user_location_t saved_vect_location
= vect_location
;
9623 vect_location
= instance
->location ();
9625 vect_schedule_slp (bb_vinfo
, instance
->subgraph_entries
);
9627 vect_location
= saved_vect_location
;
9631 /* Generate the invariant statements. */
9632 if (!gimple_seq_empty_p (bb_vinfo
->inv_pattern_def_seq
))
9634 if (dump_enabled_p ())
9635 dump_printf_loc (MSG_NOTE
, vect_location
,
9636 "------>generating invariant statements\n");
9638 bb_vinfo
->insert_seq_on_entry (NULL
,
9639 bb_vinfo
->inv_pattern_def_seq
);
9644 if (dump_enabled_p ())
9645 dump_printf_loc (MSG_NOTE
, vect_location
,
9646 "***** Analysis failed with vector mode %s\n",
9647 GET_MODE_NAME (bb_vinfo
->vector_mode
));
9651 autodetected_vector_mode
= bb_vinfo
->vector_mode
;
9654 while (mode_i
< vector_modes
.length ()
9655 && vect_chooses_same_modes_p (bb_vinfo
, vector_modes
[mode_i
]))
9657 if (dump_enabled_p ())
9658 dump_printf_loc (MSG_NOTE
, vect_location
,
9659 "***** The result for vector mode %s would"
9661 GET_MODE_NAME (vector_modes
[mode_i
]));
9667 if (mode_i
< vector_modes
.length ()
9668 && VECTOR_MODE_P (autodetected_vector_mode
)
9669 && (related_vector_mode (vector_modes
[mode_i
],
9670 GET_MODE_INNER (autodetected_vector_mode
))
9671 == autodetected_vector_mode
)
9672 && (related_vector_mode (autodetected_vector_mode
,
9673 GET_MODE_INNER (vector_modes
[mode_i
]))
9674 == vector_modes
[mode_i
]))
9676 if (dump_enabled_p ())
9677 dump_printf_loc (MSG_NOTE
, vect_location
,
9678 "***** Skipping vector mode %s, which would"
9679 " repeat the analysis for %s\n",
9680 GET_MODE_NAME (vector_modes
[mode_i
]),
9681 GET_MODE_NAME (autodetected_vector_mode
));
9686 || mode_i
== vector_modes
.length ()
9687 || autodetected_vector_mode
== VOIDmode
9688 /* If vect_slp_analyze_bb_1 signaled that analysis for all
9689 vector sizes will fail do not bother iterating. */
9693 /* Try the next biggest vector size. */
9694 next_vector_mode
= vector_modes
[mode_i
++];
9695 if (dump_enabled_p ())
9696 dump_printf_loc (MSG_NOTE
, vect_location
,
9697 "***** Re-trying analysis with vector mode %s\n",
9698 GET_MODE_NAME (next_vector_mode
));
9703 /* Main entry for the BB vectorizer. Analyze and transform BBS, returns
9704 true if anything in the basic-block was vectorized. */
9707 vect_slp_bbs (const vec
<basic_block
> &bbs
, loop_p orig_loop
)
9709 vec
<data_reference_p
> datarefs
= vNULL
;
9710 auto_vec
<int> dataref_groups
;
9712 int current_group
= 0;
9714 for (unsigned i
= 0; i
< bbs
.length (); i
++)
9716 basic_block bb
= bbs
[i
];
9717 for (gimple_stmt_iterator gsi
= gsi_after_labels (bb
); !gsi_end_p (gsi
);
9720 gimple
*stmt
= gsi_stmt (gsi
);
9721 if (is_gimple_debug (stmt
))
9726 if (gimple_location (stmt
) != UNKNOWN_LOCATION
)
9727 vect_location
= stmt
;
9729 if (!vect_find_stmt_data_reference (NULL
, stmt
, &datarefs
,
9730 &dataref_groups
, current_group
))
9733 /* New BBs always start a new DR group. */
9737 return vect_slp_region (bbs
, datarefs
, &dataref_groups
, insns
, orig_loop
);
9740 /* Special entry for the BB vectorizer. Analyze and transform a single
9741 if-converted BB with ORIG_LOOPs body being the not if-converted
9742 representation. Returns true if anything in the basic-block was
9746 vect_slp_if_converted_bb (basic_block bb
, loop_p orig_loop
)
9748 auto_vec
<basic_block
> bbs
;
9750 return vect_slp_bbs (bbs
, orig_loop
);
9753 /* Main entry for the BB vectorizer. Analyze and transform BB, returns
9754 true if anything in the basic-block was vectorized. */
9757 vect_slp_function (function
*fun
)
9760 int *rpo
= XNEWVEC (int, n_basic_blocks_for_fn (fun
));
9761 auto_bitmap exit_bbs
;
9762 bitmap_set_bit (exit_bbs
, EXIT_BLOCK
);
9763 edge entry
= single_succ_edge (ENTRY_BLOCK_PTR_FOR_FN (fun
));
9764 unsigned n
= rev_post_order_and_mark_dfs_back_seme (fun
, entry
, exit_bbs
,
9767 /* For the moment split the function into pieces to avoid making
9768 the iteration on the vector mode moot. Split at points we know
9769 to not handle well which is CFG merges (SLP discovery doesn't
9770 handle non-loop-header PHIs) and loop exits. Since pattern
9771 recog requires reverse iteration to visit uses before defs
9772 simply chop RPO into pieces. */
9773 auto_vec
<basic_block
> bbs
;
9774 for (unsigned i
= 0; i
< n
; i
++)
9776 basic_block bb
= BASIC_BLOCK_FOR_FN (fun
, rpo
[i
]);
9779 /* Split when a BB is not dominated by the first block. */
9780 if (!bbs
.is_empty ()
9781 && !dominated_by_p (CDI_DOMINATORS
, bb
, bbs
[0]))
9783 if (dump_enabled_p ())
9784 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9785 "splitting region at dominance boundary bb%d\n",
9789 /* Split when the loop determined by the first block
9790 is exited. This is because we eventually insert
9791 invariants at region begin. */
9792 else if (!bbs
.is_empty ()
9793 && bbs
[0]->loop_father
!= bb
->loop_father
9794 && !flow_loop_nested_p (bbs
[0]->loop_father
, bb
->loop_father
))
9796 if (dump_enabled_p ())
9797 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9798 "splitting region at loop %d exit at bb%d\n",
9799 bbs
[0]->loop_father
->num
, bb
->index
);
9802 else if (!bbs
.is_empty ()
9803 && bb
->loop_father
->header
== bb
9804 && bb
->loop_father
->dont_vectorize
)
9806 if (dump_enabled_p ())
9807 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9808 "splitting region at dont-vectorize loop %d "
9810 bb
->loop_father
->num
, bb
->index
);
9814 if (split
&& !bbs
.is_empty ())
9816 r
|= vect_slp_bbs (bbs
, NULL
);
9820 if (bbs
.is_empty ())
9822 /* We need to be able to insert at the head of the region which
9823 we cannot for region starting with a returns-twice call. */
9824 if (gcall
*first
= safe_dyn_cast
<gcall
*> (first_stmt (bb
)))
9825 if (gimple_call_flags (first
) & ECF_RETURNS_TWICE
)
9827 if (dump_enabled_p ())
9828 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9829 "skipping bb%d as start of region as it "
9830 "starts with returns-twice call\n",
9834 /* If the loop this BB belongs to is marked as not to be vectorized
9835 honor that also for BB vectorization. */
9836 if (bb
->loop_father
->dont_vectorize
)
9842 /* When we have a stmt ending this block and defining a
9843 value we have to insert on edges when inserting after it for
9844 a vector containing its definition. Avoid this for now. */
9845 if (gimple
*last
= *gsi_last_bb (bb
))
9846 if (gimple_get_lhs (last
)
9847 && is_ctrl_altering_stmt (last
))
9849 if (dump_enabled_p ())
9850 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
9851 "splitting region at control altering "
9852 "definition %G", last
);
9853 r
|= vect_slp_bbs (bbs
, NULL
);
9858 if (!bbs
.is_empty ())
9859 r
|= vect_slp_bbs (bbs
, NULL
);
9866 /* Build a variable-length vector in which the elements in ELTS are repeated
9867 to a fill NRESULTS vectors of type VECTOR_TYPE. Store the vectors in
9868 RESULTS and add any new instructions to SEQ.
9870 The approach we use is:
9872 (1) Find a vector mode VM with integer elements of mode IM.
9874 (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
9875 ELTS' has mode IM. This involves creating NELTS' VIEW_CONVERT_EXPRs
9876 from small vectors to IM.
9878 (3) Duplicate each ELTS'[I] into a vector of mode VM.
9880 (4) Use a tree of interleaving VEC_PERM_EXPRs to create VMs with the
9881 correct byte contents.
9883 (5) Use VIEW_CONVERT_EXPR to cast the final VMs to the required type.
9885 We try to find the largest IM for which this sequence works, in order
9886 to cut down on the number of interleaves. */
9889 duplicate_and_interleave (vec_info
*vinfo
, gimple_seq
*seq
, tree vector_type
,
9890 const vec
<tree
> &elts
, unsigned int nresults
,
9893 unsigned int nelts
= elts
.length ();
9894 tree element_type
= TREE_TYPE (vector_type
);
9896 /* (1) Find a vector mode VM with integer elements of mode IM. */
9897 unsigned int nvectors
= 1;
9898 tree new_vector_type
;
9900 if (!can_duplicate_and_interleave_p (vinfo
, nelts
, element_type
,
9901 &nvectors
, &new_vector_type
,
9905 /* Get a vector type that holds ELTS[0:NELTS/NELTS']. */
9906 unsigned int partial_nelts
= nelts
/ nvectors
;
9907 tree partial_vector_type
= build_vector_type (element_type
, partial_nelts
);
9909 tree_vector_builder partial_elts
;
9910 auto_vec
<tree
, 32> pieces (nvectors
* 2);
9911 pieces
.quick_grow_cleared (nvectors
* 2);
9912 for (unsigned int i
= 0; i
< nvectors
; ++i
)
9914 /* (2) Replace ELTS[0:NELTS] with ELTS'[0:NELTS'], where each element of
9915 ELTS' has mode IM. */
9916 partial_elts
.new_vector (partial_vector_type
, partial_nelts
, 1);
9917 for (unsigned int j
= 0; j
< partial_nelts
; ++j
)
9918 partial_elts
.quick_push (elts
[i
* partial_nelts
+ j
]);
9919 tree t
= gimple_build_vector (seq
, &partial_elts
);
9920 t
= gimple_build (seq
, VIEW_CONVERT_EXPR
,
9921 TREE_TYPE (new_vector_type
), t
);
9923 /* (3) Duplicate each ELTS'[I] into a vector of mode VM. */
9924 pieces
[i
] = gimple_build_vector_from_val (seq
, new_vector_type
, t
);
9927 /* (4) Use a tree of VEC_PERM_EXPRs to create a single VM with the
9928 correct byte contents.
9930 Conceptually, we need to repeat the following operation log2(nvectors)
9931 times, where hi_start = nvectors / 2:
9933 out[i * 2] = VEC_PERM_EXPR (in[i], in[i + hi_start], lo_permute);
9934 out[i * 2 + 1] = VEC_PERM_EXPR (in[i], in[i + hi_start], hi_permute);
9936 However, if each input repeats every N elements and the VF is
9937 a multiple of N * 2, the HI result is the same as the LO result.
9938 This will be true for the first N1 iterations of the outer loop,
9939 followed by N2 iterations for which both the LO and HI results
9942 N1 + N2 = log2(nvectors)
9944 Each "N1 iteration" doubles the number of redundant vectors and the
9945 effect of the process as a whole is to have a sequence of nvectors/2**N1
9946 vectors that repeats 2**N1 times. Rather than generate these redundant
9947 vectors, we halve the number of vectors for each N1 iteration. */
9948 unsigned int in_start
= 0;
9949 unsigned int out_start
= nvectors
;
9950 unsigned int new_nvectors
= nvectors
;
9951 for (unsigned int in_repeat
= 1; in_repeat
< nvectors
; in_repeat
*= 2)
9953 unsigned int hi_start
= new_nvectors
/ 2;
9954 unsigned int out_i
= 0;
9955 for (unsigned int in_i
= 0; in_i
< new_nvectors
; ++in_i
)
9958 && multiple_p (TYPE_VECTOR_SUBPARTS (new_vector_type
),
9962 tree output
= make_ssa_name (new_vector_type
);
9963 tree input1
= pieces
[in_start
+ (in_i
/ 2)];
9964 tree input2
= pieces
[in_start
+ (in_i
/ 2) + hi_start
];
9965 gassign
*stmt
= gimple_build_assign (output
, VEC_PERM_EXPR
,
9967 permutes
[in_i
& 1]);
9968 gimple_seq_add_stmt (seq
, stmt
);
9969 pieces
[out_start
+ out_i
] = output
;
9972 std::swap (in_start
, out_start
);
9973 new_nvectors
= out_i
;
9976 /* (5) Use VIEW_CONVERT_EXPR to cast the final VM to the required type. */
9977 results
.reserve (nresults
);
9978 for (unsigned int i
= 0; i
< nresults
; ++i
)
9979 if (i
< new_nvectors
)
9980 results
.quick_push (gimple_build (seq
, VIEW_CONVERT_EXPR
, vector_type
,
9981 pieces
[in_start
+ i
]));
9983 results
.quick_push (results
[i
- new_nvectors
]);
9987 /* For constant and loop invariant defs in OP_NODE this function creates
9988 vector defs that will be used in the vectorized stmts and stores them
9989 to SLP_TREE_VEC_DEFS of OP_NODE. */
9992 vect_create_constant_vectors (vec_info
*vinfo
, slp_tree op_node
)
9994 unsigned HOST_WIDE_INT nunits
;
9996 unsigned j
, number_of_places_left_in_vector
;
9999 int group_size
= op_node
->ops
.length ();
10000 unsigned int vec_num
, i
;
10001 unsigned number_of_copies
= 1;
10003 gimple_seq ctor_seq
= NULL
;
10004 auto_vec
<tree
, 16> permute_results
;
10006 /* We always want SLP_TREE_VECTYPE (op_node) here correctly set. */
10007 vector_type
= SLP_TREE_VECTYPE (op_node
);
10009 unsigned int number_of_vectors
= SLP_TREE_NUMBER_OF_VEC_STMTS (op_node
);
10010 SLP_TREE_VEC_DEFS (op_node
).create (number_of_vectors
);
10011 auto_vec
<tree
> voprnds (number_of_vectors
);
10013 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
10014 created vectors. It is greater than 1 if unrolling is performed.
10016 For example, we have two scalar operands, s1 and s2 (e.g., group of
10017 strided accesses of size two), while NUNITS is four (i.e., four scalars
10018 of this type can be packed in a vector). The output vector will contain
10019 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
10022 If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
10023 containing the operands.
10025 For example, NUNITS is four as before, and the group size is 8
10026 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
10027 {s5, s6, s7, s8}. */
10029 /* When using duplicate_and_interleave, we just need one element for
10030 each scalar statement. */
10031 if (!TYPE_VECTOR_SUBPARTS (vector_type
).is_constant (&nunits
))
10032 nunits
= group_size
;
10034 number_of_copies
= nunits
* number_of_vectors
/ group_size
;
10036 number_of_places_left_in_vector
= nunits
;
10038 tree uniform_elt
= NULL_TREE
;
10039 tree_vector_builder
elts (vector_type
, nunits
, 1);
10040 elts
.quick_grow (nunits
);
10041 stmt_vec_info insert_after
= NULL
;
10042 for (j
= 0; j
< number_of_copies
; j
++)
10045 for (i
= group_size
- 1; op_node
->ops
.iterate (i
, &op
); i
--)
10047 /* Create 'vect_ = {op0,op1,...,opn}'. */
10049 if (number_of_places_left_in_vector
== nunits
)
10051 else if (uniform_elt
&& operand_equal_p (uniform_elt
, op
))
10052 op
= elts
[number_of_places_left_in_vector
];
10054 uniform_elt
= NULL_TREE
;
10055 number_of_places_left_in_vector
--;
10056 if (!types_compatible_p (TREE_TYPE (vector_type
), TREE_TYPE (op
)))
10058 if (CONSTANT_CLASS_P (op
))
10060 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
10062 /* Can't use VIEW_CONVERT_EXPR for booleans because
10063 of possibly different sizes of scalar value and
10065 if (integer_zerop (op
))
10066 op
= build_int_cst (TREE_TYPE (vector_type
), 0);
10067 else if (integer_onep (op
))
10068 op
= build_all_ones_cst (TREE_TYPE (vector_type
));
10070 gcc_unreachable ();
10073 op
= fold_unary (VIEW_CONVERT_EXPR
,
10074 TREE_TYPE (vector_type
), op
);
10075 gcc_assert (op
&& CONSTANT_CLASS_P (op
));
10079 tree new_temp
= make_ssa_name (TREE_TYPE (vector_type
));
10081 if (VECTOR_BOOLEAN_TYPE_P (vector_type
))
10084 = build_all_ones_cst (TREE_TYPE (vector_type
));
10086 = build_zero_cst (TREE_TYPE (vector_type
));
10087 gcc_assert (INTEGRAL_TYPE_P (TREE_TYPE (op
)));
10088 init_stmt
= gimple_build_assign (new_temp
, COND_EXPR
,
10094 op
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (vector_type
),
10097 = gimple_build_assign (new_temp
, VIEW_CONVERT_EXPR
,
10100 gimple_seq_add_stmt (&ctor_seq
, init_stmt
);
10104 elts
[number_of_places_left_in_vector
] = op
;
10105 if (!CONSTANT_CLASS_P (op
))
10106 constant_p
= false;
10107 /* For BB vectorization we have to compute an insert location
10108 when a def is inside the analyzed region since we cannot
10109 simply insert at the BB start in this case. */
10110 stmt_vec_info opdef
;
10111 if (TREE_CODE (orig_op
) == SSA_NAME
10112 && !SSA_NAME_IS_DEFAULT_DEF (orig_op
)
10113 && is_a
<bb_vec_info
> (vinfo
)
10114 && (opdef
= vinfo
->lookup_def (orig_op
)))
10117 insert_after
= opdef
;
10119 insert_after
= get_later_stmt (insert_after
, opdef
);
10122 if (number_of_places_left_in_vector
== 0)
10124 auto type_nunits
= TYPE_VECTOR_SUBPARTS (vector_type
);
10126 vec_cst
= gimple_build_vector_from_val (&ctor_seq
, vector_type
,
10128 else if (constant_p
10129 ? multiple_p (type_nunits
, nunits
)
10130 : known_eq (type_nunits
, nunits
))
10131 vec_cst
= gimple_build_vector (&ctor_seq
, &elts
);
10134 if (permute_results
.is_empty ())
10135 duplicate_and_interleave (vinfo
, &ctor_seq
, vector_type
,
10136 elts
, number_of_vectors
,
10138 vec_cst
= permute_results
[number_of_vectors
- j
- 1];
10140 if (!gimple_seq_empty_p (ctor_seq
))
10144 gimple_stmt_iterator gsi
;
10145 if (gimple_code (insert_after
->stmt
) == GIMPLE_PHI
)
10147 gsi
= gsi_after_labels (gimple_bb (insert_after
->stmt
));
10148 gsi_insert_seq_before (&gsi
, ctor_seq
,
10149 GSI_CONTINUE_LINKING
);
10151 else if (!stmt_ends_bb_p (insert_after
->stmt
))
10153 gsi
= gsi_for_stmt (insert_after
->stmt
);
10154 gsi_insert_seq_after (&gsi
, ctor_seq
,
10155 GSI_CONTINUE_LINKING
);
10159 /* When we want to insert after a def where the
10160 defining stmt throws then insert on the fallthru
10162 edge e
= find_fallthru_edge
10163 (gimple_bb (insert_after
->stmt
)->succs
);
10165 = gsi_insert_seq_on_edge_immediate (e
, ctor_seq
);
10166 gcc_assert (!new_bb
);
10170 vinfo
->insert_seq_on_entry (NULL
, ctor_seq
);
10173 voprnds
.quick_push (vec_cst
);
10174 insert_after
= NULL
;
10175 number_of_places_left_in_vector
= nunits
;
10177 elts
.new_vector (vector_type
, nunits
, 1);
10178 elts
.quick_grow (nunits
);
10183 /* Since the vectors are created in the reverse order, we should invert
10185 vec_num
= voprnds
.length ();
10186 for (j
= vec_num
; j
!= 0; j
--)
10188 vop
= voprnds
[j
- 1];
10189 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
10192 /* In case that VF is greater than the unrolling factor needed for the SLP
10193 group of stmts, NUMBER_OF_VECTORS to be created is greater than
10194 NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
10195 to replicate the vectors. */
10196 while (number_of_vectors
> SLP_TREE_VEC_DEFS (op_node
).length ())
10197 for (i
= 0; SLP_TREE_VEC_DEFS (op_node
).iterate (i
, &vop
) && i
< vec_num
;
10199 SLP_TREE_VEC_DEFS (op_node
).quick_push (vop
);
10202 /* Get the Ith vectorized definition from SLP_NODE. */
10205 vect_get_slp_vect_def (slp_tree slp_node
, unsigned i
)
10207 return SLP_TREE_VEC_DEFS (slp_node
)[i
];
10210 /* Get the vectorized definitions of SLP_NODE in *VEC_DEFS. */
10213 vect_get_slp_defs (slp_tree slp_node
, vec
<tree
> *vec_defs
)
10215 vec_defs
->create (SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node
));
10216 vec_defs
->splice (SLP_TREE_VEC_DEFS (slp_node
));
10219 /* Get N vectorized definitions for SLP_NODE. */
10222 vect_get_slp_defs (vec_info
*,
10223 slp_tree slp_node
, vec
<vec
<tree
> > *vec_oprnds
, unsigned n
)
10226 n
= SLP_TREE_CHILDREN (slp_node
).length ();
10228 for (unsigned i
= 0; i
< n
; ++i
)
10230 slp_tree child
= SLP_TREE_CHILDREN (slp_node
)[i
];
10231 vec
<tree
> vec_defs
= vNULL
;
10232 vect_get_slp_defs (child
, &vec_defs
);
10233 vec_oprnds
->quick_push (vec_defs
);
10237 /* A subroutine of vect_transform_slp_perm_load with two extra arguments:
10238 - PERM gives the permutation that the caller wants to use for NODE,
10239 which might be different from SLP_LOAD_PERMUTATION.
10240 - DUMP_P controls whether the function dumps information. */
10243 vect_transform_slp_perm_load_1 (vec_info
*vinfo
, slp_tree node
,
10244 load_permutation_t
&perm
,
10245 const vec
<tree
> &dr_chain
,
10246 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
10247 bool analyze_only
, bool dump_p
,
10248 unsigned *n_perms
, unsigned int *n_loads
,
10251 stmt_vec_info stmt_info
= SLP_TREE_SCALAR_STMTS (node
)[0];
10253 tree vectype
= SLP_TREE_VECTYPE (node
);
10254 unsigned int group_size
= SLP_TREE_SCALAR_STMTS (node
).length ();
10255 unsigned int mask_element
;
10256 unsigned dr_group_size
;
10259 if (!STMT_VINFO_GROUPED_ACCESS (stmt_info
))
10263 stmt_info
= DR_GROUP_FIRST_ELEMENT (stmt_info
);
10264 dr_group_size
= DR_GROUP_SIZE (stmt_info
);
10267 mode
= TYPE_MODE (vectype
);
10268 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
10269 unsigned int nstmts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
10271 /* Initialize the vect stmts of NODE to properly insert the generated
10273 if (! analyze_only
)
10274 for (unsigned i
= SLP_TREE_VEC_DEFS (node
).length (); i
< nstmts
; i
++)
10275 SLP_TREE_VEC_DEFS (node
).quick_push (NULL_TREE
);
10277 /* Generate permutation masks for every NODE. Number of masks for each NODE
10278 is equal to GROUP_SIZE.
10279 E.g., we have a group of three nodes with three loads from the same
10280 location in each node, and the vector size is 4. I.e., we have a
10281 a0b0c0a1b1c1... sequence and we need to create the following vectors:
10282 for a's: a0a0a0a1 a1a1a2a2 a2a3a3a3
10283 for b's: b0b0b0b1 b1b1b2b2 b2b3b3b3
10286 The masks for a's should be: {0,0,0,3} {3,3,6,6} {6,9,9,9}.
10287 The last mask is illegal since we assume two operands for permute
10288 operation, and the mask element values can't be outside that range.
10289 Hence, the last mask must be converted into {2,5,5,5}.
10290 For the first two permutations we need the first and the second input
10291 vectors: {a0,b0,c0,a1} and {b1,c1,a2,b2}, and for the last permutation
10292 we need the second and the third vectors: {b1,c1,a2,b2} and
10295 int vect_stmts_counter
= 0;
10296 unsigned int index
= 0;
10297 int first_vec_index
= -1;
10298 int second_vec_index
= -1;
10299 bool noop_p
= true;
10302 vec_perm_builder mask
;
10303 unsigned int nelts_to_build
;
10304 unsigned int nvectors_per_build
;
10305 unsigned int in_nlanes
;
10306 bool repeating_p
= (group_size
== dr_group_size
10307 && multiple_p (nunits
, group_size
));
10310 /* A single vector contains a whole number of copies of the node, so:
10311 (a) all permutes can use the same mask; and
10312 (b) the permutes only need a single vector input. */
10313 mask
.new_vector (nunits
, group_size
, 3);
10314 nelts_to_build
= mask
.encoded_nelts ();
10315 /* It's possible to obtain zero nstmts during analyze_only, so make
10316 it at least one to ensure the later computation for n_perms
10318 nvectors_per_build
= nstmts
> 0 ? nstmts
: 1;
10319 in_nlanes
= dr_group_size
* 3;
10323 /* We need to construct a separate mask for each vector statement. */
10324 unsigned HOST_WIDE_INT const_nunits
, const_vf
;
10325 if (!nunits
.is_constant (&const_nunits
)
10326 || !vf
.is_constant (&const_vf
))
10328 mask
.new_vector (const_nunits
, const_nunits
, 1);
10329 nelts_to_build
= const_vf
* group_size
;
10330 nvectors_per_build
= 1;
10331 in_nlanes
= const_vf
* dr_group_size
;
10333 auto_sbitmap
used_in_lanes (in_nlanes
);
10334 bitmap_clear (used_in_lanes
);
10335 auto_bitmap used_defs
;
10337 unsigned int count
= mask
.encoded_nelts ();
10338 mask
.quick_grow (count
);
10339 vec_perm_indices indices
;
10341 for (unsigned int j
= 0; j
< nelts_to_build
; j
++)
10343 unsigned int iter_num
= j
/ group_size
;
10344 unsigned int stmt_num
= j
% group_size
;
10345 unsigned int i
= (iter_num
* dr_group_size
+ perm
[stmt_num
]);
10346 bitmap_set_bit (used_in_lanes
, i
);
10349 first_vec_index
= 0;
10354 /* Enforced before the loop when !repeating_p. */
10355 unsigned int const_nunits
= nunits
.to_constant ();
10356 vec_index
= i
/ const_nunits
;
10357 mask_element
= i
% const_nunits
;
10358 if (vec_index
== first_vec_index
10359 || first_vec_index
== -1)
10361 first_vec_index
= vec_index
;
10363 else if (vec_index
== second_vec_index
10364 || second_vec_index
== -1)
10366 second_vec_index
= vec_index
;
10367 mask_element
+= const_nunits
;
10372 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10373 "permutation requires at "
10374 "least three vectors %G",
10376 gcc_assert (analyze_only
);
10380 gcc_assert (mask_element
< 2 * const_nunits
);
10383 if (mask_element
!= index
)
10385 mask
[index
++] = mask_element
;
10387 if (index
== count
)
10391 indices
.new_vector (mask
, second_vec_index
== -1 ? 1 : 2, nunits
);
10392 if (!can_vec_perm_const_p (mode
, mode
, indices
))
10396 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10397 "unsupported vect permute { ");
10398 for (i
= 0; i
< count
; ++i
)
10400 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
10401 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
10403 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
10405 gcc_assert (analyze_only
);
10409 tree mask_vec
= NULL_TREE
;
10411 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
10413 if (second_vec_index
== -1)
10414 second_vec_index
= first_vec_index
;
10416 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
10421 /* Generate the permute statement if necessary. */
10422 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
10423 tree second_vec
= dr_chain
[second_vec_index
+ ri
];
10424 gassign
*stmt
= as_a
<gassign
*> (stmt_info
->stmt
);
10426 = vect_create_destination_var (gimple_assign_lhs (stmt
),
10428 perm_dest
= make_ssa_name (perm_dest
);
10430 = gimple_build_assign (perm_dest
, VEC_PERM_EXPR
, first_vec
,
10431 second_vec
, mask_vec
);
10432 vect_finish_stmt_generation (vinfo
, stmt_info
, perm_stmt
,
10436 bitmap_set_bit (used_defs
, first_vec_index
+ ri
);
10437 bitmap_set_bit (used_defs
, second_vec_index
+ ri
);
10440 /* Store the vector statement in NODE. */
10441 SLP_TREE_VEC_DEFS (node
)[vect_stmts_counter
++] = perm_dest
;
10444 else if (!analyze_only
)
10446 for (unsigned int ri
= 0; ri
< nvectors_per_build
; ++ri
)
10448 tree first_vec
= dr_chain
[first_vec_index
+ ri
];
10449 /* If mask was NULL_TREE generate the requested
10450 identity transform. */
10452 bitmap_set_bit (used_defs
, first_vec_index
+ ri
);
10454 /* Store the vector statement in NODE. */
10455 SLP_TREE_VEC_DEFS (node
)[vect_stmts_counter
++] = first_vec
;
10460 first_vec_index
= -1;
10461 second_vec_index
= -1;
10469 *n_loads
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
10472 /* Enforced above when !repeating_p. */
10473 unsigned int const_nunits
= nunits
.to_constant ();
10475 bool load_seen
= false;
10476 for (unsigned i
= 0; i
< in_nlanes
; ++i
)
10478 if (i
% const_nunits
== 0)
10484 if (bitmap_bit_p (used_in_lanes
, i
))
10493 for (unsigned i
= 0; i
< dr_chain
.length (); ++i
)
10494 if (!bitmap_bit_p (used_defs
, i
))
10496 tree def
= dr_chain
[i
];
10499 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
10500 if (is_gimple_assign (stmt
)
10501 && (gimple_assign_rhs_code (stmt
) == VIEW_CONVERT_EXPR
10502 || gimple_assign_rhs_code (stmt
) == CONSTRUCTOR
))
10503 def
= single_ssa_tree_operand (stmt
, SSA_OP_USE
);
10506 gimple_stmt_iterator rgsi
= gsi_for_stmt (stmt
);
10507 gsi_remove (&rgsi
, true);
10508 release_defs (stmt
);
10516 /* Generate vector permute statements from a list of loads in DR_CHAIN.
10517 If ANALYZE_ONLY is TRUE, only check that it is possible to create valid
10518 permute statements for the SLP node NODE. Store the number of vector
10519 permute instructions in *N_PERMS and the number of vector load
10520 instructions in *N_LOADS. If DCE_CHAIN is true, remove all definitions
10521 that were not needed. */
10524 vect_transform_slp_perm_load (vec_info
*vinfo
,
10525 slp_tree node
, const vec
<tree
> &dr_chain
,
10526 gimple_stmt_iterator
*gsi
, poly_uint64 vf
,
10527 bool analyze_only
, unsigned *n_perms
,
10528 unsigned int *n_loads
, bool dce_chain
)
10530 return vect_transform_slp_perm_load_1 (vinfo
, node
,
10531 SLP_TREE_LOAD_PERMUTATION (node
),
10532 dr_chain
, gsi
, vf
, analyze_only
,
10533 dump_enabled_p (), n_perms
, n_loads
,
10537 /* Produce the next vector result for SLP permutation NODE by adding a vector
10538 statement at GSI. If MASK_VEC is nonnull, add:
10540 <new SSA name> = VEC_PERM_EXPR <FIRST_DEF, SECOND_DEF, MASK_VEC>
10544 <new SSA name> = FIRST_DEF. */
10547 vect_add_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
10548 slp_tree node
, tree first_def
, tree second_def
,
10549 tree mask_vec
, poly_uint64 identity_offset
)
10551 tree vectype
= SLP_TREE_VECTYPE (node
);
10553 /* ??? We SLP match existing vector element extracts but
10554 allow punning which we need to re-instantiate at uses
10555 but have no good way of explicitly representing. */
10556 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def
)), TYPE_SIZE (vectype
))
10557 && !types_compatible_p (TREE_TYPE (first_def
), vectype
))
10560 = gimple_build_assign (make_ssa_name (vectype
),
10561 build1 (VIEW_CONVERT_EXPR
, vectype
, first_def
));
10562 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
10563 first_def
= gimple_assign_lhs (conv_stmt
);
10565 gassign
*perm_stmt
;
10566 tree perm_dest
= make_ssa_name (vectype
);
10569 if (operand_equal_p (TYPE_SIZE (TREE_TYPE (first_def
)),
10570 TYPE_SIZE (vectype
))
10571 && !types_compatible_p (TREE_TYPE (second_def
), vectype
))
10574 = gimple_build_assign (make_ssa_name (vectype
),
10575 build1 (VIEW_CONVERT_EXPR
,
10576 vectype
, second_def
));
10577 vect_finish_stmt_generation (vinfo
, NULL
, conv_stmt
, gsi
);
10578 second_def
= gimple_assign_lhs (conv_stmt
);
10580 perm_stmt
= gimple_build_assign (perm_dest
, VEC_PERM_EXPR
,
10581 first_def
, second_def
,
10584 else if (!types_compatible_p (TREE_TYPE (first_def
), vectype
))
10586 /* For identity permutes we still need to handle the case
10587 of offsetted extracts or concats. */
10588 unsigned HOST_WIDE_INT c
;
10589 auto first_def_nunits
10590 = TYPE_VECTOR_SUBPARTS (TREE_TYPE (first_def
));
10591 if (known_le (TYPE_VECTOR_SUBPARTS (vectype
), first_def_nunits
))
10593 unsigned HOST_WIDE_INT elsz
10594 = tree_to_uhwi (TYPE_SIZE (TREE_TYPE (TREE_TYPE (first_def
))));
10595 tree lowpart
= build3 (BIT_FIELD_REF
, vectype
, first_def
,
10596 TYPE_SIZE (vectype
),
10597 bitsize_int (identity_offset
* elsz
));
10598 perm_stmt
= gimple_build_assign (perm_dest
, lowpart
);
10600 else if (constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype
),
10601 first_def_nunits
, &c
) && c
== 2)
10603 tree ctor
= build_constructor_va (vectype
, 2, NULL_TREE
, first_def
,
10604 NULL_TREE
, second_def
);
10605 perm_stmt
= gimple_build_assign (perm_dest
, ctor
);
10608 gcc_unreachable ();
10612 /* We need a copy here in case the def was external. */
10613 perm_stmt
= gimple_build_assign (perm_dest
, first_def
);
10615 vect_finish_stmt_generation (vinfo
, NULL
, perm_stmt
, gsi
);
10616 /* Store the vector statement in NODE. */
10617 node
->push_vec_def (perm_stmt
);
10620 /* Subroutine of vectorizable_slp_permutation. Check whether the target
10621 can perform permutation PERM on the (1 or 2) input nodes in CHILDREN.
10622 If GSI is nonnull, emit the permutation there.
10624 When GSI is null, the only purpose of NODE is to give properties
10625 of the result, such as the vector type and number of SLP lanes.
10626 The node does not need to be a VEC_PERM_EXPR.
10628 If the target supports the operation, return the number of individual
10629 VEC_PERM_EXPRs needed, otherwise return -1. Print information to the
10630 dump file if DUMP_P is true. */
10633 vectorizable_slp_permutation_1 (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
10634 slp_tree node
, lane_permutation_t
&perm
,
10635 vec
<slp_tree
> &children
, bool dump_p
)
10637 tree vectype
= SLP_TREE_VECTYPE (node
);
10639 /* ??? We currently only support all same vector input types
10640 while the SLP IL should really do a concat + select and thus accept
10641 arbitrary mismatches. */
10644 poly_uint64 nunits
= TYPE_VECTOR_SUBPARTS (vectype
);
10645 bool repeating_p
= multiple_p (nunits
, SLP_TREE_LANES (node
));
10646 /* True if we're permuting a single input of 2N vectors down
10647 to N vectors. This case doesn't generalize beyond 2 since
10648 VEC_PERM_EXPR only takes 2 inputs. */
10649 bool pack_p
= false;
10650 /* If we're permuting inputs of N vectors each into X*N outputs,
10651 this is the value of X, otherwise it is 1. */
10652 unsigned int unpack_factor
= 1;
10653 tree op_vectype
= NULL_TREE
;
10654 FOR_EACH_VEC_ELT (children
, i
, child
)
10655 if (SLP_TREE_VECTYPE (child
))
10657 op_vectype
= SLP_TREE_VECTYPE (child
);
10661 op_vectype
= vectype
;
10662 FOR_EACH_VEC_ELT (children
, i
, child
)
10664 if ((SLP_TREE_DEF_TYPE (child
) != vect_internal_def
10665 && !vect_maybe_update_slp_op_vectype (child
, op_vectype
))
10666 || !types_compatible_p (SLP_TREE_VECTYPE (child
), op_vectype
)
10667 || !types_compatible_p (TREE_TYPE (vectype
), TREE_TYPE (op_vectype
)))
10670 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10671 "Unsupported vector types in lane permutation\n");
10674 auto op_nunits
= TYPE_VECTOR_SUBPARTS (op_vectype
);
10675 unsigned int this_unpack_factor
;
10676 /* Detect permutations of external, pre-existing vectors. The external
10677 node's SLP_TREE_LANES stores the total number of units in the vector,
10678 or zero if the vector has variable length.
10680 We are expected to keep the original VEC_PERM_EXPR for such cases.
10681 There is no repetition to model. */
10682 if (SLP_TREE_DEF_TYPE (child
) == vect_external_def
10683 && SLP_TREE_SCALAR_OPS (child
).is_empty ())
10684 repeating_p
= false;
10685 /* Check whether the input has twice as many lanes per vector. */
10686 else if (children
.length () == 1
10687 && known_eq (SLP_TREE_LANES (child
) * nunits
,
10688 SLP_TREE_LANES (node
) * op_nunits
* 2))
10690 /* Check whether the output has N times as many lanes per vector. */
10691 else if (constant_multiple_p (SLP_TREE_LANES (node
) * op_nunits
,
10692 SLP_TREE_LANES (child
) * nunits
,
10693 &this_unpack_factor
)
10694 && (i
== 0 || unpack_factor
== this_unpack_factor
))
10695 unpack_factor
= this_unpack_factor
;
10697 repeating_p
= false;
10700 gcc_assert (perm
.length () == SLP_TREE_LANES (node
));
10702 /* Load-lanes permute. This permute only acts as a forwarder to
10703 select the correct vector def of the load-lanes load which
10704 has the permuted vectors in its vector defs like
10705 { v0, w0, r0, v1, w1, r1 ... } for a ld3. All costs are
10706 accounted for in the costing for the actual load so we
10707 return zero here. */
10708 if (node
->ldst_lanes
)
10710 gcc_assert (children
.length () == 1);
10712 /* This is a trivial op always supported. */
10714 slp_tree child
= children
[0];
10715 unsigned vec_idx
= (SLP_TREE_LANE_PERMUTATION (node
)[0].second
10716 / SLP_TREE_LANES (node
));
10717 unsigned vec_num
= SLP_TREE_LANES (child
) / SLP_TREE_LANES (node
);
10718 for (unsigned i
= 0; i
< SLP_TREE_NUMBER_OF_VEC_STMTS (node
); ++i
)
10720 tree def
= SLP_TREE_VEC_DEFS (child
)[i
* vec_num
+ vec_idx
];
10721 node
->push_vec_def (def
);
10726 /* Set REPEATING_P to true if the permutations are cylical wrt UNPACK_FACTOR
10727 and if we can generate the vectors in a vector-length agnostic way.
10728 This requires UNPACK_STEP == NUNITS / UNPACK_FACTOR to be known at
10731 The significance of UNPACK_STEP is that, when PACK_P is false,
10732 output vector I operates on a window of UNPACK_STEP elements from each
10733 input, starting at lane UNPACK_STEP * (I % UNPACK_FACTOR). For example,
10734 when UNPACK_FACTOR is 2, the first output vector operates on lanes
10735 [0, NUNITS / 2 - 1] of each input vector and the second output vector
10736 operates on lanes [NUNITS / 2, NUNITS - 1] of each input vector.
10738 When REPEATING_P is true, NOUTPUTS holds the total number of outputs
10739 that we actually need to generate. */
10740 uint64_t noutputs
= 0;
10741 poly_uint64 unpack_step
= 0;
10742 loop_vec_info linfo
= dyn_cast
<loop_vec_info
> (vinfo
);
10744 || !multiple_p (nunits
, unpack_factor
, &unpack_step
)
10745 || !constant_multiple_p (LOOP_VINFO_VECT_FACTOR (linfo
)
10746 * SLP_TREE_LANES (node
), nunits
, &noutputs
))
10747 repeating_p
= false;
10749 /* We can handle the conditions described for REPEATING_P above for
10750 both variable- and constant-length vectors. The fallback requires
10751 us to generate every element of every permute vector explicitly,
10752 which is only possible for constant-length permute vectors.
10756 - NPATTERNS and NELTS_PER_PATTERN to the encoding of the permute
10757 mask vectors that we want to build.
10759 - NCOPIES to the number of copies of PERM that we need in order
10760 to build the necessary permute mask vectors. */
10761 uint64_t npatterns
;
10762 unsigned nelts_per_pattern
;
10766 /* We need permute mask vectors that have the form:
10768 { X1, ..., Xn, X1 + n, ..., Xn + n, X1 + 2n, ..., Xn + 2n, ... }
10770 In other words, the original n-element permute in PERM is
10771 "unrolled" to fill a full vector. The stepped vector encoding
10772 that we use for permutes requires 3n elements. */
10773 npatterns
= SLP_TREE_LANES (node
);
10774 nelts_per_pattern
= ncopies
= 3;
10778 /* Calculate every element of every permute mask vector explicitly,
10779 instead of relying on the pattern described above. */
10780 if (!nunits
.is_constant (&npatterns
)
10781 || !TYPE_VECTOR_SUBPARTS (op_vectype
).is_constant ())
10784 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10785 "unsupported permutation %p on variable-length"
10786 " vectors\n", (void *) node
);
10789 nelts_per_pattern
= ncopies
= 1;
10790 if (linfo
&& !LOOP_VINFO_VECT_FACTOR (linfo
).is_constant (&ncopies
))
10793 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10794 "unsupported permutation %p for variable VF\n",
10801 unsigned olanes
= unpack_factor
* ncopies
* SLP_TREE_LANES (node
);
10802 gcc_assert (repeating_p
|| multiple_p (olanes
, nunits
));
10804 /* Compute the { { SLP operand, vector index}, lane } permutation sequence
10805 from the { SLP operand, scalar lane } permutation as recorded in the
10806 SLP node as intermediate step. This part should already work
10807 with SLP children with arbitrary number of lanes. */
10808 auto_vec
<std::pair
<std::pair
<unsigned, unsigned>, poly_uint64
>> vperm
;
10809 auto_vec
<poly_uint64
> active_lane
;
10810 vperm
.create (olanes
);
10811 active_lane
.safe_grow_cleared (children
.length (), true);
10812 for (unsigned int ui
= 0; ui
< unpack_factor
; ++ui
)
10814 for (unsigned j
= 0; j
< children
.length (); ++j
)
10815 active_lane
[j
] = ui
* unpack_step
;
10816 for (unsigned i
= 0; i
< ncopies
; ++i
)
10818 for (unsigned pi
= 0; pi
< perm
.length (); ++pi
)
10820 std::pair
<unsigned, unsigned> p
= perm
[pi
];
10821 tree vtype
= SLP_TREE_VECTYPE (children
[p
.first
]);
10823 vperm
.quick_push ({{p
.first
, 0},
10824 p
.second
+ active_lane
[p
.first
]});
10827 /* We checked above that the vectors are constant-length. */
10828 unsigned vnunits
= TYPE_VECTOR_SUBPARTS (vtype
)
10830 unsigned lane
= active_lane
[p
.first
].to_constant ();
10831 unsigned vi
= (lane
+ p
.second
) / vnunits
;
10832 unsigned vl
= (lane
+ p
.second
) % vnunits
;
10833 vperm
.quick_push ({{p
.first
, vi
}, vl
});
10836 /* Advance to the next group. */
10837 for (unsigned j
= 0; j
< children
.length (); ++j
)
10838 active_lane
[j
] += SLP_TREE_LANES (children
[j
]);
10844 dump_printf_loc (MSG_NOTE
, vect_location
,
10845 "vectorizing permutation %p", (void *)node
);
10846 for (unsigned i
= 0; i
< perm
.length (); ++i
)
10847 dump_printf (MSG_NOTE
, " op%u[%u]", perm
[i
].first
, perm
[i
].second
);
10849 dump_printf (MSG_NOTE
, " (repeat %d)", SLP_TREE_LANES (node
));
10850 dump_printf (MSG_NOTE
, "\n");
10851 dump_printf_loc (MSG_NOTE
, vect_location
, "as");
10852 for (unsigned i
= 0; i
< vperm
.length (); ++i
)
10856 ? multiple_p (i
, npatterns
)
10857 : multiple_p (i
, TYPE_VECTOR_SUBPARTS (vectype
))))
10858 dump_printf (MSG_NOTE
, ",");
10859 dump_printf (MSG_NOTE
, " vops%u[%u][",
10860 vperm
[i
].first
.first
, vperm
[i
].first
.second
);
10861 dump_dec (MSG_NOTE
, vperm
[i
].second
);
10862 dump_printf (MSG_NOTE
, "]");
10864 dump_printf (MSG_NOTE
, "\n");
10867 /* We can only handle two-vector permutes, everything else should
10868 be lowered on the SLP level. The following is closely inspired
10869 by vect_transform_slp_perm_load and is supposed to eventually
10871 ??? As intermediate step do code-gen in the SLP tree representation
10873 std::pair
<unsigned, unsigned> first_vec
= std::make_pair (-1U, -1U);
10874 std::pair
<unsigned, unsigned> second_vec
= std::make_pair (-1U, -1U);
10875 unsigned int index
= 0;
10876 poly_uint64 mask_element
;
10877 vec_perm_builder mask
;
10878 mask
.new_vector (nunits
, npatterns
, nelts_per_pattern
);
10879 unsigned int count
= mask
.encoded_nelts ();
10880 mask
.quick_grow (count
);
10881 vec_perm_indices indices
;
10882 unsigned nperms
= 0;
10883 /* When REPEATING_P is true, we only have UNPACK_FACTOR unique permute
10884 vectors to check during analysis, but we need to generate NOUTPUTS
10885 vectors during transformation. */
10886 unsigned total_nelts
= olanes
;
10887 unsigned process_nelts
= olanes
;
10890 total_nelts
= (total_nelts
/ unpack_factor
) * noutputs
;
10892 process_nelts
= total_nelts
;
10894 unsigned last_ei
= (total_nelts
- 1) % process_nelts
;
10895 for (unsigned i
= 0; i
< process_nelts
; ++i
)
10897 /* VI is the input vector index when generating code for REPEATING_P. */
10898 unsigned vi
= i
/ olanes
* (pack_p
? 2 : 1);
10899 unsigned ei
= i
% olanes
;
10900 mask_element
= vperm
[ei
].second
;
10903 /* In this case, we have N outputs and the single child provides 2N
10904 inputs. Output X permutes inputs 2X and 2X+1.
10906 The mask indices are taken directly from the SLP permutation node.
10907 Index X selects from the first vector if (X / NUNITS) % 2 == 0;
10908 X selects from the second vector otherwise. These conditions
10909 are only known at compile time for constant-length vectors. */
10910 first_vec
= std::make_pair (0, 0);
10911 second_vec
= std::make_pair (0, 1);
10913 else if (first_vec
.first
== -1U
10914 || first_vec
== vperm
[ei
].first
)
10915 first_vec
= vperm
[ei
].first
;
10916 else if (second_vec
.first
== -1U
10917 || second_vec
== vperm
[ei
].first
)
10919 second_vec
= vperm
[ei
].first
;
10920 mask_element
+= nunits
;
10925 dump_printf_loc (MSG_MISSED_OPTIMIZATION
, vect_location
,
10926 "permutation requires at "
10927 "least three vectors\n");
10932 mask
[index
++] = mask_element
;
10934 if (index
== count
)
10936 indices
.new_vector (mask
, second_vec
.first
== -1U ? 1 : 2,
10937 TYPE_VECTOR_SUBPARTS (op_vectype
));
10938 bool identity_p
= (indices
.series_p (0, 1, mask
[0], 1)
10939 && constant_multiple_p (mask
[0], nunits
));
10940 machine_mode vmode
= TYPE_MODE (vectype
);
10941 machine_mode op_vmode
= TYPE_MODE (op_vectype
);
10942 unsigned HOST_WIDE_INT c
;
10944 && !can_vec_perm_const_p (vmode
, op_vmode
, indices
))
10946 && !known_le (nunits
,
10947 TYPE_VECTOR_SUBPARTS (op_vectype
))
10948 && (!constant_multiple_p (nunits
,
10949 TYPE_VECTOR_SUBPARTS (op_vectype
),
10954 dump_printf_loc (MSG_MISSED_OPTIMIZATION
,
10956 "unsupported vect permute { ");
10957 for (i
= 0; i
< count
; ++i
)
10959 dump_dec (MSG_MISSED_OPTIMIZATION
, mask
[i
]);
10960 dump_printf (MSG_MISSED_OPTIMIZATION
, " ");
10962 dump_printf (MSG_MISSED_OPTIMIZATION
, "}\n");
10969 nperms
+= CEIL (total_nelts
, process_nelts
) - (ei
> last_ei
);
10972 if (second_vec
.first
== -1U)
10973 second_vec
= first_vec
;
10976 first_node
= children
[first_vec
.first
],
10977 second_node
= children
[second_vec
.first
];
10979 tree mask_vec
= NULL_TREE
;
10981 mask_vec
= vect_gen_perm_mask_checked (vectype
, indices
);
10984 = vect_get_slp_vect_def (first_node
, first_vec
.second
+ vi
);
10986 = vect_get_slp_vect_def (second_node
, second_vec
.second
+ vi
);
10987 vect_add_slp_permutation (vinfo
, gsi
, node
, first_def
,
10988 second_def
, mask_vec
, mask
[0]);
10992 first_vec
= std::make_pair (-1U, -1U);
10993 second_vec
= std::make_pair (-1U, -1U);
11000 /* Vectorize the SLP permutations in NODE as specified
11001 in SLP_TREE_LANE_PERMUTATION which is a vector of pairs of SLP
11002 child number and lane number.
11003 Interleaving of two two-lane two-child SLP subtrees (not supported):
11004 [ { 0, 0 }, { 1, 0 }, { 0, 1 }, { 1, 1 } ]
11005 A blend of two four-lane two-child SLP subtrees:
11006 [ { 0, 0 }, { 1, 1 }, { 0, 2 }, { 1, 3 } ]
11007 Highpart of a four-lane one-child SLP subtree (not supported):
11008 [ { 0, 2 }, { 0, 3 } ]
11009 Where currently only a subset is supported by code generating below. */
11012 vectorizable_slp_permutation (vec_info
*vinfo
, gimple_stmt_iterator
*gsi
,
11013 slp_tree node
, stmt_vector_for_cost
*cost_vec
)
11015 tree vectype
= SLP_TREE_VECTYPE (node
);
11016 lane_permutation_t
&perm
= SLP_TREE_LANE_PERMUTATION (node
);
11017 int nperms
= vectorizable_slp_permutation_1 (vinfo
, gsi
, node
, perm
,
11018 SLP_TREE_CHILDREN (node
),
11019 dump_enabled_p ());
11024 record_stmt_cost (cost_vec
, nperms
, vec_perm
, node
, vectype
, 0, vect_body
);
11029 /* Vectorize SLP NODE. */
11032 vect_schedule_slp_node (vec_info
*vinfo
,
11033 slp_tree node
, slp_instance instance
)
11035 gimple_stmt_iterator si
;
11039 /* Vectorize externals and constants. */
11040 if (SLP_TREE_DEF_TYPE (node
) == vect_constant_def
11041 || SLP_TREE_DEF_TYPE (node
) == vect_external_def
)
11043 /* ??? vectorizable_shift can end up using a scalar operand which is
11044 currently denoted as !SLP_TREE_VECTYPE. No need to vectorize the
11045 node in this case. */
11046 if (!SLP_TREE_VECTYPE (node
))
11049 /* There are two reasons vector defs might already exist. The first
11050 is that we are vectorizing an existing vector def. The second is
11051 when performing BB vectorization shared constant/external nodes
11052 are not split apart during partitioning so during the code-gen
11053 DFS walk we can end up visiting them twice. */
11054 if (! SLP_TREE_VEC_DEFS (node
).exists ())
11055 vect_create_constant_vectors (vinfo
, node
);
11059 gcc_assert (SLP_TREE_VEC_DEFS (node
).is_empty ());
11061 stmt_vec_info stmt_info
= SLP_TREE_REPRESENTATIVE (node
);
11063 gcc_assert (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) != 0);
11064 SLP_TREE_VEC_DEFS (node
).create (SLP_TREE_NUMBER_OF_VEC_STMTS (node
));
11066 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
11067 && STMT_VINFO_DATA_REF (stmt_info
))
11069 /* Vectorized loads go before the first scalar load to make it
11070 ready early, vectorized stores go before the last scalar
11071 stmt which is where all uses are ready. */
11072 stmt_vec_info last_stmt_info
= NULL
;
11073 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info
)))
11074 last_stmt_info
= vect_find_first_scalar_stmt_in_slp (node
);
11075 else /* DR_IS_WRITE */
11076 last_stmt_info
= vect_find_last_scalar_stmt_in_slp (node
);
11077 si
= gsi_for_stmt (last_stmt_info
->stmt
);
11079 else if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
11080 && (STMT_VINFO_TYPE (stmt_info
) == cycle_phi_info_type
11081 || STMT_VINFO_TYPE (stmt_info
) == induc_vec_info_type
11082 || STMT_VINFO_TYPE (stmt_info
) == phi_info_type
))
11084 /* For PHI node vectorization we do not use the insertion iterator. */
11089 /* Emit other stmts after the children vectorized defs which is
11090 earliest possible. */
11091 gimple
*last_stmt
= NULL
;
11092 bool seen_vector_def
= false;
11093 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
11094 if (SLP_TREE_DEF_TYPE (child
) == vect_internal_def
)
11096 /* For fold-left reductions we are retaining the scalar
11097 reduction PHI but we still have SLP_TREE_NUM_VEC_STMTS
11098 set so the representation isn't perfect. Resort to the
11099 last scalar def here. */
11100 if (SLP_TREE_VEC_DEFS (child
).is_empty ())
11102 gcc_assert (STMT_VINFO_TYPE (SLP_TREE_REPRESENTATIVE (child
))
11103 == cycle_phi_info_type
);
11104 gphi
*phi
= as_a
<gphi
*>
11105 (vect_find_last_scalar_stmt_in_slp (child
)->stmt
);
11107 || vect_stmt_dominates_stmt_p (last_stmt
, phi
))
11110 /* We are emitting all vectorized stmts in the same place and
11111 the last one is the last.
11112 ??? Unless we have a load permutation applied and that
11113 figures to re-use an earlier generated load. */
11116 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
11118 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
11120 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
11124 else if (!SLP_TREE_VECTYPE (child
))
11126 /* For externals we use unvectorized at all scalar defs. */
11129 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_OPS (child
), j
, def
)
11130 if (TREE_CODE (def
) == SSA_NAME
11131 && !SSA_NAME_IS_DEFAULT_DEF (def
))
11133 gimple
*stmt
= SSA_NAME_DEF_STMT (def
);
11135 || vect_stmt_dominates_stmt_p (last_stmt
, stmt
))
11141 /* For externals we have to look at all defs since their
11142 insertion place is decided per vector. But beware
11143 of pre-existing vectors where we need to make sure
11144 we do not insert before the region boundary. */
11145 if (SLP_TREE_SCALAR_OPS (child
).is_empty ()
11146 && !vinfo
->lookup_def (SLP_TREE_VEC_DEFS (child
)[0]))
11147 seen_vector_def
= true;
11152 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (child
), j
, vdef
)
11153 if (TREE_CODE (vdef
) == SSA_NAME
11154 && !SSA_NAME_IS_DEFAULT_DEF (vdef
))
11156 gimple
*vstmt
= SSA_NAME_DEF_STMT (vdef
);
11158 || vect_stmt_dominates_stmt_p (last_stmt
, vstmt
))
11163 /* This can happen when all children are pre-existing vectors or
11166 last_stmt
= vect_find_first_scalar_stmt_in_slp (node
)->stmt
;
11169 gcc_assert (seen_vector_def
);
11170 si
= gsi_after_labels (vinfo
->bbs
[0]);
11172 else if (is_ctrl_altering_stmt (last_stmt
))
11174 /* We split regions to vectorize at control altering stmts
11175 with a definition so this must be an external which
11176 we can insert at the start of the region. */
11177 si
= gsi_after_labels (vinfo
->bbs
[0]);
11179 else if (is_a
<bb_vec_info
> (vinfo
)
11180 && SLP_TREE_CODE (node
) != VEC_PERM_EXPR
11181 && gimple_bb (last_stmt
) != gimple_bb (stmt_info
->stmt
)
11182 && gimple_could_trap_p (stmt_info
->stmt
))
11184 /* We've constrained possibly trapping operations to all come
11185 from the same basic-block, if vectorized defs would allow earlier
11186 scheduling still force vectorized stmts to the original block.
11187 This is only necessary for BB vectorization since for loop vect
11188 all operations are in a single BB and scalar stmt based
11189 placement doesn't play well with epilogue vectorization. */
11190 gcc_assert (dominated_by_p (CDI_DOMINATORS
,
11191 gimple_bb (stmt_info
->stmt
),
11192 gimple_bb (last_stmt
)));
11193 si
= gsi_after_labels (gimple_bb (stmt_info
->stmt
));
11195 else if (is_a
<gphi
*> (last_stmt
))
11196 si
= gsi_after_labels (gimple_bb (last_stmt
));
11199 si
= gsi_for_stmt (last_stmt
);
11202 /* Avoid scheduling internal defs outside of the loop when
11203 we might have only implicitly tracked loop mask/len defs. */
11204 if (auto loop_vinfo
= dyn_cast
<loop_vec_info
> (vinfo
))
11205 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo
)
11206 || LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo
))
11208 gimple_stmt_iterator si2
11209 = gsi_after_labels (LOOP_VINFO_LOOP (loop_vinfo
)->header
);
11210 if ((gsi_end_p (si2
)
11211 && (LOOP_VINFO_LOOP (loop_vinfo
)->header
11212 != gimple_bb (last_stmt
))
11213 && dominated_by_p (CDI_DOMINATORS
,
11214 LOOP_VINFO_LOOP (loop_vinfo
)->header
,
11215 gimple_bb (last_stmt
)))
11216 || (!gsi_end_p (si2
)
11217 && last_stmt
!= *si2
11218 && vect_stmt_dominates_stmt_p (last_stmt
, *si2
)))
11224 /* Handle purely internal nodes. */
11225 if (SLP_TREE_CODE (node
) == VEC_PERM_EXPR
)
11227 if (dump_enabled_p ())
11228 dump_printf_loc (MSG_NOTE
, vect_location
,
11229 "------>vectorizing SLP permutation node\n");
11230 /* ??? the transform kind is stored to STMT_VINFO_TYPE which might
11231 be shared with different SLP nodes (but usually it's the same
11232 operation apart from the case the stmt is only there for denoting
11233 the actual scalar lane defs ...). So do not call vect_transform_stmt
11234 but open-code it here (partly). */
11235 bool done
= vectorizable_slp_permutation (vinfo
, &si
, node
, NULL
);
11237 stmt_vec_info slp_stmt_info
;
11239 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, slp_stmt_info
)
11240 if (slp_stmt_info
&& STMT_VINFO_LIVE_P (slp_stmt_info
))
11242 done
= vectorizable_live_operation (vinfo
, slp_stmt_info
, node
,
11243 instance
, i
, true, NULL
);
11249 if (dump_enabled_p ())
11250 dump_printf_loc (MSG_NOTE
, vect_location
,
11251 "------>vectorizing SLP node starting from: %G",
11253 vect_transform_stmt (vinfo
, stmt_info
, &si
, node
, instance
);
11257 /* Replace scalar calls from SLP node NODE with setting of their lhs to zero.
11258 For loop vectorization this is done in vectorizable_call, but for SLP
11259 it needs to be deferred until end of vect_schedule_slp, because multiple
11260 SLP instances may refer to the same scalar stmt. */
11263 vect_remove_slp_scalar_calls (vec_info
*vinfo
,
11264 slp_tree node
, hash_set
<slp_tree
> &visited
)
11267 gimple_stmt_iterator gsi
;
11271 stmt_vec_info stmt_info
;
11273 if (!node
|| SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
11276 if (visited
.add (node
))
11279 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
11280 vect_remove_slp_scalar_calls (vinfo
, child
, visited
);
11282 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (node
), i
, stmt_info
)
11286 gcall
*stmt
= dyn_cast
<gcall
*> (stmt_info
->stmt
);
11287 if (!stmt
|| gimple_bb (stmt
) == NULL
)
11289 if (is_pattern_stmt_p (stmt_info
)
11290 || !PURE_SLP_STMT (stmt_info
))
11292 lhs
= gimple_call_lhs (stmt
);
11294 new_stmt
= gimple_build_assign (lhs
, build_zero_cst (TREE_TYPE (lhs
)));
11297 new_stmt
= gimple_build_nop ();
11298 unlink_stmt_vdef (stmt_info
->stmt
);
11300 gsi
= gsi_for_stmt (stmt
);
11301 vinfo
->replace_stmt (&gsi
, stmt_info
, new_stmt
);
11303 SSA_NAME_DEF_STMT (lhs
) = new_stmt
;
11308 vect_remove_slp_scalar_calls (vec_info
*vinfo
, slp_tree node
)
11310 hash_set
<slp_tree
> visited
;
11311 vect_remove_slp_scalar_calls (vinfo
, node
, visited
);
11314 /* Vectorize the instance root. */
11317 vectorize_slp_instance_root_stmt (vec_info
*vinfo
, slp_tree node
, slp_instance instance
)
11319 gassign
*rstmt
= NULL
;
11321 if (instance
->kind
== slp_inst_kind_ctor
)
11323 if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) == 1)
11325 tree vect_lhs
= SLP_TREE_VEC_DEFS (node
)[0];
11326 tree root_lhs
= gimple_get_lhs (instance
->root_stmts
[0]->stmt
);
11327 if (!useless_type_conversion_p (TREE_TYPE (root_lhs
),
11328 TREE_TYPE (vect_lhs
)))
11329 vect_lhs
= build1 (VIEW_CONVERT_EXPR
, TREE_TYPE (root_lhs
),
11331 rstmt
= gimple_build_assign (root_lhs
, vect_lhs
);
11333 else if (SLP_TREE_NUMBER_OF_VEC_STMTS (node
) > 1)
11335 int nelts
= SLP_TREE_NUMBER_OF_VEC_STMTS (node
);
11338 vec
<constructor_elt
, va_gc
> *v
;
11339 vec_alloc (v
, nelts
);
11341 /* A CTOR can handle V16HI composition from VNx8HI so we
11342 do not need to convert vector elements if the types
11344 FOR_EACH_VEC_ELT (SLP_TREE_VEC_DEFS (node
), j
, child_def
)
11345 CONSTRUCTOR_APPEND_ELT (v
, NULL_TREE
, child_def
);
11346 tree lhs
= gimple_get_lhs (instance
->root_stmts
[0]->stmt
);
11348 = TREE_TYPE (gimple_assign_rhs1 (instance
->root_stmts
[0]->stmt
));
11349 tree r_constructor
= build_constructor (rtype
, v
);
11350 rstmt
= gimple_build_assign (lhs
, r_constructor
);
11353 else if (instance
->kind
== slp_inst_kind_bb_reduc
)
11355 /* Largely inspired by reduction chain epilogue handling in
11356 vect_create_epilog_for_reduction. */
11357 vec
<tree
> vec_defs
= vNULL
;
11358 vect_get_slp_defs (node
, &vec_defs
);
11359 enum tree_code reduc_code
11360 = gimple_assign_rhs_code (instance
->root_stmts
[0]->stmt
);
11361 /* ??? We actually have to reflect signs somewhere. */
11362 if (reduc_code
== MINUS_EXPR
)
11363 reduc_code
= PLUS_EXPR
;
11364 gimple_seq epilogue
= NULL
;
11365 /* We may end up with more than one vector result, reduce them
11367 tree vec_def
= vec_defs
[0];
11368 tree vectype
= TREE_TYPE (vec_def
);
11369 tree compute_vectype
= vectype
;
11370 bool pun_for_overflow_p
= (ANY_INTEGRAL_TYPE_P (vectype
)
11371 && TYPE_OVERFLOW_UNDEFINED (vectype
)
11372 && operation_can_overflow (reduc_code
));
11373 if (pun_for_overflow_p
)
11375 compute_vectype
= unsigned_type_for (vectype
);
11376 vec_def
= gimple_build (&epilogue
, VIEW_CONVERT_EXPR
,
11377 compute_vectype
, vec_def
);
11379 for (unsigned i
= 1; i
< vec_defs
.length (); ++i
)
11381 tree def
= vec_defs
[i
];
11382 if (pun_for_overflow_p
)
11383 def
= gimple_build (&epilogue
, VIEW_CONVERT_EXPR
,
11384 compute_vectype
, def
);
11385 vec_def
= gimple_build (&epilogue
, reduc_code
, compute_vectype
,
11388 vec_defs
.release ();
11389 /* ??? Support other schemes than direct internal fn. */
11390 internal_fn reduc_fn
;
11391 if (!reduction_fn_for_scalar_code (reduc_code
, &reduc_fn
)
11392 || reduc_fn
== IFN_LAST
)
11393 gcc_unreachable ();
11394 tree scalar_def
= gimple_build (&epilogue
, as_combined_fn (reduc_fn
),
11395 TREE_TYPE (compute_vectype
), vec_def
);
11396 if (!SLP_INSTANCE_REMAIN_DEFS (instance
).is_empty ())
11398 tree rem_def
= NULL_TREE
;
11399 for (auto def
: SLP_INSTANCE_REMAIN_DEFS (instance
))
11401 def
= gimple_convert (&epilogue
, TREE_TYPE (scalar_def
), def
);
11405 rem_def
= gimple_build (&epilogue
, reduc_code
,
11406 TREE_TYPE (scalar_def
),
11409 scalar_def
= gimple_build (&epilogue
, reduc_code
,
11410 TREE_TYPE (scalar_def
),
11411 scalar_def
, rem_def
);
11413 scalar_def
= gimple_convert (&epilogue
,
11414 TREE_TYPE (vectype
), scalar_def
);
11415 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmts
[0]->stmt
);
11416 gsi_insert_seq_before (&rgsi
, epilogue
, GSI_SAME_STMT
);
11417 gimple_assign_set_rhs_from_tree (&rgsi
, scalar_def
);
11418 update_stmt (gsi_stmt (rgsi
));
11421 else if (instance
->kind
== slp_inst_kind_gcond
)
11423 /* Only support a single root for now as we can't codegen CFG yet and so we
11424 can't support lane > 1 at this time. */
11425 gcc_assert (instance
->root_stmts
.length () == 1);
11426 auto root_stmt_info
= instance
->root_stmts
[0];
11427 auto last_stmt
= STMT_VINFO_STMT (vect_orig_stmt (root_stmt_info
));
11428 gimple_stmt_iterator rgsi
= gsi_for_stmt (last_stmt
);
11429 gimple
*vec_stmt
= NULL
;
11430 gcc_assert (!SLP_TREE_VEC_DEFS (node
).is_empty ());
11431 bool res
= vectorizable_early_exit (vinfo
, root_stmt_info
, &rgsi
,
11432 &vec_stmt
, node
, NULL
);
11437 gcc_unreachable ();
11439 gcc_assert (rstmt
);
11441 gimple_stmt_iterator rgsi
= gsi_for_stmt (instance
->root_stmts
[0]->stmt
);
11442 gsi_replace (&rgsi
, rstmt
, true);
11445 struct slp_scc_info
11452 /* Schedule the SLP INSTANCE doing a DFS walk and collecting SCCs. */
11455 vect_schedule_scc (vec_info
*vinfo
, slp_tree node
, slp_instance instance
,
11456 hash_map
<slp_tree
, slp_scc_info
> &scc_info
,
11457 int &maxdfs
, vec
<slp_tree
> &stack
)
11460 slp_scc_info
*info
= &scc_info
.get_or_insert (node
, &existed_p
);
11461 gcc_assert (!existed_p
);
11462 info
->dfs
= maxdfs
;
11463 info
->lowlink
= maxdfs
;
11467 if (SLP_TREE_DEF_TYPE (node
) != vect_internal_def
)
11469 info
->on_stack
= false;
11470 vect_schedule_slp_node (vinfo
, node
, instance
);
11474 info
->on_stack
= true;
11475 stack
.safe_push (node
);
11480 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (node
), i
, child
)
11484 slp_scc_info
*child_info
= scc_info
.get (child
);
11487 vect_schedule_scc (vinfo
, child
, instance
, scc_info
, maxdfs
, stack
);
11488 /* Recursion might have re-allocated the node. */
11489 info
= scc_info
.get (node
);
11490 child_info
= scc_info
.get (child
);
11491 info
->lowlink
= MIN (info
->lowlink
, child_info
->lowlink
);
11493 else if (child_info
->on_stack
)
11494 info
->lowlink
= MIN (info
->lowlink
, child_info
->dfs
);
11496 if (info
->lowlink
!= info
->dfs
)
11499 auto_vec
<slp_tree
, 4> phis_to_fixup
;
11502 if (stack
.last () == node
)
11505 info
->on_stack
= false;
11506 vect_schedule_slp_node (vinfo
, node
, instance
);
11507 if (SLP_TREE_CODE (node
) != VEC_PERM_EXPR
11508 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (node
)->stmt
))
11509 phis_to_fixup
.quick_push (node
);
11514 int last_idx
= stack
.length () - 1;
11515 while (stack
[last_idx
] != node
)
11517 /* We can break the cycle at PHIs who have at least one child
11518 code generated. Then we could re-start the DFS walk until
11519 all nodes in the SCC are covered (we might have new entries
11520 for only back-reachable nodes). But it's simpler to just
11521 iterate and schedule those that are ready. */
11522 unsigned todo
= stack
.length () - last_idx
;
11525 for (int idx
= stack
.length () - 1; idx
>= last_idx
; --idx
)
11527 slp_tree entry
= stack
[idx
];
11530 bool phi
= (SLP_TREE_CODE (entry
) != VEC_PERM_EXPR
11531 && is_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (entry
)->stmt
));
11533 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (entry
), i
, child
)
11540 else if (scc_info
.get (child
)->on_stack
)
11558 vect_schedule_slp_node (vinfo
, entry
, instance
);
11559 scc_info
.get (entry
)->on_stack
= false;
11563 phis_to_fixup
.safe_push (entry
);
11570 stack
.truncate (last_idx
);
11573 /* Now fixup the backedge def of the vectorized PHIs in this SCC. */
11575 FOR_EACH_VEC_ELT (phis_to_fixup
, i
, phi_node
)
11577 gphi
*phi
= as_a
<gphi
*> (SLP_TREE_REPRESENTATIVE (phi_node
)->stmt
);
11580 FOR_EACH_EDGE (e
, ei
, gimple_bb (phi
)->preds
)
11582 unsigned dest_idx
= e
->dest_idx
;
11583 child
= SLP_TREE_CHILDREN (phi_node
)[dest_idx
];
11584 if (!child
|| SLP_TREE_DEF_TYPE (child
) != vect_internal_def
)
11586 unsigned n
= SLP_TREE_VEC_DEFS (phi_node
).length ();
11587 /* Simply fill all args. */
11588 if (STMT_VINFO_DEF_TYPE (SLP_TREE_REPRESENTATIVE (phi_node
))
11589 != vect_first_order_recurrence
)
11590 for (unsigned i
= 0; i
< n
; ++i
)
11592 tree phidef
= SLP_TREE_VEC_DEFS (phi_node
)[i
];
11593 gphi
*phi
= as_a
<gphi
*> (SSA_NAME_DEF_STMT (phidef
));
11594 add_phi_arg (phi
, vect_get_slp_vect_def (child
, i
),
11595 e
, gimple_phi_arg_location (phi
, dest_idx
));
11599 /* Unless it is a first order recurrence which needs
11600 args filled in for both the PHI node and the permutes. */
11602 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node
)[0]);
11603 gimple
*rphi
= SSA_NAME_DEF_STMT (gimple_assign_rhs1 (perm
));
11604 add_phi_arg (as_a
<gphi
*> (rphi
),
11605 vect_get_slp_vect_def (child
, n
- 1),
11606 e
, gimple_phi_arg_location (phi
, dest_idx
));
11607 for (unsigned i
= 0; i
< n
; ++i
)
11610 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (phi_node
)[i
]);
11612 gimple_assign_set_rhs1 (perm
,
11613 vect_get_slp_vect_def (child
, i
- 1));
11614 gimple_assign_set_rhs2 (perm
,
11615 vect_get_slp_vect_def (child
, i
));
11616 update_stmt (perm
);
11623 /* Generate vector code for SLP_INSTANCES in the loop/basic block. */
11626 vect_schedule_slp (vec_info
*vinfo
, const vec
<slp_instance
> &slp_instances
)
11628 slp_instance instance
;
11631 hash_map
<slp_tree
, slp_scc_info
> scc_info
;
11633 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
11635 slp_tree node
= SLP_INSTANCE_TREE (instance
);
11636 if (dump_enabled_p ())
11638 dump_printf_loc (MSG_NOTE
, vect_location
,
11639 "Vectorizing SLP tree:\n");
11640 /* ??? Dump all? */
11641 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
11642 dump_printf_loc (MSG_NOTE
, vect_location
, "Root stmt: %G",
11643 SLP_INSTANCE_ROOT_STMTS (instance
)[0]->stmt
);
11644 vect_print_slp_graph (MSG_NOTE
, vect_location
,
11645 SLP_INSTANCE_TREE (instance
));
11647 /* Schedule the tree of INSTANCE, scheduling SCCs in a way to
11648 have a PHI be the node breaking the cycle. */
11649 auto_vec
<slp_tree
> stack
;
11650 if (!scc_info
.get (node
))
11651 vect_schedule_scc (vinfo
, node
, instance
, scc_info
, maxdfs
, stack
);
11653 if (!SLP_INSTANCE_ROOT_STMTS (instance
).is_empty ())
11654 vectorize_slp_instance_root_stmt (vinfo
, node
, instance
);
11656 if (dump_enabled_p ())
11657 dump_printf_loc (MSG_NOTE
, vect_location
,
11658 "vectorizing stmts using SLP.\n");
11661 FOR_EACH_VEC_ELT (slp_instances
, i
, instance
)
11663 slp_tree root
= SLP_INSTANCE_TREE (instance
);
11664 stmt_vec_info store_info
;
11667 /* Remove scalar call stmts. Do not do this for basic-block
11668 vectorization as not all uses may be vectorized.
11669 ??? Why should this be necessary? DCE should be able to
11670 remove the stmts itself.
11671 ??? For BB vectorization we can as well remove scalar
11672 stmts starting from the SLP tree root if they have no
11674 if (is_a
<loop_vec_info
> (vinfo
))
11675 vect_remove_slp_scalar_calls (vinfo
, root
);
11677 /* Remove vectorized stores original scalar stmts. */
11678 for (j
= 0; SLP_TREE_SCALAR_STMTS (root
).iterate (j
, &store_info
); j
++)
11680 if (!STMT_VINFO_DATA_REF (store_info
)
11681 || !DR_IS_WRITE (STMT_VINFO_DATA_REF (store_info
)))
11684 store_info
= vect_orig_stmt (store_info
);
11685 /* Free the attached stmt_vec_info and remove the stmt. */
11686 vinfo
->remove_stmt (store_info
);
11688 /* Invalidate SLP_TREE_REPRESENTATIVE in case we released it
11689 to not crash in vect_free_slp_tree later. */
11690 if (SLP_TREE_REPRESENTATIVE (root
) == store_info
)
11691 SLP_TREE_REPRESENTATIVE (root
) = NULL
;